Stable-X commited on
Commit
f3ff4f1
·
verified ·
1 Parent(s): 3652684

Upload 288 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. README.md +8 -5
  3. app.py +392 -0
  4. assets/example_multi_image/SpongeBob_1.png +3 -0
  5. assets/example_multi_image/SpongeBob_2.png +3 -0
  6. assets/example_multi_image/SpongeBob_3.png +3 -0
  7. assets/example_multi_image/SpongeBob_4.png +3 -0
  8. assets/example_multi_image/chair_1.png +3 -0
  9. assets/example_multi_image/chair_2.png +3 -0
  10. assets/example_multi_image/chair_3.png +3 -0
  11. assets/example_multi_image/flower_1.png +3 -0
  12. assets/example_multi_image/flower_2.png +3 -0
  13. assets/example_multi_image/flower_3.png +3 -0
  14. assets/example_multi_image/flower_4.png +3 -0
  15. assets/example_multi_image/flower_5.png +3 -0
  16. assets/example_multi_image/flower_6.png +3 -0
  17. assets/example_multi_image/flower_7.png +3 -0
  18. assets/example_multi_image/flower_8.png +3 -0
  19. assets/example_multi_image/monkey_1.png +3 -0
  20. assets/example_multi_image/monkey_2.png +3 -0
  21. assets/example_multi_image/monkey_3.png +3 -0
  22. assets/example_multi_image/monkey_4.png +3 -0
  23. assets/example_multi_image/paopao_1.png +3 -0
  24. assets/example_multi_image/paopao_2.png +3 -0
  25. assets/example_multi_image/paopao_3.png +3 -0
  26. assets/example_multi_image/paopao_4.png +3 -0
  27. assets/example_multi_image/paopao_5.png +3 -0
  28. assets/example_multi_image/paopao_6.png +3 -0
  29. assets/example_multi_image/paopao_7.png +3 -0
  30. assets/example_multi_image/paopao_8.png +3 -0
  31. assets/example_multi_image/puppet_1.png +3 -0
  32. assets/example_multi_image/puppet_2.png +3 -0
  33. assets/example_multi_image/puppet_3.png +3 -0
  34. assets/example_multi_image/robot_1.png +3 -0
  35. assets/example_multi_image/robot_2.png +3 -0
  36. assets/example_multi_image/toolcar_1.png +3 -0
  37. assets/example_multi_image/toolcar_2.png +3 -0
  38. assets/example_multi_image/toolcar_3.png +3 -0
  39. extensions/nvdiffrast/LICENSE.txt +97 -0
  40. extensions/nvdiffrast/README.md +42 -0
  41. extensions/nvdiffrast/nvdiffrast/__init__.py +9 -0
  42. extensions/nvdiffrast/nvdiffrast/common/antialias.cu +558 -0
  43. extensions/nvdiffrast/nvdiffrast/common/antialias.h +50 -0
  44. extensions/nvdiffrast/nvdiffrast/common/common.cpp +60 -0
  45. extensions/nvdiffrast/nvdiffrast/common/common.h +263 -0
  46. extensions/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp +63 -0
  47. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl +423 -0
  48. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp +94 -0
  49. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp +55 -0
  50. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl +730 -0
.gitattributes CHANGED
@@ -33,3 +33,38 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/example_multi_image/chair_1.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/example_multi_image/chair_2.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/example_multi_image/chair_3.png filter=lfs diff=lfs merge=lfs -text
39
+ assets/example_multi_image/flower_1.png filter=lfs diff=lfs merge=lfs -text
40
+ assets/example_multi_image/flower_2.png filter=lfs diff=lfs merge=lfs -text
41
+ assets/example_multi_image/flower_3.png filter=lfs diff=lfs merge=lfs -text
42
+ assets/example_multi_image/flower_4.png filter=lfs diff=lfs merge=lfs -text
43
+ assets/example_multi_image/flower_5.png filter=lfs diff=lfs merge=lfs -text
44
+ assets/example_multi_image/flower_6.png filter=lfs diff=lfs merge=lfs -text
45
+ assets/example_multi_image/flower_7.png filter=lfs diff=lfs merge=lfs -text
46
+ assets/example_multi_image/flower_8.png filter=lfs diff=lfs merge=lfs -text
47
+ assets/example_multi_image/monkey_1.png filter=lfs diff=lfs merge=lfs -text
48
+ assets/example_multi_image/monkey_2.png filter=lfs diff=lfs merge=lfs -text
49
+ assets/example_multi_image/monkey_3.png filter=lfs diff=lfs merge=lfs -text
50
+ assets/example_multi_image/monkey_4.png filter=lfs diff=lfs merge=lfs -text
51
+ assets/example_multi_image/paopao_1.png filter=lfs diff=lfs merge=lfs -text
52
+ assets/example_multi_image/paopao_2.png filter=lfs diff=lfs merge=lfs -text
53
+ assets/example_multi_image/paopao_3.png filter=lfs diff=lfs merge=lfs -text
54
+ assets/example_multi_image/paopao_4.png filter=lfs diff=lfs merge=lfs -text
55
+ assets/example_multi_image/paopao_5.png filter=lfs diff=lfs merge=lfs -text
56
+ assets/example_multi_image/paopao_6.png filter=lfs diff=lfs merge=lfs -text
57
+ assets/example_multi_image/paopao_7.png filter=lfs diff=lfs merge=lfs -text
58
+ assets/example_multi_image/paopao_8.png filter=lfs diff=lfs merge=lfs -text
59
+ assets/example_multi_image/puppet_1.png filter=lfs diff=lfs merge=lfs -text
60
+ assets/example_multi_image/puppet_2.png filter=lfs diff=lfs merge=lfs -text
61
+ assets/example_multi_image/puppet_3.png filter=lfs diff=lfs merge=lfs -text
62
+ assets/example_multi_image/robot_1.png filter=lfs diff=lfs merge=lfs -text
63
+ assets/example_multi_image/robot_2.png filter=lfs diff=lfs merge=lfs -text
64
+ assets/example_multi_image/SpongeBob_1.png filter=lfs diff=lfs merge=lfs -text
65
+ assets/example_multi_image/SpongeBob_2.png filter=lfs diff=lfs merge=lfs -text
66
+ assets/example_multi_image/SpongeBob_3.png filter=lfs diff=lfs merge=lfs -text
67
+ assets/example_multi_image/SpongeBob_4.png filter=lfs diff=lfs merge=lfs -text
68
+ assets/example_multi_image/toolcar_1.png filter=lfs diff=lfs merge=lfs -text
69
+ assets/example_multi_image/toolcar_2.png filter=lfs diff=lfs merge=lfs -text
70
+ assets/example_multi_image/toolcar_3.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,16 @@
1
  ---
2
  title: ReconViaGen
3
- emoji: 💻
4
- colorFrom: green
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.44.1
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
  title: ReconViaGen
3
+ emoji: 🖥️
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.34.2
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ short_description: High-fidelity 3D Geometry Generation from single view image
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
16
+ Project Page: https://jiahao620.github.io/reconviagen/
app.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ from gradio_litmodel3d import LitModel3D
4
+
5
+ import os
6
+ import shutil
7
+ os.environ['SPCONV_ALGO'] = 'native'
8
+ from typing import *
9
+ import torch
10
+ import numpy as np
11
+ import imageio
12
+ from easydict import EasyDict as edict
13
+ from PIL import Image
14
+ from trellis.pipelines import TrellisVGGTTo3DPipeline
15
+ from trellis.representations import Gaussian, MeshExtractResult
16
+ from trellis.utils import render_utils, postprocessing_utils
17
+
18
+
19
+
20
+ MAX_SEED = np.iinfo(np.int32).max
21
+ # TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
22
+ TMP_DIR = "tmp/Trellis-demo"
23
+ os.environ['GRADIO_TEMP_DIR'] = 'tmp'
24
+ os.makedirs(TMP_DIR, exist_ok=True)
25
+
26
+ def start_session(req: gr.Request):
27
+ user_dir = os.path.join(TMP_DIR, str(req.session_hash))
28
+ os.makedirs(user_dir, exist_ok=True)
29
+
30
+
31
+ def end_session(req: gr.Request):
32
+ user_dir = os.path.join(TMP_DIR, str(req.session_hash))
33
+ shutil.rmtree(user_dir)
34
+ def preprocess_image(image: Image.Image) -> Image.Image:
35
+ """
36
+ Preprocess the input image for 3D generation.
37
+
38
+ This function is called when a user uploads an image or selects an example.
39
+ It applies background removal and other preprocessing steps necessary for
40
+ optimal 3D model generation.
41
+
42
+ Args:
43
+ image (Image.Image): The input image from the user
44
+
45
+ Returns:
46
+ Image.Image: The preprocessed image ready for 3D generation
47
+ """
48
+ processed_image = pipeline.preprocess_image(image)
49
+ return processed_image
50
+
51
+
52
+ def preprocess_images(images: List[Tuple[Image.Image, str]]) -> List[Image.Image]:
53
+ """
54
+ Preprocess a list of input images for multi-image 3D generation.
55
+
56
+ This function is called when users upload multiple images in the gallery.
57
+ It processes each image to prepare them for the multi-image 3D generation pipeline.
58
+
59
+ Args:
60
+ images (List[Tuple[Image.Image, str]]): The input images from the gallery
61
+
62
+ Returns:
63
+ List[Image.Image]: The preprocessed images ready for 3D generation
64
+ """
65
+ images = [image[0] for image in images]
66
+ processed_images = [pipeline.preprocess_image(image) for image in images]
67
+ return processed_images
68
+
69
+
70
+ def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
71
+ return {
72
+ 'gaussian': {
73
+ **gs.init_params,
74
+ '_xyz': gs._xyz.cpu().numpy(),
75
+ '_features_dc': gs._features_dc.cpu().numpy(),
76
+ '_scaling': gs._scaling.cpu().numpy(),
77
+ '_rotation': gs._rotation.cpu().numpy(),
78
+ '_opacity': gs._opacity.cpu().numpy(),
79
+ },
80
+ 'mesh': {
81
+ 'vertices': mesh.vertices.cpu().numpy(),
82
+ 'faces': mesh.faces.cpu().numpy(),
83
+ },
84
+ }
85
+
86
+
87
+ def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
88
+ gs = Gaussian(
89
+ aabb=state['gaussian']['aabb'],
90
+ sh_degree=state['gaussian']['sh_degree'],
91
+ mininum_kernel_size=state['gaussian']['mininum_kernel_size'],
92
+ scaling_bias=state['gaussian']['scaling_bias'],
93
+ opacity_bias=state['gaussian']['opacity_bias'],
94
+ scaling_activation=state['gaussian']['scaling_activation'],
95
+ )
96
+ gs._xyz = torch.tensor(state['gaussian']['_xyz'], device='cuda')
97
+ gs._features_dc = torch.tensor(state['gaussian']['_features_dc'], device='cuda')
98
+ gs._scaling = torch.tensor(state['gaussian']['_scaling'], device='cuda')
99
+ gs._rotation = torch.tensor(state['gaussian']['_rotation'], device='cuda')
100
+ gs._opacity = torch.tensor(state['gaussian']['_opacity'], device='cuda')
101
+
102
+ mesh = edict(
103
+ vertices=torch.tensor(state['mesh']['vertices'], device='cuda'),
104
+ faces=torch.tensor(state['mesh']['faces'], device='cuda'),
105
+ )
106
+
107
+ return gs, mesh
108
+
109
+
110
+ def get_seed(randomize_seed: bool, seed: int) -> int:
111
+ """
112
+ Get the random seed for generation.
113
+
114
+ This function is called by the generate button to determine whether to use
115
+ a random seed or the user-specified seed value.
116
+
117
+ Args:
118
+ randomize_seed (bool): Whether to generate a random seed
119
+ seed (int): The user-specified seed value
120
+
121
+ Returns:
122
+ int: The seed to use for generation
123
+ """
124
+ return np.random.randint(0, MAX_SEED) if randomize_seed else seed
125
+
126
+
127
+ @spaces.GPU(duration=120)
128
+ def generate_and_extract_glb(
129
+ multiimages: List[Tuple[Image.Image, str]],
130
+ seed: int,
131
+ ss_guidance_strength: float,
132
+ ss_sampling_steps: int,
133
+ slat_guidance_strength: float,
134
+ slat_sampling_steps: int,
135
+ multiimage_algo: Literal["multidiffusion", "stochastic"],
136
+ mesh_simplify: float,
137
+ texture_size: int,
138
+ req: gr.Request,
139
+ ) -> Tuple[dict, str, str, str]:
140
+ """
141
+ Convert an image to a 3D model and extract GLB file.
142
+
143
+ Args:
144
+ image (Image.Image): The input image.
145
+ multiimages (List[Tuple[Image.Image, str]]): The input images in multi-image mode.
146
+ is_multiimage (bool): Whether is in multi-image mode.
147
+ seed (int): The random seed.
148
+ ss_guidance_strength (float): The guidance strength for sparse structure generation.
149
+ ss_sampling_steps (int): The number of sampling steps for sparse structure generation.
150
+ slat_guidance_strength (float): The guidance strength for structured latent generation.
151
+ slat_sampling_steps (int): The number of sampling steps for structured latent generation.
152
+ multiimage_algo (Literal["multidiffusion", "stochastic"]): The algorithm for multi-image generation.
153
+ mesh_simplify (float): The mesh simplification factor.
154
+ texture_size (int): The texture resolution.
155
+
156
+ Returns:
157
+ dict: The information of the generated 3D model.
158
+ str: The path to the video of the 3D model.
159
+ str: The path to the extracted GLB file.
160
+ str: The path to the extracted GLB file (for download).
161
+ """
162
+ user_dir = os.path.join(TMP_DIR, str(req.session_hash))
163
+ image_files = [image[0] for image in multiimages]
164
+
165
+ # Generate 3D model
166
+ outputs = pipeline.run(
167
+ image=image_files,
168
+ seed=seed,
169
+ formats=["gaussian", "mesh"],
170
+ preprocess_image=False,
171
+ sparse_structure_sampler_params={
172
+ "steps": ss_sampling_steps,
173
+ "cfg_strength": ss_guidance_strength,
174
+ },
175
+ slat_sampler_params={
176
+ "steps": slat_sampling_steps,
177
+ "cfg_strength": slat_guidance_strength,
178
+ },
179
+ mode=multiimage_algo,
180
+ )
181
+
182
+ # Render video
183
+ video = render_utils.render_video(outputs['gaussian'][0], num_frames=120)['color']
184
+ video_geo = render_utils.render_video(outputs['mesh'][0], num_frames=120)['normal']
185
+ video = [np.concatenate([video[i], video_geo[i]], axis=1) for i in range(len(video))]
186
+ video_path = os.path.join(user_dir, 'sample.mp4')
187
+ imageio.mimsave(video_path, video, fps=15)
188
+
189
+ # Extract GLB
190
+ gs = outputs['gaussian'][0]
191
+ mesh = outputs['mesh'][0]
192
+ glb = postprocessing_utils.to_glb(gs, mesh, simplify=mesh_simplify, texture_size=texture_size, verbose=False)
193
+ glb_path = os.path.join(user_dir, 'sample.glb')
194
+ glb.export(glb_path)
195
+
196
+ # Pack state for optional Gaussian extraction
197
+ state = pack_state(gs, mesh)
198
+
199
+ torch.cuda.empty_cache()
200
+ return state, video_path, glb_path, glb_path
201
+
202
+
203
+ @spaces.GPU
204
+ def extract_gaussian(state: dict, req: gr.Request) -> Tuple[str, str]:
205
+ """
206
+ Extract a Gaussian splatting file from the generated 3D model.
207
+
208
+ This function is called when the user clicks "Extract Gaussian" button.
209
+ It converts the 3D model state into a .ply file format containing
210
+ Gaussian splatting data for advanced 3D applications.
211
+
212
+ Args:
213
+ state (dict): The state of the generated 3D model containing Gaussian data
214
+ req (gr.Request): Gradio request object for session management
215
+
216
+ Returns:
217
+ Tuple[str, str]: Paths to the extracted Gaussian file (for display and download)
218
+ """
219
+ user_dir = os.path.join(TMP_DIR, str(req.session_hash))
220
+ gs, _ = unpack_state(state)
221
+ gaussian_path = os.path.join(user_dir, 'sample.ply')
222
+ gs.save_ply(gaussian_path)
223
+ torch.cuda.empty_cache()
224
+ return gaussian_path, gaussian_path
225
+
226
+
227
+ def prepare_multi_example() -> List[Image.Image]:
228
+ multi_case = list(set([i.split('_')[0] for i in os.listdir("assets/example_multi_image")]))
229
+ images = []
230
+ for case in multi_case:
231
+ _images = []
232
+ for i in range(1, 9):
233
+ if os.path.exists(f'assets/example_multi_image/{case}_{i}.png'):
234
+ img = Image.open(f'assets/example_multi_image/{case}_{i}.png')
235
+ W, H = img.size
236
+ img = img.resize((int(W / H * 512), 512))
237
+ _images.append(np.array(img))
238
+ if len(_images) > 0:
239
+ images.append(Image.fromarray(np.concatenate(_images, axis=1)))
240
+ return images
241
+
242
+
243
+ def split_image(image: Image.Image) -> List[Image.Image]:
244
+ """
245
+ Split a multi-view image into separate view images.
246
+
247
+ This function is called when users select multi-image examples that contain
248
+ multiple views in a single concatenated image. It automatically splits them
249
+ based on alpha channel boundaries and preprocesses each view.
250
+
251
+ Args:
252
+ image (Image.Image): A concatenated image containing multiple views
253
+
254
+ Returns:
255
+ List[Image.Image]: List of individual preprocessed view images
256
+ """
257
+ image = np.array(image)
258
+ alpha = image[..., 3]
259
+ alpha = np.any(alpha>0, axis=0)
260
+ start_pos = np.where(~alpha[:-1] & alpha[1:])[0].tolist()
261
+ end_pos = np.where(alpha[:-1] & ~alpha[1:])[0].tolist()
262
+ images = []
263
+ for s, e in zip(start_pos, end_pos):
264
+ images.append(Image.fromarray(image[:, s:e+1]))
265
+ return [preprocess_image(image) for image in images]
266
+
267
+
268
+ with gr.Blocks(delete_cache=(600, 600)) as demo:
269
+ gr.Markdown("""
270
+ ## Multi-view images to 3D Asset with [ReconViaGen](https://jiahao620.github.io/reconviagen/)
271
+ * Upload an image and click "Generate & Extract GLB" to create a 3D asset and automatically extract the GLB file.
272
+ * If you want the Gaussian file as well, click "Extract Gaussian" after generation.
273
+ * If the image has alpha channel, it will be used as the mask. Otherwise, we use `rembg` to remove the background.
274
+
275
+ ✨This demo is partial. We will release the whole model later. Stay tuned!✨
276
+ """)
277
+
278
+ with gr.Row():
279
+ with gr.Column():
280
+ with gr.Tabs() as input_tabs:
281
+ with gr.Tab(label="Multiple Images", id=0) as multiimage_input_tab:
282
+ image_prompt = gr.Image(label="Image Prompt", format="png", visible=False, image_mode="RGBA", type="pil", height=300)
283
+ multiimage_prompt = gr.Gallery(label="Image Prompt", format="png", type="pil", height=300, columns=3)
284
+ gr.Markdown("""
285
+ Input different views of the object in separate images.
286
+
287
+ *NOTE: this is an experimental algorithm without training a specialized model. It may not produce the best results for all images, especially those having different poses or inconsistent details.*
288
+ """)
289
+
290
+ with gr.Accordion(label="Generation Settings", open=False):
291
+ seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
292
+ randomize_seed = gr.Checkbox(label="Randomize Seed", value=False)
293
+ gr.Markdown("Stage 1: Sparse Structure Generation")
294
+ with gr.Row():
295
+ ss_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
296
+ ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=30, step=1)
297
+ gr.Markdown("Stage 2: Structured Latent Generation")
298
+ with gr.Row():
299
+ slat_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=3.0, step=0.1)
300
+ slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
301
+ multiimage_algo = gr.Radio(["stochastic", "multidiffusion"], label="Multi-image Algorithm", value="multidiffusion")
302
+
303
+ with gr.Accordion(label="GLB Extraction Settings", open=False):
304
+ mesh_simplify = gr.Slider(0.9, 0.98, label="Simplify", value=0.95, step=0.01)
305
+ texture_size = gr.Slider(512, 2048, label="Texture Size", value=1024, step=512)
306
+
307
+ generate_btn = gr.Button("Generate & Extract GLB", variant="primary")
308
+ extract_gs_btn = gr.Button("Extract Gaussian", interactive=False)
309
+ gr.Markdown("""
310
+ *NOTE: Gaussian file can be very large (~50MB), it will take a while to display and download.*
311
+ """)
312
+
313
+ with gr.Column():
314
+ video_output = gr.Video(label="Generated 3D Asset", autoplay=True, loop=True, height=300)
315
+ model_output = LitModel3D(label="Extracted GLB/Gaussian", exposure=10.0, height=300)
316
+
317
+ with gr.Row():
318
+ download_glb = gr.DownloadButton(label="Download GLB", interactive=False)
319
+ download_gs = gr.DownloadButton(label="Download Gaussian", interactive=False)
320
+
321
+ output_buf = gr.State()
322
+
323
+ # Example images at the bottom of the page
324
+ with gr.Row() as multiimage_example:
325
+ examples_multi = gr.Examples(
326
+ examples=prepare_multi_example(),
327
+ inputs=[image_prompt],
328
+ fn=split_image,
329
+ outputs=[multiimage_prompt],
330
+ run_on_click=True,
331
+ examples_per_page=8,
332
+ )
333
+
334
+ # Handlers
335
+ demo.load(start_session)
336
+ demo.unload(end_session)
337
+
338
+
339
+ multiimage_prompt.upload(
340
+ preprocess_images,
341
+ inputs=[multiimage_prompt],
342
+ outputs=[multiimage_prompt],
343
+ )
344
+
345
+ generate_btn.click(
346
+ get_seed,
347
+ inputs=[randomize_seed, seed],
348
+ outputs=[seed],
349
+ ).then(
350
+ lambda: [None, None, None, None], # 先清空 video_output
351
+ inputs=[],
352
+ outputs=[video_output, model_output, download_glb, download_gs],
353
+ ).then(
354
+ generate_and_extract_glb,
355
+ inputs=[multiimage_prompt, seed, ss_guidance_strength, ss_sampling_steps, slat_guidance_strength, slat_sampling_steps, multiimage_algo, mesh_simplify, texture_size],
356
+ outputs=[output_buf, video_output, model_output, download_glb],
357
+ ).then(
358
+ lambda: tuple([gr.Button(interactive=True), gr.Button(interactive=True)]),
359
+ outputs=[extract_gs_btn, download_glb],
360
+ )
361
+
362
+ video_output.clear(
363
+ lambda: tuple([gr.Button(interactive=False), gr.Button(interactive=False), gr.Button(interactive=False)]),
364
+ outputs=[extract_gs_btn, download_glb, download_gs],
365
+ )
366
+
367
+ extract_gs_btn.click(
368
+ extract_gaussian,
369
+ inputs=[output_buf],
370
+ outputs=[model_output, download_gs],
371
+ ).then(
372
+ lambda: gr.Button(interactive=True),
373
+ outputs=[download_gs],
374
+ )
375
+
376
+ model_output.clear(
377
+ lambda: tuple([gr.Button(interactive=False), gr.Button(interactive=False)]),
378
+ outputs=[download_glb, download_gs],
379
+ )
380
+
381
+
382
+ # Launch the Gradio app
383
+ if __name__ == "__main__":
384
+ pipeline = TrellisVGGTTo3DPipeline.from_pretrained("weights/trellis-vggt-v0-1")
385
+ # pipeline = TrellisVGGTTo3DPipeline.from_pretrained("Stable-X/trellis-vggt-v0-1")
386
+ pipeline.cuda()
387
+ pipeline.VGGT_model.cuda()
388
+ try:
389
+ pipeline.preprocess_image(Image.fromarray(np.zeros((512, 512, 3), dtype=np.uint8))) # Preload rembg
390
+ except:
391
+ pass
392
+ demo.launch()
assets/example_multi_image/SpongeBob_1.png ADDED

Git LFS Details

  • SHA256: a022951d5eb80145eb7a523786c217e680b40b91e2aae1b8369aff35d849da55
  • Pointer size: 131 Bytes
  • Size of remote file: 274 kB
assets/example_multi_image/SpongeBob_2.png ADDED

Git LFS Details

  • SHA256: 2ea5243c1decd64fca9db076de0857a90eb173c4deb6ff5feb7af0412f99d0c4
  • Pointer size: 131 Bytes
  • Size of remote file: 239 kB
assets/example_multi_image/SpongeBob_3.png ADDED

Git LFS Details

  • SHA256: 2bdb9dc4a6215ba93b3a489ac05c4bd0c713578690e31f9c6c7bf9a801e35160
  • Pointer size: 131 Bytes
  • Size of remote file: 149 kB
assets/example_multi_image/SpongeBob_4.png ADDED

Git LFS Details

  • SHA256: 9274122325c00f390cf7ed91fee5051774791a91718c932b87e8bcf4518d262a
  • Pointer size: 131 Bytes
  • Size of remote file: 184 kB
assets/example_multi_image/chair_1.png ADDED

Git LFS Details

  • SHA256: e60f01e62be25418ce96581b4ed2268a011b32f7c6d5409697a3f297f95fea4c
  • Pointer size: 131 Bytes
  • Size of remote file: 171 kB
assets/example_multi_image/chair_2.png ADDED

Git LFS Details

  • SHA256: 0ac39f31bb7f0173fc545796f280bd449f6a67a017966d701067b8faf26060aa
  • Pointer size: 131 Bytes
  • Size of remote file: 155 kB
assets/example_multi_image/chair_3.png ADDED

Git LFS Details

  • SHA256: 1b3cc4debfba605ba1eaf540d9e7b3d77248e42e84c4301da6685ce9248db1ee
  • Pointer size: 131 Bytes
  • Size of remote file: 148 kB
assets/example_multi_image/flower_1.png ADDED

Git LFS Details

  • SHA256: 3888a244c5e11b7d31db48ecef13835436b55f7f89a4a335bd3c92d411e19dd7
  • Pointer size: 131 Bytes
  • Size of remote file: 159 kB
assets/example_multi_image/flower_2.png ADDED

Git LFS Details

  • SHA256: b0e20b8cba9027f725b6d7d8893a9d8b6bb2ee1dc6d7613992cafb4642b3fe34
  • Pointer size: 131 Bytes
  • Size of remote file: 161 kB
assets/example_multi_image/flower_3.png ADDED

Git LFS Details

  • SHA256: b93c7ff6093cfc5a41a37b9956211c50406652dd754178f99fd3b76b9eb2b5f0
  • Pointer size: 131 Bytes
  • Size of remote file: 157 kB
assets/example_multi_image/flower_4.png ADDED

Git LFS Details

  • SHA256: 4f8230cabcdc9d189307d22b8b999ba3258fecf32f2406ec0f2d1fe22d527071
  • Pointer size: 131 Bytes
  • Size of remote file: 158 kB
assets/example_multi_image/flower_5.png ADDED

Git LFS Details

  • SHA256: 288268846343ba9b14fa5645c8f27d0f6a481645437355e26ae05f58a3f86826
  • Pointer size: 131 Bytes
  • Size of remote file: 157 kB
assets/example_multi_image/flower_6.png ADDED

Git LFS Details

  • SHA256: fca1cd7c68af1b35cb70ac775f37fd8dd65883e32a5f1cf4b2d3dbbe1b68fcb4
  • Pointer size: 131 Bytes
  • Size of remote file: 156 kB
assets/example_multi_image/flower_7.png ADDED

Git LFS Details

  • SHA256: 52e025b07dc2ee6e55c154d3c1cee598d19fc47d66d6f1954230f0622813d1d1
  • Pointer size: 131 Bytes
  • Size of remote file: 158 kB
assets/example_multi_image/flower_8.png ADDED

Git LFS Details

  • SHA256: b3c2c05502cc77cb59704cbeaf63d30da5492f08ce24bbc9b1a527cb057a9841
  • Pointer size: 131 Bytes
  • Size of remote file: 159 kB
assets/example_multi_image/monkey_1.png ADDED

Git LFS Details

  • SHA256: 4b3e610685ddaa1375af40b96f97dbfa4f4f53be6756c9529e91dab5ae7d7292
  • Pointer size: 131 Bytes
  • Size of remote file: 123 kB
assets/example_multi_image/monkey_2.png ADDED

Git LFS Details

  • SHA256: c0386797982ec0da142abc85aa66a3f6b65f779540a07b27d25e7ffc64ebb9c7
  • Pointer size: 131 Bytes
  • Size of remote file: 129 kB
assets/example_multi_image/monkey_3.png ADDED

Git LFS Details

  • SHA256: 95aa11f4e5616c2e0d6eca7331e0ddc6c6d9a1fa9c18ea186abb73e66b78d8a8
  • Pointer size: 131 Bytes
  • Size of remote file: 136 kB
assets/example_multi_image/monkey_4.png ADDED

Git LFS Details

  • SHA256: a13a9537461baf5ff842bac59f5b3a27a8d77340b2be2f527dc8302c502fedcb
  • Pointer size: 131 Bytes
  • Size of remote file: 115 kB
assets/example_multi_image/paopao_1.png ADDED

Git LFS Details

  • SHA256: 6765b458154ebd0e84af5e3b281b07418afc617b64b02fa6f9d041a4f1810630
  • Pointer size: 131 Bytes
  • Size of remote file: 127 kB
assets/example_multi_image/paopao_2.png ADDED

Git LFS Details

  • SHA256: c1076a73d7f83225e24fccb4f33805012f1e4229afb4cece06f5eff8a88d8986
  • Pointer size: 131 Bytes
  • Size of remote file: 120 kB
assets/example_multi_image/paopao_3.png ADDED

Git LFS Details

  • SHA256: 8a2b7ee735c94f1cc6102db02eb08bf6516ba5d7e30cd4b0f0a5437a247f6343
  • Pointer size: 131 Bytes
  • Size of remote file: 127 kB
assets/example_multi_image/paopao_4.png ADDED

Git LFS Details

  • SHA256: 359d7b28b03f309b2b602c2f2003f57b71e239a90fbc8971c99b52c93f5ea05e
  • Pointer size: 131 Bytes
  • Size of remote file: 122 kB
assets/example_multi_image/paopao_5.png ADDED

Git LFS Details

  • SHA256: 28d360a528c1e9661245eea3c14c2c3111dcb26b8dc13f7efaed8ecedc451ab3
  • Pointer size: 131 Bytes
  • Size of remote file: 119 kB
assets/example_multi_image/paopao_6.png ADDED

Git LFS Details

  • SHA256: 65d356f07f6e57b44604768e6ffae771af98ea7e2b553839500f37d8119f2655
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
assets/example_multi_image/paopao_7.png ADDED

Git LFS Details

  • SHA256: 6859331313770c2174b361291d8a79523a11a9acb9c456617855a8b1f2ef036b
  • Pointer size: 131 Bytes
  • Size of remote file: 119 kB
assets/example_multi_image/paopao_8.png ADDED

Git LFS Details

  • SHA256: df1a323c8f879df98594d3c47a2eb5991c2f0415f73c4d083b80261d090f4d8d
  • Pointer size: 131 Bytes
  • Size of remote file: 123 kB
assets/example_multi_image/puppet_1.png ADDED

Git LFS Details

  • SHA256: 4f8c870d6abb33eb6de29e4adba0afb06415d487ea17ee92d761733cb4dddeed
  • Pointer size: 131 Bytes
  • Size of remote file: 182 kB
assets/example_multi_image/puppet_2.png ADDED

Git LFS Details

  • SHA256: 29ab76e25445690226148f6b71f1710d217dbfe7cdb5b11d60b4193dba45158d
  • Pointer size: 131 Bytes
  • Size of remote file: 188 kB
assets/example_multi_image/puppet_3.png ADDED

Git LFS Details

  • SHA256: 5b0aa58ac7dadcd4fb15d3ddc80d8c07ab0572018e6af12ffb417b308c868319
  • Pointer size: 131 Bytes
  • Size of remote file: 240 kB
assets/example_multi_image/robot_1.png ADDED

Git LFS Details

  • SHA256: 0da58dacc1fa327e2a084661bcb00328dba5febc4c8a5c8592d63a7c59f1925c
  • Pointer size: 131 Bytes
  • Size of remote file: 169 kB
assets/example_multi_image/robot_2.png ADDED

Git LFS Details

  • SHA256: 98becf95105550816b2bd95ada56219655895bf2aa60b373068f3df164fe6a15
  • Pointer size: 131 Bytes
  • Size of remote file: 225 kB
assets/example_multi_image/toolcar_1.png ADDED

Git LFS Details

  • SHA256: b48779b49b026b33742e1a84c51c0ef1c299ed0dd3bb215736fa5d4380109cda
  • Pointer size: 131 Bytes
  • Size of remote file: 167 kB
assets/example_multi_image/toolcar_2.png ADDED

Git LFS Details

  • SHA256: 2e04e2715577bf790b131491c3b70b9d658ae10ba3980419c6cee356996cd5db
  • Pointer size: 131 Bytes
  • Size of remote file: 151 kB
assets/example_multi_image/toolcar_3.png ADDED

Git LFS Details

  • SHA256: 8eb50ffe75dca646a1578d00bfc3003e7d5bdf506fcb4f7840f2436c4e08d6d1
  • Pointer size: 131 Bytes
  • Size of remote file: 147 kB
extensions/nvdiffrast/LICENSE.txt ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
2
+
3
+
4
+ Nvidia Source Code License (1-Way Commercial)
5
+
6
+ =======================================================================
7
+
8
+ 1. Definitions
9
+
10
+ "Licensor" means any person or entity that distributes its Work.
11
+
12
+ "Software" means the original work of authorship made available under
13
+ this License.
14
+
15
+ "Work" means the Software and any additions to or derivative works of
16
+ the Software that are made available under this License.
17
+
18
+ The terms "reproduce," "reproduction," "derivative works," and
19
+ "distribution" have the meaning as provided under U.S. copyright law;
20
+ provided, however, that for the purposes of this License, derivative
21
+ works shall not include works that remain separable from, or merely
22
+ link (or bind by name) to the interfaces of, the Work.
23
+
24
+ Works, including the Software, are "made available" under this License
25
+ by including in or with the Work either (a) a copyright notice
26
+ referencing the applicability of this License to the Work, or (b) a
27
+ copy of this License.
28
+
29
+ 2. License Grants
30
+
31
+ 2.1 Copyright Grant. Subject to the terms and conditions of this
32
+ License, each Licensor grants to you a perpetual, worldwide,
33
+ non-exclusive, royalty-free, copyright license to reproduce,
34
+ prepare derivative works of, publicly display, publicly perform,
35
+ sublicense and distribute its Work and any resulting derivative
36
+ works in any form.
37
+
38
+ 3. Limitations
39
+
40
+ 3.1 Redistribution. You may reproduce or distribute the Work only
41
+ if (a) you do so under this License, (b) you include a complete
42
+ copy of this License with your distribution, and (c) you retain
43
+ without modification any copyright, patent, trademark, or
44
+ attribution notices that are present in the Work.
45
+
46
+ 3.2 Derivative Works. You may specify that additional or different
47
+ terms apply to the use, reproduction, and distribution of your
48
+ derivative works of the Work ("Your Terms") only if (a) Your Terms
49
+ provide that the use limitation in Section 3.3 applies to your
50
+ derivative works, and (b) you identify the specific derivative
51
+ works that are subject to Your Terms. Notwithstanding Your Terms,
52
+ this License (including the redistribution requirements in Section
53
+ 3.1) will continue to apply to the Work itself.
54
+
55
+ 3.3 Use Limitation. The Work and any derivative works thereof only
56
+ may be used or intended for use non-commercially. The Work or
57
+ derivative works thereof may be used or intended for use by Nvidia
58
+ or its affiliates commercially or non-commercially. As used herein,
59
+ "non-commercially" means for research or evaluation purposes only
60
+ and not for any direct or indirect monetary gain.
61
+
62
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim
63
+ against any Licensor (including any claim, cross-claim or
64
+ counterclaim in a lawsuit) to enforce any patents that you allege
65
+ are infringed by any Work, then your rights under this License from
66
+ such Licensor (including the grant in Section 2.1) will terminate
67
+ immediately.
68
+
69
+ 3.5 Trademarks. This License does not grant any rights to use any
70
+ Licensor's or its affiliates' names, logos, or trademarks, except
71
+ as necessary to reproduce the notices described in this License.
72
+
73
+ 3.6 Termination. If you violate any term of this License, then your
74
+ rights under this License (including the grant in Section 2.1) will
75
+ terminate immediately.
76
+
77
+ 4. Disclaimer of Warranty.
78
+
79
+ THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
80
+ KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
81
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
82
+ NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
83
+ THIS LICENSE.
84
+
85
+ 5. Limitation of Liability.
86
+
87
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
88
+ THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
89
+ SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
90
+ INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
91
+ OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
92
+ (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
93
+ LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
94
+ COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
95
+ THE POSSIBILITY OF SUCH DAMAGES.
96
+
97
+ =======================================================================
extensions/nvdiffrast/README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Nvdiffrast – Modular Primitives for High-Performance Differentiable Rendering
2
+
3
+ ![Teaser image](./docs/img/teaser.png)
4
+
5
+ **Modular Primitives for High-Performance Differentiable Rendering**<br>
6
+ Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
7
+ [http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
8
+
9
+ Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
10
+ Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
11
+
12
+ ## Licenses
13
+
14
+ Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
15
+
16
+ This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
17
+
18
+ For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
19
+
20
+ We do not currently accept outside code contributions in the form of pull requests.
21
+
22
+ Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
23
+ [sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
24
+ originally shared under
25
+ [MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
26
+ Mesh and texture stored as part of `samples/data/earth.npz` are derived from
27
+ [3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
28
+ model originally made available under
29
+ [TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
30
+
31
+ ## Citation
32
+
33
+ ```
34
+ @article{Laine2020diffrast,
35
+ title = {Modular Primitives for High-Performance Differentiable Rendering},
36
+ author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
37
+ journal = {ACM Transactions on Graphics},
38
+ year = {2020},
39
+ volume = {39},
40
+ number = {6}
41
+ }
42
+ ```
extensions/nvdiffrast/nvdiffrast/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ # and proprietary rights in and to this software, related documentation
5
+ # and any modifications thereto. Any use, reproduction, disclosure or
6
+ # distribution of this software and related documentation without an express
7
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ __version__ = '0.3.3'
extensions/nvdiffrast/nvdiffrast/common/antialias.cu ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #include "antialias.h"
10
+
11
+ //------------------------------------------------------------------------
12
+ // Helpers.
13
+
14
+ #define F32_MAX (3.402823466e+38f)
15
+ static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
16
+ static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
17
+ static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
18
+ {
19
+ bool g10 = rational_gt(n1, n0, d1, d0);
20
+ bool g20 = rational_gt(n2, n0, d2, d0);
21
+ bool g21 = rational_gt(n2, n1, d2, d1);
22
+ if (g20 && g21) return 2;
23
+ if (g10) return 1;
24
+ return 0;
25
+ }
26
+
27
+ //------------------------------------------------------------------------
28
+ // Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
29
+
30
+ struct AAWorkItem
31
+ {
32
+ enum
33
+ {
34
+ EDGE_MASK = 3, // Edge index in lowest bits.
35
+ FLAG_DOWN_BIT = 2, // Down instead of right.
36
+ FLAG_TRI1_BIT = 3, // Edge is from other pixel's triangle.
37
+ };
38
+
39
+ int px, py; // Pixel x, y.
40
+ unsigned int pz_flags; // High 16 bits = pixel z, low 16 bits = edge index and flags.
41
+ float alpha; // Antialiasing alpha value. Zero if no AA.
42
+ };
43
+
44
+ //------------------------------------------------------------------------
45
+ // Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
46
+
47
+ #define JENKINS_MAGIC (0x9e3779b9u)
48
+ static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
49
+ {
50
+ a -= b; a -= c; a ^= (c>>13);
51
+ b -= c; b -= a; b ^= (a<<8);
52
+ c -= a; c -= b; c ^= (b>>13);
53
+ a -= b; a -= c; a ^= (c>>12);
54
+ b -= c; b -= a; b ^= (a<<16);
55
+ c -= a; c -= b; c ^= (b>>5);
56
+ a -= b; a -= c; a ^= (c>>3);
57
+ b -= c; b -= a; b ^= (a<<10);
58
+ c -= a; c -= b; c ^= (b>>15);
59
+ }
60
+
61
+ // Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
62
+ class HashIndex
63
+ {
64
+ public:
65
+ __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
66
+ {
67
+ m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; // This should work until triangle count exceeds 1073741824.
68
+ m_idx = (uint32_t)(key & 0xffffffffu);
69
+ m_skip = (uint32_t)(key >> 32);
70
+ uint32_t dummy = JENKINS_MAGIC;
71
+ jenkins_mix(m_idx, m_skip, dummy);
72
+ m_idx &= m_mask;
73
+ m_skip &= m_mask;
74
+ m_skip |= 1;
75
+ }
76
+ __device__ __forceinline__ int get(void) const { return m_idx; }
77
+ __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
78
+ private:
79
+ uint32_t m_idx, m_skip, m_mask;
80
+ };
81
+
82
+ static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
83
+ {
84
+ HashIndex idx(p, key);
85
+ while(1)
86
+ {
87
+ uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
88
+ if (prev == 0 || prev == key)
89
+ break;
90
+ idx.next();
91
+ }
92
+ int* q = (int*)&p.evHash[idx.get()];
93
+ int a = atomicCAS(q+2, 0, v);
94
+ if (a != 0 && a != v)
95
+ atomicCAS(q+3, 0, v);
96
+ }
97
+
98
+ static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
99
+ {
100
+ HashIndex idx(p, key);
101
+ while(1)
102
+ {
103
+ uint4 entry = p.evHash[idx.get()];
104
+ uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
105
+ if (k == key || k == 0)
106
+ return make_int2((int)entry.z, (int)entry.w);
107
+ idx.next();
108
+ }
109
+ }
110
+
111
+ static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
112
+ {
113
+ if (va == vb)
114
+ return;
115
+
116
+ uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
117
+ uint64_t v1 = (uint32_t)max(va, vb) + 1;
118
+ uint64_t vk = v0 | (v1 << 32); // hash key
119
+ hash_insert(p, vk, vn + 1);
120
+ }
121
+
122
+ static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
123
+ {
124
+ if (va == vb)
125
+ return -1;
126
+
127
+ uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
128
+ uint64_t v1 = (uint32_t)max(va, vb) + 1;
129
+ uint64_t vk = v0 | (v1 << 32); // hash key
130
+ int2 vn = hash_find(p, vk) - 1;
131
+ if (vn.x == vr) return vn.y;
132
+ if (vn.y == vr) return vn.x;
133
+ return -1;
134
+ }
135
+
136
+ //------------------------------------------------------------------------
137
+ // Mesh analysis kernel.
138
+
139
+ __global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
140
+ {
141
+ int idx = threadIdx.x + blockIdx.x * blockDim.x;
142
+ if (idx >= p.numTriangles)
143
+ return;
144
+
145
+ int v0 = p.tri[idx * 3 + 0];
146
+ int v1 = p.tri[idx * 3 + 1];
147
+ int v2 = p.tri[idx * 3 + 2];
148
+
149
+ if (v0 < 0 || v0 >= p.numVertices ||
150
+ v1 < 0 || v1 >= p.numVertices ||
151
+ v2 < 0 || v2 >= p.numVertices)
152
+ return;
153
+
154
+ if (v0 == v1 || v1 == v2 || v2 == v0)
155
+ return;
156
+
157
+ evhash_insert_vertex(p, v1, v2, v0);
158
+ evhash_insert_vertex(p, v2, v0, v1);
159
+ evhash_insert_vertex(p, v0, v1, v2);
160
+ }
161
+
162
+ //------------------------------------------------------------------------
163
+ // Discontinuity finder kernel.
164
+
165
+ __global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
166
+ {
167
+ // Calculate pixel position.
168
+ int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
169
+ int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
170
+ int pz = blockIdx.z;
171
+ if (px >= p.width || py >= p.height || pz >= p.n)
172
+ return;
173
+
174
+ // Pointer to our TriIdx and fetch.
175
+ int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
176
+ float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other.
177
+
178
+ // Look right, clamp at edge.
179
+ int pidx1 = pidx0;
180
+ if (px < p.width - 1)
181
+ pidx1 += 4;
182
+ float tri1 = p.rasterOut[pidx1];
183
+
184
+ // Look down, clamp at edge.
185
+ int pidx2 = pidx0;
186
+ if (py < p.height - 1)
187
+ pidx2 += p.width << 2;
188
+ float tri2 = p.rasterOut[pidx2];
189
+
190
+ // Determine amount of work.
191
+ int count = 0;
192
+ if (tri1 != tri0) count = 1;
193
+ if (tri2 != tri0) count += 1;
194
+ if (!count)
195
+ return; // Exit warp.
196
+
197
+ // Coalesce work counter update to once per CTA.
198
+ __shared__ int s_temp;
199
+ s_temp = 0;
200
+ __syncthreads();
201
+ int idx = atomicAdd(&s_temp, count);
202
+ __syncthreads();
203
+ if (idx == 0)
204
+ {
205
+ int base = atomicAdd(&p.workBuffer[0].x, s_temp);
206
+ s_temp = base + 1; // don't clobber the counters in first slot.
207
+ }
208
+ __syncthreads();
209
+ idx += s_temp;
210
+
211
+ // Write to memory.
212
+ if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
213
+ if (tri2 != tri0) p.workBuffer[idx] = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
214
+ }
215
+
216
+ //------------------------------------------------------------------------
217
+ // Forward analysis kernel.
218
+
219
+ __global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
220
+ {
221
+ __shared__ int s_base;
222
+ int workCount = p.workBuffer[0].x;
223
+ for(;;)
224
+ {
225
+ // Persistent threads work fetcher.
226
+ __syncthreads();
227
+ if (threadIdx.x == 0)
228
+ s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
229
+ __syncthreads();
230
+ int thread_idx = s_base + threadIdx.x;
231
+ if (thread_idx >= workCount)
232
+ return;
233
+
234
+ int4* pItem = p.workBuffer + thread_idx + 1;
235
+ int4 item = *pItem;
236
+ int px = item.x;
237
+ int py = item.y;
238
+ int pz = (int)(((unsigned int)item.z) >> 16);
239
+ int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
240
+
241
+ int pixel0 = px + p.width * (py + p.height * pz);
242
+ int pixel1 = pixel0 + (d ? p.width : 1);
243
+ float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
244
+ float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
245
+ int tri0 = float_to_triidx(zt0.y) - 1;
246
+ int tri1 = float_to_triidx(zt1.y) - 1;
247
+
248
+ // Select triangle based on background / depth.
249
+ int tri = (tri0 >= 0) ? tri0 : tri1;
250
+ if (tri0 >= 0 && tri1 >= 0)
251
+ tri = (zt0.x < zt1.x) ? tri0 : tri1;
252
+ if (tri == tri1)
253
+ {
254
+ // Calculate with respect to neighbor pixel if chose that triangle.
255
+ px += 1 - d;
256
+ py += d;
257
+ }
258
+
259
+ // Bail out if triangle index is corrupt.
260
+ if (tri < 0 || tri >= p.numTriangles)
261
+ continue;
262
+
263
+ // Fetch vertex indices.
264
+ int vi0 = p.tri[tri * 3 + 0];
265
+ int vi1 = p.tri[tri * 3 + 1];
266
+ int vi2 = p.tri[tri * 3 + 2];
267
+
268
+ // Bail out if vertex indices are corrupt.
269
+ if (vi0 < 0 || vi0 >= p.numVertices ||
270
+ vi1 < 0 || vi1 >= p.numVertices ||
271
+ vi2 < 0 || vi2 >= p.numVertices)
272
+ continue;
273
+
274
+ // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
275
+ int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
276
+ int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
277
+ int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
278
+
279
+ // Instance mode: Adjust vertex indices based on minibatch index.
280
+ if (p.instance_mode)
281
+ {
282
+ int vbase = pz * p.numVertices;
283
+ vi0 += vbase;
284
+ vi1 += vbase;
285
+ vi2 += vbase;
286
+ if (op0 >= 0) op0 += vbase;
287
+ if (op1 >= 0) op1 += vbase;
288
+ if (op2 >= 0) op2 += vbase;
289
+ }
290
+
291
+ // Fetch vertex positions.
292
+ float4 p0 = ((float4*)p.pos)[vi0];
293
+ float4 p1 = ((float4*)p.pos)[vi1];
294
+ float4 p2 = ((float4*)p.pos)[vi2];
295
+ float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
296
+ float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
297
+ float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
298
+
299
+ // Project vertices to pixel space.
300
+ float w0 = 1.f / p0.w;
301
+ float w1 = 1.f / p1.w;
302
+ float w2 = 1.f / p2.w;
303
+ float ow0 = 1.f / o0.w;
304
+ float ow1 = 1.f / o1.w;
305
+ float ow2 = 1.f / o2.w;
306
+ float fx = (float)px + .5f - p.xh;
307
+ float fy = (float)py + .5f - p.yh;
308
+ float x0 = p0.x * w0 * p.xh - fx;
309
+ float y0 = p0.y * w0 * p.yh - fy;
310
+ float x1 = p1.x * w1 * p.xh - fx;
311
+ float y1 = p1.y * w1 * p.yh - fy;
312
+ float x2 = p2.x * w2 * p.xh - fx;
313
+ float y2 = p2.y * w2 * p.yh - fy;
314
+ float ox0 = o0.x * ow0 * p.xh - fx;
315
+ float oy0 = o0.y * ow0 * p.yh - fy;
316
+ float ox1 = o1.x * ow1 * p.xh - fx;
317
+ float oy1 = o1.y * ow1 * p.yh - fy;
318
+ float ox2 = o2.x * ow2 * p.xh - fx;
319
+ float oy2 = o2.y * ow2 * p.yh - fy;
320
+
321
+ // Signs to kill non-silhouette edges.
322
+ float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
323
+ float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
324
+ float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
325
+ float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
326
+
327
+ // If no matching signs anywhere, skip the rest.
328
+ if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
329
+ {
330
+ // XY flip for horizontal edges.
331
+ if (d)
332
+ {
333
+ swap(x0, y0);
334
+ swap(x1, y1);
335
+ swap(x2, y2);
336
+ }
337
+
338
+ float dx0 = x2 - x1;
339
+ float dx1 = x0 - x2;
340
+ float dx2 = x1 - x0;
341
+ float dy0 = y2 - y1;
342
+ float dy1 = y0 - y2;
343
+ float dy2 = y1 - y0;
344
+
345
+ // Check if an edge crosses between us and the neighbor pixel.
346
+ float dc = -F32_MAX;
347
+ float ds = (tri == tri0) ? 1.f : -1.f;
348
+ float d0 = ds * (x1*dy0 - y1*dx0);
349
+ float d1 = ds * (x2*dy1 - y2*dx1);
350
+ float d2 = ds * (x0*dy2 - y0*dx2);
351
+
352
+ if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
353
+ if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
354
+ if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
355
+
356
+ int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
357
+ if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
358
+ if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
359
+ if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
360
+ float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
361
+
362
+ // Adjust output image if a suitable edge was found.
363
+ if (dc > -eps && dc < 1.f + eps)
364
+ {
365
+ dc = fminf(fmaxf(dc, 0.f), 1.f);
366
+ float alpha = ds * (.5f - dc);
367
+ const float* pColor0 = p.color + pixel0 * p.channels;
368
+ const float* pColor1 = p.color + pixel1 * p.channels;
369
+ float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
370
+ for (int i=0; i < p.channels; i++)
371
+ atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
372
+
373
+ // Rewrite the work item's flags and alpha. Keep original px, py.
374
+ unsigned int flags = pz << 16;
375
+ flags |= di;
376
+ flags |= d << AAWorkItem::FLAG_DOWN_BIT;
377
+ flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
378
+ ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
379
+ }
380
+ }
381
+ }
382
+ }
383
+
384
+ //------------------------------------------------------------------------
385
+ // Gradient kernel.
386
+
387
+ __global__ void AntialiasGradKernel(const AntialiasKernelParams p)
388
+ {
389
+ // Temporary space for coalesced atomics.
390
+ CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
391
+ __shared__ int s_base; // Work counter communication across entire CTA.
392
+
393
+ int workCount = p.workBuffer[0].x;
394
+
395
+ for(;;)
396
+ {
397
+ // Persistent threads work fetcher.
398
+ __syncthreads();
399
+ if (threadIdx.x == 0)
400
+ s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
401
+ __syncthreads();
402
+ int thread_idx = s_base + threadIdx.x;
403
+ if (thread_idx >= workCount)
404
+ return;
405
+
406
+ // Read work item filled out by forward kernel.
407
+ int4 item = p.workBuffer[thread_idx + 1];
408
+ unsigned int amask = __ballot_sync(0xffffffffu, item.w);
409
+ if (item.w == 0)
410
+ continue; // No effect.
411
+
412
+ // Unpack work item and replicate setup from forward analysis kernel.
413
+ int px = item.x;
414
+ int py = item.y;
415
+ int pz = (int)(((unsigned int)item.z) >> 16);
416
+ int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
417
+ float alpha = __int_as_float(item.w);
418
+ int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
419
+ int di = item.z & AAWorkItem::EDGE_MASK;
420
+ float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
421
+ int pixel0 = px + p.width * (py + p.height * pz);
422
+ int pixel1 = pixel0 + (d ? p.width : 1);
423
+ int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1;
424
+ if (tri1)
425
+ {
426
+ px += 1 - d;
427
+ py += d;
428
+ }
429
+
430
+ // Bail out if triangle index is corrupt.
431
+ bool triFail = (tri < 0 || tri >= p.numTriangles);
432
+ amask = __ballot_sync(amask, !triFail);
433
+ if (triFail)
434
+ continue;
435
+
436
+ // Outgoing color gradients.
437
+ float* pGrad0 = p.gradColor + pixel0 * p.channels;
438
+ float* pGrad1 = p.gradColor + pixel1 * p.channels;
439
+
440
+ // Incoming color gradients.
441
+ const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
442
+
443
+ // Position gradient weight based on colors and incoming gradients.
444
+ float dd = 0.f;
445
+ const float* pColor0 = p.color + pixel0 * p.channels;
446
+ const float* pColor1 = p.color + pixel1 * p.channels;
447
+
448
+ // Loop over channels and accumulate.
449
+ for (int i=0; i < p.channels; i++)
450
+ {
451
+ float dy = pDy[i];
452
+ if (dy != 0.f)
453
+ {
454
+ // Update position gradient weight.
455
+ dd += dy * (pColor1[i] - pColor0[i]);
456
+
457
+ // Update color gradients. No coalescing because all have different targets.
458
+ float v = alpha * dy;
459
+ atomicAdd(&pGrad0[i], -v);
460
+ atomicAdd(&pGrad1[i], v);
461
+ }
462
+ }
463
+
464
+ // If position weight is zero, skip the rest.
465
+ bool noGrad = (dd == 0.f);
466
+ amask = __ballot_sync(amask, !noGrad);
467
+ if (noGrad)
468
+ continue;
469
+
470
+ // Fetch vertex indices of the active edge and their positions.
471
+ int i1 = (di < 2) ? (di + 1) : 0;
472
+ int i2 = (i1 < 2) ? (i1 + 1) : 0;
473
+ int vi1 = p.tri[3 * tri + i1];
474
+ int vi2 = p.tri[3 * tri + i2];
475
+
476
+ // Bail out if vertex indices are corrupt.
477
+ bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
478
+ amask = __ballot_sync(amask, !vtxFail);
479
+ if (vtxFail)
480
+ continue;
481
+
482
+ // Instance mode: Adjust vertex indices based on minibatch index.
483
+ if (p.instance_mode)
484
+ {
485
+ vi1 += pz * p.numVertices;
486
+ vi2 += pz * p.numVertices;
487
+ }
488
+
489
+ // Fetch vertex positions.
490
+ float4 p1 = ((float4*)p.pos)[vi1];
491
+ float4 p2 = ((float4*)p.pos)[vi2];
492
+
493
+ // Project vertices to pixel space.
494
+ float pxh = p.xh;
495
+ float pyh = p.yh;
496
+ float fx = (float)px + .5f - pxh;
497
+ float fy = (float)py + .5f - pyh;
498
+
499
+ // XY flip for horizontal edges.
500
+ if (d)
501
+ {
502
+ swap(p1.x, p1.y);
503
+ swap(p2.x, p2.y);
504
+ swap(pxh, pyh);
505
+ swap(fx, fy);
506
+ }
507
+
508
+ // Gradient calculation setup.
509
+ float w1 = 1.f / p1.w;
510
+ float w2 = 1.f / p2.w;
511
+ float x1 = p1.x * w1 * pxh - fx;
512
+ float y1 = p1.y * w1 * pyh - fy;
513
+ float x2 = p2.x * w2 * pxh - fx;
514
+ float y2 = p2.y * w2 * pyh - fy;
515
+ float dx = x2 - x1;
516
+ float dy = y2 - y1;
517
+ float db = x1*dy - y1*dx;
518
+
519
+ // Compute inverse delta-y with epsilon.
520
+ float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
521
+ float iy = 1.f / (dy + ep);
522
+
523
+ // Compute position gradients.
524
+ float dby = db * iy;
525
+ float iw1 = -w1 * iy * dd;
526
+ float iw2 = w2 * iy * dd;
527
+ float gp1x = iw1 * pxh * y2;
528
+ float gp2x = iw2 * pxh * y1;
529
+ float gp1y = iw1 * pyh * (dby - x2);
530
+ float gp2y = iw2 * pyh * (dby - x1);
531
+ float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
532
+ float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
533
+
534
+ // XY flip the gradients.
535
+ if (d)
536
+ {
537
+ swap(gp1x, gp1y);
538
+ swap(gp2x, gp2y);
539
+ }
540
+
541
+ // Kill position gradients if alpha was saturated.
542
+ if (fabsf(alpha) >= 0.5f)
543
+ {
544
+ gp1x = gp1y = gp1w = 0.f;
545
+ gp2x = gp2y = gp2w = 0.f;
546
+ }
547
+
548
+ // Initialize coalesced atomics. Match both triangle ID and edge index.
549
+ // Also note that some threads may be inactive.
550
+ CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
551
+
552
+ // Accumulate gradients.
553
+ caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
554
+ caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
555
+ }
556
+ }
557
+
558
+ //------------------------------------------------------------------------
extensions/nvdiffrast/nvdiffrast/common/antialias.h ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+ #include "common.h"
11
+
12
+ //------------------------------------------------------------------------
13
+ // Constants and helpers.
14
+
15
+ #define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH 32
16
+ #define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT 8
17
+ #define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK 256
18
+ #define AA_MESH_KERNEL_THREADS_PER_BLOCK 256
19
+ #define AA_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 4 : 8) // With more than 16777216 triangles (alloc >= 33554432) use smallest possible value of 4 to conserve memory, otherwise use 8 for fewer collisions.
20
+ #define AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 2 : 3)
21
+ #define AA_GRAD_KERNEL_THREADS_PER_BLOCK 256
22
+
23
+ //------------------------------------------------------------------------
24
+ // CUDA kernel params.
25
+
26
+ struct AntialiasKernelParams
27
+ {
28
+ const float* color; // Incoming color buffer.
29
+ const float* rasterOut; // Incoming rasterizer output buffer.
30
+ const int* tri; // Incoming triangle buffer.
31
+ const float* pos; // Incoming position buffer.
32
+ float* output; // Output buffer of forward kernel.
33
+ const float* dy; // Incoming gradients.
34
+ float* gradColor; // Output buffer, color gradient.
35
+ float* gradPos; // Output buffer, position gradient.
36
+ int4* workBuffer; // Buffer for storing intermediate work items. First item reserved for counters.
37
+ uint4* evHash; // Edge-vertex hash.
38
+ int allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
39
+ int numTriangles; // Number of triangles.
40
+ int numVertices; // Number of vertices.
41
+ int width; // Input width.
42
+ int height; // Input height.
43
+ int n; // Minibatch size.
44
+ int channels; // Channel count in color input.
45
+ float xh, yh; // Transfer to pixel space.
46
+ int instance_mode; // 0=normal, 1=instance mode.
47
+ int tri_const; // 1 if triangle array is known to be constant.
48
+ };
49
+
50
+ //------------------------------------------------------------------------
extensions/nvdiffrast/nvdiffrast/common/common.cpp ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #include <cuda_runtime.h>
10
+
11
+ //------------------------------------------------------------------------
12
+ // Block and grid size calculators for kernel launches.
13
+
14
+ dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
15
+ {
16
+ int maxThreads = maxWidth * maxHeight;
17
+ if (maxThreads <= 1 || (width * height) <= 1)
18
+ return dim3(1, 1, 1); // Degenerate.
19
+
20
+ // Start from max size.
21
+ int bw = maxWidth;
22
+ int bh = maxHeight;
23
+
24
+ // Optimizations for weirdly sized buffers.
25
+ if (width < bw)
26
+ {
27
+ // Decrease block width to smallest power of two that covers the buffer width.
28
+ while ((bw >> 1) >= width)
29
+ bw >>= 1;
30
+
31
+ // Maximize height.
32
+ bh = maxThreads / bw;
33
+ if (bh > height)
34
+ bh = height;
35
+ }
36
+ else if (height < bh)
37
+ {
38
+ // Halve height and double width until fits completely inside buffer vertically.
39
+ while (bh > height)
40
+ {
41
+ bh >>= 1;
42
+ if (bw < width)
43
+ bw <<= 1;
44
+ }
45
+ }
46
+
47
+ // Done.
48
+ return dim3(bw, bh, 1);
49
+ }
50
+
51
+ dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
52
+ {
53
+ dim3 gridSize;
54
+ gridSize.x = (width - 1) / blockSize.x + 1;
55
+ gridSize.y = (height - 1) / blockSize.y + 1;
56
+ gridSize.z = (depth - 1) / blockSize.z + 1;
57
+ return gridSize;
58
+ }
59
+
60
+ //------------------------------------------------------------------------
extensions/nvdiffrast/nvdiffrast/common/common.h ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+ #include <cuda.h>
11
+ #include <stdint.h>
12
+
13
+ //------------------------------------------------------------------------
14
+ // C++ helper function prototypes.
15
+
16
+ dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
17
+ dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
18
+
19
+ //------------------------------------------------------------------------
20
+ // The rest is CUDA device code specific stuff.
21
+
22
+ #ifdef __CUDACC__
23
+
24
+ //------------------------------------------------------------------------
25
+ // Helpers for CUDA vector types.
26
+
27
+ static __device__ __forceinline__ float2& operator*= (float2& a, const float2& b) { a.x *= b.x; a.y *= b.y; return a; }
28
+ static __device__ __forceinline__ float2& operator+= (float2& a, const float2& b) { a.x += b.x; a.y += b.y; return a; }
29
+ static __device__ __forceinline__ float2& operator-= (float2& a, const float2& b) { a.x -= b.x; a.y -= b.y; return a; }
30
+ static __device__ __forceinline__ float2& operator*= (float2& a, float b) { a.x *= b; a.y *= b; return a; }
31
+ static __device__ __forceinline__ float2& operator+= (float2& a, float b) { a.x += b; a.y += b; return a; }
32
+ static __device__ __forceinline__ float2& operator-= (float2& a, float b) { a.x -= b; a.y -= b; return a; }
33
+ static __device__ __forceinline__ float2 operator* (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
34
+ static __device__ __forceinline__ float2 operator+ (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
35
+ static __device__ __forceinline__ float2 operator- (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
36
+ static __device__ __forceinline__ float2 operator* (const float2& a, float b) { return make_float2(a.x * b, a.y * b); }
37
+ static __device__ __forceinline__ float2 operator+ (const float2& a, float b) { return make_float2(a.x + b, a.y + b); }
38
+ static __device__ __forceinline__ float2 operator- (const float2& a, float b) { return make_float2(a.x - b, a.y - b); }
39
+ static __device__ __forceinline__ float2 operator* (float a, const float2& b) { return make_float2(a * b.x, a * b.y); }
40
+ static __device__ __forceinline__ float2 operator+ (float a, const float2& b) { return make_float2(a + b.x, a + b.y); }
41
+ static __device__ __forceinline__ float2 operator- (float a, const float2& b) { return make_float2(a - b.x, a - b.y); }
42
+ static __device__ __forceinline__ float2 operator- (const float2& a) { return make_float2(-a.x, -a.y); }
43
+ static __device__ __forceinline__ float3& operator*= (float3& a, const float3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
44
+ static __device__ __forceinline__ float3& operator+= (float3& a, const float3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
45
+ static __device__ __forceinline__ float3& operator-= (float3& a, const float3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
46
+ static __device__ __forceinline__ float3& operator*= (float3& a, float b) { a.x *= b; a.y *= b; a.z *= b; return a; }
47
+ static __device__ __forceinline__ float3& operator+= (float3& a, float b) { a.x += b; a.y += b; a.z += b; return a; }
48
+ static __device__ __forceinline__ float3& operator-= (float3& a, float b) { a.x -= b; a.y -= b; a.z -= b; return a; }
49
+ static __device__ __forceinline__ float3 operator* (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
50
+ static __device__ __forceinline__ float3 operator+ (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
51
+ static __device__ __forceinline__ float3 operator- (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
52
+ static __device__ __forceinline__ float3 operator* (const float3& a, float b) { return make_float3(a.x * b, a.y * b, a.z * b); }
53
+ static __device__ __forceinline__ float3 operator+ (const float3& a, float b) { return make_float3(a.x + b, a.y + b, a.z + b); }
54
+ static __device__ __forceinline__ float3 operator- (const float3& a, float b) { return make_float3(a.x - b, a.y - b, a.z - b); }
55
+ static __device__ __forceinline__ float3 operator* (float a, const float3& b) { return make_float3(a * b.x, a * b.y, a * b.z); }
56
+ static __device__ __forceinline__ float3 operator+ (float a, const float3& b) { return make_float3(a + b.x, a + b.y, a + b.z); }
57
+ static __device__ __forceinline__ float3 operator- (float a, const float3& b) { return make_float3(a - b.x, a - b.y, a - b.z); }
58
+ static __device__ __forceinline__ float3 operator- (const float3& a) { return make_float3(-a.x, -a.y, -a.z); }
59
+ static __device__ __forceinline__ float4& operator*= (float4& a, const float4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
60
+ static __device__ __forceinline__ float4& operator+= (float4& a, const float4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
61
+ static __device__ __forceinline__ float4& operator-= (float4& a, const float4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
62
+ static __device__ __forceinline__ float4& operator*= (float4& a, float b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
63
+ static __device__ __forceinline__ float4& operator+= (float4& a, float b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
64
+ static __device__ __forceinline__ float4& operator-= (float4& a, float b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
65
+ static __device__ __forceinline__ float4 operator* (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
66
+ static __device__ __forceinline__ float4 operator+ (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
67
+ static __device__ __forceinline__ float4 operator- (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
68
+ static __device__ __forceinline__ float4 operator* (const float4& a, float b) { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
69
+ static __device__ __forceinline__ float4 operator+ (const float4& a, float b) { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
70
+ static __device__ __forceinline__ float4 operator- (const float4& a, float b) { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
71
+ static __device__ __forceinline__ float4 operator* (float a, const float4& b) { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
72
+ static __device__ __forceinline__ float4 operator+ (float a, const float4& b) { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
73
+ static __device__ __forceinline__ float4 operator- (float a, const float4& b) { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
74
+ static __device__ __forceinline__ float4 operator- (const float4& a) { return make_float4(-a.x, -a.y, -a.z, -a.w); }
75
+ static __device__ __forceinline__ int2& operator*= (int2& a, const int2& b) { a.x *= b.x; a.y *= b.y; return a; }
76
+ static __device__ __forceinline__ int2& operator+= (int2& a, const int2& b) { a.x += b.x; a.y += b.y; return a; }
77
+ static __device__ __forceinline__ int2& operator-= (int2& a, const int2& b) { a.x -= b.x; a.y -= b.y; return a; }
78
+ static __device__ __forceinline__ int2& operator*= (int2& a, int b) { a.x *= b; a.y *= b; return a; }
79
+ static __device__ __forceinline__ int2& operator+= (int2& a, int b) { a.x += b; a.y += b; return a; }
80
+ static __device__ __forceinline__ int2& operator-= (int2& a, int b) { a.x -= b; a.y -= b; return a; }
81
+ static __device__ __forceinline__ int2 operator* (const int2& a, const int2& b) { return make_int2(a.x * b.x, a.y * b.y); }
82
+ static __device__ __forceinline__ int2 operator+ (const int2& a, const int2& b) { return make_int2(a.x + b.x, a.y + b.y); }
83
+ static __device__ __forceinline__ int2 operator- (const int2& a, const int2& b) { return make_int2(a.x - b.x, a.y - b.y); }
84
+ static __device__ __forceinline__ int2 operator* (const int2& a, int b) { return make_int2(a.x * b, a.y * b); }
85
+ static __device__ __forceinline__ int2 operator+ (const int2& a, int b) { return make_int2(a.x + b, a.y + b); }
86
+ static __device__ __forceinline__ int2 operator- (const int2& a, int b) { return make_int2(a.x - b, a.y - b); }
87
+ static __device__ __forceinline__ int2 operator* (int a, const int2& b) { return make_int2(a * b.x, a * b.y); }
88
+ static __device__ __forceinline__ int2 operator+ (int a, const int2& b) { return make_int2(a + b.x, a + b.y); }
89
+ static __device__ __forceinline__ int2 operator- (int a, const int2& b) { return make_int2(a - b.x, a - b.y); }
90
+ static __device__ __forceinline__ int2 operator- (const int2& a) { return make_int2(-a.x, -a.y); }
91
+ static __device__ __forceinline__ int3& operator*= (int3& a, const int3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
92
+ static __device__ __forceinline__ int3& operator+= (int3& a, const int3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
93
+ static __device__ __forceinline__ int3& operator-= (int3& a, const int3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
94
+ static __device__ __forceinline__ int3& operator*= (int3& a, int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
95
+ static __device__ __forceinline__ int3& operator+= (int3& a, int b) { a.x += b; a.y += b; a.z += b; return a; }
96
+ static __device__ __forceinline__ int3& operator-= (int3& a, int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
97
+ static __device__ __forceinline__ int3 operator* (const int3& a, const int3& b) { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
98
+ static __device__ __forceinline__ int3 operator+ (const int3& a, const int3& b) { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
99
+ static __device__ __forceinline__ int3 operator- (const int3& a, const int3& b) { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
100
+ static __device__ __forceinline__ int3 operator* (const int3& a, int b) { return make_int3(a.x * b, a.y * b, a.z * b); }
101
+ static __device__ __forceinline__ int3 operator+ (const int3& a, int b) { return make_int3(a.x + b, a.y + b, a.z + b); }
102
+ static __device__ __forceinline__ int3 operator- (const int3& a, int b) { return make_int3(a.x - b, a.y - b, a.z - b); }
103
+ static __device__ __forceinline__ int3 operator* (int a, const int3& b) { return make_int3(a * b.x, a * b.y, a * b.z); }
104
+ static __device__ __forceinline__ int3 operator+ (int a, const int3& b) { return make_int3(a + b.x, a + b.y, a + b.z); }
105
+ static __device__ __forceinline__ int3 operator- (int a, const int3& b) { return make_int3(a - b.x, a - b.y, a - b.z); }
106
+ static __device__ __forceinline__ int3 operator- (const int3& a) { return make_int3(-a.x, -a.y, -a.z); }
107
+ static __device__ __forceinline__ int4& operator*= (int4& a, const int4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
108
+ static __device__ __forceinline__ int4& operator+= (int4& a, const int4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
109
+ static __device__ __forceinline__ int4& operator-= (int4& a, const int4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
110
+ static __device__ __forceinline__ int4& operator*= (int4& a, int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
111
+ static __device__ __forceinline__ int4& operator+= (int4& a, int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
112
+ static __device__ __forceinline__ int4& operator-= (int4& a, int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
113
+ static __device__ __forceinline__ int4 operator* (const int4& a, const int4& b) { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
114
+ static __device__ __forceinline__ int4 operator+ (const int4& a, const int4& b) { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
115
+ static __device__ __forceinline__ int4 operator- (const int4& a, const int4& b) { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
116
+ static __device__ __forceinline__ int4 operator* (const int4& a, int b) { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
117
+ static __device__ __forceinline__ int4 operator+ (const int4& a, int b) { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
118
+ static __device__ __forceinline__ int4 operator- (const int4& a, int b) { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
119
+ static __device__ __forceinline__ int4 operator* (int a, const int4& b) { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
120
+ static __device__ __forceinline__ int4 operator+ (int a, const int4& b) { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
121
+ static __device__ __forceinline__ int4 operator- (int a, const int4& b) { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
122
+ static __device__ __forceinline__ int4 operator- (const int4& a) { return make_int4(-a.x, -a.y, -a.z, -a.w); }
123
+ static __device__ __forceinline__ uint2& operator*= (uint2& a, const uint2& b) { a.x *= b.x; a.y *= b.y; return a; }
124
+ static __device__ __forceinline__ uint2& operator+= (uint2& a, const uint2& b) { a.x += b.x; a.y += b.y; return a; }
125
+ static __device__ __forceinline__ uint2& operator-= (uint2& a, const uint2& b) { a.x -= b.x; a.y -= b.y; return a; }
126
+ static __device__ __forceinline__ uint2& operator*= (uint2& a, unsigned int b) { a.x *= b; a.y *= b; return a; }
127
+ static __device__ __forceinline__ uint2& operator+= (uint2& a, unsigned int b) { a.x += b; a.y += b; return a; }
128
+ static __device__ __forceinline__ uint2& operator-= (uint2& a, unsigned int b) { a.x -= b; a.y -= b; return a; }
129
+ static __device__ __forceinline__ uint2 operator* (const uint2& a, const uint2& b) { return make_uint2(a.x * b.x, a.y * b.y); }
130
+ static __device__ __forceinline__ uint2 operator+ (const uint2& a, const uint2& b) { return make_uint2(a.x + b.x, a.y + b.y); }
131
+ static __device__ __forceinline__ uint2 operator- (const uint2& a, const uint2& b) { return make_uint2(a.x - b.x, a.y - b.y); }
132
+ static __device__ __forceinline__ uint2 operator* (const uint2& a, unsigned int b) { return make_uint2(a.x * b, a.y * b); }
133
+ static __device__ __forceinline__ uint2 operator+ (const uint2& a, unsigned int b) { return make_uint2(a.x + b, a.y + b); }
134
+ static __device__ __forceinline__ uint2 operator- (const uint2& a, unsigned int b) { return make_uint2(a.x - b, a.y - b); }
135
+ static __device__ __forceinline__ uint2 operator* (unsigned int a, const uint2& b) { return make_uint2(a * b.x, a * b.y); }
136
+ static __device__ __forceinline__ uint2 operator+ (unsigned int a, const uint2& b) { return make_uint2(a + b.x, a + b.y); }
137
+ static __device__ __forceinline__ uint2 operator- (unsigned int a, const uint2& b) { return make_uint2(a - b.x, a - b.y); }
138
+ static __device__ __forceinline__ uint3& operator*= (uint3& a, const uint3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
139
+ static __device__ __forceinline__ uint3& operator+= (uint3& a, const uint3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
140
+ static __device__ __forceinline__ uint3& operator-= (uint3& a, const uint3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
141
+ static __device__ __forceinline__ uint3& operator*= (uint3& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
142
+ static __device__ __forceinline__ uint3& operator+= (uint3& a, unsigned int b) { a.x += b; a.y += b; a.z += b; return a; }
143
+ static __device__ __forceinline__ uint3& operator-= (uint3& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
144
+ static __device__ __forceinline__ uint3 operator* (const uint3& a, const uint3& b) { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
145
+ static __device__ __forceinline__ uint3 operator+ (const uint3& a, const uint3& b) { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
146
+ static __device__ __forceinline__ uint3 operator- (const uint3& a, const uint3& b) { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
147
+ static __device__ __forceinline__ uint3 operator* (const uint3& a, unsigned int b) { return make_uint3(a.x * b, a.y * b, a.z * b); }
148
+ static __device__ __forceinline__ uint3 operator+ (const uint3& a, unsigned int b) { return make_uint3(a.x + b, a.y + b, a.z + b); }
149
+ static __device__ __forceinline__ uint3 operator- (const uint3& a, unsigned int b) { return make_uint3(a.x - b, a.y - b, a.z - b); }
150
+ static __device__ __forceinline__ uint3 operator* (unsigned int a, const uint3& b) { return make_uint3(a * b.x, a * b.y, a * b.z); }
151
+ static __device__ __forceinline__ uint3 operator+ (unsigned int a, const uint3& b) { return make_uint3(a + b.x, a + b.y, a + b.z); }
152
+ static __device__ __forceinline__ uint3 operator- (unsigned int a, const uint3& b) { return make_uint3(a - b.x, a - b.y, a - b.z); }
153
+ static __device__ __forceinline__ uint4& operator*= (uint4& a, const uint4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
154
+ static __device__ __forceinline__ uint4& operator+= (uint4& a, const uint4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
155
+ static __device__ __forceinline__ uint4& operator-= (uint4& a, const uint4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
156
+ static __device__ __forceinline__ uint4& operator*= (uint4& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
157
+ static __device__ __forceinline__ uint4& operator+= (uint4& a, unsigned int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
158
+ static __device__ __forceinline__ uint4& operator-= (uint4& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
159
+ static __device__ __forceinline__ uint4 operator* (const uint4& a, const uint4& b) { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
160
+ static __device__ __forceinline__ uint4 operator+ (const uint4& a, const uint4& b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
161
+ static __device__ __forceinline__ uint4 operator- (const uint4& a, const uint4& b) { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
162
+ static __device__ __forceinline__ uint4 operator* (const uint4& a, unsigned int b) { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
163
+ static __device__ __forceinline__ uint4 operator+ (const uint4& a, unsigned int b) { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
164
+ static __device__ __forceinline__ uint4 operator- (const uint4& a, unsigned int b) { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
165
+ static __device__ __forceinline__ uint4 operator* (unsigned int a, const uint4& b) { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
166
+ static __device__ __forceinline__ uint4 operator+ (unsigned int a, const uint4& b) { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
167
+ static __device__ __forceinline__ uint4 operator- (unsigned int a, const uint4& b) { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
168
+
169
+ template<class T> static __device__ __forceinline__ T zero_value(void);
170
+ template<> __device__ __forceinline__ float zero_value<float> (void) { return 0.f; }
171
+ template<> __device__ __forceinline__ float2 zero_value<float2>(void) { return make_float2(0.f, 0.f); }
172
+ template<> __device__ __forceinline__ float4 zero_value<float4>(void) { return make_float4(0.f, 0.f, 0.f, 0.f); }
173
+ static __device__ __forceinline__ float3 make_float3(const float2& a, float b) { return make_float3(a.x, a.y, b); }
174
+ static __device__ __forceinline__ float4 make_float4(const float3& a, float b) { return make_float4(a.x, a.y, a.z, b); }
175
+ static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b) { return make_float4(a.x, a.y, b.x, b.y); }
176
+ static __device__ __forceinline__ int3 make_int3(const int2& a, int b) { return make_int3(a.x, a.y, b); }
177
+ static __device__ __forceinline__ int4 make_int4(const int3& a, int b) { return make_int4(a.x, a.y, a.z, b); }
178
+ static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b) { return make_int4(a.x, a.y, b.x, b.y); }
179
+ static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b) { return make_uint3(a.x, a.y, b); }
180
+ static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b) { return make_uint4(a.x, a.y, a.z, b); }
181
+ static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b) { return make_uint4(a.x, a.y, b.x, b.y); }
182
+
183
+ template<class T> static __device__ __forceinline__ void swap(T& a, T& b) { T temp = a; a = b; b = temp; }
184
+
185
+ //------------------------------------------------------------------------
186
+ // Triangle ID <-> float32 conversion functions to support very large triangle IDs.
187
+ //
188
+ // Values up to and including 16777216 (also, negative values) are converted trivially and retain
189
+ // compatibility with previous versions. Larger values are mapped to unique float32 that are not equal to
190
+ // the ID. The largest value that converts to float32 and back without generating inf or nan is 889192447.
191
+
192
+ static __device__ __forceinline__ int float_to_triidx(float x) { if (x <= 16777216.f) return (int)x; return __float_as_int(x) - 0x4a800000; }
193
+ static __device__ __forceinline__ float triidx_to_float(int x) { if (x <= 0x01000000) return (float)x; return __int_as_float(0x4a800000 + x); }
194
+
195
+ //------------------------------------------------------------------------
196
+ // Coalesced atomics. These are all done via macros.
197
+
198
+ #if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
199
+
200
+ #define CA_TEMP _ca_temp
201
+ #define CA_TEMP_PARAM float* CA_TEMP
202
+ #define CA_DECLARE_TEMP(threads_per_block) \
203
+ __shared__ float CA_TEMP[(threads_per_block)]
204
+
205
+ #define CA_SET_GROUP_MASK(group, thread_mask) \
206
+ bool _ca_leader; \
207
+ float* _ca_ptr; \
208
+ do { \
209
+ int tidx = threadIdx.x + blockDim.x * threadIdx.y; \
210
+ int lane = tidx & 31; \
211
+ int warp = tidx >> 5; \
212
+ int tmask = __match_any_sync((thread_mask), (group)); \
213
+ int leader = __ffs(tmask) - 1; \
214
+ _ca_leader = (leader == lane); \
215
+ _ca_ptr = &_ca_temp[((warp << 5) + leader)]; \
216
+ } while(0)
217
+
218
+ #define CA_SET_GROUP(group) \
219
+ CA_SET_GROUP_MASK((group), 0xffffffffu)
220
+
221
+ #define caAtomicAdd(ptr, value) \
222
+ do { \
223
+ if (_ca_leader) \
224
+ *_ca_ptr = 0.f; \
225
+ atomicAdd(_ca_ptr, (value)); \
226
+ if (_ca_leader) \
227
+ atomicAdd((ptr), *_ca_ptr); \
228
+ } while(0)
229
+
230
+ #define caAtomicAdd3_xyw(ptr, x, y, w) \
231
+ do { \
232
+ caAtomicAdd((ptr), (x)); \
233
+ caAtomicAdd((ptr)+1, (y)); \
234
+ caAtomicAdd((ptr)+3, (w)); \
235
+ } while(0)
236
+
237
+ #define caAtomicAddTexture(ptr, level, idx, value) \
238
+ do { \
239
+ CA_SET_GROUP((idx) ^ ((level) << 27)); \
240
+ caAtomicAdd((ptr)+(idx), (value)); \
241
+ } while(0)
242
+
243
+ //------------------------------------------------------------------------
244
+ // Disable atomic coalescing for compute capability lower than 7.x
245
+
246
+ #else // __CUDA_ARCH__ >= 700
247
+ #define CA_TEMP _ca_temp
248
+ #define CA_TEMP_PARAM float CA_TEMP
249
+ #define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
250
+ #define CA_SET_GROUP_MASK(group, thread_mask)
251
+ #define CA_SET_GROUP(group)
252
+ #define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
253
+ #define caAtomicAdd3_xyw(ptr, x, y, w) \
254
+ do { \
255
+ atomicAdd((ptr), (x)); \
256
+ atomicAdd((ptr)+1, (y)); \
257
+ atomicAdd((ptr)+3, (w)); \
258
+ } while(0)
259
+ #define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
260
+ #endif // __CUDA_ARCH__ >= 700
261
+
262
+ //------------------------------------------------------------------------
263
+ #endif // __CUDACC__
extensions/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+
11
+ //------------------------------------------------------------------------
12
+ // This is a slimmed-down and modernized version of the original
13
+ // CudaRaster codebase that accompanied the HPG 2011 paper
14
+ // "High-Performance Software Rasterization on GPUs" by Laine and Karras.
15
+ // Modifications have been made to accommodate post-Volta execution model
16
+ // with warp divergence. Support for shading, blending, quad rendering,
17
+ // and supersampling have been removed as unnecessary for nvdiffrast.
18
+ //------------------------------------------------------------------------
19
+
20
+ namespace CR
21
+ {
22
+
23
+ class RasterImpl;
24
+
25
+ //------------------------------------------------------------------------
26
+ // Interface class to isolate user from implementation details.
27
+ //------------------------------------------------------------------------
28
+
29
+ class CudaRaster
30
+ {
31
+ public:
32
+ enum
33
+ {
34
+ RenderModeFlag_EnableBackfaceCulling = 1 << 0, // Enable backface culling.
35
+ RenderModeFlag_EnableDepthPeeling = 1 << 1, // Enable depth peeling. Must have a peel buffer set.
36
+ };
37
+
38
+ public:
39
+ CudaRaster (void);
40
+ ~CudaRaster (void);
41
+
42
+ void setBufferSize (int width, int height, int numImages); // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
43
+ void setViewport (int width, int height, int offsetX, int offsetY); // Tiled rendering viewport setup.
44
+ void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
45
+ void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
46
+ void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
47
+ void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
48
+ bool drawTriangles (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
49
+ void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
50
+ void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
51
+ void swapDepthAndPeel (void); // Swap depth and peeling buffers.
52
+
53
+ private:
54
+ CudaRaster (const CudaRaster&); // forbidden
55
+ CudaRaster& operator= (const CudaRaster&); // forbidden
56
+
57
+ private:
58
+ RasterImpl* m_impl; // Opaque pointer to implementation.
59
+ };
60
+
61
+ //------------------------------------------------------------------------
62
+ } // namespace CR
63
+
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ //------------------------------------------------------------------------
10
+
11
+ __device__ __inline__ void binRasterImpl(const CRParams p)
12
+ {
13
+ __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
14
+ __shared__ volatile S32 s_outOfs [CR_MAXBINS_SQR];
15
+ __shared__ volatile S32 s_outTotal [CR_MAXBINS_SQR];
16
+ __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
17
+ __shared__ volatile S32 s_outMask [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
18
+ __shared__ volatile S32 s_outCount [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
19
+ __shared__ volatile S32 s_triBuf [CR_BIN_WARPS*32*4]; // triangle ring buffer
20
+ __shared__ volatile U32 s_batchPos;
21
+ __shared__ volatile U32 s_bufCount;
22
+ __shared__ volatile U32 s_overTotal;
23
+ __shared__ volatile U32 s_allocBase;
24
+
25
+ const CRImageParams& ip = getImageParams(p, blockIdx.z);
26
+ CRAtomics& atomics = p.atomics[blockIdx.z];
27
+ const U8* triSubtris = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
28
+ const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
29
+
30
+ S32* binFirstSeg = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
31
+ S32* binTotal = (S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
32
+ S32* binSegData = (S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
33
+ S32* binSegNext = (S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
34
+ S32* binSegCount = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
35
+
36
+ if (atomics.numSubtris > p.maxSubtris)
37
+ return;
38
+
39
+ // per-thread state
40
+ int thrInBlock = threadIdx.x + threadIdx.y * 32;
41
+ int batchPos = 0;
42
+
43
+ // first 16 elements of s_broadcast are always zero
44
+ if (thrInBlock < 16)
45
+ s_broadcast[thrInBlock] = 0;
46
+
47
+ // initialize output linked lists and offsets
48
+ if (thrInBlock < p.numBins)
49
+ {
50
+ binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
51
+ s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
52
+ s_outTotal[thrInBlock] = 0;
53
+ }
54
+
55
+ // repeat until done
56
+ for(;;)
57
+ {
58
+ // get batch
59
+ if (thrInBlock == 0)
60
+ s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
61
+ __syncthreads();
62
+ batchPos = s_batchPos;
63
+
64
+ // all batches done?
65
+ if (batchPos >= ip.triCount)
66
+ break;
67
+
68
+ // per-thread state
69
+ int bufIndex = 0;
70
+ int bufCount = 0;
71
+ int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
72
+
73
+ // loop over batch as long as we have triangles in it
74
+ do
75
+ {
76
+ // read more triangles
77
+ while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
78
+ {
79
+ // get subtriangle count
80
+
81
+ int triIdx = batchPos + thrInBlock;
82
+ int num = 0;
83
+ if (triIdx < batchEnd)
84
+ num = triSubtris[triIdx];
85
+
86
+ // cumulative sum of subtriangles within each warp
87
+ U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
88
+ if (__any_sync(~0u, num > 1))
89
+ {
90
+ myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
91
+ myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
92
+ }
93
+ if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
94
+ s_broadcast[threadIdx.y + 16] = myIdx + num;
95
+ __syncthreads();
96
+
97
+ // cumulative sum of per-warp subtriangle counts
98
+ // Note: cannot have more than 32 warps or this needs to sync between each step.
99
+ bool act = (thrInBlock < CR_BIN_WARPS);
100
+ U32 actMask = __ballot_sync(~0u, act);
101
+ if (threadIdx.y == 0 && act)
102
+ {
103
+ volatile U32* ptr = &s_broadcast[thrInBlock + 16];
104
+ U32 val = *ptr;
105
+ #if (CR_BIN_WARPS > 1)
106
+ val += ptr[-1]; __syncwarp(actMask);
107
+ *ptr = val; __syncwarp(actMask);
108
+ #endif
109
+ #if (CR_BIN_WARPS > 2)
110
+ val += ptr[-2]; __syncwarp(actMask);
111
+ *ptr = val; __syncwarp(actMask);
112
+ #endif
113
+ #if (CR_BIN_WARPS > 4)
114
+ val += ptr[-4]; __syncwarp(actMask);
115
+ *ptr = val; __syncwarp(actMask);
116
+ #endif
117
+ #if (CR_BIN_WARPS > 8)
118
+ val += ptr[-8]; __syncwarp(actMask);
119
+ *ptr = val; __syncwarp(actMask);
120
+ #endif
121
+ #if (CR_BIN_WARPS > 16)
122
+ val += ptr[-16]; __syncwarp(actMask);
123
+ *ptr = val; __syncwarp(actMask);
124
+ #endif
125
+
126
+ // initially assume that we consume everything
127
+ // only last active thread does the writes
128
+ if (threadIdx.x == CR_BIN_WARPS - 1)
129
+ {
130
+ s_batchPos = batchPos + CR_BIN_WARPS * 32;
131
+ s_bufCount = bufCount + val;
132
+ }
133
+ }
134
+ __syncthreads();
135
+
136
+ // skip if no subtriangles
137
+ if (num)
138
+ {
139
+ // calculate write position for first subtriangle
140
+ U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
141
+
142
+ // only write if entire triangle fits
143
+ if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
144
+ {
145
+ pos += bufIndex; // adjust for current start position
146
+ pos &= CR_ARRAY_SIZE(s_triBuf)-1;
147
+ if (num == 1)
148
+ s_triBuf[pos] = triIdx * 8 + 7; // single triangle
149
+ else
150
+ {
151
+ for (int i=0; i < num; i++)
152
+ {
153
+ s_triBuf[pos] = triIdx * 8 + i;
154
+ pos++;
155
+ pos &= CR_ARRAY_SIZE(s_triBuf)-1;
156
+ }
157
+ }
158
+ } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
159
+ {
160
+ // this triangle is the first that failed, overwrite total count and triangle count
161
+ s_batchPos = batchPos + thrInBlock;
162
+ s_bufCount = pos;
163
+ }
164
+ }
165
+
166
+ // update triangle counts
167
+ __syncthreads();
168
+ batchPos = s_batchPos;
169
+ bufCount = s_bufCount;
170
+ }
171
+
172
+ // make every warp clear its output buffers
173
+ for (int i=threadIdx.x; i < p.numBins; i += 32)
174
+ s_outMask[threadIdx.y][i] = 0;
175
+ __syncwarp();
176
+
177
+ // choose our triangle
178
+ uint4 triData = make_uint4(0, 0, 0, 0);
179
+ if (thrInBlock < bufCount)
180
+ {
181
+ U32 triPos = bufIndex + thrInBlock;
182
+ triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
183
+
184
+ // find triangle
185
+ int triIdx = s_triBuf[triPos];
186
+ int dataIdx = triIdx >> 3;
187
+ int subtriIdx = triIdx & 7;
188
+ if (subtriIdx != 7)
189
+ dataIdx = triHeader[dataIdx].misc + subtriIdx;
190
+
191
+ // read triangle
192
+
193
+ triData = *(((const uint4*)triHeader) + dataIdx);
194
+ }
195
+
196
+ // setup bounding box and edge functions, and rasterize
197
+ S32 lox, loy, hix, hiy;
198
+ bool hasTri = (thrInBlock < bufCount);
199
+ U32 hasTriMask = __ballot_sync(~0u, hasTri);
200
+ if (hasTri)
201
+ {
202
+ S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
203
+ S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
204
+ S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
205
+ S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
206
+ S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
207
+ S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
208
+ int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
209
+ lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
210
+ loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
211
+ hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
212
+ hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
213
+
214
+ U32 bit = 1 << threadIdx.x;
215
+ #if __CUDA_ARCH__ >= 700
216
+ bool multi = (hix != lox || hiy != loy);
217
+ if (!__any_sync(hasTriMask, multi))
218
+ {
219
+ int binIdx = lox + p.widthBins * loy;
220
+ U32 mask = __match_any_sync(hasTriMask, binIdx);
221
+ s_outMask[threadIdx.y][binIdx] = mask;
222
+ __syncwarp(hasTriMask);
223
+ } else
224
+ #endif
225
+ {
226
+ bool complex = (hix > lox+1 || hiy > loy+1);
227
+ if (!__any_sync(hasTriMask, complex))
228
+ {
229
+ int binIdx = lox + p.widthBins * loy;
230
+ atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
231
+ if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
232
+ if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
233
+ if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
234
+ } else
235
+ {
236
+ S32 d12x = d02x - d01x, d12y = d02y - d01y;
237
+ v0x -= lox << binLog, v0y -= loy << binLog;
238
+
239
+ S32 t01 = v0x * d01y - v0y * d01x;
240
+ S32 t02 = v0y * d02x - v0x * d02y;
241
+ S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
242
+ S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
243
+ S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
244
+ S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
245
+
246
+ int width = hix - lox + 1;
247
+ d01x += width * d01y;
248
+ d02x += width * d02y;
249
+ d12x += width * d12y;
250
+
251
+ U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
252
+ U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
253
+ U8* endPtr = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
254
+ int stride = p.widthBins * 4;
255
+ int ptrYInc = stride - width * 4;
256
+
257
+ do
258
+ {
259
+ if (b01 >= 0 && b02 >= 0 && b12 >= 0)
260
+ atomicOr((U32*)currPtr, bit);
261
+ currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
262
+ if (currPtr == skipPtr)
263
+ currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
264
+ }
265
+ while (currPtr != endPtr);
266
+ }
267
+ }
268
+ }
269
+
270
+ // count per-bin contributions
271
+ if (thrInBlock == 0)
272
+ s_overTotal = 0; // overflow counter
273
+
274
+ // ensure that out masks are done
275
+ __syncthreads();
276
+
277
+ int overIndex = -1;
278
+ bool act = (thrInBlock < p.numBins);
279
+ U32 actMask = __ballot_sync(~0u, act);
280
+ if (act)
281
+ {
282
+ U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
283
+ U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
284
+ int total = 0;
285
+ for (int i = 0; i < CR_BIN_WARPS; i++)
286
+ {
287
+ total += __popc(*(U32*)srcPtr);
288
+ *(U32*)dstPtr = total;
289
+ srcPtr += (CR_MAXBINS_SQR + 1) * 4;
290
+ dstPtr += (CR_MAXBINS_SQR + 1) * 4;
291
+ }
292
+
293
+ // overflow => request a new segment
294
+ int ofs = s_outOfs[thrInBlock];
295
+ bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
296
+ U32 ovrMask = __ballot_sync(actMask, ovr);
297
+ if (ovr)
298
+ {
299
+ overIndex = __popc(ovrMask & getLaneMaskLt());
300
+ if (overIndex == 0)
301
+ s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
302
+ __syncwarp(ovrMask);
303
+ overIndex += s_broadcast[threadIdx.y + 16];
304
+ s_overIndex[thrInBlock] = overIndex;
305
+ }
306
+ }
307
+
308
+ // sync after overTotal is ready
309
+ __syncthreads();
310
+
311
+ // at least one segment overflowed => allocate segments
312
+ U32 overTotal = s_overTotal;
313
+ U32 allocBase = 0;
314
+ if (overTotal > 0)
315
+ {
316
+ // allocate memory
317
+ if (thrInBlock == 0)
318
+ {
319
+ U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
320
+ s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
321
+ }
322
+ __syncthreads();
323
+ allocBase = s_allocBase;
324
+
325
+ // did my bin overflow?
326
+ if (overIndex != -1)
327
+ {
328
+ // calculate new segment index
329
+ int segIdx = allocBase + overIndex;
330
+
331
+ // add to linked list
332
+ if (s_outOfs[thrInBlock] < 0)
333
+ binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
334
+ else
335
+ binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
336
+
337
+ // defaults
338
+ binSegNext [segIdx] = -1;
339
+ binSegCount[segIdx] = CR_BIN_SEG_SIZE;
340
+ }
341
+ }
342
+
343
+ // concurrent emission -- each warp handles its own triangle
344
+ if (thrInBlock < bufCount)
345
+ {
346
+ int triPos = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
347
+ int currBin = lox + loy * p.widthBins;
348
+ int skipBin = (hix + 1) + loy * p.widthBins;
349
+ int endBin = lox + (hiy + 1) * p.widthBins;
350
+ int binYInc = p.widthBins - (hix - lox + 1);
351
+
352
+ // loop over triangle's bins
353
+ do
354
+ {
355
+ U32 outMask = s_outMask[threadIdx.y][currBin];
356
+ if (outMask & (1<<threadIdx.x))
357
+ {
358
+ int idx = __popc(outMask & getLaneMaskLt());
359
+ if (threadIdx.y > 0)
360
+ idx += s_outCount[threadIdx.y-1][currBin];
361
+
362
+ int base = s_outOfs[currBin];
363
+ int free = (-base) & (CR_BIN_SEG_SIZE - 1);
364
+ if (idx >= free)
365
+ idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
366
+ else
367
+ idx += base;
368
+
369
+ binSegData[idx] = s_triBuf[triPos];
370
+ }
371
+
372
+ currBin++;
373
+ if (currBin == skipBin)
374
+ currBin += binYInc, skipBin += p.widthBins;
375
+ }
376
+ while (currBin != endBin);
377
+ }
378
+
379
+ // wait all triangles to finish, then replace overflown segment offsets
380
+ __syncthreads();
381
+ if (thrInBlock < p.numBins)
382
+ {
383
+ U32 total = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
384
+ U32 oldOfs = s_outOfs[thrInBlock];
385
+ if (overIndex == -1)
386
+ s_outOfs[thrInBlock] = oldOfs + total;
387
+ else
388
+ {
389
+ int addr = oldOfs + total;
390
+ addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
391
+ addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
392
+ s_outOfs[thrInBlock] = addr;
393
+ }
394
+ s_outTotal[thrInBlock] += total;
395
+ }
396
+
397
+ // these triangles are now done
398
+ int count = ::min(bufCount, CR_BIN_WARPS * 32);
399
+ bufCount -= count;
400
+ bufIndex += count;
401
+ bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
402
+ }
403
+ while (bufCount > 0 || batchPos < batchEnd);
404
+
405
+ // flush all bins
406
+ if (thrInBlock < p.numBins)
407
+ {
408
+ int ofs = s_outOfs[thrInBlock];
409
+ if (ofs & (CR_BIN_SEG_SIZE-1))
410
+ {
411
+ int seg = ofs >> CR_BIN_SEG_LOG2;
412
+ binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
413
+ s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
414
+ }
415
+ }
416
+ }
417
+
418
+ // output totals
419
+ if (thrInBlock < p.numBins)
420
+ binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
421
+ }
422
+
423
+ //------------------------------------------------------------------------
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #include "../../framework.h"
10
+ #include "Buffer.hpp"
11
+
12
+ using namespace CR;
13
+
14
+ //------------------------------------------------------------------------
15
+ // GPU buffer.
16
+ //------------------------------------------------------------------------
17
+
18
+ Buffer::Buffer(void)
19
+ : m_gpuPtr(NULL),
20
+ m_bytes (0)
21
+ {
22
+ // empty
23
+ }
24
+
25
+ Buffer::~Buffer(void)
26
+ {
27
+ if (m_gpuPtr)
28
+ cudaFree(m_gpuPtr); // Don't throw an exception.
29
+ }
30
+
31
+ void Buffer::reset(size_t bytes)
32
+ {
33
+ if (bytes == m_bytes)
34
+ return;
35
+
36
+ if (m_gpuPtr)
37
+ {
38
+ NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
39
+ m_gpuPtr = NULL;
40
+ }
41
+
42
+ if (bytes > 0)
43
+ NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
44
+
45
+ m_bytes = bytes;
46
+ }
47
+
48
+ void Buffer::grow(size_t bytes)
49
+ {
50
+ if (bytes > m_bytes)
51
+ reset(bytes);
52
+ }
53
+
54
+ //------------------------------------------------------------------------
55
+ // Host buffer with page-locked memory.
56
+ //------------------------------------------------------------------------
57
+
58
+ HostBuffer::HostBuffer(void)
59
+ : m_hostPtr(NULL),
60
+ m_bytes (0)
61
+ {
62
+ // empty
63
+ }
64
+
65
+ HostBuffer::~HostBuffer(void)
66
+ {
67
+ if (m_hostPtr)
68
+ cudaFreeHost(m_hostPtr); // Don't throw an exception.
69
+ }
70
+
71
+ void HostBuffer::reset(size_t bytes)
72
+ {
73
+ if (bytes == m_bytes)
74
+ return;
75
+
76
+ if (m_hostPtr)
77
+ {
78
+ NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
79
+ m_hostPtr = NULL;
80
+ }
81
+
82
+ if (bytes > 0)
83
+ NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
84
+
85
+ m_bytes = bytes;
86
+ }
87
+
88
+ void HostBuffer::grow(size_t bytes)
89
+ {
90
+ if (bytes > m_bytes)
91
+ reset(bytes);
92
+ }
93
+
94
+ //------------------------------------------------------------------------
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+ #include "Defs.hpp"
11
+
12
+ namespace CR
13
+ {
14
+ //------------------------------------------------------------------------
15
+
16
+ class Buffer
17
+ {
18
+ public:
19
+ Buffer (void);
20
+ ~Buffer (void);
21
+
22
+ void reset (size_t bytes);
23
+ void grow (size_t bytes);
24
+ void* getPtr (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
25
+ size_t getSize (void) const { return m_bytes; }
26
+
27
+ void setPtr (void* ptr) { m_gpuPtr = ptr; }
28
+
29
+ private:
30
+ void* m_gpuPtr;
31
+ size_t m_bytes;
32
+ };
33
+
34
+ //------------------------------------------------------------------------
35
+
36
+ class HostBuffer
37
+ {
38
+ public:
39
+ HostBuffer (void);
40
+ ~HostBuffer (void);
41
+
42
+ void reset (size_t bytes);
43
+ void grow (size_t bytes);
44
+ void* getPtr (void) { return m_hostPtr; }
45
+ size_t getSize (void) const { return m_bytes; }
46
+
47
+ void setPtr (void* ptr) { m_hostPtr = ptr; }
48
+
49
+ private:
50
+ void* m_hostPtr;
51
+ size_t m_bytes;
52
+ };
53
+
54
+ //------------------------------------------------------------------------
55
+ }
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl ADDED
@@ -0,0 +1,730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ //------------------------------------------------------------------------
10
+
11
+ __device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
12
+ {
13
+ int tileX = tileInBin & (CR_BIN_SIZE - 1);
14
+ int tileY = tileInBin >> CR_BIN_LOG2;
15
+ return tileX + tileY * widthTiles;
16
+ }
17
+
18
+ //------------------------------------------------------------------------
19
+
20
+ __device__ __inline__ void coarseRasterImpl(const CRParams p)
21
+ {
22
+ // Common.
23
+
24
+ __shared__ volatile U32 s_workCounter;
25
+ __shared__ volatile U32 s_scanTemp [CR_COARSE_WARPS][48]; // 3KB
26
+
27
+ // Input.
28
+
29
+ __shared__ volatile U32 s_binOrder [CR_MAXBINS_SQR]; // 1KB
30
+ __shared__ volatile S32 s_binStreamCurrSeg [CR_BIN_STREAMS_SIZE]; // 0KB
31
+ __shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE]; // 0KB
32
+ __shared__ volatile S32 s_triQueue [CR_COARSE_QUEUE_SIZE]; // 4KB
33
+ __shared__ volatile S32 s_triQueueWritePos;
34
+ __shared__ volatile U32 s_binStreamSelectedOfs;
35
+ __shared__ volatile U32 s_binStreamSelectedSize;
36
+
37
+ // Output.
38
+
39
+ __shared__ volatile U32 s_warpEmitMask [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
40
+ __shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
41
+ __shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1]; // 1KB, zero at the beginning
42
+ __shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1]; // 1KB, zero at the beginning
43
+ __shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR]; // 1KB
44
+ __shared__ volatile U32 s_firstAllocSeg;
45
+ __shared__ volatile U32 s_firstActiveIdx;
46
+
47
+ // Pointers and constants.
48
+
49
+ CRAtomics& atomics = p.atomics[blockIdx.z];
50
+ const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
51
+ const S32* binFirstSeg = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
52
+ const S32* binTotal = (const S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
53
+ const S32* binSegData = (const S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
54
+ const S32* binSegNext = (const S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
55
+ const S32* binSegCount = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
56
+ S32* activeTiles = (S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
57
+ S32* tileFirstSeg = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
58
+ S32* tileSegData = (S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
59
+ S32* tileSegNext = (S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
60
+ S32* tileSegCount = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
61
+
62
+ int tileLog = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
63
+ int thrInBlock = threadIdx.x + threadIdx.y * 32;
64
+ int emitShift = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
65
+
66
+ if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
67
+ return;
68
+
69
+ // Initialize sharedmem arrays.
70
+
71
+ if (thrInBlock == 0)
72
+ {
73
+ s_tileEmitPrefixSum[0] = 0;
74
+ s_tileAllocPrefixSum[0] = 0;
75
+ }
76
+ s_scanTemp[threadIdx.y][threadIdx.x] = 0;
77
+
78
+ // Sort bins in descending order of triangle count.
79
+
80
+ for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
81
+ {
82
+ int count = 0;
83
+ for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
84
+ count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
85
+ s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
86
+ }
87
+
88
+ __syncthreads();
89
+ sortShared(s_binOrder, p.numBins);
90
+
91
+ // Process each bin by one block.
92
+
93
+ for (;;)
94
+ {
95
+ // Pick a bin for the block.
96
+
97
+ if (thrInBlock == 0)
98
+ s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
99
+ __syncthreads();
100
+
101
+ int workCounter = s_workCounter;
102
+ if (workCounter >= p.numBins)
103
+ break;
104
+
105
+ U32 binOrder = s_binOrder[workCounter];
106
+ bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
107
+ if (binEmpty && !p.deferredClear)
108
+ break;
109
+
110
+ int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
111
+
112
+ // Initialize input/output streams.
113
+
114
+ int triQueueWritePos = 0;
115
+ int triQueueReadPos = 0;
116
+
117
+ if (thrInBlock < CR_BIN_STREAMS_SIZE)
118
+ {
119
+ int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
120
+ s_binStreamCurrSeg[thrInBlock] = segIdx;
121
+ s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
122
+ }
123
+
124
+ for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
125
+ s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
126
+
127
+ // Initialize per-bin state.
128
+
129
+ int binY = idiv_fast(binIdx, p.widthBins);
130
+ int binX = binIdx - binY * p.widthBins;
131
+ int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
132
+ int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
133
+ int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
134
+ int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
135
+ int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
136
+
137
+ // Entire block: Merge input streams and process triangles.
138
+
139
+ if (!binEmpty)
140
+ do
141
+ {
142
+ //------------------------------------------------------------------------
143
+ // Merge.
144
+ //------------------------------------------------------------------------
145
+
146
+ // Entire block: Not enough triangles => merge and queue segments.
147
+ // NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
148
+
149
+ while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
150
+ {
151
+ // First warp: Choose the segment with the lowest initial triangle index.
152
+
153
+ bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
154
+ U32 hasStreamMask = __ballot_sync(~0u, hasStream);
155
+ if (hasStream)
156
+ {
157
+ // Find the stream with the lowest triangle index.
158
+
159
+ U32 firstTri = s_binStreamFirstTri[thrInBlock];
160
+ U32 t = firstTri;
161
+ volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
162
+
163
+ #if (CR_BIN_STREAMS_SIZE > 1)
164
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
165
+ #endif
166
+ #if (CR_BIN_STREAMS_SIZE > 2)
167
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
168
+ #endif
169
+ #if (CR_BIN_STREAMS_SIZE > 4)
170
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
171
+ #endif
172
+ #if (CR_BIN_STREAMS_SIZE > 8)
173
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
174
+ #endif
175
+ #if (CR_BIN_STREAMS_SIZE > 16)
176
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
177
+ #endif
178
+ v[0] = t; __syncwarp(hasStreamMask);
179
+
180
+ // Consume and broadcast.
181
+
182
+ bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
183
+ U32 firstMask = __ballot_sync(hasStreamMask, first);
184
+ if (first && (firstMask >> threadIdx.x) == 1u)
185
+ {
186
+ int segIdx = s_binStreamCurrSeg[thrInBlock];
187
+ s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
188
+ if (segIdx != -1)
189
+ {
190
+ int segSize = binSegCount[segIdx];
191
+ int segNext = binSegNext[segIdx];
192
+ s_binStreamSelectedSize = segSize;
193
+ s_triQueueWritePos = triQueueWritePos + segSize;
194
+ s_binStreamCurrSeg[thrInBlock] = segNext;
195
+ s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
196
+ }
197
+ }
198
+ }
199
+
200
+ // No more segments => break.
201
+
202
+ __syncthreads();
203
+ triQueueWritePos = s_triQueueWritePos;
204
+ int segOfs = s_binStreamSelectedOfs;
205
+ if (segOfs < 0)
206
+ break;
207
+
208
+ int segSize = s_binStreamSelectedSize;
209
+ __syncthreads();
210
+
211
+ // Fetch triangles into the queue.
212
+
213
+ for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
214
+ {
215
+ S32 triIdx = binSegData[segOfs + idxInSeg];
216
+ s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
217
+ }
218
+ }
219
+
220
+ // All threads: Clear emit masks.
221
+
222
+ for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
223
+ s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
224
+
225
+ __syncthreads();
226
+
227
+ //------------------------------------------------------------------------
228
+ // Raster.
229
+ //------------------------------------------------------------------------
230
+
231
+ // Triangle per thread: Read from the queue.
232
+
233
+ int triIdx = -1;
234
+ if (triQueueReadPos + thrInBlock < triQueueWritePos)
235
+ triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
236
+
237
+ uint4 triData = make_uint4(0, 0, 0, 0);
238
+ if (triIdx != -1)
239
+ {
240
+ int dataIdx = triIdx >> 3;
241
+ int subtriIdx = triIdx & 7;
242
+ if (subtriIdx != 7)
243
+ dataIdx = triHeader[dataIdx].misc + subtriIdx;
244
+ triData = *((uint4*)triHeader + dataIdx);
245
+ }
246
+
247
+ // 32 triangles per warp: Record emits (= tile intersections).
248
+
249
+ if (__any_sync(~0u, triIdx != -1))
250
+ {
251
+ S32 v0x = sub_s16lo_s16lo(triData.x, originX);
252
+ S32 v0y = sub_s16hi_s16lo(triData.x, originY);
253
+ S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
254
+ S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
255
+ S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
256
+ S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
257
+
258
+ // Compute tile-based AABB.
259
+
260
+ int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
261
+ int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
262
+ int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
263
+ int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
264
+ int sizex = add_sub(hix, 1, lox);
265
+ int sizey = add_sub(hiy, 1, loy);
266
+ int area = sizex * sizey;
267
+
268
+ // Miscellaneous init.
269
+
270
+ U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
271
+ int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
272
+ U32 maskBit = 1 << threadIdx.x;
273
+
274
+ // Case A: All AABBs are small => record the full AABB using atomics.
275
+
276
+ if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
277
+ {
278
+ if (triIdx != -1)
279
+ {
280
+ atomicOr((U32*)currPtr, maskBit);
281
+ if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
282
+ if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
283
+ if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
284
+ }
285
+ }
286
+ else
287
+ {
288
+ // Compute warp-AABB (scan-32).
289
+
290
+ U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
291
+ if (triIdx == -1)
292
+ aabbMask = 0;
293
+
294
+ volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
295
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
296
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
297
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
298
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
299
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
300
+ v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
301
+
302
+ U32 maskX = aabbMask & 0xFFFF;
303
+ U32 maskY = aabbMask >> 16;
304
+ int wlox = findLeadingOne(maskX ^ (maskX - 1));
305
+ int wloy = findLeadingOne(maskY ^ (maskY - 1));
306
+ int whix = findLeadingOne(maskX);
307
+ int whiy = findLeadingOne(maskY);
308
+ int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
309
+
310
+ // Initialize edge functions.
311
+
312
+ S32 d12x = d02x - d01x;
313
+ S32 d12y = d02y - d01y;
314
+ v0x -= lox << tileLog;
315
+ v0y -= loy << tileLog;
316
+
317
+ S32 t01 = v0x * d01y - v0y * d01x;
318
+ S32 t02 = v0y * d02x - v0x * d02y;
319
+ S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
320
+ S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
321
+ S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
322
+ S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
323
+
324
+ d01x += sizex * d01y;
325
+ d02x += sizex * d02y;
326
+ d12x += sizex * d12y;
327
+
328
+ // Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
329
+ if (__any_sync(~0u, warea * 4 <= area * 8))
330
+ {
331
+ // Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
332
+ bool act = (triIdx != -1);
333
+ U32 actMask = __ballot_sync(~0u, act);
334
+ if (act)
335
+ {
336
+ for (int y = wloy; y <= whiy; y++)
337
+ {
338
+ bool yIn = (y >= loy && y <= hiy);
339
+ U32 yMask = __ballot_sync(actMask, yIn);
340
+ if (yIn)
341
+ {
342
+ for (int x = wlox; x <= whix; x++)
343
+ {
344
+ bool xyIn = (x >= lox && x <= hix);
345
+ U32 xyMask = __ballot_sync(yMask, xyIn);
346
+ if (xyIn)
347
+ {
348
+ U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
349
+ if (threadIdx.x == 31 - __clz(xyMask))
350
+ *(U32*)currPtr = res;
351
+ currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
352
+ }
353
+ }
354
+ currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
355
+ }
356
+ }
357
+ }
358
+ }
359
+
360
+ // Case C: General case => Check tiles in AABB, record using atomics.
361
+
362
+ else
363
+ {
364
+ if (triIdx != -1)
365
+ {
366
+ U8* skipPtr = currPtr + (sizex << 2);
367
+ U8* endPtr = currPtr + (sizey << (CR_BIN_LOG2 + 2));
368
+ do
369
+ {
370
+ if (b01 >= 0 && b02 >= 0 && b12 >= 0)
371
+ atomicOr((U32*)currPtr, maskBit);
372
+ currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
373
+ if (currPtr == skipPtr)
374
+ currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
375
+ }
376
+ while (currPtr != endPtr);
377
+ }
378
+ }
379
+ }
380
+ }
381
+
382
+ __syncthreads();
383
+
384
+ //------------------------------------------------------------------------
385
+ // Count.
386
+ //------------------------------------------------------------------------
387
+
388
+ // Tile per thread: Initialize prefix sums.
389
+
390
+ for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
391
+ {
392
+ int tileInBin = tileInBin_base + thrInBlock;
393
+ bool act = (tileInBin < CR_BIN_SQR);
394
+ U32 actMask = __ballot_sync(~0u, act);
395
+ if (act)
396
+ {
397
+ // Compute prefix sum of emits over warps.
398
+
399
+ U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
400
+ U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
401
+ int tileEmits = 0;
402
+ for (int i = 0; i < CR_COARSE_WARPS; i++)
403
+ {
404
+ tileEmits += __popc(*(U32*)srcPtr);
405
+ *(U32*)dstPtr = tileEmits;
406
+ srcPtr += (CR_BIN_SQR + 1) * 4;
407
+ dstPtr += (CR_BIN_SQR + 1) * 4;
408
+ }
409
+
410
+ // Determine the number of segments to allocate.
411
+
412
+ int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
413
+ int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
414
+ volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
415
+
416
+ // All counters within the warp are small => compute prefix sum using ballot.
417
+
418
+ if (!__any_sync(actMask, tileEmits >= 2))
419
+ {
420
+ U32 m = getLaneMaskLe();
421
+ *v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
422
+ }
423
+
424
+ // Otherwise => scan-32 within the warp.
425
+
426
+ else
427
+ {
428
+ U32 sum = (tileEmits << emitShift) | tileAllocs;
429
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 1) sum += v[-1]; __syncwarp(actMask);
430
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 2) sum += v[-2]; __syncwarp(actMask);
431
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 4) sum += v[-4]; __syncwarp(actMask);
432
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 8) sum += v[-8]; __syncwarp(actMask);
433
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
434
+ *v = sum; __syncwarp(actMask);
435
+ }
436
+ }
437
+ }
438
+
439
+ // First warp: Scan-8.
440
+
441
+ __syncthreads();
442
+
443
+ bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
444
+ U32 scan8Mask = __ballot_sync(~0u, scan8);
445
+ if (scan8)
446
+ {
447
+ int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
448
+ volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
449
+ v[0] = sum; __syncwarp(scan8Mask);
450
+ #if (CR_BIN_SQR > 1 * 32)
451
+ sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
452
+ #endif
453
+ #if (CR_BIN_SQR > 2 * 32)
454
+ sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
455
+ #endif
456
+ #if (CR_BIN_SQR > 4 * 32)
457
+ sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
458
+ #endif
459
+ }
460
+
461
+ __syncthreads();
462
+
463
+ // Tile per thread: Finalize prefix sums.
464
+ // Single thread: Allocate segments.
465
+
466
+ for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
467
+ {
468
+ int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
469
+ int numEmits = sum >> emitShift;
470
+ int numAllocs = sum & ((1 << emitShift) - 1);
471
+ s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
472
+ s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
473
+
474
+ if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
475
+ {
476
+ int t = atomicAdd(&atomics.numTileSegs, numAllocs);
477
+ s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
478
+ }
479
+ }
480
+
481
+ __syncthreads();
482
+ int firstAllocSeg = s_firstAllocSeg;
483
+ int totalEmits = s_tileEmitPrefixSum[CR_BIN_SQR];
484
+ int totalAllocs = s_tileAllocPrefixSum[CR_BIN_SQR];
485
+
486
+ //------------------------------------------------------------------------
487
+ // Emit.
488
+ //------------------------------------------------------------------------
489
+
490
+ // Emit per thread: Write triangle index to globalmem.
491
+
492
+ for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
493
+ {
494
+ // Find tile in bin.
495
+
496
+ U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
497
+ U8* tilePtr = tileBase;
498
+ U8* ptr;
499
+
500
+ #if (CR_BIN_SQR > 128)
501
+ ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
502
+ #endif
503
+ #if (CR_BIN_SQR > 64)
504
+ ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
505
+ #endif
506
+ #if (CR_BIN_SQR > 32)
507
+ ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
508
+ #endif
509
+ #if (CR_BIN_SQR > 16)
510
+ ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
511
+ #endif
512
+ #if (CR_BIN_SQR > 8)
513
+ ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
514
+ #endif
515
+ #if (CR_BIN_SQR > 4)
516
+ ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
517
+ #endif
518
+ #if (CR_BIN_SQR > 2)
519
+ ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
520
+ #endif
521
+ #if (CR_BIN_SQR > 1)
522
+ ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
523
+ #endif
524
+
525
+ int tileInBin = (tilePtr - tileBase) >> 2;
526
+ int emitInTile = emitInBin - *(U32*)tilePtr;
527
+
528
+ // Find warp in tile.
529
+
530
+ int warpStep = (CR_BIN_SQR + 1) * 4;
531
+ U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
532
+ U8* warpPtr = warpBase;
533
+
534
+ #if (CR_COARSE_WARPS > 8)
535
+ ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
536
+ #endif
537
+ #if (CR_COARSE_WARPS > 4)
538
+ ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
539
+ #endif
540
+ #if (CR_COARSE_WARPS > 2)
541
+ ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
542
+ #endif
543
+ #if (CR_COARSE_WARPS > 1)
544
+ ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
545
+ #endif
546
+
547
+ int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
548
+ U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
549
+ int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
550
+
551
+ // Find thread in warp.
552
+
553
+ int threadInWarp = 0;
554
+ int pop = __popc(emitMask & 0xFFFF);
555
+ bool pred = (emitInWarp >= pop);
556
+ if (pred) emitInWarp -= pop;
557
+ if (pred) emitMask >>= 0x10;
558
+ if (pred) threadInWarp += 0x10;
559
+
560
+ pop = __popc(emitMask & 0xFF);
561
+ pred = (emitInWarp >= pop);
562
+ if (pred) emitInWarp -= pop;
563
+ if (pred) emitMask >>= 0x08;
564
+ if (pred) threadInWarp += 0x08;
565
+
566
+ pop = __popc(emitMask & 0xF);
567
+ pred = (emitInWarp >= pop);
568
+ if (pred) emitInWarp -= pop;
569
+ if (pred) emitMask >>= 0x04;
570
+ if (pred) threadInWarp += 0x04;
571
+
572
+ pop = __popc(emitMask & 0x3);
573
+ pred = (emitInWarp >= pop);
574
+ if (pred) emitInWarp -= pop;
575
+ if (pred) emitMask >>= 0x02;
576
+ if (pred) threadInWarp += 0x02;
577
+
578
+ if (emitInWarp >= (emitMask & 1))
579
+ threadInWarp++;
580
+
581
+ // Figure out where to write.
582
+
583
+ int currOfs = s_tileStreamCurrOfs[tileInBin];
584
+ int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
585
+ int outOfs = emitInTile;
586
+
587
+ if (outOfs < spaceLeft)
588
+ outOfs += currOfs;
589
+ else
590
+ {
591
+ int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
592
+ outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
593
+ }
594
+
595
+ // Write.
596
+
597
+ int queueIdx = warpInTile * 32 + threadInWarp;
598
+ int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
599
+
600
+ tileSegData[outOfs] = triIdx;
601
+ }
602
+
603
+ //------------------------------------------------------------------------
604
+ // Patch.
605
+ //------------------------------------------------------------------------
606
+
607
+ // Allocated segment per thread: Initialize next-pointer and count.
608
+
609
+ for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
610
+ {
611
+ int segIdx = firstAllocSeg + i;
612
+ tileSegNext[segIdx] = segIdx + 1;
613
+ tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
614
+ }
615
+
616
+ // Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
617
+
618
+ __syncthreads();
619
+ for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
620
+ {
621
+ int oldOfs = s_tileStreamCurrOfs[tileInBin];
622
+ int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
623
+ int allocLo = s_tileAllocPrefixSum[tileInBin];
624
+ int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
625
+
626
+ if (allocLo != allocHi)
627
+ {
628
+ S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
629
+ if (oldOfs < 0)
630
+ nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
631
+ *nextPtr = firstAllocSeg + allocLo;
632
+
633
+ newOfs--;
634
+ newOfs &= CR_TILE_SEG_SIZE - 1;
635
+ newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
636
+ newOfs++;
637
+ }
638
+ s_tileStreamCurrOfs[tileInBin] = newOfs;
639
+ }
640
+
641
+ // Advance queue read pointer.
642
+ // Queue became empty => bin done.
643
+
644
+ triQueueReadPos += CR_COARSE_WARPS * 32;
645
+ }
646
+ while (triQueueReadPos < triQueueWritePos);
647
+
648
+ // Tile per thread: Fix next-pointer and count of the last segment.
649
+ // 32 tiles per warp: Count active tiles.
650
+
651
+ __syncthreads();
652
+
653
+ for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
654
+ {
655
+ int tileInBin = tileInBin_base + thrInBlock;
656
+ bool act = (tileInBin < CR_BIN_SQR);
657
+ U32 actMask = __ballot_sync(~0u, act);
658
+ if (act)
659
+ {
660
+ int tileX = tileInBin & (CR_BIN_SIZE - 1);
661
+ int tileY = tileInBin >> CR_BIN_LOG2;
662
+ bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
663
+
664
+ int ofs = s_tileStreamCurrOfs[tileInBin];
665
+ int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
666
+ int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
667
+
668
+ if (ofs >= 0)
669
+ tileSegNext[segIdx] = -1;
670
+ else if (force)
671
+ {
672
+ s_tileStreamCurrOfs[tileInBin] = 0;
673
+ tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
674
+ }
675
+
676
+ if (segCount != 0)
677
+ tileSegCount[segIdx] = segCount;
678
+
679
+ U32 res = __ballot_sync(actMask, ofs >= 0 | force);
680
+ if (threadIdx.x == 0)
681
+ s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
682
+ }
683
+ }
684
+
685
+ // First warp: Scan-8.
686
+ // One thread: Allocate space for active tiles.
687
+
688
+ __syncthreads();
689
+
690
+ bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
691
+ U32 scan8Mask = __ballot_sync(~0u, scan8);
692
+ if (scan8)
693
+ {
694
+ volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
695
+ U32 sum = v[0];
696
+ #if (CR_BIN_SQR > 1 * 32)
697
+ sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
698
+ #endif
699
+ #if (CR_BIN_SQR > 2 * 32)
700
+ sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
701
+ #endif
702
+ #if (CR_BIN_SQR > 4 * 32)
703
+ sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
704
+ #endif
705
+
706
+ if (thrInBlock == CR_BIN_SQR / 32 - 1)
707
+ s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
708
+ }
709
+
710
+ // Tile per thread: Output active tiles.
711
+
712
+ __syncthreads();
713
+
714
+ for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
715
+ {
716
+ int tileInBin = tileInBin_base + thrInBlock;
717
+ bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
718
+ U32 actMask = __ballot_sync(~0u, act);
719
+ if (act)
720
+ {
721
+ int activeIdx = s_firstActiveIdx;
722
+ activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
723
+ activeIdx += __popc(actMask & getLaneMaskLt());
724
+ activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
725
+ }
726
+ }
727
+ }
728
+ }
729
+
730
+ //------------------------------------------------------------------------