TalHach61 commited on
Commit
710e08b
·
verified ·
1 Parent(s): 87da024

Upload 40 files

Browse files
Files changed (40) hide show
  1. ControlNetUnion-space/.DS_Store +0 -0
  2. ControlNetUnion-space/app.py +264 -0
  3. ControlNetUnion-space/controlnet_aux/.DS_Store +0 -0
  4. ControlNetUnion-space/controlnet_aux/__init__.py +5 -0
  5. ControlNetUnion-space/controlnet_aux/canny/__init__.py +36 -0
  6. ControlNetUnion-space/controlnet_aux/open_pose/LICENSE +108 -0
  7. ControlNetUnion-space/controlnet_aux/open_pose/__init__.py +234 -0
  8. ControlNetUnion-space/controlnet_aux/open_pose/body.py +260 -0
  9. ControlNetUnion-space/controlnet_aux/open_pose/face.py +364 -0
  10. ControlNetUnion-space/controlnet_aux/open_pose/hand.py +90 -0
  11. ControlNetUnion-space/controlnet_aux/open_pose/model.py +217 -0
  12. ControlNetUnion-space/controlnet_aux/open_pose/util.py +383 -0
  13. ControlNetUnion-space/controlnet_aux/util.py +146 -0
  14. ControlNetUnion-space/depth_anything_v2/.DS_Store +0 -0
  15. ControlNetUnion-space/depth_anything_v2/dinov2.py +415 -0
  16. ControlNetUnion-space/depth_anything_v2/dinov2_layers/__init__.py +11 -0
  17. ControlNetUnion-space/depth_anything_v2/dinov2_layers/attention.py +83 -0
  18. ControlNetUnion-space/depth_anything_v2/dinov2_layers/block.py +252 -0
  19. ControlNetUnion-space/depth_anything_v2/dinov2_layers/drop_path.py +35 -0
  20. ControlNetUnion-space/depth_anything_v2/dinov2_layers/layer_scale.py +28 -0
  21. ControlNetUnion-space/depth_anything_v2/dinov2_layers/mlp.py +41 -0
  22. ControlNetUnion-space/depth_anything_v2/dinov2_layers/patch_embed.py +89 -0
  23. ControlNetUnion-space/depth_anything_v2/dinov2_layers/swiglu_ffn.py +63 -0
  24. ControlNetUnion-space/depth_anything_v2/dpt.py +221 -0
  25. ControlNetUnion-space/depth_anything_v2/util/blocks.py +148 -0
  26. ControlNetUnion-space/depth_anything_v2/util/transform.py +158 -0
  27. ControlNetUnion-space/requirements.txt +17 -0
  28. app.py +264 -0
  29. controlnet_aux/.DS_Store +0 -0
  30. controlnet_aux/__init__.py +5 -0
  31. controlnet_aux/canny/__init__.py +36 -0
  32. controlnet_aux/open_pose/LICENSE +108 -0
  33. controlnet_aux/open_pose/__init__.py +234 -0
  34. controlnet_aux/open_pose/body.py +260 -0
  35. controlnet_aux/open_pose/face.py +364 -0
  36. controlnet_aux/open_pose/hand.py +90 -0
  37. controlnet_aux/open_pose/model.py +217 -0
  38. controlnet_aux/open_pose/util.py +383 -0
  39. controlnet_aux/util.py +146 -0
  40. requirements.txt +17 -0
ControlNetUnion-space/.DS_Store ADDED
Binary file (6.15 kB). View file
 
ControlNetUnion-space/app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('./')
3
+
4
+ import gradio as gr
5
+ import spaces
6
+ import os
7
+ import sys
8
+ import subprocess
9
+ import numpy as np
10
+ from PIL import Image
11
+ import cv2
12
+ import torch
13
+ import random
14
+
15
+ os.system("pip install -e ./controlnet_aux")
16
+
17
+ from controlnet_aux import OpenposeDetector, CannyDetector
18
+ from depth_anything_v2.dpt import DepthAnythingV2
19
+
20
+ from huggingface_hub import hf_hub_download
21
+
22
+ from huggingface_hub import login
23
+ hf_token = os.environ.get("HF_TOKEN_GATED")
24
+ login(token=hf_token)
25
+
26
+ MAX_SEED = np.iinfo(np.int32).max
27
+
28
+ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
29
+ if randomize_seed:
30
+ seed = random.randint(0, MAX_SEED)
31
+ return seed
32
+
33
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
34
+ model_configs = {
35
+ 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
36
+ 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
37
+ 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
38
+ 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
39
+ }
40
+
41
+ encoder = 'vitl'
42
+ model = DepthAnythingV2(**model_configs[encoder])
43
+ filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-Large", filename=f"depth_anything_v2_vitl.pth", repo_type="model")
44
+ state_dict = torch.load(filepath, map_location="cpu")
45
+ model.load_state_dict(state_dict)
46
+ model = model.to(DEVICE).eval()
47
+
48
+ import torch
49
+ from diffusers.utils import load_image
50
+ from diffusers import FluxControlNetPipeline, FluxControlNetModel
51
+ from diffusers.models import FluxMultiControlNetModel
52
+
53
+ base_model = 'black-forest-labs/FLUX.1-dev'
54
+ controlnet_model = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro'
55
+ controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
56
+ controlnet = FluxMultiControlNetModel([controlnet])
57
+ pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=controlnet, torch_dtype=torch.bfloat16)
58
+ pipe.to("cuda")
59
+
60
+ mode_mapping = {"canny":0, "tile":1, "depth":2, "blur":3, "openpose":4, "gray":5, "low quality": 6}
61
+ strength_mapping = {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}
62
+
63
+ canny = CannyDetector()
64
+ open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
65
+
66
+ torch.backends.cuda.matmul.allow_tf32 = True
67
+ pipe.vae.enable_tiling()
68
+ pipe.vae.enable_slicing()
69
+ pipe.enable_model_cpu_offload() # for saving memory
70
+
71
+ def convert_from_image_to_cv2(img: Image) -> np.ndarray:
72
+ return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
73
+
74
+ def convert_from_cv2_to_image(img: np.ndarray) -> Image:
75
+ return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
76
+
77
+ def extract_depth(image):
78
+ image = np.asarray(image)
79
+ depth = model.infer_image(image[:, :, ::-1])
80
+ depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
81
+ depth = depth.astype(np.uint8)
82
+ gray_depth = Image.fromarray(depth).convert('RGB')
83
+ return gray_depth
84
+
85
+ def extract_openpose(img):
86
+ processed_image_open_pose = open_pose(img, hand_and_face=True)
87
+ return processed_image_open_pose
88
+
89
+ def extract_canny(image):
90
+ processed_image_canny = canny(image)
91
+ return processed_image_canny
92
+
93
+ def apply_gaussian_blur(image, kernel_size=(21, 21)):
94
+ image = convert_from_image_to_cv2(image)
95
+ blurred_image = convert_from_cv2_to_image(cv2.GaussianBlur(image, kernel_size, 0))
96
+ return blurred_image
97
+
98
+ def convert_to_grayscale(image):
99
+ image = convert_from_image_to_cv2(image)
100
+ gray_image = convert_from_cv2_to_image(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))
101
+ return gray_image
102
+
103
+ def add_gaussian_noise(image, mean=0, sigma=10):
104
+ image = convert_from_image_to_cv2(image)
105
+ noise = np.random.normal(mean, sigma, image.shape)
106
+ noisy_image = convert_from_cv2_to_image(np.clip(image.astype(np.float32) + noise, 0, 255).astype(np.uint8))
107
+ return noisy_image
108
+
109
+ def tile(input_image, resolution=768):
110
+ input_image = convert_from_image_to_cv2(input_image)
111
+ H, W, C = input_image.shape
112
+ H = float(H)
113
+ W = float(W)
114
+ k = float(resolution) / min(H, W)
115
+ H *= k
116
+ W *= k
117
+ H = int(np.round(H / 64.0)) * 64
118
+ W = int(np.round(W / 64.0)) * 64
119
+ img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
120
+ img = convert_from_cv2_to_image(img)
121
+ return img
122
+
123
+ def resize_img(input_image, max_side=768, min_side=512, size=None,
124
+ pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64):
125
+
126
+ w, h = input_image.size
127
+ if size is not None:
128
+ w_resize_new, h_resize_new = size
129
+ else:
130
+ ratio = min_side / min(h, w)
131
+ w, h = round(ratio*w), round(ratio*h)
132
+ ratio = max_side / max(h, w)
133
+ input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
134
+ w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
135
+ h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
136
+ input_image = input_image.resize([w_resize_new, h_resize_new], mode)
137
+
138
+ if pad_to_max_side:
139
+ res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
140
+ offset_x = (max_side - w_resize_new) // 2
141
+ offset_y = (max_side - h_resize_new) // 2
142
+ res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
143
+ input_image = Image.fromarray(res)
144
+ return input_image
145
+
146
+ @spaces.GPU(duration=180)
147
+ def infer(cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed, progress=gr.Progress(track_tqdm=True)):
148
+
149
+ control_mode_num = mode_mapping[control_mode]
150
+
151
+ if cond_in is None:
152
+ if image_in is not None:
153
+ image_in = resize_img(load_image(image_in))
154
+ if control_mode == "canny":
155
+ control_image = extract_canny(image_in)
156
+ elif control_mode == "depth":
157
+ control_image = extract_depth(image_in)
158
+ elif control_mode == "openpose":
159
+ control_image = extract_openpose(image_in)
160
+ elif control_mode == "blur":
161
+ control_image = apply_gaussian_blur(image_in)
162
+ elif control_mode == "low quality":
163
+ control_image = add_gaussian_noise(image_in)
164
+ elif control_mode == "gray":
165
+ control_image = convert_to_grayscale(image_in)
166
+ elif control_mode == "tile":
167
+ control_image = tile(image_in)
168
+ else:
169
+ control_image = resize_img(load_image(cond_in))
170
+
171
+ width, height = control_image.size
172
+
173
+ image = pipe(
174
+ prompt,
175
+ control_image=[control_image],
176
+ control_mode=[control_mode_num],
177
+ width=width,
178
+ height=height,
179
+ controlnet_conditioning_scale=[control_strength],
180
+ num_inference_steps=inference_steps,
181
+ guidance_scale=guidance_scale,
182
+ generator=torch.manual_seed(seed),
183
+ ).images[0]
184
+
185
+ torch.cuda.empty_cache()
186
+
187
+ return image, control_image, gr.update(visible=True)
188
+
189
+
190
+ css="""
191
+ #col-container{
192
+ margin: 0 auto;
193
+ max-width: 1080px;
194
+ }
195
+ """
196
+ with gr.Blocks(css=css) as demo:
197
+ with gr.Column(elem_id="col-container"):
198
+ gr.Markdown("""
199
+ # FLUX.1-dev-ControlNet-Union-Pro
200
+ A unified ControlNet for FLUX.1-dev model from the InstantX team and Shakker Labs. Model card: [Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro). <br />
201
+ The recommended strength: {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}. Long prompt is preferred by FLUX.1.
202
+ """)
203
+
204
+ with gr.Column():
205
+
206
+ with gr.Row():
207
+ with gr.Column():
208
+
209
+ with gr.Row(equal_height=True):
210
+ cond_in = gr.Image(label="Upload a processed control image", sources=["upload"], type="filepath")
211
+ image_in = gr.Image(label="Extract condition from a reference image (Optional)", sources=["upload"], type="filepath")
212
+
213
+ prompt = gr.Textbox(label="Prompt", value="best quality")
214
+
215
+ with gr.Accordion("Controlnet"):
216
+ control_mode = gr.Radio(
217
+ ["canny", "depth", "openpose", "gray", "blur", "tile", "low quality"], label="Mode", value="gray",
218
+ info="select the control mode, one for all"
219
+ )
220
+
221
+ control_strength = gr.Slider(
222
+ label="control strength",
223
+ minimum=0,
224
+ maximum=1.0,
225
+ step=0.05,
226
+ value=0.50,
227
+ )
228
+
229
+ seed = gr.Slider(
230
+ label="Seed",
231
+ minimum=0,
232
+ maximum=MAX_SEED,
233
+ step=1,
234
+ value=42,
235
+ )
236
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
237
+
238
+ with gr.Accordion("Advanced settings", open=False):
239
+ with gr.Column():
240
+ with gr.Row():
241
+ inference_steps = gr.Slider(label="Inference steps", minimum=1, maximum=50, step=1, value=24)
242
+ guidance_scale = gr.Slider(label="Guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=3.5)
243
+
244
+ submit_btn = gr.Button("Submit")
245
+
246
+ with gr.Column():
247
+ result = gr.Image(label="Result")
248
+ processed_cond = gr.Image(label="Preprocessed Cond")
249
+
250
+ submit_btn.click(
251
+ fn=randomize_seed_fn,
252
+ inputs=[seed, randomize_seed],
253
+ outputs=seed,
254
+ queue=False,
255
+ api_name=False
256
+ ).then(
257
+ fn = infer,
258
+ inputs = [cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed],
259
+ outputs = [result, processed_cond],
260
+ show_api=False
261
+ )
262
+
263
+ demo.queue(api_open=False)
264
+ demo.launch()
ControlNetUnion-space/controlnet_aux/.DS_Store ADDED
Binary file (6.15 kB). View file
 
ControlNetUnion-space/controlnet_aux/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __version__ = "0.0.9"
2
+
3
+ from .canny import CannyDetector
4
+ from .open_pose import OpenposeDetector
5
+
ControlNetUnion-space/controlnet_aux/canny/__init__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
+ from ..util import HWC3, resize_image
6
+
7
+ class CannyDetector:
8
+ def __call__(self, input_image=None, low_threshold=100, high_threshold=200, detect_resolution=512, image_resolution=512, output_type=None, **kwargs):
9
+ if "img" in kwargs:
10
+ warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
11
+ input_image = kwargs.pop("img")
12
+
13
+ if input_image is None:
14
+ raise ValueError("input_image must be defined.")
15
+
16
+ if not isinstance(input_image, np.ndarray):
17
+ input_image = np.array(input_image, dtype=np.uint8)
18
+ output_type = output_type or "pil"
19
+ else:
20
+ output_type = output_type or "np"
21
+
22
+ input_image = HWC3(input_image)
23
+ input_image = resize_image(input_image, detect_resolution)
24
+
25
+ detected_map = cv2.Canny(input_image, low_threshold, high_threshold)
26
+ detected_map = HWC3(detected_map)
27
+
28
+ img = resize_image(input_image, image_resolution)
29
+ H, W, C = img.shape
30
+
31
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
32
+
33
+ if output_type == "pil":
34
+ detected_map = Image.fromarray(detected_map)
35
+
36
+ return detected_map
ControlNetUnion-space/controlnet_aux/open_pose/LICENSE ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OPENPOSE: MULTIPERSON KEYPOINT DETECTION
2
+ SOFTWARE LICENSE AGREEMENT
3
+ ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
4
+
5
+ BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
6
+
7
+ This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
8
+
9
+ RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
10
+ Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
11
+ non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
12
+
13
+ CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
14
+
15
+ COPYRIGHT: The Software is owned by Licensor and is protected by United
16
+ States copyright laws and applicable international treaties and/or conventions.
17
+
18
+ PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
19
+
20
+ DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
21
+
22
+ BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
23
+
24
+ USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
25
+
26
+ You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
27
+
28
+ ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
29
+
30
+ TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
31
+
32
+ The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
33
+
34
+ FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
35
+
36
+ DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
37
+
38
+ SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
39
+
40
+ EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
41
+
42
+ EXPORT REGULATION: Licensee agrees to comply with any and all applicable
43
+ U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
44
+
45
+ SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
46
+
47
+ NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
48
+
49
+ GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
50
+
51
+ ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
52
+
53
+
54
+
55
+ ************************************************************************
56
+
57
+ THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
58
+
59
+ This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
60
+
61
+ 1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
62
+
63
+ COPYRIGHT
64
+
65
+ All contributions by the University of California:
66
+ Copyright (c) 2014-2017 The Regents of the University of California (Regents)
67
+ All rights reserved.
68
+
69
+ All other contributions:
70
+ Copyright (c) 2014-2017, the respective contributors
71
+ All rights reserved.
72
+
73
+ Caffe uses a shared copyright model: each contributor holds copyright over
74
+ their contributions to Caffe. The project versioning records all such
75
+ contribution and copyright details. If a contributor wants to further mark
76
+ their specific copyright on a particular contribution, they should indicate
77
+ their copyright solely in the commit message of the change when it is
78
+ committed.
79
+
80
+ LICENSE
81
+
82
+ Redistribution and use in source and binary forms, with or without
83
+ modification, are permitted provided that the following conditions are met:
84
+
85
+ 1. Redistributions of source code must retain the above copyright notice, this
86
+ list of conditions and the following disclaimer.
87
+ 2. Redistributions in binary form must reproduce the above copyright notice,
88
+ this list of conditions and the following disclaimer in the documentation
89
+ and/or other materials provided with the distribution.
90
+
91
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
92
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
93
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
94
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
95
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
96
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
97
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
98
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
99
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
100
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
101
+
102
+ CONTRIBUTION AGREEMENT
103
+
104
+ By contributing to the BVLC/caffe repository through pull-request, comment,
105
+ or otherwise, the contributor releases their content to the
106
+ license and copyright terms herein.
107
+
108
+ ************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
ControlNetUnion-space/controlnet_aux/open_pose/__init__.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Openpose
2
+ # Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
3
+ # 2nd Edited by https://github.com/Hzzone/pytorch-openpose
4
+ # 3rd Edited by ControlNet
5
+ # 4th Edited by ControlNet (added face and correct hands)
6
+ # 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs)
7
+ # This preprocessor is licensed by CMU for non-commercial use only.
8
+
9
+
10
+ import os
11
+
12
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
13
+
14
+ import json
15
+ import warnings
16
+ from typing import Callable, List, NamedTuple, Tuple, Union
17
+
18
+ import cv2
19
+ import numpy as np
20
+ import torch
21
+ from huggingface_hub import hf_hub_download
22
+ from PIL import Image
23
+
24
+ from ..util import HWC3, resize_image
25
+ from . import util
26
+ from .body import Body, BodyResult, Keypoint
27
+ from .face import Face
28
+ from .hand import Hand
29
+
30
+ HandResult = List[Keypoint]
31
+ FaceResult = List[Keypoint]
32
+
33
+ class PoseResult(NamedTuple):
34
+ body: BodyResult
35
+ left_hand: Union[HandResult, None]
36
+ right_hand: Union[HandResult, None]
37
+ face: Union[FaceResult, None]
38
+
39
+ def draw_poses(poses: List[PoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True):
40
+ """
41
+ Draw the detected poses on an empty canvas.
42
+
43
+ Args:
44
+ poses (List[PoseResult]): A list of PoseResult objects containing the detected poses.
45
+ H (int): The height of the canvas.
46
+ W (int): The width of the canvas.
47
+ draw_body (bool, optional): Whether to draw body keypoints. Defaults to True.
48
+ draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True.
49
+ draw_face (bool, optional): Whether to draw face keypoints. Defaults to True.
50
+
51
+ Returns:
52
+ numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses.
53
+ """
54
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
55
+
56
+ for pose in poses:
57
+ if draw_body:
58
+ canvas = util.draw_bodypose(canvas, pose.body.keypoints)
59
+
60
+ if draw_hand:
61
+ canvas = util.draw_handpose(canvas, pose.left_hand)
62
+ canvas = util.draw_handpose(canvas, pose.right_hand)
63
+
64
+ if draw_face:
65
+ canvas = util.draw_facepose(canvas, pose.face)
66
+
67
+ return canvas
68
+
69
+
70
+ class OpenposeDetector:
71
+ """
72
+ A class for detecting human poses in images using the Openpose model.
73
+
74
+ Attributes:
75
+ model_dir (str): Path to the directory where the pose models are stored.
76
+ """
77
+ def __init__(self, body_estimation, hand_estimation=None, face_estimation=None):
78
+ self.body_estimation = body_estimation
79
+ self.hand_estimation = hand_estimation
80
+ self.face_estimation = face_estimation
81
+
82
+ @classmethod
83
+ def from_pretrained(cls, pretrained_model_or_path, filename=None, hand_filename=None, face_filename=None, cache_dir=None, local_files_only=False):
84
+
85
+ if pretrained_model_or_path == "lllyasviel/ControlNet":
86
+ filename = filename or "annotator/ckpts/body_pose_model.pth"
87
+ hand_filename = hand_filename or "annotator/ckpts/hand_pose_model.pth"
88
+ face_filename = face_filename or "facenet.pth"
89
+
90
+ face_pretrained_model_or_path = "lllyasviel/Annotators"
91
+ else:
92
+ filename = filename or "body_pose_model.pth"
93
+ hand_filename = hand_filename or "hand_pose_model.pth"
94
+ face_filename = face_filename or "facenet.pth"
95
+
96
+ face_pretrained_model_or_path = pretrained_model_or_path
97
+
98
+ if os.path.isdir(pretrained_model_or_path):
99
+ body_model_path = os.path.join(pretrained_model_or_path, filename)
100
+ hand_model_path = os.path.join(pretrained_model_or_path, hand_filename)
101
+ face_model_path = os.path.join(face_pretrained_model_or_path, face_filename)
102
+ else:
103
+ body_model_path = hf_hub_download(pretrained_model_or_path, filename, cache_dir=cache_dir, local_files_only=local_files_only)
104
+ hand_model_path = hf_hub_download(pretrained_model_or_path, hand_filename, cache_dir=cache_dir, local_files_only=local_files_only)
105
+ face_model_path = hf_hub_download(face_pretrained_model_or_path, face_filename, cache_dir=cache_dir, local_files_only=local_files_only)
106
+
107
+ body_estimation = Body(body_model_path)
108
+ hand_estimation = Hand(hand_model_path)
109
+ face_estimation = Face(face_model_path)
110
+
111
+ return cls(body_estimation, hand_estimation, face_estimation)
112
+
113
+ def to(self, device):
114
+ self.body_estimation.to(device)
115
+ self.hand_estimation.to(device)
116
+ self.face_estimation.to(device)
117
+ return self
118
+
119
+ def detect_hands(self, body: BodyResult, oriImg) -> Tuple[Union[HandResult, None], Union[HandResult, None]]:
120
+ left_hand = None
121
+ right_hand = None
122
+ H, W, _ = oriImg.shape
123
+ for x, y, w, is_left in util.handDetect(body, oriImg):
124
+ peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :]).astype(np.float32)
125
+ if peaks.ndim == 2 and peaks.shape[1] == 2:
126
+ peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
127
+ peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
128
+
129
+ hand_result = [
130
+ Keypoint(x=peak[0], y=peak[1])
131
+ for peak in peaks
132
+ ]
133
+
134
+ if is_left:
135
+ left_hand = hand_result
136
+ else:
137
+ right_hand = hand_result
138
+
139
+ return left_hand, right_hand
140
+
141
+ def detect_face(self, body: BodyResult, oriImg) -> Union[FaceResult, None]:
142
+ face = util.faceDetect(body, oriImg)
143
+ if face is None:
144
+ return None
145
+
146
+ x, y, w = face
147
+ H, W, _ = oriImg.shape
148
+ heatmaps = self.face_estimation(oriImg[y:y+w, x:x+w, :])
149
+ peaks = self.face_estimation.compute_peaks_from_heatmaps(heatmaps).astype(np.float32)
150
+ if peaks.ndim == 2 and peaks.shape[1] == 2:
151
+ peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
152
+ peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
153
+ return [
154
+ Keypoint(x=peak[0], y=peak[1])
155
+ for peak in peaks
156
+ ]
157
+
158
+ return None
159
+
160
+ def detect_poses(self, oriImg, include_hand=False, include_face=False) -> List[PoseResult]:
161
+ """
162
+ Detect poses in the given image.
163
+ Args:
164
+ oriImg (numpy.ndarray): The input image for pose detection.
165
+ include_hand (bool, optional): Whether to include hand detection. Defaults to False.
166
+ include_face (bool, optional): Whether to include face detection. Defaults to False.
167
+
168
+ Returns:
169
+ List[PoseResult]: A list of PoseResult objects containing the detected poses.
170
+ """
171
+ oriImg = oriImg[:, :, ::-1].copy()
172
+ H, W, C = oriImg.shape
173
+ with torch.no_grad():
174
+ candidate, subset = self.body_estimation(oriImg)
175
+ bodies = self.body_estimation.format_body_result(candidate, subset)
176
+
177
+ results = []
178
+ for body in bodies:
179
+ left_hand, right_hand, face = (None,) * 3
180
+ if include_hand:
181
+ left_hand, right_hand = self.detect_hands(body, oriImg)
182
+ if include_face:
183
+ face = self.detect_face(body, oriImg)
184
+
185
+ results.append(PoseResult(BodyResult(
186
+ keypoints=[
187
+ Keypoint(
188
+ x=keypoint.x / float(W),
189
+ y=keypoint.y / float(H)
190
+ ) if keypoint is not None else None
191
+ for keypoint in body.keypoints
192
+ ],
193
+ total_score=body.total_score,
194
+ total_parts=body.total_parts
195
+ ), left_hand, right_hand, face))
196
+
197
+ return results
198
+
199
+ def __call__(self, input_image, detect_resolution=512, image_resolution=512, include_body=True, include_hand=False, include_face=False, hand_and_face=None, output_type="pil", **kwargs):
200
+ if hand_and_face is not None:
201
+ warnings.warn("hand_and_face is deprecated. Use include_hand and include_face instead.", DeprecationWarning)
202
+ include_hand = hand_and_face
203
+ include_face = hand_and_face
204
+
205
+ if "return_pil" in kwargs:
206
+ warnings.warn("return_pil is deprecated. Use output_type instead.", DeprecationWarning)
207
+ output_type = "pil" if kwargs["return_pil"] else "np"
208
+ if type(output_type) is bool:
209
+ warnings.warn("Passing `True` or `False` to `output_type` is deprecated and will raise an error in future versions")
210
+ if output_type:
211
+ output_type = "pil"
212
+
213
+ if not isinstance(input_image, np.ndarray):
214
+ input_image = np.array(input_image, dtype=np.uint8)
215
+
216
+ input_image = HWC3(input_image)
217
+ input_image = resize_image(input_image, detect_resolution)
218
+ H, W, C = input_image.shape
219
+
220
+ poses = self.detect_poses(input_image, include_hand, include_face)
221
+ canvas = draw_poses(poses, H, W, draw_body=include_body, draw_hand=include_hand, draw_face=include_face)
222
+
223
+ detected_map = canvas
224
+ detected_map = HWC3(detected_map)
225
+
226
+ img = resize_image(input_image, image_resolution)
227
+ H, W, C = img.shape
228
+
229
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
230
+
231
+ if output_type == "pil":
232
+ detected_map = Image.fromarray(detected_map)
233
+
234
+ return detected_map
ControlNetUnion-space/controlnet_aux/open_pose/body.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, NamedTuple, Union
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import torch
7
+ from scipy.ndimage.filters import gaussian_filter
8
+
9
+ from . import util
10
+ from .model import bodypose_model
11
+
12
+
13
+ class Keypoint(NamedTuple):
14
+ x: float
15
+ y: float
16
+ score: float = 1.0
17
+ id: int = -1
18
+
19
+
20
+ class BodyResult(NamedTuple):
21
+ # Note: Using `Union` instead of `|` operator as the ladder is a Python
22
+ # 3.10 feature.
23
+ # Annotator code should be Python 3.8 Compatible, as controlnet repo uses
24
+ # Python 3.8 environment.
25
+ # https://github.com/lllyasviel/ControlNet/blob/d3284fcd0972c510635a4f5abe2eeb71dc0de524/environment.yaml#L6
26
+ keypoints: List[Union[Keypoint, None]]
27
+ total_score: float
28
+ total_parts: int
29
+
30
+
31
+ class Body(object):
32
+ def __init__(self, model_path):
33
+ self.model = bodypose_model()
34
+ model_dict = util.transfer(self.model, torch.load(model_path))
35
+ self.model.load_state_dict(model_dict)
36
+ self.model.eval()
37
+
38
+ def to(self, device):
39
+ self.model.to(device)
40
+ return self
41
+
42
+ def __call__(self, oriImg):
43
+ device = next(iter(self.model.parameters())).device
44
+ # scale_search = [0.5, 1.0, 1.5, 2.0]
45
+ scale_search = [0.5]
46
+ boxsize = 368
47
+ stride = 8
48
+ padValue = 128
49
+ thre1 = 0.1
50
+ thre2 = 0.05
51
+ multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
52
+ heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
53
+ paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
54
+
55
+ for m in range(len(multiplier)):
56
+ scale = multiplier[m]
57
+ imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale)
58
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
59
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
60
+ im = np.ascontiguousarray(im)
61
+
62
+ data = torch.from_numpy(im).float()
63
+ data = data.to(device)
64
+ # data = data.permute([2, 0, 1]).unsqueeze(0).float()
65
+ with torch.no_grad():
66
+ Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
67
+ Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
68
+ Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
69
+
70
+ # extract outputs, resize, and remove padding
71
+ # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
72
+ heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
73
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
74
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
75
+ heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1]))
76
+
77
+ # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
78
+ paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
79
+ paf = util.smart_resize_k(paf, fx=stride, fy=stride)
80
+ paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
81
+ paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1]))
82
+
83
+ heatmap_avg += heatmap_avg + heatmap / len(multiplier)
84
+ paf_avg += + paf / len(multiplier)
85
+
86
+ all_peaks = []
87
+ peak_counter = 0
88
+
89
+ for part in range(18):
90
+ map_ori = heatmap_avg[:, :, part]
91
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
92
+
93
+ map_left = np.zeros(one_heatmap.shape)
94
+ map_left[1:, :] = one_heatmap[:-1, :]
95
+ map_right = np.zeros(one_heatmap.shape)
96
+ map_right[:-1, :] = one_heatmap[1:, :]
97
+ map_up = np.zeros(one_heatmap.shape)
98
+ map_up[:, 1:] = one_heatmap[:, :-1]
99
+ map_down = np.zeros(one_heatmap.shape)
100
+ map_down[:, :-1] = one_heatmap[:, 1:]
101
+
102
+ peaks_binary = np.logical_and.reduce(
103
+ (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
104
+ peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
105
+ peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
106
+ peak_id = range(peak_counter, peak_counter + len(peaks))
107
+ peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
108
+
109
+ all_peaks.append(peaks_with_score_and_id)
110
+ peak_counter += len(peaks)
111
+
112
+ # find connection in the specified sequence, center 29 is in the position 15
113
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
114
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
115
+ [1, 16], [16, 18], [3, 17], [6, 18]]
116
+ # the middle joints heatmap correpondence
117
+ mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
118
+ [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
119
+ [55, 56], [37, 38], [45, 46]]
120
+
121
+ connection_all = []
122
+ special_k = []
123
+ mid_num = 10
124
+
125
+ for k in range(len(mapIdx)):
126
+ score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
127
+ candA = all_peaks[limbSeq[k][0] - 1]
128
+ candB = all_peaks[limbSeq[k][1] - 1]
129
+ nA = len(candA)
130
+ nB = len(candB)
131
+ indexA, indexB = limbSeq[k]
132
+ if (nA != 0 and nB != 0):
133
+ connection_candidate = []
134
+ for i in range(nA):
135
+ for j in range(nB):
136
+ vec = np.subtract(candB[j][:2], candA[i][:2])
137
+ norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
138
+ norm = max(0.001, norm)
139
+ vec = np.divide(vec, norm)
140
+
141
+ startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
142
+ np.linspace(candA[i][1], candB[j][1], num=mid_num)))
143
+
144
+ vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
145
+ for I in range(len(startend))])
146
+ vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
147
+ for I in range(len(startend))])
148
+
149
+ score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
150
+ score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
151
+ 0.5 * oriImg.shape[0] / norm - 1, 0)
152
+ criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
153
+ criterion2 = score_with_dist_prior > 0
154
+ if criterion1 and criterion2:
155
+ connection_candidate.append(
156
+ [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
157
+
158
+ connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
159
+ connection = np.zeros((0, 5))
160
+ for c in range(len(connection_candidate)):
161
+ i, j, s = connection_candidate[c][0:3]
162
+ if (i not in connection[:, 3] and j not in connection[:, 4]):
163
+ connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
164
+ if (len(connection) >= min(nA, nB)):
165
+ break
166
+
167
+ connection_all.append(connection)
168
+ else:
169
+ special_k.append(k)
170
+ connection_all.append([])
171
+
172
+ # last number in each row is the total parts number of that person
173
+ # the second last number in each row is the score of the overall configuration
174
+ subset = -1 * np.ones((0, 20))
175
+ candidate = np.array([item for sublist in all_peaks for item in sublist])
176
+
177
+ for k in range(len(mapIdx)):
178
+ if k not in special_k:
179
+ partAs = connection_all[k][:, 0]
180
+ partBs = connection_all[k][:, 1]
181
+ indexA, indexB = np.array(limbSeq[k]) - 1
182
+
183
+ for i in range(len(connection_all[k])): # = 1:size(temp,1)
184
+ found = 0
185
+ subset_idx = [-1, -1]
186
+ for j in range(len(subset)): # 1:size(subset,1):
187
+ if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
188
+ subset_idx[found] = j
189
+ found += 1
190
+
191
+ if found == 1:
192
+ j = subset_idx[0]
193
+ if subset[j][indexB] != partBs[i]:
194
+ subset[j][indexB] = partBs[i]
195
+ subset[j][-1] += 1
196
+ subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
197
+ elif found == 2: # if found 2 and disjoint, merge them
198
+ j1, j2 = subset_idx
199
+ membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
200
+ if len(np.nonzero(membership == 2)[0]) == 0: # merge
201
+ subset[j1][:-2] += (subset[j2][:-2] + 1)
202
+ subset[j1][-2:] += subset[j2][-2:]
203
+ subset[j1][-2] += connection_all[k][i][2]
204
+ subset = np.delete(subset, j2, 0)
205
+ else: # as like found == 1
206
+ subset[j1][indexB] = partBs[i]
207
+ subset[j1][-1] += 1
208
+ subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
209
+
210
+ # if find no partA in the subset, create a new subset
211
+ elif not found and k < 17:
212
+ row = -1 * np.ones(20)
213
+ row[indexA] = partAs[i]
214
+ row[indexB] = partBs[i]
215
+ row[-1] = 2
216
+ row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
217
+ subset = np.vstack([subset, row])
218
+ # delete some rows of subset which has few parts occur
219
+ deleteIdx = []
220
+ for i in range(len(subset)):
221
+ if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
222
+ deleteIdx.append(i)
223
+ subset = np.delete(subset, deleteIdx, axis=0)
224
+
225
+ # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
226
+ # candidate: x, y, score, id
227
+ return candidate, subset
228
+
229
+ @staticmethod
230
+ def format_body_result(candidate: np.ndarray, subset: np.ndarray) -> List[BodyResult]:
231
+ """
232
+ Format the body results from the candidate and subset arrays into a list of BodyResult objects.
233
+
234
+ Args:
235
+ candidate (np.ndarray): An array of candidates containing the x, y coordinates, score, and id
236
+ for each body part.
237
+ subset (np.ndarray): An array of subsets containing indices to the candidate array for each
238
+ person detected. The last two columns of each row hold the total score and total parts
239
+ of the person.
240
+
241
+ Returns:
242
+ List[BodyResult]: A list of BodyResult objects, where each object represents a person with
243
+ detected keypoints, total score, and total parts.
244
+ """
245
+ return [
246
+ BodyResult(
247
+ keypoints=[
248
+ Keypoint(
249
+ x=candidate[candidate_index][0],
250
+ y=candidate[candidate_index][1],
251
+ score=candidate[candidate_index][2],
252
+ id=candidate[candidate_index][3]
253
+ ) if candidate_index != -1 else None
254
+ for candidate_index in person[:18].astype(int)
255
+ ],
256
+ total_score=person[18],
257
+ total_parts=person[19]
258
+ )
259
+ for person in subset
260
+ ]
ControlNetUnion-space/controlnet_aux/open_pose/face.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from torch.nn import Conv2d, MaxPool2d, Module, ReLU, init
7
+ from torchvision.transforms import ToPILImage, ToTensor
8
+
9
+ from . import util
10
+
11
+
12
+ class FaceNet(Module):
13
+ """Model the cascading heatmaps. """
14
+ def __init__(self):
15
+ super(FaceNet, self).__init__()
16
+ # cnn to make feature map
17
+ self.relu = ReLU()
18
+ self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2)
19
+ self.conv1_1 = Conv2d(in_channels=3, out_channels=64,
20
+ kernel_size=3, stride=1, padding=1)
21
+ self.conv1_2 = Conv2d(
22
+ in_channels=64, out_channels=64, kernel_size=3, stride=1,
23
+ padding=1)
24
+ self.conv2_1 = Conv2d(
25
+ in_channels=64, out_channels=128, kernel_size=3, stride=1,
26
+ padding=1)
27
+ self.conv2_2 = Conv2d(
28
+ in_channels=128, out_channels=128, kernel_size=3, stride=1,
29
+ padding=1)
30
+ self.conv3_1 = Conv2d(
31
+ in_channels=128, out_channels=256, kernel_size=3, stride=1,
32
+ padding=1)
33
+ self.conv3_2 = Conv2d(
34
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
35
+ padding=1)
36
+ self.conv3_3 = Conv2d(
37
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
38
+ padding=1)
39
+ self.conv3_4 = Conv2d(
40
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
41
+ padding=1)
42
+ self.conv4_1 = Conv2d(
43
+ in_channels=256, out_channels=512, kernel_size=3, stride=1,
44
+ padding=1)
45
+ self.conv4_2 = Conv2d(
46
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
47
+ padding=1)
48
+ self.conv4_3 = Conv2d(
49
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
50
+ padding=1)
51
+ self.conv4_4 = Conv2d(
52
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
53
+ padding=1)
54
+ self.conv5_1 = Conv2d(
55
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
56
+ padding=1)
57
+ self.conv5_2 = Conv2d(
58
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
59
+ padding=1)
60
+ self.conv5_3_CPM = Conv2d(
61
+ in_channels=512, out_channels=128, kernel_size=3, stride=1,
62
+ padding=1)
63
+
64
+ # stage1
65
+ self.conv6_1_CPM = Conv2d(
66
+ in_channels=128, out_channels=512, kernel_size=1, stride=1,
67
+ padding=0)
68
+ self.conv6_2_CPM = Conv2d(
69
+ in_channels=512, out_channels=71, kernel_size=1, stride=1,
70
+ padding=0)
71
+
72
+ # stage2
73
+ self.Mconv1_stage2 = Conv2d(
74
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
75
+ padding=3)
76
+ self.Mconv2_stage2 = Conv2d(
77
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
78
+ padding=3)
79
+ self.Mconv3_stage2 = Conv2d(
80
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
81
+ padding=3)
82
+ self.Mconv4_stage2 = Conv2d(
83
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
84
+ padding=3)
85
+ self.Mconv5_stage2 = Conv2d(
86
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
87
+ padding=3)
88
+ self.Mconv6_stage2 = Conv2d(
89
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
90
+ padding=0)
91
+ self.Mconv7_stage2 = Conv2d(
92
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
93
+ padding=0)
94
+
95
+ # stage3
96
+ self.Mconv1_stage3 = Conv2d(
97
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
98
+ padding=3)
99
+ self.Mconv2_stage3 = Conv2d(
100
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
101
+ padding=3)
102
+ self.Mconv3_stage3 = Conv2d(
103
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
104
+ padding=3)
105
+ self.Mconv4_stage3 = Conv2d(
106
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
107
+ padding=3)
108
+ self.Mconv5_stage3 = Conv2d(
109
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
110
+ padding=3)
111
+ self.Mconv6_stage3 = Conv2d(
112
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
113
+ padding=0)
114
+ self.Mconv7_stage3 = Conv2d(
115
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
116
+ padding=0)
117
+
118
+ # stage4
119
+ self.Mconv1_stage4 = Conv2d(
120
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
121
+ padding=3)
122
+ self.Mconv2_stage4 = Conv2d(
123
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
124
+ padding=3)
125
+ self.Mconv3_stage4 = Conv2d(
126
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
127
+ padding=3)
128
+ self.Mconv4_stage4 = Conv2d(
129
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
130
+ padding=3)
131
+ self.Mconv5_stage4 = Conv2d(
132
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
133
+ padding=3)
134
+ self.Mconv6_stage4 = Conv2d(
135
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
136
+ padding=0)
137
+ self.Mconv7_stage4 = Conv2d(
138
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
139
+ padding=0)
140
+
141
+ # stage5
142
+ self.Mconv1_stage5 = Conv2d(
143
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
144
+ padding=3)
145
+ self.Mconv2_stage5 = Conv2d(
146
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
147
+ padding=3)
148
+ self.Mconv3_stage5 = Conv2d(
149
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
150
+ padding=3)
151
+ self.Mconv4_stage5 = Conv2d(
152
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
153
+ padding=3)
154
+ self.Mconv5_stage5 = Conv2d(
155
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
156
+ padding=3)
157
+ self.Mconv6_stage5 = Conv2d(
158
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
159
+ padding=0)
160
+ self.Mconv7_stage5 = Conv2d(
161
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
162
+ padding=0)
163
+
164
+ # stage6
165
+ self.Mconv1_stage6 = Conv2d(
166
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
167
+ padding=3)
168
+ self.Mconv2_stage6 = Conv2d(
169
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
170
+ padding=3)
171
+ self.Mconv3_stage6 = Conv2d(
172
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
173
+ padding=3)
174
+ self.Mconv4_stage6 = Conv2d(
175
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
176
+ padding=3)
177
+ self.Mconv5_stage6 = Conv2d(
178
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
179
+ padding=3)
180
+ self.Mconv6_stage6 = Conv2d(
181
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
182
+ padding=0)
183
+ self.Mconv7_stage6 = Conv2d(
184
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
185
+ padding=0)
186
+
187
+ for m in self.modules():
188
+ if isinstance(m, Conv2d):
189
+ init.constant_(m.bias, 0)
190
+
191
+ def forward(self, x):
192
+ """Return a list of heatmaps."""
193
+ heatmaps = []
194
+
195
+ h = self.relu(self.conv1_1(x))
196
+ h = self.relu(self.conv1_2(h))
197
+ h = self.max_pooling_2d(h)
198
+ h = self.relu(self.conv2_1(h))
199
+ h = self.relu(self.conv2_2(h))
200
+ h = self.max_pooling_2d(h)
201
+ h = self.relu(self.conv3_1(h))
202
+ h = self.relu(self.conv3_2(h))
203
+ h = self.relu(self.conv3_3(h))
204
+ h = self.relu(self.conv3_4(h))
205
+ h = self.max_pooling_2d(h)
206
+ h = self.relu(self.conv4_1(h))
207
+ h = self.relu(self.conv4_2(h))
208
+ h = self.relu(self.conv4_3(h))
209
+ h = self.relu(self.conv4_4(h))
210
+ h = self.relu(self.conv5_1(h))
211
+ h = self.relu(self.conv5_2(h))
212
+ h = self.relu(self.conv5_3_CPM(h))
213
+ feature_map = h
214
+
215
+ # stage1
216
+ h = self.relu(self.conv6_1_CPM(h))
217
+ h = self.conv6_2_CPM(h)
218
+ heatmaps.append(h)
219
+
220
+ # stage2
221
+ h = torch.cat([h, feature_map], dim=1) # channel concat
222
+ h = self.relu(self.Mconv1_stage2(h))
223
+ h = self.relu(self.Mconv2_stage2(h))
224
+ h = self.relu(self.Mconv3_stage2(h))
225
+ h = self.relu(self.Mconv4_stage2(h))
226
+ h = self.relu(self.Mconv5_stage2(h))
227
+ h = self.relu(self.Mconv6_stage2(h))
228
+ h = self.Mconv7_stage2(h)
229
+ heatmaps.append(h)
230
+
231
+ # stage3
232
+ h = torch.cat([h, feature_map], dim=1) # channel concat
233
+ h = self.relu(self.Mconv1_stage3(h))
234
+ h = self.relu(self.Mconv2_stage3(h))
235
+ h = self.relu(self.Mconv3_stage3(h))
236
+ h = self.relu(self.Mconv4_stage3(h))
237
+ h = self.relu(self.Mconv5_stage3(h))
238
+ h = self.relu(self.Mconv6_stage3(h))
239
+ h = self.Mconv7_stage3(h)
240
+ heatmaps.append(h)
241
+
242
+ # stage4
243
+ h = torch.cat([h, feature_map], dim=1) # channel concat
244
+ h = self.relu(self.Mconv1_stage4(h))
245
+ h = self.relu(self.Mconv2_stage4(h))
246
+ h = self.relu(self.Mconv3_stage4(h))
247
+ h = self.relu(self.Mconv4_stage4(h))
248
+ h = self.relu(self.Mconv5_stage4(h))
249
+ h = self.relu(self.Mconv6_stage4(h))
250
+ h = self.Mconv7_stage4(h)
251
+ heatmaps.append(h)
252
+
253
+ # stage5
254
+ h = torch.cat([h, feature_map], dim=1) # channel concat
255
+ h = self.relu(self.Mconv1_stage5(h))
256
+ h = self.relu(self.Mconv2_stage5(h))
257
+ h = self.relu(self.Mconv3_stage5(h))
258
+ h = self.relu(self.Mconv4_stage5(h))
259
+ h = self.relu(self.Mconv5_stage5(h))
260
+ h = self.relu(self.Mconv6_stage5(h))
261
+ h = self.Mconv7_stage5(h)
262
+ heatmaps.append(h)
263
+
264
+ # stage6
265
+ h = torch.cat([h, feature_map], dim=1) # channel concat
266
+ h = self.relu(self.Mconv1_stage6(h))
267
+ h = self.relu(self.Mconv2_stage6(h))
268
+ h = self.relu(self.Mconv3_stage6(h))
269
+ h = self.relu(self.Mconv4_stage6(h))
270
+ h = self.relu(self.Mconv5_stage6(h))
271
+ h = self.relu(self.Mconv6_stage6(h))
272
+ h = self.Mconv7_stage6(h)
273
+ heatmaps.append(h)
274
+
275
+ return heatmaps
276
+
277
+
278
+ LOG = logging.getLogger(__name__)
279
+ TOTEN = ToTensor()
280
+ TOPIL = ToPILImage()
281
+
282
+
283
+ params = {
284
+ 'gaussian_sigma': 2.5,
285
+ 'inference_img_size': 736, # 368, 736, 1312
286
+ 'heatmap_peak_thresh': 0.1,
287
+ 'crop_scale': 1.5,
288
+ 'line_indices': [
289
+ [0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
290
+ [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13],
291
+ [13, 14], [14, 15], [15, 16],
292
+ [17, 18], [18, 19], [19, 20], [20, 21],
293
+ [22, 23], [23, 24], [24, 25], [25, 26],
294
+ [27, 28], [28, 29], [29, 30],
295
+ [31, 32], [32, 33], [33, 34], [34, 35],
296
+ [36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36],
297
+ [42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42],
298
+ [48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54],
299
+ [54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48],
300
+ [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66],
301
+ [66, 67], [67, 60]
302
+ ],
303
+ }
304
+
305
+
306
+ class Face(object):
307
+ """
308
+ The OpenPose face landmark detector model.
309
+
310
+ Args:
311
+ inference_size: set the size of the inference image size, suggested:
312
+ 368, 736, 1312, default 736
313
+ gaussian_sigma: blur the heatmaps, default 2.5
314
+ heatmap_peak_thresh: return landmark if over threshold, default 0.1
315
+
316
+ """
317
+ def __init__(self, face_model_path,
318
+ inference_size=None,
319
+ gaussian_sigma=None,
320
+ heatmap_peak_thresh=None):
321
+ self.inference_size = inference_size or params["inference_img_size"]
322
+ self.sigma = gaussian_sigma or params['gaussian_sigma']
323
+ self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"]
324
+ self.model = FaceNet()
325
+ self.model.load_state_dict(torch.load(face_model_path))
326
+ self.model.eval()
327
+
328
+ def to(self, device):
329
+ self.model.to(device)
330
+ return self
331
+
332
+ def __call__(self, face_img):
333
+ device = next(iter(self.model.parameters())).device
334
+ H, W, C = face_img.shape
335
+
336
+ w_size = 384
337
+ x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5
338
+
339
+ x_data = x_data.to(device)
340
+
341
+ with torch.no_grad():
342
+ hs = self.model(x_data[None, ...])
343
+ heatmaps = F.interpolate(
344
+ hs[-1],
345
+ (H, W),
346
+ mode='bilinear', align_corners=True).cpu().numpy()[0]
347
+ return heatmaps
348
+
349
+ def compute_peaks_from_heatmaps(self, heatmaps):
350
+ all_peaks = []
351
+ for part in range(heatmaps.shape[0]):
352
+ map_ori = heatmaps[part].copy()
353
+ binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8)
354
+
355
+ if np.sum(binary) == 0:
356
+ continue
357
+
358
+ positions = np.where(binary > 0.5)
359
+ intensities = map_ori[positions]
360
+ mi = np.argmax(intensities)
361
+ y, x = positions[0][mi], positions[1][mi]
362
+ all_peaks.append([x, y])
363
+
364
+ return np.array(all_peaks)
ControlNetUnion-space/controlnet_aux/open_pose/hand.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+ from scipy.ndimage.filters import gaussian_filter
5
+ from skimage.measure import label
6
+
7
+ from . import util
8
+ from .model import handpose_model
9
+
10
+
11
+ class Hand(object):
12
+ def __init__(self, model_path):
13
+ self.model = handpose_model()
14
+ model_dict = util.transfer(self.model, torch.load(model_path))
15
+ self.model.load_state_dict(model_dict)
16
+ self.model.eval()
17
+
18
+ def to(self, device):
19
+ self.model.to(device)
20
+ return self
21
+
22
+ def __call__(self, oriImgRaw):
23
+ device = next(iter(self.model.parameters())).device
24
+ scale_search = [0.5, 1.0, 1.5, 2.0]
25
+ # scale_search = [0.5]
26
+ boxsize = 368
27
+ stride = 8
28
+ padValue = 128
29
+ thre = 0.05
30
+ multiplier = [x * boxsize for x in scale_search]
31
+
32
+ wsize = 128
33
+ heatmap_avg = np.zeros((wsize, wsize, 22))
34
+
35
+ Hr, Wr, Cr = oriImgRaw.shape
36
+
37
+ oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8)
38
+
39
+ for m in range(len(multiplier)):
40
+ scale = multiplier[m]
41
+ imageToTest = util.smart_resize(oriImg, (scale, scale))
42
+
43
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
44
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
45
+ im = np.ascontiguousarray(im)
46
+
47
+ data = torch.from_numpy(im).float()
48
+ data = data.to(device)
49
+
50
+ with torch.no_grad():
51
+ output = self.model(data).cpu().numpy()
52
+
53
+ # extract outputs, resize, and remove padding
54
+ heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
55
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
56
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
57
+ heatmap = util.smart_resize(heatmap, (wsize, wsize))
58
+
59
+ heatmap_avg += heatmap / len(multiplier)
60
+
61
+ all_peaks = []
62
+ for part in range(21):
63
+ map_ori = heatmap_avg[:, :, part]
64
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
65
+ binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
66
+
67
+ if np.sum(binary) == 0:
68
+ all_peaks.append([0, 0])
69
+ continue
70
+ label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
71
+ max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
72
+ label_img[label_img != max_index] = 0
73
+ map_ori[label_img == 0] = 0
74
+
75
+ y, x = util.npmax(map_ori)
76
+ y = int(float(y) * float(Hr) / float(wsize))
77
+ x = int(float(x) * float(Wr) / float(wsize))
78
+ all_peaks.append([x, y])
79
+ return np.array(all_peaks)
80
+
81
+ if __name__ == "__main__":
82
+ hand_estimation = Hand('../model/hand_pose_model.pth')
83
+
84
+ # test_image = '../images/hand.jpg'
85
+ test_image = '../images/hand.jpg'
86
+ oriImg = cv2.imread(test_image) # B,G,R order
87
+ peaks = hand_estimation(oriImg)
88
+ canvas = util.draw_handpose(oriImg, peaks, True)
89
+ cv2.imshow('', canvas)
90
+ cv2.waitKey(0)
ControlNetUnion-space/controlnet_aux/open_pose/model.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from collections import OrderedDict
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ def make_layers(block, no_relu_layers):
8
+ layers = []
9
+ for layer_name, v in block.items():
10
+ if 'pool' in layer_name:
11
+ layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
12
+ padding=v[2])
13
+ layers.append((layer_name, layer))
14
+ else:
15
+ conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
16
+ kernel_size=v[2], stride=v[3],
17
+ padding=v[4])
18
+ layers.append((layer_name, conv2d))
19
+ if layer_name not in no_relu_layers:
20
+ layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
21
+
22
+ return nn.Sequential(OrderedDict(layers))
23
+
24
+ class bodypose_model(nn.Module):
25
+ def __init__(self):
26
+ super(bodypose_model, self).__init__()
27
+
28
+ # these layers have no relu layer
29
+ no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
30
+ 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
31
+ 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
32
+ 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
33
+ blocks = {}
34
+ block0 = OrderedDict([
35
+ ('conv1_1', [3, 64, 3, 1, 1]),
36
+ ('conv1_2', [64, 64, 3, 1, 1]),
37
+ ('pool1_stage1', [2, 2, 0]),
38
+ ('conv2_1', [64, 128, 3, 1, 1]),
39
+ ('conv2_2', [128, 128, 3, 1, 1]),
40
+ ('pool2_stage1', [2, 2, 0]),
41
+ ('conv3_1', [128, 256, 3, 1, 1]),
42
+ ('conv3_2', [256, 256, 3, 1, 1]),
43
+ ('conv3_3', [256, 256, 3, 1, 1]),
44
+ ('conv3_4', [256, 256, 3, 1, 1]),
45
+ ('pool3_stage1', [2, 2, 0]),
46
+ ('conv4_1', [256, 512, 3, 1, 1]),
47
+ ('conv4_2', [512, 512, 3, 1, 1]),
48
+ ('conv4_3_CPM', [512, 256, 3, 1, 1]),
49
+ ('conv4_4_CPM', [256, 128, 3, 1, 1])
50
+ ])
51
+
52
+
53
+ # Stage 1
54
+ block1_1 = OrderedDict([
55
+ ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
56
+ ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
57
+ ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
58
+ ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
59
+ ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
60
+ ])
61
+
62
+ block1_2 = OrderedDict([
63
+ ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
64
+ ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
65
+ ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
66
+ ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
67
+ ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
68
+ ])
69
+ blocks['block1_1'] = block1_1
70
+ blocks['block1_2'] = block1_2
71
+
72
+ self.model0 = make_layers(block0, no_relu_layers)
73
+
74
+ # Stages 2 - 6
75
+ for i in range(2, 7):
76
+ blocks['block%d_1' % i] = OrderedDict([
77
+ ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
78
+ ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
79
+ ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
80
+ ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
81
+ ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
82
+ ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
83
+ ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
84
+ ])
85
+
86
+ blocks['block%d_2' % i] = OrderedDict([
87
+ ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
88
+ ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
89
+ ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
90
+ ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
91
+ ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
92
+ ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
93
+ ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
94
+ ])
95
+
96
+ for k in blocks.keys():
97
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
98
+
99
+ self.model1_1 = blocks['block1_1']
100
+ self.model2_1 = blocks['block2_1']
101
+ self.model3_1 = blocks['block3_1']
102
+ self.model4_1 = blocks['block4_1']
103
+ self.model5_1 = blocks['block5_1']
104
+ self.model6_1 = blocks['block6_1']
105
+
106
+ self.model1_2 = blocks['block1_2']
107
+ self.model2_2 = blocks['block2_2']
108
+ self.model3_2 = blocks['block3_2']
109
+ self.model4_2 = blocks['block4_2']
110
+ self.model5_2 = blocks['block5_2']
111
+ self.model6_2 = blocks['block6_2']
112
+
113
+
114
+ def forward(self, x):
115
+
116
+ out1 = self.model0(x)
117
+
118
+ out1_1 = self.model1_1(out1)
119
+ out1_2 = self.model1_2(out1)
120
+ out2 = torch.cat([out1_1, out1_2, out1], 1)
121
+
122
+ out2_1 = self.model2_1(out2)
123
+ out2_2 = self.model2_2(out2)
124
+ out3 = torch.cat([out2_1, out2_2, out1], 1)
125
+
126
+ out3_1 = self.model3_1(out3)
127
+ out3_2 = self.model3_2(out3)
128
+ out4 = torch.cat([out3_1, out3_2, out1], 1)
129
+
130
+ out4_1 = self.model4_1(out4)
131
+ out4_2 = self.model4_2(out4)
132
+ out5 = torch.cat([out4_1, out4_2, out1], 1)
133
+
134
+ out5_1 = self.model5_1(out5)
135
+ out5_2 = self.model5_2(out5)
136
+ out6 = torch.cat([out5_1, out5_2, out1], 1)
137
+
138
+ out6_1 = self.model6_1(out6)
139
+ out6_2 = self.model6_2(out6)
140
+
141
+ return out6_1, out6_2
142
+
143
+ class handpose_model(nn.Module):
144
+ def __init__(self):
145
+ super(handpose_model, self).__init__()
146
+
147
+ # these layers have no relu layer
148
+ no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
149
+ 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
150
+ # stage 1
151
+ block1_0 = OrderedDict([
152
+ ('conv1_1', [3, 64, 3, 1, 1]),
153
+ ('conv1_2', [64, 64, 3, 1, 1]),
154
+ ('pool1_stage1', [2, 2, 0]),
155
+ ('conv2_1', [64, 128, 3, 1, 1]),
156
+ ('conv2_2', [128, 128, 3, 1, 1]),
157
+ ('pool2_stage1', [2, 2, 0]),
158
+ ('conv3_1', [128, 256, 3, 1, 1]),
159
+ ('conv3_2', [256, 256, 3, 1, 1]),
160
+ ('conv3_3', [256, 256, 3, 1, 1]),
161
+ ('conv3_4', [256, 256, 3, 1, 1]),
162
+ ('pool3_stage1', [2, 2, 0]),
163
+ ('conv4_1', [256, 512, 3, 1, 1]),
164
+ ('conv4_2', [512, 512, 3, 1, 1]),
165
+ ('conv4_3', [512, 512, 3, 1, 1]),
166
+ ('conv4_4', [512, 512, 3, 1, 1]),
167
+ ('conv5_1', [512, 512, 3, 1, 1]),
168
+ ('conv5_2', [512, 512, 3, 1, 1]),
169
+ ('conv5_3_CPM', [512, 128, 3, 1, 1])
170
+ ])
171
+
172
+ block1_1 = OrderedDict([
173
+ ('conv6_1_CPM', [128, 512, 1, 1, 0]),
174
+ ('conv6_2_CPM', [512, 22, 1, 1, 0])
175
+ ])
176
+
177
+ blocks = {}
178
+ blocks['block1_0'] = block1_0
179
+ blocks['block1_1'] = block1_1
180
+
181
+ # stage 2-6
182
+ for i in range(2, 7):
183
+ blocks['block%d' % i] = OrderedDict([
184
+ ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
185
+ ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
186
+ ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
187
+ ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
188
+ ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
189
+ ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
190
+ ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
191
+ ])
192
+
193
+ for k in blocks.keys():
194
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
195
+
196
+ self.model1_0 = blocks['block1_0']
197
+ self.model1_1 = blocks['block1_1']
198
+ self.model2 = blocks['block2']
199
+ self.model3 = blocks['block3']
200
+ self.model4 = blocks['block4']
201
+ self.model5 = blocks['block5']
202
+ self.model6 = blocks['block6']
203
+
204
+ def forward(self, x):
205
+ out1_0 = self.model1_0(x)
206
+ out1_1 = self.model1_1(out1_0)
207
+ concat_stage2 = torch.cat([out1_1, out1_0], 1)
208
+ out_stage2 = self.model2(concat_stage2)
209
+ concat_stage3 = torch.cat([out_stage2, out1_0], 1)
210
+ out_stage3 = self.model3(concat_stage3)
211
+ concat_stage4 = torch.cat([out_stage3, out1_0], 1)
212
+ out_stage4 = self.model4(concat_stage4)
213
+ concat_stage5 = torch.cat([out_stage4, out1_0], 1)
214
+ out_stage5 = self.model5(concat_stage5)
215
+ concat_stage6 = torch.cat([out_stage5, out1_0], 1)
216
+ out_stage6 = self.model6(concat_stage6)
217
+ return out_stage6
ControlNetUnion-space/controlnet_aux/open_pose/util.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import cv2
4
+ from typing import List, Tuple, Union
5
+
6
+ from .body import BodyResult, Keypoint
7
+
8
+ eps = 0.01
9
+
10
+
11
+ def smart_resize(x, s):
12
+ Ht, Wt = s
13
+ if x.ndim == 2:
14
+ Ho, Wo = x.shape
15
+ Co = 1
16
+ else:
17
+ Ho, Wo, Co = x.shape
18
+ if Co == 3 or Co == 1:
19
+ k = float(Ht + Wt) / float(Ho + Wo)
20
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
21
+ else:
22
+ return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
23
+
24
+
25
+ def smart_resize_k(x, fx, fy):
26
+ if x.ndim == 2:
27
+ Ho, Wo = x.shape
28
+ Co = 1
29
+ else:
30
+ Ho, Wo, Co = x.shape
31
+ Ht, Wt = Ho * fy, Wo * fx
32
+ if Co == 3 or Co == 1:
33
+ k = float(Ht + Wt) / float(Ho + Wo)
34
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
35
+ else:
36
+ return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
37
+
38
+
39
+ def padRightDownCorner(img, stride, padValue):
40
+ h = img.shape[0]
41
+ w = img.shape[1]
42
+
43
+ pad = 4 * [None]
44
+ pad[0] = 0 # up
45
+ pad[1] = 0 # left
46
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
47
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
48
+
49
+ img_padded = img
50
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
51
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
52
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
53
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
54
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
55
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
56
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
57
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
58
+
59
+ return img_padded, pad
60
+
61
+
62
+ def transfer(model, model_weights):
63
+ transfered_model_weights = {}
64
+ for weights_name in model.state_dict().keys():
65
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
66
+ return transfered_model_weights
67
+
68
+
69
+ def draw_bodypose(canvas: np.ndarray, keypoints: List[Keypoint]) -> np.ndarray:
70
+ """
71
+ Draw keypoints and limbs representing body pose on a given canvas.
72
+
73
+ Args:
74
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the body pose.
75
+ keypoints (List[Keypoint]): A list of Keypoint objects representing the body keypoints to be drawn.
76
+
77
+ Returns:
78
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn body pose.
79
+
80
+ Note:
81
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
82
+ """
83
+ H, W, C = canvas.shape
84
+ stickwidth = 4
85
+
86
+ limbSeq = [
87
+ [2, 3], [2, 6], [3, 4], [4, 5],
88
+ [6, 7], [7, 8], [2, 9], [9, 10],
89
+ [10, 11], [2, 12], [12, 13], [13, 14],
90
+ [2, 1], [1, 15], [15, 17], [1, 16],
91
+ [16, 18],
92
+ ]
93
+
94
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
95
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
96
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
97
+
98
+ for (k1_index, k2_index), color in zip(limbSeq, colors):
99
+ keypoint1 = keypoints[k1_index - 1]
100
+ keypoint2 = keypoints[k2_index - 1]
101
+
102
+ if keypoint1 is None or keypoint2 is None:
103
+ continue
104
+
105
+ Y = np.array([keypoint1.x, keypoint2.x]) * float(W)
106
+ X = np.array([keypoint1.y, keypoint2.y]) * float(H)
107
+ mX = np.mean(X)
108
+ mY = np.mean(Y)
109
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
110
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
111
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
112
+ cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color])
113
+
114
+ for keypoint, color in zip(keypoints, colors):
115
+ if keypoint is None:
116
+ continue
117
+
118
+ x, y = keypoint.x, keypoint.y
119
+ x = int(x * W)
120
+ y = int(y * H)
121
+ cv2.circle(canvas, (int(x), int(y)), 4, color, thickness=-1)
122
+
123
+ return canvas
124
+
125
+
126
+ def draw_handpose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
127
+ import matplotlib
128
+ """
129
+ Draw keypoints and connections representing hand pose on a given canvas.
130
+
131
+ Args:
132
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the hand pose.
133
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the hand keypoints to be drawn
134
+ or None if no keypoints are present.
135
+
136
+ Returns:
137
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn hand pose.
138
+
139
+ Note:
140
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
141
+ """
142
+ if not keypoints:
143
+ return canvas
144
+
145
+ H, W, C = canvas.shape
146
+
147
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
148
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
149
+
150
+ for ie, (e1, e2) in enumerate(edges):
151
+ k1 = keypoints[e1]
152
+ k2 = keypoints[e2]
153
+ if k1 is None or k2 is None:
154
+ continue
155
+
156
+ x1 = int(k1.x * W)
157
+ y1 = int(k1.y * H)
158
+ x2 = int(k2.x * W)
159
+ y2 = int(k2.y * H)
160
+ if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
161
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
162
+
163
+ for keypoint in keypoints:
164
+ x, y = keypoint.x, keypoint.y
165
+ x = int(x * W)
166
+ y = int(y * H)
167
+ if x > eps and y > eps:
168
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
169
+ return canvas
170
+
171
+
172
+ def draw_facepose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
173
+ """
174
+ Draw keypoints representing face pose on a given canvas.
175
+
176
+ Args:
177
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the face pose.
178
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the face keypoints to be drawn
179
+ or None if no keypoints are present.
180
+
181
+ Returns:
182
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn face pose.
183
+
184
+ Note:
185
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
186
+ """
187
+ if not keypoints:
188
+ return canvas
189
+
190
+ H, W, C = canvas.shape
191
+ for keypoint in keypoints:
192
+ x, y = keypoint.x, keypoint.y
193
+ x = int(x * W)
194
+ y = int(y * H)
195
+ if x > eps and y > eps:
196
+ cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
197
+ return canvas
198
+
199
+
200
+ # detect hand according to body pose keypoints
201
+ # please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
202
+ def handDetect(body: BodyResult, oriImg) -> List[Tuple[int, int, int, bool]]:
203
+ """
204
+ Detect hands in the input body pose keypoints and calculate the bounding box for each hand.
205
+
206
+ Args:
207
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
208
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
209
+
210
+ Returns:
211
+ List[Tuple[int, int, int, bool]]: A list of tuples, each containing the coordinates (x, y) of the top-left
212
+ corner of the bounding box, the width (height) of the bounding box, and
213
+ a boolean flag indicating whether the hand is a left hand (True) or a
214
+ right hand (False).
215
+
216
+ Notes:
217
+ - The width and height of the bounding boxes are equal since the network requires squared input.
218
+ - The minimum bounding box size is 20 pixels.
219
+ """
220
+ ratioWristElbow = 0.33
221
+ detect_result = []
222
+ image_height, image_width = oriImg.shape[0:2]
223
+
224
+ keypoints = body.keypoints
225
+ # right hand: wrist 4, elbow 3, shoulder 2
226
+ # left hand: wrist 7, elbow 6, shoulder 5
227
+ left_shoulder = keypoints[5]
228
+ left_elbow = keypoints[6]
229
+ left_wrist = keypoints[7]
230
+ right_shoulder = keypoints[2]
231
+ right_elbow = keypoints[3]
232
+ right_wrist = keypoints[4]
233
+
234
+ # if any of three not detected
235
+ has_left = all(keypoint is not None for keypoint in (left_shoulder, left_elbow, left_wrist))
236
+ has_right = all(keypoint is not None for keypoint in (right_shoulder, right_elbow, right_wrist))
237
+ if not (has_left or has_right):
238
+ return []
239
+
240
+ hands = []
241
+ #left hand
242
+ if has_left:
243
+ hands.append([
244
+ left_shoulder.x, left_shoulder.y,
245
+ left_elbow.x, left_elbow.y,
246
+ left_wrist.x, left_wrist.y,
247
+ True
248
+ ])
249
+ # right hand
250
+ if has_right:
251
+ hands.append([
252
+ right_shoulder.x, right_shoulder.y,
253
+ right_elbow.x, right_elbow.y,
254
+ right_wrist.x, right_wrist.y,
255
+ False
256
+ ])
257
+
258
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
259
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
260
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
261
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
262
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
263
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
264
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
265
+ x = x3 + ratioWristElbow * (x3 - x2)
266
+ y = y3 + ratioWristElbow * (y3 - y2)
267
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
268
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
269
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
270
+ # x-y refers to the center --> offset to topLeft point
271
+ # handRectangle.x -= handRectangle.width / 2.f;
272
+ # handRectangle.y -= handRectangle.height / 2.f;
273
+ x -= width / 2
274
+ y -= width / 2 # width = height
275
+ # overflow the image
276
+ if x < 0: x = 0
277
+ if y < 0: y = 0
278
+ width1 = width
279
+ width2 = width
280
+ if x + width > image_width: width1 = image_width - x
281
+ if y + width > image_height: width2 = image_height - y
282
+ width = min(width1, width2)
283
+ # the max hand box value is 20 pixels
284
+ if width >= 20:
285
+ detect_result.append((int(x), int(y), int(width), is_left))
286
+
287
+ '''
288
+ return value: [[x, y, w, True if left hand else False]].
289
+ width=height since the network require squared input.
290
+ x, y is the coordinate of top left
291
+ '''
292
+ return detect_result
293
+
294
+
295
+ # Written by Lvmin
296
+ def faceDetect(body: BodyResult, oriImg) -> Union[Tuple[int, int, int], None]:
297
+ """
298
+ Detect the face in the input body pose keypoints and calculate the bounding box for the face.
299
+
300
+ Args:
301
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
302
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
303
+
304
+ Returns:
305
+ Tuple[int, int, int] | None: A tuple containing the coordinates (x, y) of the top-left corner of the
306
+ bounding box and the width (height) of the bounding box, or None if the
307
+ face is not detected or the bounding box width is less than 20 pixels.
308
+
309
+ Notes:
310
+ - The width and height of the bounding box are equal.
311
+ - The minimum bounding box size is 20 pixels.
312
+ """
313
+ # left right eye ear 14 15 16 17
314
+ image_height, image_width = oriImg.shape[0:2]
315
+
316
+ keypoints = body.keypoints
317
+ head = keypoints[0]
318
+ left_eye = keypoints[14]
319
+ right_eye = keypoints[15]
320
+ left_ear = keypoints[16]
321
+ right_ear = keypoints[17]
322
+
323
+ if head is None or all(keypoint is None for keypoint in (left_eye, right_eye, left_ear, right_ear)):
324
+ return None
325
+
326
+ width = 0.0
327
+ x0, y0 = head.x, head.y
328
+
329
+ if left_eye is not None:
330
+ x1, y1 = left_eye.x, left_eye.y
331
+ d = max(abs(x0 - x1), abs(y0 - y1))
332
+ width = max(width, d * 3.0)
333
+
334
+ if right_eye is not None:
335
+ x1, y1 = right_eye.x, right_eye.y
336
+ d = max(abs(x0 - x1), abs(y0 - y1))
337
+ width = max(width, d * 3.0)
338
+
339
+ if left_ear is not None:
340
+ x1, y1 = left_ear.x, left_ear.y
341
+ d = max(abs(x0 - x1), abs(y0 - y1))
342
+ width = max(width, d * 1.5)
343
+
344
+ if right_ear is not None:
345
+ x1, y1 = right_ear.x, right_ear.y
346
+ d = max(abs(x0 - x1), abs(y0 - y1))
347
+ width = max(width, d * 1.5)
348
+
349
+ x, y = x0, y0
350
+
351
+ x -= width
352
+ y -= width
353
+
354
+ if x < 0:
355
+ x = 0
356
+
357
+ if y < 0:
358
+ y = 0
359
+
360
+ width1 = width * 2
361
+ width2 = width * 2
362
+
363
+ if x + width > image_width:
364
+ width1 = image_width - x
365
+
366
+ if y + width > image_height:
367
+ width2 = image_height - y
368
+
369
+ width = min(width1, width2)
370
+
371
+ if width >= 20:
372
+ return int(x), int(y), int(width)
373
+ else:
374
+ return None
375
+
376
+
377
+ # get max index of 2d array
378
+ def npmax(array):
379
+ arrayindex = array.argmax(1)
380
+ arrayvalue = array.max(1)
381
+ i = arrayvalue.argmax()
382
+ j = arrayindex[i]
383
+ return i, j
ControlNetUnion-space/controlnet_aux/util.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import torch
7
+
8
+ annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
9
+
10
+
11
+ def HWC3(x):
12
+ assert x.dtype == np.uint8
13
+ if x.ndim == 2:
14
+ x = x[:, :, None]
15
+ assert x.ndim == 3
16
+ H, W, C = x.shape
17
+ assert C == 1 or C == 3 or C == 4
18
+ if C == 3:
19
+ return x
20
+ if C == 1:
21
+ return np.concatenate([x, x, x], axis=2)
22
+ if C == 4:
23
+ color = x[:, :, 0:3].astype(np.float32)
24
+ alpha = x[:, :, 3:4].astype(np.float32) / 255.0
25
+ y = color * alpha + 255.0 * (1.0 - alpha)
26
+ y = y.clip(0, 255).astype(np.uint8)
27
+ return y
28
+
29
+
30
+ def make_noise_disk(H, W, C, F):
31
+ noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
32
+ noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
33
+ noise = noise[F: F + H, F: F + W]
34
+ noise -= np.min(noise)
35
+ noise /= np.max(noise)
36
+ if C == 1:
37
+ noise = noise[:, :, None]
38
+ return noise
39
+
40
+
41
+ def nms(x, t, s):
42
+ x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
43
+
44
+ f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
45
+ f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
46
+ f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
47
+ f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
48
+
49
+ y = np.zeros_like(x)
50
+
51
+ for f in [f1, f2, f3, f4]:
52
+ np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
53
+
54
+ z = np.zeros_like(y, dtype=np.uint8)
55
+ z[y > t] = 255
56
+ return z
57
+
58
+ def min_max_norm(x):
59
+ x -= np.min(x)
60
+ x /= np.maximum(np.max(x), 1e-5)
61
+ return x
62
+
63
+
64
+ def safe_step(x, step=2):
65
+ y = x.astype(np.float32) * float(step + 1)
66
+ y = y.astype(np.int32).astype(np.float32) / float(step)
67
+ return y
68
+
69
+
70
+ def img2mask(img, H, W, low=10, high=90):
71
+ assert img.ndim == 3 or img.ndim == 2
72
+ assert img.dtype == np.uint8
73
+
74
+ if img.ndim == 3:
75
+ y = img[:, :, random.randrange(0, img.shape[2])]
76
+ else:
77
+ y = img
78
+
79
+ y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
80
+
81
+ if random.uniform(0, 1) < 0.5:
82
+ y = 255 - y
83
+
84
+ return y < np.percentile(y, random.randrange(low, high))
85
+
86
+
87
+ def resize_image(input_image, resolution):
88
+ H, W, C = input_image.shape
89
+ H = float(H)
90
+ W = float(W)
91
+ k = float(resolution) / min(H, W)
92
+ H *= k
93
+ W *= k
94
+ H = int(np.round(H / 64.0)) * 64
95
+ W = int(np.round(W / 64.0)) * 64
96
+ img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
97
+ return img
98
+
99
+
100
+ def torch_gc():
101
+ if torch.cuda.is_available():
102
+ torch.cuda.empty_cache()
103
+ torch.cuda.ipc_collect()
104
+
105
+
106
+ def ade_palette():
107
+ """ADE20K palette that maps each class to RGB values."""
108
+ return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
109
+ [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
110
+ [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
111
+ [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
112
+ [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
113
+ [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
114
+ [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
115
+ [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
116
+ [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
117
+ [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
118
+ [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
119
+ [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
120
+ [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
121
+ [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
122
+ [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
123
+ [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
124
+ [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
125
+ [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
126
+ [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
127
+ [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
128
+ [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
129
+ [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
130
+ [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
131
+ [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
132
+ [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
133
+ [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
134
+ [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
135
+ [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
136
+ [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
137
+ [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
138
+ [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
139
+ [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
140
+ [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
141
+ [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
142
+ [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
143
+ [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
144
+ [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
145
+ [102, 255, 0], [92, 0, 255]]
146
+
ControlNetUnion-space/depth_anything_v2/.DS_Store ADDED
Binary file (6.15 kB). View file
 
ControlNetUnion-space/depth_anything_v2/dinov2.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
9
+
10
+ from functools import partial
11
+ import math
12
+ import logging
13
+ from typing import Sequence, Tuple, Union, Callable
14
+
15
+ import torch
16
+ import torch.nn as nn
17
+ import torch.utils.checkpoint
18
+ from torch.nn.init import trunc_normal_
19
+
20
+ from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
21
+
22
+
23
+ logger = logging.getLogger("dinov2")
24
+
25
+
26
+ def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
27
+ if not depth_first and include_root:
28
+ fn(module=module, name=name)
29
+ for child_name, child_module in module.named_children():
30
+ child_name = ".".join((name, child_name)) if name else child_name
31
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
32
+ if depth_first and include_root:
33
+ fn(module=module, name=name)
34
+ return module
35
+
36
+
37
+ class BlockChunk(nn.ModuleList):
38
+ def forward(self, x):
39
+ for b in self:
40
+ x = b(x)
41
+ return x
42
+
43
+
44
+ class DinoVisionTransformer(nn.Module):
45
+ def __init__(
46
+ self,
47
+ img_size=224,
48
+ patch_size=16,
49
+ in_chans=3,
50
+ embed_dim=768,
51
+ depth=12,
52
+ num_heads=12,
53
+ mlp_ratio=4.0,
54
+ qkv_bias=True,
55
+ ffn_bias=True,
56
+ proj_bias=True,
57
+ drop_path_rate=0.0,
58
+ drop_path_uniform=False,
59
+ init_values=None, # for layerscale: None or 0 => no layerscale
60
+ embed_layer=PatchEmbed,
61
+ act_layer=nn.GELU,
62
+ block_fn=Block,
63
+ ffn_layer="mlp",
64
+ block_chunks=1,
65
+ num_register_tokens=0,
66
+ interpolate_antialias=False,
67
+ interpolate_offset=0.1,
68
+ ):
69
+ """
70
+ Args:
71
+ img_size (int, tuple): input image size
72
+ patch_size (int, tuple): patch size
73
+ in_chans (int): number of input channels
74
+ embed_dim (int): embedding dimension
75
+ depth (int): depth of transformer
76
+ num_heads (int): number of attention heads
77
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
78
+ qkv_bias (bool): enable bias for qkv if True
79
+ proj_bias (bool): enable bias for proj in attn if True
80
+ ffn_bias (bool): enable bias for ffn if True
81
+ drop_path_rate (float): stochastic depth rate
82
+ drop_path_uniform (bool): apply uniform drop rate across blocks
83
+ weight_init (str): weight init scheme
84
+ init_values (float): layer-scale init values
85
+ embed_layer (nn.Module): patch embedding layer
86
+ act_layer (nn.Module): MLP activation layer
87
+ block_fn (nn.Module): transformer block class
88
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
89
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
90
+ num_register_tokens: (int) number of extra cls tokens (so-called "registers")
91
+ interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
92
+ interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
93
+ """
94
+ super().__init__()
95
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
96
+
97
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
98
+ self.num_tokens = 1
99
+ self.n_blocks = depth
100
+ self.num_heads = num_heads
101
+ self.patch_size = patch_size
102
+ self.num_register_tokens = num_register_tokens
103
+ self.interpolate_antialias = interpolate_antialias
104
+ self.interpolate_offset = interpolate_offset
105
+
106
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
107
+ num_patches = self.patch_embed.num_patches
108
+
109
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
110
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
111
+ assert num_register_tokens >= 0
112
+ self.register_tokens = (
113
+ nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
114
+ )
115
+
116
+ if drop_path_uniform is True:
117
+ dpr = [drop_path_rate] * depth
118
+ else:
119
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
120
+
121
+ if ffn_layer == "mlp":
122
+ logger.info("using MLP layer as FFN")
123
+ ffn_layer = Mlp
124
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
125
+ logger.info("using SwiGLU layer as FFN")
126
+ ffn_layer = SwiGLUFFNFused
127
+ elif ffn_layer == "identity":
128
+ logger.info("using Identity layer as FFN")
129
+
130
+ def f(*args, **kwargs):
131
+ return nn.Identity()
132
+
133
+ ffn_layer = f
134
+ else:
135
+ raise NotImplementedError
136
+
137
+ blocks_list = [
138
+ block_fn(
139
+ dim=embed_dim,
140
+ num_heads=num_heads,
141
+ mlp_ratio=mlp_ratio,
142
+ qkv_bias=qkv_bias,
143
+ proj_bias=proj_bias,
144
+ ffn_bias=ffn_bias,
145
+ drop_path=dpr[i],
146
+ norm_layer=norm_layer,
147
+ act_layer=act_layer,
148
+ ffn_layer=ffn_layer,
149
+ init_values=init_values,
150
+ )
151
+ for i in range(depth)
152
+ ]
153
+ if block_chunks > 0:
154
+ self.chunked_blocks = True
155
+ chunked_blocks = []
156
+ chunksize = depth // block_chunks
157
+ for i in range(0, depth, chunksize):
158
+ # this is to keep the block index consistent if we chunk the block list
159
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
160
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
161
+ else:
162
+ self.chunked_blocks = False
163
+ self.blocks = nn.ModuleList(blocks_list)
164
+
165
+ self.norm = norm_layer(embed_dim)
166
+ self.head = nn.Identity()
167
+
168
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
169
+
170
+ self.init_weights()
171
+
172
+ def init_weights(self):
173
+ trunc_normal_(self.pos_embed, std=0.02)
174
+ nn.init.normal_(self.cls_token, std=1e-6)
175
+ if self.register_tokens is not None:
176
+ nn.init.normal_(self.register_tokens, std=1e-6)
177
+ named_apply(init_weights_vit_timm, self)
178
+
179
+ def interpolate_pos_encoding(self, x, w, h):
180
+ previous_dtype = x.dtype
181
+ npatch = x.shape[1] - 1
182
+ N = self.pos_embed.shape[1] - 1
183
+ if npatch == N and w == h:
184
+ return self.pos_embed
185
+ pos_embed = self.pos_embed.float()
186
+ class_pos_embed = pos_embed[:, 0]
187
+ patch_pos_embed = pos_embed[:, 1:]
188
+ dim = x.shape[-1]
189
+ w0 = w // self.patch_size
190
+ h0 = h // self.patch_size
191
+ # we add a small number to avoid floating point error in the interpolation
192
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
193
+ # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
194
+ w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
195
+ # w0, h0 = w0 + 0.1, h0 + 0.1
196
+
197
+ sqrt_N = math.sqrt(N)
198
+ sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
199
+ patch_pos_embed = nn.functional.interpolate(
200
+ patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
201
+ scale_factor=(sx, sy),
202
+ # (int(w0), int(h0)), # to solve the upsampling shape issue
203
+ mode="bicubic",
204
+ antialias=self.interpolate_antialias
205
+ )
206
+
207
+ assert int(w0) == patch_pos_embed.shape[-2]
208
+ assert int(h0) == patch_pos_embed.shape[-1]
209
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
210
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
211
+
212
+ def prepare_tokens_with_masks(self, x, masks=None):
213
+ B, nc, w, h = x.shape
214
+ x = self.patch_embed(x)
215
+ if masks is not None:
216
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
217
+
218
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
219
+ x = x + self.interpolate_pos_encoding(x, w, h)
220
+
221
+ if self.register_tokens is not None:
222
+ x = torch.cat(
223
+ (
224
+ x[:, :1],
225
+ self.register_tokens.expand(x.shape[0], -1, -1),
226
+ x[:, 1:],
227
+ ),
228
+ dim=1,
229
+ )
230
+
231
+ return x
232
+
233
+ def forward_features_list(self, x_list, masks_list):
234
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
235
+ for blk in self.blocks:
236
+ x = blk(x)
237
+
238
+ all_x = x
239
+ output = []
240
+ for x, masks in zip(all_x, masks_list):
241
+ x_norm = self.norm(x)
242
+ output.append(
243
+ {
244
+ "x_norm_clstoken": x_norm[:, 0],
245
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
246
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
247
+ "x_prenorm": x,
248
+ "masks": masks,
249
+ }
250
+ )
251
+ return output
252
+
253
+ def forward_features(self, x, masks=None):
254
+ if isinstance(x, list):
255
+ return self.forward_features_list(x, masks)
256
+
257
+ x = self.prepare_tokens_with_masks(x, masks)
258
+
259
+ for blk in self.blocks:
260
+ x = blk(x)
261
+
262
+ x_norm = self.norm(x)
263
+ return {
264
+ "x_norm_clstoken": x_norm[:, 0],
265
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
266
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
267
+ "x_prenorm": x,
268
+ "masks": masks,
269
+ }
270
+
271
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
272
+ x = self.prepare_tokens_with_masks(x)
273
+ # If n is an int, take the n last blocks. If it's a list, take them
274
+ output, total_block_len = [], len(self.blocks)
275
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
276
+ for i, blk in enumerate(self.blocks):
277
+ x = blk(x)
278
+ if i in blocks_to_take:
279
+ output.append(x)
280
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
281
+ return output
282
+
283
+ def _get_intermediate_layers_chunked(self, x, n=1):
284
+ x = self.prepare_tokens_with_masks(x)
285
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
286
+ # If n is an int, take the n last blocks. If it's a list, take them
287
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
288
+ for block_chunk in self.blocks:
289
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
290
+ x = blk(x)
291
+ if i in blocks_to_take:
292
+ output.append(x)
293
+ i += 1
294
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
295
+ return output
296
+
297
+ def get_intermediate_layers(
298
+ self,
299
+ x: torch.Tensor,
300
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
301
+ reshape: bool = False,
302
+ return_class_token: bool = False,
303
+ norm=True
304
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
305
+ if self.chunked_blocks:
306
+ outputs = self._get_intermediate_layers_chunked(x, n)
307
+ else:
308
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
309
+ if norm:
310
+ outputs = [self.norm(out) for out in outputs]
311
+ class_tokens = [out[:, 0] for out in outputs]
312
+ outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
313
+ if reshape:
314
+ B, _, w, h = x.shape
315
+ outputs = [
316
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
317
+ for out in outputs
318
+ ]
319
+ if return_class_token:
320
+ return tuple(zip(outputs, class_tokens))
321
+ return tuple(outputs)
322
+
323
+ def forward(self, *args, is_training=False, **kwargs):
324
+ ret = self.forward_features(*args, **kwargs)
325
+ if is_training:
326
+ return ret
327
+ else:
328
+ return self.head(ret["x_norm_clstoken"])
329
+
330
+
331
+ def init_weights_vit_timm(module: nn.Module, name: str = ""):
332
+ """ViT weight initialization, original timm impl (for reproducibility)"""
333
+ if isinstance(module, nn.Linear):
334
+ trunc_normal_(module.weight, std=0.02)
335
+ if module.bias is not None:
336
+ nn.init.zeros_(module.bias)
337
+
338
+
339
+ def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
340
+ model = DinoVisionTransformer(
341
+ patch_size=patch_size,
342
+ embed_dim=384,
343
+ depth=12,
344
+ num_heads=6,
345
+ mlp_ratio=4,
346
+ block_fn=partial(Block, attn_class=MemEffAttention),
347
+ num_register_tokens=num_register_tokens,
348
+ **kwargs,
349
+ )
350
+ return model
351
+
352
+
353
+ def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
354
+ model = DinoVisionTransformer(
355
+ patch_size=patch_size,
356
+ embed_dim=768,
357
+ depth=12,
358
+ num_heads=12,
359
+ mlp_ratio=4,
360
+ block_fn=partial(Block, attn_class=MemEffAttention),
361
+ num_register_tokens=num_register_tokens,
362
+ **kwargs,
363
+ )
364
+ return model
365
+
366
+
367
+ def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
368
+ model = DinoVisionTransformer(
369
+ patch_size=patch_size,
370
+ embed_dim=1024,
371
+ depth=24,
372
+ num_heads=16,
373
+ mlp_ratio=4,
374
+ block_fn=partial(Block, attn_class=MemEffAttention),
375
+ num_register_tokens=num_register_tokens,
376
+ **kwargs,
377
+ )
378
+ return model
379
+
380
+
381
+ def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
382
+ """
383
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
384
+ """
385
+ model = DinoVisionTransformer(
386
+ patch_size=patch_size,
387
+ embed_dim=1536,
388
+ depth=40,
389
+ num_heads=24,
390
+ mlp_ratio=4,
391
+ block_fn=partial(Block, attn_class=MemEffAttention),
392
+ num_register_tokens=num_register_tokens,
393
+ **kwargs,
394
+ )
395
+ return model
396
+
397
+
398
+ def DINOv2(model_name):
399
+ model_zoo = {
400
+ "vits": vit_small,
401
+ "vitb": vit_base,
402
+ "vitl": vit_large,
403
+ "vitg": vit_giant2
404
+ }
405
+
406
+ return model_zoo[model_name](
407
+ img_size=518,
408
+ patch_size=14,
409
+ init_values=1.0,
410
+ ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
411
+ block_chunks=0,
412
+ num_register_tokens=0,
413
+ interpolate_antialias=False,
414
+ interpolate_offset=0.1
415
+ )
ControlNetUnion-space/depth_anything_v2/dinov2_layers/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from .mlp import Mlp
8
+ from .patch_embed import PatchEmbed
9
+ from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10
+ from .block import NestedTensorBlock
11
+ from .attention import MemEffAttention
ControlNetUnion-space/depth_anything_v2/dinov2_layers/attention.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10
+
11
+ import logging
12
+
13
+ from torch import Tensor
14
+ from torch import nn
15
+
16
+
17
+ logger = logging.getLogger("dinov2")
18
+
19
+
20
+ try:
21
+ from xformers.ops import memory_efficient_attention, unbind, fmha
22
+
23
+ XFORMERS_AVAILABLE = True
24
+ except ImportError:
25
+ logger.warning("xFormers not available")
26
+ XFORMERS_AVAILABLE = False
27
+
28
+
29
+ class Attention(nn.Module):
30
+ def __init__(
31
+ self,
32
+ dim: int,
33
+ num_heads: int = 8,
34
+ qkv_bias: bool = False,
35
+ proj_bias: bool = True,
36
+ attn_drop: float = 0.0,
37
+ proj_drop: float = 0.0,
38
+ ) -> None:
39
+ super().__init__()
40
+ self.num_heads = num_heads
41
+ head_dim = dim // num_heads
42
+ self.scale = head_dim**-0.5
43
+
44
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
45
+ self.attn_drop = nn.Dropout(attn_drop)
46
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
47
+ self.proj_drop = nn.Dropout(proj_drop)
48
+
49
+ def forward(self, x: Tensor) -> Tensor:
50
+ B, N, C = x.shape
51
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
52
+
53
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
54
+ attn = q @ k.transpose(-2, -1)
55
+
56
+ attn = attn.softmax(dim=-1)
57
+ attn = self.attn_drop(attn)
58
+
59
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
60
+ x = self.proj(x)
61
+ x = self.proj_drop(x)
62
+ return x
63
+
64
+
65
+ class MemEffAttention(Attention):
66
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
67
+ if not XFORMERS_AVAILABLE:
68
+ assert attn_bias is None, "xFormers is required for nested tensors usage"
69
+ return super().forward(x)
70
+
71
+ B, N, C = x.shape
72
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
73
+
74
+ q, k, v = unbind(qkv, 2)
75
+
76
+ x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
77
+ x = x.reshape([B, N, C])
78
+
79
+ x = self.proj(x)
80
+ x = self.proj_drop(x)
81
+ return x
82
+
83
+
ControlNetUnion-space/depth_anything_v2/dinov2_layers/block.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10
+
11
+ import logging
12
+ from typing import Callable, List, Any, Tuple, Dict
13
+
14
+ import torch
15
+ from torch import nn, Tensor
16
+
17
+ from .attention import Attention, MemEffAttention
18
+ from .drop_path import DropPath
19
+ from .layer_scale import LayerScale
20
+ from .mlp import Mlp
21
+
22
+
23
+ logger = logging.getLogger("dinov2")
24
+
25
+
26
+ try:
27
+ from xformers.ops import fmha
28
+ from xformers.ops import scaled_index_add, index_select_cat
29
+
30
+ XFORMERS_AVAILABLE = True
31
+ except ImportError:
32
+ logger.warning("xFormers not available")
33
+ XFORMERS_AVAILABLE = False
34
+
35
+
36
+ class Block(nn.Module):
37
+ def __init__(
38
+ self,
39
+ dim: int,
40
+ num_heads: int,
41
+ mlp_ratio: float = 4.0,
42
+ qkv_bias: bool = False,
43
+ proj_bias: bool = True,
44
+ ffn_bias: bool = True,
45
+ drop: float = 0.0,
46
+ attn_drop: float = 0.0,
47
+ init_values=None,
48
+ drop_path: float = 0.0,
49
+ act_layer: Callable[..., nn.Module] = nn.GELU,
50
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
51
+ attn_class: Callable[..., nn.Module] = Attention,
52
+ ffn_layer: Callable[..., nn.Module] = Mlp,
53
+ ) -> None:
54
+ super().__init__()
55
+ # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
56
+ self.norm1 = norm_layer(dim)
57
+ self.attn = attn_class(
58
+ dim,
59
+ num_heads=num_heads,
60
+ qkv_bias=qkv_bias,
61
+ proj_bias=proj_bias,
62
+ attn_drop=attn_drop,
63
+ proj_drop=drop,
64
+ )
65
+ self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
66
+ self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
67
+
68
+ self.norm2 = norm_layer(dim)
69
+ mlp_hidden_dim = int(dim * mlp_ratio)
70
+ self.mlp = ffn_layer(
71
+ in_features=dim,
72
+ hidden_features=mlp_hidden_dim,
73
+ act_layer=act_layer,
74
+ drop=drop,
75
+ bias=ffn_bias,
76
+ )
77
+ self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
78
+ self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
79
+
80
+ self.sample_drop_ratio = drop_path
81
+
82
+ def forward(self, x: Tensor) -> Tensor:
83
+ def attn_residual_func(x: Tensor) -> Tensor:
84
+ return self.ls1(self.attn(self.norm1(x)))
85
+
86
+ def ffn_residual_func(x: Tensor) -> Tensor:
87
+ return self.ls2(self.mlp(self.norm2(x)))
88
+
89
+ if self.training and self.sample_drop_ratio > 0.1:
90
+ # the overhead is compensated only for a drop path rate larger than 0.1
91
+ x = drop_add_residual_stochastic_depth(
92
+ x,
93
+ residual_func=attn_residual_func,
94
+ sample_drop_ratio=self.sample_drop_ratio,
95
+ )
96
+ x = drop_add_residual_stochastic_depth(
97
+ x,
98
+ residual_func=ffn_residual_func,
99
+ sample_drop_ratio=self.sample_drop_ratio,
100
+ )
101
+ elif self.training and self.sample_drop_ratio > 0.0:
102
+ x = x + self.drop_path1(attn_residual_func(x))
103
+ x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
104
+ else:
105
+ x = x + attn_residual_func(x)
106
+ x = x + ffn_residual_func(x)
107
+ return x
108
+
109
+
110
+ def drop_add_residual_stochastic_depth(
111
+ x: Tensor,
112
+ residual_func: Callable[[Tensor], Tensor],
113
+ sample_drop_ratio: float = 0.0,
114
+ ) -> Tensor:
115
+ # 1) extract subset using permutation
116
+ b, n, d = x.shape
117
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
118
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
119
+ x_subset = x[brange]
120
+
121
+ # 2) apply residual_func to get residual
122
+ residual = residual_func(x_subset)
123
+
124
+ x_flat = x.flatten(1)
125
+ residual = residual.flatten(1)
126
+
127
+ residual_scale_factor = b / sample_subset_size
128
+
129
+ # 3) add the residual
130
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
131
+ return x_plus_residual.view_as(x)
132
+
133
+
134
+ def get_branges_scales(x, sample_drop_ratio=0.0):
135
+ b, n, d = x.shape
136
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
137
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
138
+ residual_scale_factor = b / sample_subset_size
139
+ return brange, residual_scale_factor
140
+
141
+
142
+ def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
143
+ if scaling_vector is None:
144
+ x_flat = x.flatten(1)
145
+ residual = residual.flatten(1)
146
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
147
+ else:
148
+ x_plus_residual = scaled_index_add(
149
+ x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
150
+ )
151
+ return x_plus_residual
152
+
153
+
154
+ attn_bias_cache: Dict[Tuple, Any] = {}
155
+
156
+
157
+ def get_attn_bias_and_cat(x_list, branges=None):
158
+ """
159
+ this will perform the index select, cat the tensors, and provide the attn_bias from cache
160
+ """
161
+ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
162
+ all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
163
+ if all_shapes not in attn_bias_cache.keys():
164
+ seqlens = []
165
+ for b, x in zip(batch_sizes, x_list):
166
+ for _ in range(b):
167
+ seqlens.append(x.shape[1])
168
+ attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
169
+ attn_bias._batch_sizes = batch_sizes
170
+ attn_bias_cache[all_shapes] = attn_bias
171
+
172
+ if branges is not None:
173
+ cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
174
+ else:
175
+ tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
176
+ cat_tensors = torch.cat(tensors_bs1, dim=1)
177
+
178
+ return attn_bias_cache[all_shapes], cat_tensors
179
+
180
+
181
+ def drop_add_residual_stochastic_depth_list(
182
+ x_list: List[Tensor],
183
+ residual_func: Callable[[Tensor, Any], Tensor],
184
+ sample_drop_ratio: float = 0.0,
185
+ scaling_vector=None,
186
+ ) -> Tensor:
187
+ # 1) generate random set of indices for dropping samples in the batch
188
+ branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
189
+ branges = [s[0] for s in branges_scales]
190
+ residual_scale_factors = [s[1] for s in branges_scales]
191
+
192
+ # 2) get attention bias and index+concat the tensors
193
+ attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
194
+
195
+ # 3) apply residual_func to get residual, and split the result
196
+ residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
197
+
198
+ outputs = []
199
+ for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
200
+ outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
201
+ return outputs
202
+
203
+
204
+ class NestedTensorBlock(Block):
205
+ def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
206
+ """
207
+ x_list contains a list of tensors to nest together and run
208
+ """
209
+ assert isinstance(self.attn, MemEffAttention)
210
+
211
+ if self.training and self.sample_drop_ratio > 0.0:
212
+
213
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
214
+ return self.attn(self.norm1(x), attn_bias=attn_bias)
215
+
216
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
217
+ return self.mlp(self.norm2(x))
218
+
219
+ x_list = drop_add_residual_stochastic_depth_list(
220
+ x_list,
221
+ residual_func=attn_residual_func,
222
+ sample_drop_ratio=self.sample_drop_ratio,
223
+ scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
224
+ )
225
+ x_list = drop_add_residual_stochastic_depth_list(
226
+ x_list,
227
+ residual_func=ffn_residual_func,
228
+ sample_drop_ratio=self.sample_drop_ratio,
229
+ scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
230
+ )
231
+ return x_list
232
+ else:
233
+
234
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
235
+ return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
236
+
237
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
238
+ return self.ls2(self.mlp(self.norm2(x)))
239
+
240
+ attn_bias, x = get_attn_bias_and_cat(x_list)
241
+ x = x + attn_residual_func(x, attn_bias=attn_bias)
242
+ x = x + ffn_residual_func(x)
243
+ return attn_bias.split(x)
244
+
245
+ def forward(self, x_or_x_list):
246
+ if isinstance(x_or_x_list, Tensor):
247
+ return super().forward(x_or_x_list)
248
+ elif isinstance(x_or_x_list, list):
249
+ assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
250
+ return self.forward_nested(x_or_x_list)
251
+ else:
252
+ raise AssertionError
ControlNetUnion-space/depth_anything_v2/dinov2_layers/drop_path.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10
+
11
+
12
+ from torch import nn
13
+
14
+
15
+ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16
+ if drop_prob == 0.0 or not training:
17
+ return x
18
+ keep_prob = 1 - drop_prob
19
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
20
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21
+ if keep_prob > 0.0:
22
+ random_tensor.div_(keep_prob)
23
+ output = x * random_tensor
24
+ return output
25
+
26
+
27
+ class DropPath(nn.Module):
28
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29
+
30
+ def __init__(self, drop_prob=None):
31
+ super(DropPath, self).__init__()
32
+ self.drop_prob = drop_prob
33
+
34
+ def forward(self, x):
35
+ return drop_path(x, self.drop_prob, self.training)
ControlNetUnion-space/depth_anything_v2/dinov2_layers/layer_scale.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
8
+
9
+ from typing import Union
10
+
11
+ import torch
12
+ from torch import Tensor
13
+ from torch import nn
14
+
15
+
16
+ class LayerScale(nn.Module):
17
+ def __init__(
18
+ self,
19
+ dim: int,
20
+ init_values: Union[float, Tensor] = 1e-5,
21
+ inplace: bool = False,
22
+ ) -> None:
23
+ super().__init__()
24
+ self.inplace = inplace
25
+ self.gamma = nn.Parameter(init_values * torch.ones(dim))
26
+
27
+ def forward(self, x: Tensor) -> Tensor:
28
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
ControlNetUnion-space/depth_anything_v2/dinov2_layers/mlp.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10
+
11
+
12
+ from typing import Callable, Optional
13
+
14
+ from torch import Tensor, nn
15
+
16
+
17
+ class Mlp(nn.Module):
18
+ def __init__(
19
+ self,
20
+ in_features: int,
21
+ hidden_features: Optional[int] = None,
22
+ out_features: Optional[int] = None,
23
+ act_layer: Callable[..., nn.Module] = nn.GELU,
24
+ drop: float = 0.0,
25
+ bias: bool = True,
26
+ ) -> None:
27
+ super().__init__()
28
+ out_features = out_features or in_features
29
+ hidden_features = hidden_features or in_features
30
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31
+ self.act = act_layer()
32
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33
+ self.drop = nn.Dropout(drop)
34
+
35
+ def forward(self, x: Tensor) -> Tensor:
36
+ x = self.fc1(x)
37
+ x = self.act(x)
38
+ x = self.drop(x)
39
+ x = self.fc2(x)
40
+ x = self.drop(x)
41
+ return x
ControlNetUnion-space/depth_anything_v2/dinov2_layers/patch_embed.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10
+
11
+ from typing import Callable, Optional, Tuple, Union
12
+
13
+ from torch import Tensor
14
+ import torch.nn as nn
15
+
16
+
17
+ def make_2tuple(x):
18
+ if isinstance(x, tuple):
19
+ assert len(x) == 2
20
+ return x
21
+
22
+ assert isinstance(x, int)
23
+ return (x, x)
24
+
25
+
26
+ class PatchEmbed(nn.Module):
27
+ """
28
+ 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
29
+
30
+ Args:
31
+ img_size: Image size.
32
+ patch_size: Patch token size.
33
+ in_chans: Number of input image channels.
34
+ embed_dim: Number of linear projection output channels.
35
+ norm_layer: Normalization layer.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ img_size: Union[int, Tuple[int, int]] = 224,
41
+ patch_size: Union[int, Tuple[int, int]] = 16,
42
+ in_chans: int = 3,
43
+ embed_dim: int = 768,
44
+ norm_layer: Optional[Callable] = None,
45
+ flatten_embedding: bool = True,
46
+ ) -> None:
47
+ super().__init__()
48
+
49
+ image_HW = make_2tuple(img_size)
50
+ patch_HW = make_2tuple(patch_size)
51
+ patch_grid_size = (
52
+ image_HW[0] // patch_HW[0],
53
+ image_HW[1] // patch_HW[1],
54
+ )
55
+
56
+ self.img_size = image_HW
57
+ self.patch_size = patch_HW
58
+ self.patches_resolution = patch_grid_size
59
+ self.num_patches = patch_grid_size[0] * patch_grid_size[1]
60
+
61
+ self.in_chans = in_chans
62
+ self.embed_dim = embed_dim
63
+
64
+ self.flatten_embedding = flatten_embedding
65
+
66
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
67
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
68
+
69
+ def forward(self, x: Tensor) -> Tensor:
70
+ _, _, H, W = x.shape
71
+ patch_H, patch_W = self.patch_size
72
+
73
+ assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
74
+ assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
75
+
76
+ x = self.proj(x) # B C H W
77
+ H, W = x.size(2), x.size(3)
78
+ x = x.flatten(2).transpose(1, 2) # B HW C
79
+ x = self.norm(x)
80
+ if not self.flatten_embedding:
81
+ x = x.reshape(-1, H, W, self.embed_dim) # B H W C
82
+ return x
83
+
84
+ def flops(self) -> float:
85
+ Ho, Wo = self.patches_resolution
86
+ flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
87
+ if self.norm is not None:
88
+ flops += Ho * Wo * self.embed_dim
89
+ return flops
ControlNetUnion-space/depth_anything_v2/dinov2_layers/swiglu_ffn.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from typing import Callable, Optional
8
+
9
+ from torch import Tensor, nn
10
+ import torch.nn.functional as F
11
+
12
+
13
+ class SwiGLUFFN(nn.Module):
14
+ def __init__(
15
+ self,
16
+ in_features: int,
17
+ hidden_features: Optional[int] = None,
18
+ out_features: Optional[int] = None,
19
+ act_layer: Callable[..., nn.Module] = None,
20
+ drop: float = 0.0,
21
+ bias: bool = True,
22
+ ) -> None:
23
+ super().__init__()
24
+ out_features = out_features or in_features
25
+ hidden_features = hidden_features or in_features
26
+ self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
27
+ self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
28
+
29
+ def forward(self, x: Tensor) -> Tensor:
30
+ x12 = self.w12(x)
31
+ x1, x2 = x12.chunk(2, dim=-1)
32
+ hidden = F.silu(x1) * x2
33
+ return self.w3(hidden)
34
+
35
+
36
+ try:
37
+ from xformers.ops import SwiGLU
38
+
39
+ XFORMERS_AVAILABLE = True
40
+ except ImportError:
41
+ SwiGLU = SwiGLUFFN
42
+ XFORMERS_AVAILABLE = False
43
+
44
+
45
+ class SwiGLUFFNFused(SwiGLU):
46
+ def __init__(
47
+ self,
48
+ in_features: int,
49
+ hidden_features: Optional[int] = None,
50
+ out_features: Optional[int] = None,
51
+ act_layer: Callable[..., nn.Module] = None,
52
+ drop: float = 0.0,
53
+ bias: bool = True,
54
+ ) -> None:
55
+ out_features = out_features or in_features
56
+ hidden_features = hidden_features or in_features
57
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
58
+ super().__init__(
59
+ in_features=in_features,
60
+ hidden_features=hidden_features,
61
+ out_features=out_features,
62
+ bias=bias,
63
+ )
ControlNetUnion-space/depth_anything_v2/dpt.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torchvision.transforms import Compose
6
+
7
+ from .dinov2 import DINOv2
8
+ from .util.blocks import FeatureFusionBlock, _make_scratch
9
+ from .util.transform import Resize, NormalizeImage, PrepareForNet
10
+
11
+
12
+ def _make_fusion_block(features, use_bn, size=None):
13
+ return FeatureFusionBlock(
14
+ features,
15
+ nn.ReLU(False),
16
+ deconv=False,
17
+ bn=use_bn,
18
+ expand=False,
19
+ align_corners=True,
20
+ size=size,
21
+ )
22
+
23
+
24
+ class ConvBlock(nn.Module):
25
+ def __init__(self, in_feature, out_feature):
26
+ super().__init__()
27
+
28
+ self.conv_block = nn.Sequential(
29
+ nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
30
+ nn.BatchNorm2d(out_feature),
31
+ nn.ReLU(True)
32
+ )
33
+
34
+ def forward(self, x):
35
+ return self.conv_block(x)
36
+
37
+
38
+ class DPTHead(nn.Module):
39
+ def __init__(
40
+ self,
41
+ in_channels,
42
+ features=256,
43
+ use_bn=False,
44
+ out_channels=[256, 512, 1024, 1024],
45
+ use_clstoken=False
46
+ ):
47
+ super(DPTHead, self).__init__()
48
+
49
+ self.use_clstoken = use_clstoken
50
+
51
+ self.projects = nn.ModuleList([
52
+ nn.Conv2d(
53
+ in_channels=in_channels,
54
+ out_channels=out_channel,
55
+ kernel_size=1,
56
+ stride=1,
57
+ padding=0,
58
+ ) for out_channel in out_channels
59
+ ])
60
+
61
+ self.resize_layers = nn.ModuleList([
62
+ nn.ConvTranspose2d(
63
+ in_channels=out_channels[0],
64
+ out_channels=out_channels[0],
65
+ kernel_size=4,
66
+ stride=4,
67
+ padding=0),
68
+ nn.ConvTranspose2d(
69
+ in_channels=out_channels[1],
70
+ out_channels=out_channels[1],
71
+ kernel_size=2,
72
+ stride=2,
73
+ padding=0),
74
+ nn.Identity(),
75
+ nn.Conv2d(
76
+ in_channels=out_channels[3],
77
+ out_channels=out_channels[3],
78
+ kernel_size=3,
79
+ stride=2,
80
+ padding=1)
81
+ ])
82
+
83
+ if use_clstoken:
84
+ self.readout_projects = nn.ModuleList()
85
+ for _ in range(len(self.projects)):
86
+ self.readout_projects.append(
87
+ nn.Sequential(
88
+ nn.Linear(2 * in_channels, in_channels),
89
+ nn.GELU()))
90
+
91
+ self.scratch = _make_scratch(
92
+ out_channels,
93
+ features,
94
+ groups=1,
95
+ expand=False,
96
+ )
97
+
98
+ self.scratch.stem_transpose = None
99
+
100
+ self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
101
+ self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
102
+ self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
103
+ self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
104
+
105
+ head_features_1 = features
106
+ head_features_2 = 32
107
+
108
+ self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
109
+ self.scratch.output_conv2 = nn.Sequential(
110
+ nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
111
+ nn.ReLU(True),
112
+ nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
113
+ nn.ReLU(True),
114
+ nn.Identity(),
115
+ )
116
+
117
+ def forward(self, out_features, patch_h, patch_w):
118
+ out = []
119
+ for i, x in enumerate(out_features):
120
+ if self.use_clstoken:
121
+ x, cls_token = x[0], x[1]
122
+ readout = cls_token.unsqueeze(1).expand_as(x)
123
+ x = self.readout_projects[i](torch.cat((x, readout), -1))
124
+ else:
125
+ x = x[0]
126
+
127
+ x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
128
+
129
+ x = self.projects[i](x)
130
+ x = self.resize_layers[i](x)
131
+
132
+ out.append(x)
133
+
134
+ layer_1, layer_2, layer_3, layer_4 = out
135
+
136
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
137
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
138
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
139
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
140
+
141
+ path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
142
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
143
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
144
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
145
+
146
+ out = self.scratch.output_conv1(path_1)
147
+ out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
148
+ out = self.scratch.output_conv2(out)
149
+
150
+ return out
151
+
152
+
153
+ class DepthAnythingV2(nn.Module):
154
+ def __init__(
155
+ self,
156
+ encoder='vitl',
157
+ features=256,
158
+ out_channels=[256, 512, 1024, 1024],
159
+ use_bn=False,
160
+ use_clstoken=False
161
+ ):
162
+ super(DepthAnythingV2, self).__init__()
163
+
164
+ self.intermediate_layer_idx = {
165
+ 'vits': [2, 5, 8, 11],
166
+ 'vitb': [2, 5, 8, 11],
167
+ 'vitl': [4, 11, 17, 23],
168
+ 'vitg': [9, 19, 29, 39]
169
+ }
170
+
171
+ self.encoder = encoder
172
+ self.pretrained = DINOv2(model_name=encoder)
173
+
174
+ self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
175
+
176
+ def forward(self, x):
177
+ patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
178
+
179
+ features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True)
180
+
181
+ depth = self.depth_head(features, patch_h, patch_w)
182
+ depth = F.relu(depth)
183
+
184
+ return depth.squeeze(1)
185
+
186
+ @torch.no_grad()
187
+ def infer_image(self, raw_image, input_size=518):
188
+ image, (h, w) = self.image2tensor(raw_image, input_size)
189
+
190
+ depth = self.forward(image)
191
+
192
+ depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
193
+
194
+ return depth.cpu().numpy()
195
+
196
+ def image2tensor(self, raw_image, input_size=518):
197
+ transform = Compose([
198
+ Resize(
199
+ width=input_size,
200
+ height=input_size,
201
+ resize_target=False,
202
+ keep_aspect_ratio=True,
203
+ ensure_multiple_of=14,
204
+ resize_method='lower_bound',
205
+ image_interpolation_method=cv2.INTER_CUBIC,
206
+ ),
207
+ NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
208
+ PrepareForNet(),
209
+ ])
210
+
211
+ h, w = raw_image.shape[:2]
212
+
213
+ image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
214
+
215
+ image = transform({'image': image})['image']
216
+ image = torch.from_numpy(image).unsqueeze(0)
217
+
218
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
219
+ image = image.to(DEVICE)
220
+
221
+ return image, (h, w)
ControlNetUnion-space/depth_anything_v2/util/blocks.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+
4
+ def _make_scratch(in_shape, out_shape, groups=1, expand=False):
5
+ scratch = nn.Module()
6
+
7
+ out_shape1 = out_shape
8
+ out_shape2 = out_shape
9
+ out_shape3 = out_shape
10
+ if len(in_shape) >= 4:
11
+ out_shape4 = out_shape
12
+
13
+ if expand:
14
+ out_shape1 = out_shape
15
+ out_shape2 = out_shape * 2
16
+ out_shape3 = out_shape * 4
17
+ if len(in_shape) >= 4:
18
+ out_shape4 = out_shape * 8
19
+
20
+ scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
21
+ scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
22
+ scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
23
+ if len(in_shape) >= 4:
24
+ scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
25
+
26
+ return scratch
27
+
28
+
29
+ class ResidualConvUnit(nn.Module):
30
+ """Residual convolution module.
31
+ """
32
+
33
+ def __init__(self, features, activation, bn):
34
+ """Init.
35
+
36
+ Args:
37
+ features (int): number of features
38
+ """
39
+ super().__init__()
40
+
41
+ self.bn = bn
42
+
43
+ self.groups=1
44
+
45
+ self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
46
+
47
+ self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
48
+
49
+ if self.bn == True:
50
+ self.bn1 = nn.BatchNorm2d(features)
51
+ self.bn2 = nn.BatchNorm2d(features)
52
+
53
+ self.activation = activation
54
+
55
+ self.skip_add = nn.quantized.FloatFunctional()
56
+
57
+ def forward(self, x):
58
+ """Forward pass.
59
+
60
+ Args:
61
+ x (tensor): input
62
+
63
+ Returns:
64
+ tensor: output
65
+ """
66
+
67
+ out = self.activation(x)
68
+ out = self.conv1(out)
69
+ if self.bn == True:
70
+ out = self.bn1(out)
71
+
72
+ out = self.activation(out)
73
+ out = self.conv2(out)
74
+ if self.bn == True:
75
+ out = self.bn2(out)
76
+
77
+ if self.groups > 1:
78
+ out = self.conv_merge(out)
79
+
80
+ return self.skip_add.add(out, x)
81
+
82
+
83
+ class FeatureFusionBlock(nn.Module):
84
+ """Feature fusion block.
85
+ """
86
+
87
+ def __init__(
88
+ self,
89
+ features,
90
+ activation,
91
+ deconv=False,
92
+ bn=False,
93
+ expand=False,
94
+ align_corners=True,
95
+ size=None
96
+ ):
97
+ """Init.
98
+
99
+ Args:
100
+ features (int): number of features
101
+ """
102
+ super(FeatureFusionBlock, self).__init__()
103
+
104
+ self.deconv = deconv
105
+ self.align_corners = align_corners
106
+
107
+ self.groups=1
108
+
109
+ self.expand = expand
110
+ out_features = features
111
+ if self.expand == True:
112
+ out_features = features // 2
113
+
114
+ self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
115
+
116
+ self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
117
+ self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
118
+
119
+ self.skip_add = nn.quantized.FloatFunctional()
120
+
121
+ self.size=size
122
+
123
+ def forward(self, *xs, size=None):
124
+ """Forward pass.
125
+
126
+ Returns:
127
+ tensor: output
128
+ """
129
+ output = xs[0]
130
+
131
+ if len(xs) == 2:
132
+ res = self.resConfUnit1(xs[1])
133
+ output = self.skip_add.add(output, res)
134
+
135
+ output = self.resConfUnit2(output)
136
+
137
+ if (size is None) and (self.size is None):
138
+ modifier = {"scale_factor": 2}
139
+ elif size is None:
140
+ modifier = {"size": self.size}
141
+ else:
142
+ modifier = {"size": size}
143
+
144
+ output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
145
+
146
+ output = self.out_conv(output)
147
+
148
+ return output
ControlNetUnion-space/depth_anything_v2/util/transform.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+
4
+
5
+ class Resize(object):
6
+ """Resize sample to given size (width, height).
7
+ """
8
+
9
+ def __init__(
10
+ self,
11
+ width,
12
+ height,
13
+ resize_target=True,
14
+ keep_aspect_ratio=False,
15
+ ensure_multiple_of=1,
16
+ resize_method="lower_bound",
17
+ image_interpolation_method=cv2.INTER_AREA,
18
+ ):
19
+ """Init.
20
+
21
+ Args:
22
+ width (int): desired output width
23
+ height (int): desired output height
24
+ resize_target (bool, optional):
25
+ True: Resize the full sample (image, mask, target).
26
+ False: Resize image only.
27
+ Defaults to True.
28
+ keep_aspect_ratio (bool, optional):
29
+ True: Keep the aspect ratio of the input sample.
30
+ Output sample might not have the given width and height, and
31
+ resize behaviour depends on the parameter 'resize_method'.
32
+ Defaults to False.
33
+ ensure_multiple_of (int, optional):
34
+ Output width and height is constrained to be multiple of this parameter.
35
+ Defaults to 1.
36
+ resize_method (str, optional):
37
+ "lower_bound": Output will be at least as large as the given size.
38
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
39
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
40
+ Defaults to "lower_bound".
41
+ """
42
+ self.__width = width
43
+ self.__height = height
44
+
45
+ self.__resize_target = resize_target
46
+ self.__keep_aspect_ratio = keep_aspect_ratio
47
+ self.__multiple_of = ensure_multiple_of
48
+ self.__resize_method = resize_method
49
+ self.__image_interpolation_method = image_interpolation_method
50
+
51
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
52
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
53
+
54
+ if max_val is not None and y > max_val:
55
+ y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
56
+
57
+ if y < min_val:
58
+ y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
59
+
60
+ return y
61
+
62
+ def get_size(self, width, height):
63
+ # determine new height and width
64
+ scale_height = self.__height / height
65
+ scale_width = self.__width / width
66
+
67
+ if self.__keep_aspect_ratio:
68
+ if self.__resize_method == "lower_bound":
69
+ # scale such that output size is lower bound
70
+ if scale_width > scale_height:
71
+ # fit width
72
+ scale_height = scale_width
73
+ else:
74
+ # fit height
75
+ scale_width = scale_height
76
+ elif self.__resize_method == "upper_bound":
77
+ # scale such that output size is upper bound
78
+ if scale_width < scale_height:
79
+ # fit width
80
+ scale_height = scale_width
81
+ else:
82
+ # fit height
83
+ scale_width = scale_height
84
+ elif self.__resize_method == "minimal":
85
+ # scale as least as possbile
86
+ if abs(1 - scale_width) < abs(1 - scale_height):
87
+ # fit width
88
+ scale_height = scale_width
89
+ else:
90
+ # fit height
91
+ scale_width = scale_height
92
+ else:
93
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
94
+
95
+ if self.__resize_method == "lower_bound":
96
+ new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
97
+ new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
98
+ elif self.__resize_method == "upper_bound":
99
+ new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
100
+ new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
101
+ elif self.__resize_method == "minimal":
102
+ new_height = self.constrain_to_multiple_of(scale_height * height)
103
+ new_width = self.constrain_to_multiple_of(scale_width * width)
104
+ else:
105
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
106
+
107
+ return (new_width, new_height)
108
+
109
+ def __call__(self, sample):
110
+ width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
111
+
112
+ # resize sample
113
+ sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
114
+
115
+ if self.__resize_target:
116
+ if "depth" in sample:
117
+ sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
118
+
119
+ if "mask" in sample:
120
+ sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
121
+
122
+ return sample
123
+
124
+
125
+ class NormalizeImage(object):
126
+ """Normlize image by given mean and std.
127
+ """
128
+
129
+ def __init__(self, mean, std):
130
+ self.__mean = mean
131
+ self.__std = std
132
+
133
+ def __call__(self, sample):
134
+ sample["image"] = (sample["image"] - self.__mean) / self.__std
135
+
136
+ return sample
137
+
138
+
139
+ class PrepareForNet(object):
140
+ """Prepare sample for usage as network input.
141
+ """
142
+
143
+ def __init__(self):
144
+ pass
145
+
146
+ def __call__(self, sample):
147
+ image = np.transpose(sample["image"], (2, 0, 1))
148
+ sample["image"] = np.ascontiguousarray(image).astype(np.float32)
149
+
150
+ if "depth" in sample:
151
+ depth = sample["depth"].astype(np.float32)
152
+ sample["depth"] = np.ascontiguousarray(depth)
153
+
154
+ if "mask" in sample:
155
+ sample["mask"] = sample["mask"].astype(np.float32)
156
+ sample["mask"] = np.ascontiguousarray(sample["mask"])
157
+
158
+ return sample
ControlNetUnion-space/requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/diffusers.git
2
+ torch==2.0.1
3
+ torchvision==0.15.2
4
+ transformers==4.43.3
5
+ einops
6
+ onnxruntime-gpu
7
+ spaces
8
+ accelerate
9
+ omegaconf
10
+ huggingface-hub
11
+ opencv-python
12
+ gradio
13
+ xformers
14
+ sentencepiece
15
+ peft
16
+ scipy
17
+ scikit-image
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('./')
3
+
4
+ import gradio as gr
5
+ import spaces
6
+ import os
7
+ import sys
8
+ import subprocess
9
+ import numpy as np
10
+ from PIL import Image
11
+ import cv2
12
+ import torch
13
+ import random
14
+
15
+ os.system("pip install -e ./controlnet_aux")
16
+
17
+ from controlnet_aux import OpenposeDetector, CannyDetector
18
+ from depth_anything_v2.dpt import DepthAnythingV2
19
+
20
+ from huggingface_hub import hf_hub_download
21
+
22
+ from huggingface_hub import login
23
+ hf_token = os.environ.get("HF_TOKEN_GATED")
24
+ login(token=hf_token)
25
+
26
+ MAX_SEED = np.iinfo(np.int32).max
27
+
28
+ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
29
+ if randomize_seed:
30
+ seed = random.randint(0, MAX_SEED)
31
+ return seed
32
+
33
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
34
+ model_configs = {
35
+ 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
36
+ 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
37
+ 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
38
+ 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
39
+ }
40
+
41
+ encoder = 'vitl'
42
+ model = DepthAnythingV2(**model_configs[encoder])
43
+ filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-Large", filename=f"depth_anything_v2_vitl.pth", repo_type="model")
44
+ state_dict = torch.load(filepath, map_location="cpu")
45
+ model.load_state_dict(state_dict)
46
+ model = model.to(DEVICE).eval()
47
+
48
+ import torch
49
+ from diffusers.utils import load_image
50
+ from diffusers import FluxControlNetPipeline, FluxControlNetModel
51
+ from diffusers.models import FluxMultiControlNetModel
52
+
53
+ base_model = 'black-forest-labs/FLUX.1-dev'
54
+ controlnet_model = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro'
55
+ controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
56
+ controlnet = FluxMultiControlNetModel([controlnet])
57
+ pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=controlnet, torch_dtype=torch.bfloat16)
58
+ pipe.to("cuda")
59
+
60
+ mode_mapping = {"canny":0, "tile":1, "depth":2, "blur":3, "openpose":4, "gray":5, "low quality": 6}
61
+ strength_mapping = {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}
62
+
63
+ canny = CannyDetector()
64
+ open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
65
+
66
+ torch.backends.cuda.matmul.allow_tf32 = True
67
+ pipe.vae.enable_tiling()
68
+ pipe.vae.enable_slicing()
69
+ pipe.enable_model_cpu_offload() # for saving memory
70
+
71
+ def convert_from_image_to_cv2(img: Image) -> np.ndarray:
72
+ return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
73
+
74
+ def convert_from_cv2_to_image(img: np.ndarray) -> Image:
75
+ return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
76
+
77
+ def extract_depth(image):
78
+ image = np.asarray(image)
79
+ depth = model.infer_image(image[:, :, ::-1])
80
+ depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
81
+ depth = depth.astype(np.uint8)
82
+ gray_depth = Image.fromarray(depth).convert('RGB')
83
+ return gray_depth
84
+
85
+ def extract_openpose(img):
86
+ processed_image_open_pose = open_pose(img, hand_and_face=True)
87
+ return processed_image_open_pose
88
+
89
+ def extract_canny(image):
90
+ processed_image_canny = canny(image)
91
+ return processed_image_canny
92
+
93
+ def apply_gaussian_blur(image, kernel_size=(21, 21)):
94
+ image = convert_from_image_to_cv2(image)
95
+ blurred_image = convert_from_cv2_to_image(cv2.GaussianBlur(image, kernel_size, 0))
96
+ return blurred_image
97
+
98
+ def convert_to_grayscale(image):
99
+ image = convert_from_image_to_cv2(image)
100
+ gray_image = convert_from_cv2_to_image(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))
101
+ return gray_image
102
+
103
+ def add_gaussian_noise(image, mean=0, sigma=10):
104
+ image = convert_from_image_to_cv2(image)
105
+ noise = np.random.normal(mean, sigma, image.shape)
106
+ noisy_image = convert_from_cv2_to_image(np.clip(image.astype(np.float32) + noise, 0, 255).astype(np.uint8))
107
+ return noisy_image
108
+
109
+ def tile(input_image, resolution=768):
110
+ input_image = convert_from_image_to_cv2(input_image)
111
+ H, W, C = input_image.shape
112
+ H = float(H)
113
+ W = float(W)
114
+ k = float(resolution) / min(H, W)
115
+ H *= k
116
+ W *= k
117
+ H = int(np.round(H / 64.0)) * 64
118
+ W = int(np.round(W / 64.0)) * 64
119
+ img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
120
+ img = convert_from_cv2_to_image(img)
121
+ return img
122
+
123
+ def resize_img(input_image, max_side=768, min_side=512, size=None,
124
+ pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64):
125
+
126
+ w, h = input_image.size
127
+ if size is not None:
128
+ w_resize_new, h_resize_new = size
129
+ else:
130
+ ratio = min_side / min(h, w)
131
+ w, h = round(ratio*w), round(ratio*h)
132
+ ratio = max_side / max(h, w)
133
+ input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
134
+ w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
135
+ h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
136
+ input_image = input_image.resize([w_resize_new, h_resize_new], mode)
137
+
138
+ if pad_to_max_side:
139
+ res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
140
+ offset_x = (max_side - w_resize_new) // 2
141
+ offset_y = (max_side - h_resize_new) // 2
142
+ res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
143
+ input_image = Image.fromarray(res)
144
+ return input_image
145
+
146
+ @spaces.GPU(duration=180)
147
+ def infer(cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed, progress=gr.Progress(track_tqdm=True)):
148
+
149
+ control_mode_num = mode_mapping[control_mode]
150
+
151
+ if cond_in is None:
152
+ if image_in is not None:
153
+ image_in = resize_img(load_image(image_in))
154
+ if control_mode == "canny":
155
+ control_image = extract_canny(image_in)
156
+ elif control_mode == "depth":
157
+ control_image = extract_depth(image_in)
158
+ elif control_mode == "openpose":
159
+ control_image = extract_openpose(image_in)
160
+ elif control_mode == "blur":
161
+ control_image = apply_gaussian_blur(image_in)
162
+ elif control_mode == "low quality":
163
+ control_image = add_gaussian_noise(image_in)
164
+ elif control_mode == "gray":
165
+ control_image = convert_to_grayscale(image_in)
166
+ elif control_mode == "tile":
167
+ control_image = tile(image_in)
168
+ else:
169
+ control_image = resize_img(load_image(cond_in))
170
+
171
+ width, height = control_image.size
172
+
173
+ image = pipe(
174
+ prompt,
175
+ control_image=[control_image],
176
+ control_mode=[control_mode_num],
177
+ width=width,
178
+ height=height,
179
+ controlnet_conditioning_scale=[control_strength],
180
+ num_inference_steps=inference_steps,
181
+ guidance_scale=guidance_scale,
182
+ generator=torch.manual_seed(seed),
183
+ ).images[0]
184
+
185
+ torch.cuda.empty_cache()
186
+
187
+ return image, control_image, gr.update(visible=True)
188
+
189
+
190
+ css="""
191
+ #col-container{
192
+ margin: 0 auto;
193
+ max-width: 1080px;
194
+ }
195
+ """
196
+ with gr.Blocks(css=css) as demo:
197
+ with gr.Column(elem_id="col-container"):
198
+ gr.Markdown("""
199
+ # FLUX.1-dev-ControlNet-Union-Pro
200
+ A unified ControlNet for FLUX.1-dev model from the InstantX team and Shakker Labs. Model card: [Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro). <br />
201
+ The recommended strength: {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}. Long prompt is preferred by FLUX.1.
202
+ """)
203
+
204
+ with gr.Column():
205
+
206
+ with gr.Row():
207
+ with gr.Column():
208
+
209
+ with gr.Row(equal_height=True):
210
+ cond_in = gr.Image(label="Upload a processed control image", sources=["upload"], type="filepath")
211
+ image_in = gr.Image(label="Extract condition from a reference image (Optional)", sources=["upload"], type="filepath")
212
+
213
+ prompt = gr.Textbox(label="Prompt", value="best quality")
214
+
215
+ with gr.Accordion("Controlnet"):
216
+ control_mode = gr.Radio(
217
+ ["canny", "depth", "openpose", "gray", "blur", "tile", "low quality"], label="Mode", value="gray",
218
+ info="select the control mode, one for all"
219
+ )
220
+
221
+ control_strength = gr.Slider(
222
+ label="control strength",
223
+ minimum=0,
224
+ maximum=1.0,
225
+ step=0.05,
226
+ value=0.50,
227
+ )
228
+
229
+ seed = gr.Slider(
230
+ label="Seed",
231
+ minimum=0,
232
+ maximum=MAX_SEED,
233
+ step=1,
234
+ value=42,
235
+ )
236
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
237
+
238
+ with gr.Accordion("Advanced settings", open=False):
239
+ with gr.Column():
240
+ with gr.Row():
241
+ inference_steps = gr.Slider(label="Inference steps", minimum=1, maximum=50, step=1, value=24)
242
+ guidance_scale = gr.Slider(label="Guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=3.5)
243
+
244
+ submit_btn = gr.Button("Submit")
245
+
246
+ with gr.Column():
247
+ result = gr.Image(label="Result")
248
+ processed_cond = gr.Image(label="Preprocessed Cond")
249
+
250
+ submit_btn.click(
251
+ fn=randomize_seed_fn,
252
+ inputs=[seed, randomize_seed],
253
+ outputs=seed,
254
+ queue=False,
255
+ api_name=False
256
+ ).then(
257
+ fn = infer,
258
+ inputs = [cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed],
259
+ outputs = [result, processed_cond],
260
+ show_api=False
261
+ )
262
+
263
+ demo.queue(api_open=False)
264
+ demo.launch()
controlnet_aux/.DS_Store ADDED
Binary file (6.15 kB). View file
 
controlnet_aux/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __version__ = "0.0.9"
2
+
3
+ from .canny import CannyDetector
4
+ from .open_pose import OpenposeDetector
5
+
controlnet_aux/canny/__init__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
+ from ..util import HWC3, resize_image
6
+
7
+ class CannyDetector:
8
+ def __call__(self, input_image=None, low_threshold=100, high_threshold=200, detect_resolution=512, image_resolution=512, output_type=None, **kwargs):
9
+ if "img" in kwargs:
10
+ warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
11
+ input_image = kwargs.pop("img")
12
+
13
+ if input_image is None:
14
+ raise ValueError("input_image must be defined.")
15
+
16
+ if not isinstance(input_image, np.ndarray):
17
+ input_image = np.array(input_image, dtype=np.uint8)
18
+ output_type = output_type or "pil"
19
+ else:
20
+ output_type = output_type or "np"
21
+
22
+ input_image = HWC3(input_image)
23
+ input_image = resize_image(input_image, detect_resolution)
24
+
25
+ detected_map = cv2.Canny(input_image, low_threshold, high_threshold)
26
+ detected_map = HWC3(detected_map)
27
+
28
+ img = resize_image(input_image, image_resolution)
29
+ H, W, C = img.shape
30
+
31
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
32
+
33
+ if output_type == "pil":
34
+ detected_map = Image.fromarray(detected_map)
35
+
36
+ return detected_map
controlnet_aux/open_pose/LICENSE ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OPENPOSE: MULTIPERSON KEYPOINT DETECTION
2
+ SOFTWARE LICENSE AGREEMENT
3
+ ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
4
+
5
+ BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
6
+
7
+ This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
8
+
9
+ RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
10
+ Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
11
+ non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
12
+
13
+ CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
14
+
15
+ COPYRIGHT: The Software is owned by Licensor and is protected by United
16
+ States copyright laws and applicable international treaties and/or conventions.
17
+
18
+ PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
19
+
20
+ DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
21
+
22
+ BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
23
+
24
+ USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
25
+
26
+ You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
27
+
28
+ ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
29
+
30
+ TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
31
+
32
+ The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
33
+
34
+ FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
35
+
36
+ DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
37
+
38
+ SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
39
+
40
+ EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
41
+
42
+ EXPORT REGULATION: Licensee agrees to comply with any and all applicable
43
+ U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
44
+
45
+ SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
46
+
47
+ NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
48
+
49
+ GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
50
+
51
+ ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
52
+
53
+
54
+
55
+ ************************************************************************
56
+
57
+ THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
58
+
59
+ This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
60
+
61
+ 1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
62
+
63
+ COPYRIGHT
64
+
65
+ All contributions by the University of California:
66
+ Copyright (c) 2014-2017 The Regents of the University of California (Regents)
67
+ All rights reserved.
68
+
69
+ All other contributions:
70
+ Copyright (c) 2014-2017, the respective contributors
71
+ All rights reserved.
72
+
73
+ Caffe uses a shared copyright model: each contributor holds copyright over
74
+ their contributions to Caffe. The project versioning records all such
75
+ contribution and copyright details. If a contributor wants to further mark
76
+ their specific copyright on a particular contribution, they should indicate
77
+ their copyright solely in the commit message of the change when it is
78
+ committed.
79
+
80
+ LICENSE
81
+
82
+ Redistribution and use in source and binary forms, with or without
83
+ modification, are permitted provided that the following conditions are met:
84
+
85
+ 1. Redistributions of source code must retain the above copyright notice, this
86
+ list of conditions and the following disclaimer.
87
+ 2. Redistributions in binary form must reproduce the above copyright notice,
88
+ this list of conditions and the following disclaimer in the documentation
89
+ and/or other materials provided with the distribution.
90
+
91
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
92
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
93
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
94
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
95
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
96
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
97
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
98
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
99
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
100
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
101
+
102
+ CONTRIBUTION AGREEMENT
103
+
104
+ By contributing to the BVLC/caffe repository through pull-request, comment,
105
+ or otherwise, the contributor releases their content to the
106
+ license and copyright terms herein.
107
+
108
+ ************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
controlnet_aux/open_pose/__init__.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Openpose
2
+ # Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
3
+ # 2nd Edited by https://github.com/Hzzone/pytorch-openpose
4
+ # 3rd Edited by ControlNet
5
+ # 4th Edited by ControlNet (added face and correct hands)
6
+ # 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs)
7
+ # This preprocessor is licensed by CMU for non-commercial use only.
8
+
9
+
10
+ import os
11
+
12
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
13
+
14
+ import json
15
+ import warnings
16
+ from typing import Callable, List, NamedTuple, Tuple, Union
17
+
18
+ import cv2
19
+ import numpy as np
20
+ import torch
21
+ from huggingface_hub import hf_hub_download
22
+ from PIL import Image
23
+
24
+ from ..util import HWC3, resize_image
25
+ from . import util
26
+ from .body import Body, BodyResult, Keypoint
27
+ from .face import Face
28
+ from .hand import Hand
29
+
30
+ HandResult = List[Keypoint]
31
+ FaceResult = List[Keypoint]
32
+
33
+ class PoseResult(NamedTuple):
34
+ body: BodyResult
35
+ left_hand: Union[HandResult, None]
36
+ right_hand: Union[HandResult, None]
37
+ face: Union[FaceResult, None]
38
+
39
+ def draw_poses(poses: List[PoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True):
40
+ """
41
+ Draw the detected poses on an empty canvas.
42
+
43
+ Args:
44
+ poses (List[PoseResult]): A list of PoseResult objects containing the detected poses.
45
+ H (int): The height of the canvas.
46
+ W (int): The width of the canvas.
47
+ draw_body (bool, optional): Whether to draw body keypoints. Defaults to True.
48
+ draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True.
49
+ draw_face (bool, optional): Whether to draw face keypoints. Defaults to True.
50
+
51
+ Returns:
52
+ numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses.
53
+ """
54
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
55
+
56
+ for pose in poses:
57
+ if draw_body:
58
+ canvas = util.draw_bodypose(canvas, pose.body.keypoints)
59
+
60
+ if draw_hand:
61
+ canvas = util.draw_handpose(canvas, pose.left_hand)
62
+ canvas = util.draw_handpose(canvas, pose.right_hand)
63
+
64
+ if draw_face:
65
+ canvas = util.draw_facepose(canvas, pose.face)
66
+
67
+ return canvas
68
+
69
+
70
+ class OpenposeDetector:
71
+ """
72
+ A class for detecting human poses in images using the Openpose model.
73
+
74
+ Attributes:
75
+ model_dir (str): Path to the directory where the pose models are stored.
76
+ """
77
+ def __init__(self, body_estimation, hand_estimation=None, face_estimation=None):
78
+ self.body_estimation = body_estimation
79
+ self.hand_estimation = hand_estimation
80
+ self.face_estimation = face_estimation
81
+
82
+ @classmethod
83
+ def from_pretrained(cls, pretrained_model_or_path, filename=None, hand_filename=None, face_filename=None, cache_dir=None, local_files_only=False):
84
+
85
+ if pretrained_model_or_path == "lllyasviel/ControlNet":
86
+ filename = filename or "annotator/ckpts/body_pose_model.pth"
87
+ hand_filename = hand_filename or "annotator/ckpts/hand_pose_model.pth"
88
+ face_filename = face_filename or "facenet.pth"
89
+
90
+ face_pretrained_model_or_path = "lllyasviel/Annotators"
91
+ else:
92
+ filename = filename or "body_pose_model.pth"
93
+ hand_filename = hand_filename or "hand_pose_model.pth"
94
+ face_filename = face_filename or "facenet.pth"
95
+
96
+ face_pretrained_model_or_path = pretrained_model_or_path
97
+
98
+ if os.path.isdir(pretrained_model_or_path):
99
+ body_model_path = os.path.join(pretrained_model_or_path, filename)
100
+ hand_model_path = os.path.join(pretrained_model_or_path, hand_filename)
101
+ face_model_path = os.path.join(face_pretrained_model_or_path, face_filename)
102
+ else:
103
+ body_model_path = hf_hub_download(pretrained_model_or_path, filename, cache_dir=cache_dir, local_files_only=local_files_only)
104
+ hand_model_path = hf_hub_download(pretrained_model_or_path, hand_filename, cache_dir=cache_dir, local_files_only=local_files_only)
105
+ face_model_path = hf_hub_download(face_pretrained_model_or_path, face_filename, cache_dir=cache_dir, local_files_only=local_files_only)
106
+
107
+ body_estimation = Body(body_model_path)
108
+ hand_estimation = Hand(hand_model_path)
109
+ face_estimation = Face(face_model_path)
110
+
111
+ return cls(body_estimation, hand_estimation, face_estimation)
112
+
113
+ def to(self, device):
114
+ self.body_estimation.to(device)
115
+ self.hand_estimation.to(device)
116
+ self.face_estimation.to(device)
117
+ return self
118
+
119
+ def detect_hands(self, body: BodyResult, oriImg) -> Tuple[Union[HandResult, None], Union[HandResult, None]]:
120
+ left_hand = None
121
+ right_hand = None
122
+ H, W, _ = oriImg.shape
123
+ for x, y, w, is_left in util.handDetect(body, oriImg):
124
+ peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :]).astype(np.float32)
125
+ if peaks.ndim == 2 and peaks.shape[1] == 2:
126
+ peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
127
+ peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
128
+
129
+ hand_result = [
130
+ Keypoint(x=peak[0], y=peak[1])
131
+ for peak in peaks
132
+ ]
133
+
134
+ if is_left:
135
+ left_hand = hand_result
136
+ else:
137
+ right_hand = hand_result
138
+
139
+ return left_hand, right_hand
140
+
141
+ def detect_face(self, body: BodyResult, oriImg) -> Union[FaceResult, None]:
142
+ face = util.faceDetect(body, oriImg)
143
+ if face is None:
144
+ return None
145
+
146
+ x, y, w = face
147
+ H, W, _ = oriImg.shape
148
+ heatmaps = self.face_estimation(oriImg[y:y+w, x:x+w, :])
149
+ peaks = self.face_estimation.compute_peaks_from_heatmaps(heatmaps).astype(np.float32)
150
+ if peaks.ndim == 2 and peaks.shape[1] == 2:
151
+ peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
152
+ peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
153
+ return [
154
+ Keypoint(x=peak[0], y=peak[1])
155
+ for peak in peaks
156
+ ]
157
+
158
+ return None
159
+
160
+ def detect_poses(self, oriImg, include_hand=False, include_face=False) -> List[PoseResult]:
161
+ """
162
+ Detect poses in the given image.
163
+ Args:
164
+ oriImg (numpy.ndarray): The input image for pose detection.
165
+ include_hand (bool, optional): Whether to include hand detection. Defaults to False.
166
+ include_face (bool, optional): Whether to include face detection. Defaults to False.
167
+
168
+ Returns:
169
+ List[PoseResult]: A list of PoseResult objects containing the detected poses.
170
+ """
171
+ oriImg = oriImg[:, :, ::-1].copy()
172
+ H, W, C = oriImg.shape
173
+ with torch.no_grad():
174
+ candidate, subset = self.body_estimation(oriImg)
175
+ bodies = self.body_estimation.format_body_result(candidate, subset)
176
+
177
+ results = []
178
+ for body in bodies:
179
+ left_hand, right_hand, face = (None,) * 3
180
+ if include_hand:
181
+ left_hand, right_hand = self.detect_hands(body, oriImg)
182
+ if include_face:
183
+ face = self.detect_face(body, oriImg)
184
+
185
+ results.append(PoseResult(BodyResult(
186
+ keypoints=[
187
+ Keypoint(
188
+ x=keypoint.x / float(W),
189
+ y=keypoint.y / float(H)
190
+ ) if keypoint is not None else None
191
+ for keypoint in body.keypoints
192
+ ],
193
+ total_score=body.total_score,
194
+ total_parts=body.total_parts
195
+ ), left_hand, right_hand, face))
196
+
197
+ return results
198
+
199
+ def __call__(self, input_image, detect_resolution=512, image_resolution=512, include_body=True, include_hand=False, include_face=False, hand_and_face=None, output_type="pil", **kwargs):
200
+ if hand_and_face is not None:
201
+ warnings.warn("hand_and_face is deprecated. Use include_hand and include_face instead.", DeprecationWarning)
202
+ include_hand = hand_and_face
203
+ include_face = hand_and_face
204
+
205
+ if "return_pil" in kwargs:
206
+ warnings.warn("return_pil is deprecated. Use output_type instead.", DeprecationWarning)
207
+ output_type = "pil" if kwargs["return_pil"] else "np"
208
+ if type(output_type) is bool:
209
+ warnings.warn("Passing `True` or `False` to `output_type` is deprecated and will raise an error in future versions")
210
+ if output_type:
211
+ output_type = "pil"
212
+
213
+ if not isinstance(input_image, np.ndarray):
214
+ input_image = np.array(input_image, dtype=np.uint8)
215
+
216
+ input_image = HWC3(input_image)
217
+ input_image = resize_image(input_image, detect_resolution)
218
+ H, W, C = input_image.shape
219
+
220
+ poses = self.detect_poses(input_image, include_hand, include_face)
221
+ canvas = draw_poses(poses, H, W, draw_body=include_body, draw_hand=include_hand, draw_face=include_face)
222
+
223
+ detected_map = canvas
224
+ detected_map = HWC3(detected_map)
225
+
226
+ img = resize_image(input_image, image_resolution)
227
+ H, W, C = img.shape
228
+
229
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
230
+
231
+ if output_type == "pil":
232
+ detected_map = Image.fromarray(detected_map)
233
+
234
+ return detected_map
controlnet_aux/open_pose/body.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, NamedTuple, Union
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import torch
7
+ from scipy.ndimage.filters import gaussian_filter
8
+
9
+ from . import util
10
+ from .model import bodypose_model
11
+
12
+
13
+ class Keypoint(NamedTuple):
14
+ x: float
15
+ y: float
16
+ score: float = 1.0
17
+ id: int = -1
18
+
19
+
20
+ class BodyResult(NamedTuple):
21
+ # Note: Using `Union` instead of `|` operator as the ladder is a Python
22
+ # 3.10 feature.
23
+ # Annotator code should be Python 3.8 Compatible, as controlnet repo uses
24
+ # Python 3.8 environment.
25
+ # https://github.com/lllyasviel/ControlNet/blob/d3284fcd0972c510635a4f5abe2eeb71dc0de524/environment.yaml#L6
26
+ keypoints: List[Union[Keypoint, None]]
27
+ total_score: float
28
+ total_parts: int
29
+
30
+
31
+ class Body(object):
32
+ def __init__(self, model_path):
33
+ self.model = bodypose_model()
34
+ model_dict = util.transfer(self.model, torch.load(model_path))
35
+ self.model.load_state_dict(model_dict)
36
+ self.model.eval()
37
+
38
+ def to(self, device):
39
+ self.model.to(device)
40
+ return self
41
+
42
+ def __call__(self, oriImg):
43
+ device = next(iter(self.model.parameters())).device
44
+ # scale_search = [0.5, 1.0, 1.5, 2.0]
45
+ scale_search = [0.5]
46
+ boxsize = 368
47
+ stride = 8
48
+ padValue = 128
49
+ thre1 = 0.1
50
+ thre2 = 0.05
51
+ multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
52
+ heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
53
+ paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
54
+
55
+ for m in range(len(multiplier)):
56
+ scale = multiplier[m]
57
+ imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale)
58
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
59
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
60
+ im = np.ascontiguousarray(im)
61
+
62
+ data = torch.from_numpy(im).float()
63
+ data = data.to(device)
64
+ # data = data.permute([2, 0, 1]).unsqueeze(0).float()
65
+ with torch.no_grad():
66
+ Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
67
+ Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
68
+ Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
69
+
70
+ # extract outputs, resize, and remove padding
71
+ # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
72
+ heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
73
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
74
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
75
+ heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1]))
76
+
77
+ # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
78
+ paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
79
+ paf = util.smart_resize_k(paf, fx=stride, fy=stride)
80
+ paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
81
+ paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1]))
82
+
83
+ heatmap_avg += heatmap_avg + heatmap / len(multiplier)
84
+ paf_avg += + paf / len(multiplier)
85
+
86
+ all_peaks = []
87
+ peak_counter = 0
88
+
89
+ for part in range(18):
90
+ map_ori = heatmap_avg[:, :, part]
91
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
92
+
93
+ map_left = np.zeros(one_heatmap.shape)
94
+ map_left[1:, :] = one_heatmap[:-1, :]
95
+ map_right = np.zeros(one_heatmap.shape)
96
+ map_right[:-1, :] = one_heatmap[1:, :]
97
+ map_up = np.zeros(one_heatmap.shape)
98
+ map_up[:, 1:] = one_heatmap[:, :-1]
99
+ map_down = np.zeros(one_heatmap.shape)
100
+ map_down[:, :-1] = one_heatmap[:, 1:]
101
+
102
+ peaks_binary = np.logical_and.reduce(
103
+ (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
104
+ peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
105
+ peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
106
+ peak_id = range(peak_counter, peak_counter + len(peaks))
107
+ peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
108
+
109
+ all_peaks.append(peaks_with_score_and_id)
110
+ peak_counter += len(peaks)
111
+
112
+ # find connection in the specified sequence, center 29 is in the position 15
113
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
114
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
115
+ [1, 16], [16, 18], [3, 17], [6, 18]]
116
+ # the middle joints heatmap correpondence
117
+ mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
118
+ [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
119
+ [55, 56], [37, 38], [45, 46]]
120
+
121
+ connection_all = []
122
+ special_k = []
123
+ mid_num = 10
124
+
125
+ for k in range(len(mapIdx)):
126
+ score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
127
+ candA = all_peaks[limbSeq[k][0] - 1]
128
+ candB = all_peaks[limbSeq[k][1] - 1]
129
+ nA = len(candA)
130
+ nB = len(candB)
131
+ indexA, indexB = limbSeq[k]
132
+ if (nA != 0 and nB != 0):
133
+ connection_candidate = []
134
+ for i in range(nA):
135
+ for j in range(nB):
136
+ vec = np.subtract(candB[j][:2], candA[i][:2])
137
+ norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
138
+ norm = max(0.001, norm)
139
+ vec = np.divide(vec, norm)
140
+
141
+ startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
142
+ np.linspace(candA[i][1], candB[j][1], num=mid_num)))
143
+
144
+ vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
145
+ for I in range(len(startend))])
146
+ vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
147
+ for I in range(len(startend))])
148
+
149
+ score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
150
+ score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
151
+ 0.5 * oriImg.shape[0] / norm - 1, 0)
152
+ criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
153
+ criterion2 = score_with_dist_prior > 0
154
+ if criterion1 and criterion2:
155
+ connection_candidate.append(
156
+ [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
157
+
158
+ connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
159
+ connection = np.zeros((0, 5))
160
+ for c in range(len(connection_candidate)):
161
+ i, j, s = connection_candidate[c][0:3]
162
+ if (i not in connection[:, 3] and j not in connection[:, 4]):
163
+ connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
164
+ if (len(connection) >= min(nA, nB)):
165
+ break
166
+
167
+ connection_all.append(connection)
168
+ else:
169
+ special_k.append(k)
170
+ connection_all.append([])
171
+
172
+ # last number in each row is the total parts number of that person
173
+ # the second last number in each row is the score of the overall configuration
174
+ subset = -1 * np.ones((0, 20))
175
+ candidate = np.array([item for sublist in all_peaks for item in sublist])
176
+
177
+ for k in range(len(mapIdx)):
178
+ if k not in special_k:
179
+ partAs = connection_all[k][:, 0]
180
+ partBs = connection_all[k][:, 1]
181
+ indexA, indexB = np.array(limbSeq[k]) - 1
182
+
183
+ for i in range(len(connection_all[k])): # = 1:size(temp,1)
184
+ found = 0
185
+ subset_idx = [-1, -1]
186
+ for j in range(len(subset)): # 1:size(subset,1):
187
+ if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
188
+ subset_idx[found] = j
189
+ found += 1
190
+
191
+ if found == 1:
192
+ j = subset_idx[0]
193
+ if subset[j][indexB] != partBs[i]:
194
+ subset[j][indexB] = partBs[i]
195
+ subset[j][-1] += 1
196
+ subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
197
+ elif found == 2: # if found 2 and disjoint, merge them
198
+ j1, j2 = subset_idx
199
+ membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
200
+ if len(np.nonzero(membership == 2)[0]) == 0: # merge
201
+ subset[j1][:-2] += (subset[j2][:-2] + 1)
202
+ subset[j1][-2:] += subset[j2][-2:]
203
+ subset[j1][-2] += connection_all[k][i][2]
204
+ subset = np.delete(subset, j2, 0)
205
+ else: # as like found == 1
206
+ subset[j1][indexB] = partBs[i]
207
+ subset[j1][-1] += 1
208
+ subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
209
+
210
+ # if find no partA in the subset, create a new subset
211
+ elif not found and k < 17:
212
+ row = -1 * np.ones(20)
213
+ row[indexA] = partAs[i]
214
+ row[indexB] = partBs[i]
215
+ row[-1] = 2
216
+ row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
217
+ subset = np.vstack([subset, row])
218
+ # delete some rows of subset which has few parts occur
219
+ deleteIdx = []
220
+ for i in range(len(subset)):
221
+ if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
222
+ deleteIdx.append(i)
223
+ subset = np.delete(subset, deleteIdx, axis=0)
224
+
225
+ # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
226
+ # candidate: x, y, score, id
227
+ return candidate, subset
228
+
229
+ @staticmethod
230
+ def format_body_result(candidate: np.ndarray, subset: np.ndarray) -> List[BodyResult]:
231
+ """
232
+ Format the body results from the candidate and subset arrays into a list of BodyResult objects.
233
+
234
+ Args:
235
+ candidate (np.ndarray): An array of candidates containing the x, y coordinates, score, and id
236
+ for each body part.
237
+ subset (np.ndarray): An array of subsets containing indices to the candidate array for each
238
+ person detected. The last two columns of each row hold the total score and total parts
239
+ of the person.
240
+
241
+ Returns:
242
+ List[BodyResult]: A list of BodyResult objects, where each object represents a person with
243
+ detected keypoints, total score, and total parts.
244
+ """
245
+ return [
246
+ BodyResult(
247
+ keypoints=[
248
+ Keypoint(
249
+ x=candidate[candidate_index][0],
250
+ y=candidate[candidate_index][1],
251
+ score=candidate[candidate_index][2],
252
+ id=candidate[candidate_index][3]
253
+ ) if candidate_index != -1 else None
254
+ for candidate_index in person[:18].astype(int)
255
+ ],
256
+ total_score=person[18],
257
+ total_parts=person[19]
258
+ )
259
+ for person in subset
260
+ ]
controlnet_aux/open_pose/face.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from torch.nn import Conv2d, MaxPool2d, Module, ReLU, init
7
+ from torchvision.transforms import ToPILImage, ToTensor
8
+
9
+ from . import util
10
+
11
+
12
+ class FaceNet(Module):
13
+ """Model the cascading heatmaps. """
14
+ def __init__(self):
15
+ super(FaceNet, self).__init__()
16
+ # cnn to make feature map
17
+ self.relu = ReLU()
18
+ self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2)
19
+ self.conv1_1 = Conv2d(in_channels=3, out_channels=64,
20
+ kernel_size=3, stride=1, padding=1)
21
+ self.conv1_2 = Conv2d(
22
+ in_channels=64, out_channels=64, kernel_size=3, stride=1,
23
+ padding=1)
24
+ self.conv2_1 = Conv2d(
25
+ in_channels=64, out_channels=128, kernel_size=3, stride=1,
26
+ padding=1)
27
+ self.conv2_2 = Conv2d(
28
+ in_channels=128, out_channels=128, kernel_size=3, stride=1,
29
+ padding=1)
30
+ self.conv3_1 = Conv2d(
31
+ in_channels=128, out_channels=256, kernel_size=3, stride=1,
32
+ padding=1)
33
+ self.conv3_2 = Conv2d(
34
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
35
+ padding=1)
36
+ self.conv3_3 = Conv2d(
37
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
38
+ padding=1)
39
+ self.conv3_4 = Conv2d(
40
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
41
+ padding=1)
42
+ self.conv4_1 = Conv2d(
43
+ in_channels=256, out_channels=512, kernel_size=3, stride=1,
44
+ padding=1)
45
+ self.conv4_2 = Conv2d(
46
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
47
+ padding=1)
48
+ self.conv4_3 = Conv2d(
49
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
50
+ padding=1)
51
+ self.conv4_4 = Conv2d(
52
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
53
+ padding=1)
54
+ self.conv5_1 = Conv2d(
55
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
56
+ padding=1)
57
+ self.conv5_2 = Conv2d(
58
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
59
+ padding=1)
60
+ self.conv5_3_CPM = Conv2d(
61
+ in_channels=512, out_channels=128, kernel_size=3, stride=1,
62
+ padding=1)
63
+
64
+ # stage1
65
+ self.conv6_1_CPM = Conv2d(
66
+ in_channels=128, out_channels=512, kernel_size=1, stride=1,
67
+ padding=0)
68
+ self.conv6_2_CPM = Conv2d(
69
+ in_channels=512, out_channels=71, kernel_size=1, stride=1,
70
+ padding=0)
71
+
72
+ # stage2
73
+ self.Mconv1_stage2 = Conv2d(
74
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
75
+ padding=3)
76
+ self.Mconv2_stage2 = Conv2d(
77
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
78
+ padding=3)
79
+ self.Mconv3_stage2 = Conv2d(
80
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
81
+ padding=3)
82
+ self.Mconv4_stage2 = Conv2d(
83
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
84
+ padding=3)
85
+ self.Mconv5_stage2 = Conv2d(
86
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
87
+ padding=3)
88
+ self.Mconv6_stage2 = Conv2d(
89
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
90
+ padding=0)
91
+ self.Mconv7_stage2 = Conv2d(
92
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
93
+ padding=0)
94
+
95
+ # stage3
96
+ self.Mconv1_stage3 = Conv2d(
97
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
98
+ padding=3)
99
+ self.Mconv2_stage3 = Conv2d(
100
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
101
+ padding=3)
102
+ self.Mconv3_stage3 = Conv2d(
103
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
104
+ padding=3)
105
+ self.Mconv4_stage3 = Conv2d(
106
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
107
+ padding=3)
108
+ self.Mconv5_stage3 = Conv2d(
109
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
110
+ padding=3)
111
+ self.Mconv6_stage3 = Conv2d(
112
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
113
+ padding=0)
114
+ self.Mconv7_stage3 = Conv2d(
115
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
116
+ padding=0)
117
+
118
+ # stage4
119
+ self.Mconv1_stage4 = Conv2d(
120
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
121
+ padding=3)
122
+ self.Mconv2_stage4 = Conv2d(
123
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
124
+ padding=3)
125
+ self.Mconv3_stage4 = Conv2d(
126
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
127
+ padding=3)
128
+ self.Mconv4_stage4 = Conv2d(
129
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
130
+ padding=3)
131
+ self.Mconv5_stage4 = Conv2d(
132
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
133
+ padding=3)
134
+ self.Mconv6_stage4 = Conv2d(
135
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
136
+ padding=0)
137
+ self.Mconv7_stage4 = Conv2d(
138
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
139
+ padding=0)
140
+
141
+ # stage5
142
+ self.Mconv1_stage5 = Conv2d(
143
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
144
+ padding=3)
145
+ self.Mconv2_stage5 = Conv2d(
146
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
147
+ padding=3)
148
+ self.Mconv3_stage5 = Conv2d(
149
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
150
+ padding=3)
151
+ self.Mconv4_stage5 = Conv2d(
152
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
153
+ padding=3)
154
+ self.Mconv5_stage5 = Conv2d(
155
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
156
+ padding=3)
157
+ self.Mconv6_stage5 = Conv2d(
158
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
159
+ padding=0)
160
+ self.Mconv7_stage5 = Conv2d(
161
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
162
+ padding=0)
163
+
164
+ # stage6
165
+ self.Mconv1_stage6 = Conv2d(
166
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
167
+ padding=3)
168
+ self.Mconv2_stage6 = Conv2d(
169
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
170
+ padding=3)
171
+ self.Mconv3_stage6 = Conv2d(
172
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
173
+ padding=3)
174
+ self.Mconv4_stage6 = Conv2d(
175
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
176
+ padding=3)
177
+ self.Mconv5_stage6 = Conv2d(
178
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
179
+ padding=3)
180
+ self.Mconv6_stage6 = Conv2d(
181
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
182
+ padding=0)
183
+ self.Mconv7_stage6 = Conv2d(
184
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
185
+ padding=0)
186
+
187
+ for m in self.modules():
188
+ if isinstance(m, Conv2d):
189
+ init.constant_(m.bias, 0)
190
+
191
+ def forward(self, x):
192
+ """Return a list of heatmaps."""
193
+ heatmaps = []
194
+
195
+ h = self.relu(self.conv1_1(x))
196
+ h = self.relu(self.conv1_2(h))
197
+ h = self.max_pooling_2d(h)
198
+ h = self.relu(self.conv2_1(h))
199
+ h = self.relu(self.conv2_2(h))
200
+ h = self.max_pooling_2d(h)
201
+ h = self.relu(self.conv3_1(h))
202
+ h = self.relu(self.conv3_2(h))
203
+ h = self.relu(self.conv3_3(h))
204
+ h = self.relu(self.conv3_4(h))
205
+ h = self.max_pooling_2d(h)
206
+ h = self.relu(self.conv4_1(h))
207
+ h = self.relu(self.conv4_2(h))
208
+ h = self.relu(self.conv4_3(h))
209
+ h = self.relu(self.conv4_4(h))
210
+ h = self.relu(self.conv5_1(h))
211
+ h = self.relu(self.conv5_2(h))
212
+ h = self.relu(self.conv5_3_CPM(h))
213
+ feature_map = h
214
+
215
+ # stage1
216
+ h = self.relu(self.conv6_1_CPM(h))
217
+ h = self.conv6_2_CPM(h)
218
+ heatmaps.append(h)
219
+
220
+ # stage2
221
+ h = torch.cat([h, feature_map], dim=1) # channel concat
222
+ h = self.relu(self.Mconv1_stage2(h))
223
+ h = self.relu(self.Mconv2_stage2(h))
224
+ h = self.relu(self.Mconv3_stage2(h))
225
+ h = self.relu(self.Mconv4_stage2(h))
226
+ h = self.relu(self.Mconv5_stage2(h))
227
+ h = self.relu(self.Mconv6_stage2(h))
228
+ h = self.Mconv7_stage2(h)
229
+ heatmaps.append(h)
230
+
231
+ # stage3
232
+ h = torch.cat([h, feature_map], dim=1) # channel concat
233
+ h = self.relu(self.Mconv1_stage3(h))
234
+ h = self.relu(self.Mconv2_stage3(h))
235
+ h = self.relu(self.Mconv3_stage3(h))
236
+ h = self.relu(self.Mconv4_stage3(h))
237
+ h = self.relu(self.Mconv5_stage3(h))
238
+ h = self.relu(self.Mconv6_stage3(h))
239
+ h = self.Mconv7_stage3(h)
240
+ heatmaps.append(h)
241
+
242
+ # stage4
243
+ h = torch.cat([h, feature_map], dim=1) # channel concat
244
+ h = self.relu(self.Mconv1_stage4(h))
245
+ h = self.relu(self.Mconv2_stage4(h))
246
+ h = self.relu(self.Mconv3_stage4(h))
247
+ h = self.relu(self.Mconv4_stage4(h))
248
+ h = self.relu(self.Mconv5_stage4(h))
249
+ h = self.relu(self.Mconv6_stage4(h))
250
+ h = self.Mconv7_stage4(h)
251
+ heatmaps.append(h)
252
+
253
+ # stage5
254
+ h = torch.cat([h, feature_map], dim=1) # channel concat
255
+ h = self.relu(self.Mconv1_stage5(h))
256
+ h = self.relu(self.Mconv2_stage5(h))
257
+ h = self.relu(self.Mconv3_stage5(h))
258
+ h = self.relu(self.Mconv4_stage5(h))
259
+ h = self.relu(self.Mconv5_stage5(h))
260
+ h = self.relu(self.Mconv6_stage5(h))
261
+ h = self.Mconv7_stage5(h)
262
+ heatmaps.append(h)
263
+
264
+ # stage6
265
+ h = torch.cat([h, feature_map], dim=1) # channel concat
266
+ h = self.relu(self.Mconv1_stage6(h))
267
+ h = self.relu(self.Mconv2_stage6(h))
268
+ h = self.relu(self.Mconv3_stage6(h))
269
+ h = self.relu(self.Mconv4_stage6(h))
270
+ h = self.relu(self.Mconv5_stage6(h))
271
+ h = self.relu(self.Mconv6_stage6(h))
272
+ h = self.Mconv7_stage6(h)
273
+ heatmaps.append(h)
274
+
275
+ return heatmaps
276
+
277
+
278
+ LOG = logging.getLogger(__name__)
279
+ TOTEN = ToTensor()
280
+ TOPIL = ToPILImage()
281
+
282
+
283
+ params = {
284
+ 'gaussian_sigma': 2.5,
285
+ 'inference_img_size': 736, # 368, 736, 1312
286
+ 'heatmap_peak_thresh': 0.1,
287
+ 'crop_scale': 1.5,
288
+ 'line_indices': [
289
+ [0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
290
+ [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13],
291
+ [13, 14], [14, 15], [15, 16],
292
+ [17, 18], [18, 19], [19, 20], [20, 21],
293
+ [22, 23], [23, 24], [24, 25], [25, 26],
294
+ [27, 28], [28, 29], [29, 30],
295
+ [31, 32], [32, 33], [33, 34], [34, 35],
296
+ [36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36],
297
+ [42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42],
298
+ [48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54],
299
+ [54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48],
300
+ [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66],
301
+ [66, 67], [67, 60]
302
+ ],
303
+ }
304
+
305
+
306
+ class Face(object):
307
+ """
308
+ The OpenPose face landmark detector model.
309
+
310
+ Args:
311
+ inference_size: set the size of the inference image size, suggested:
312
+ 368, 736, 1312, default 736
313
+ gaussian_sigma: blur the heatmaps, default 2.5
314
+ heatmap_peak_thresh: return landmark if over threshold, default 0.1
315
+
316
+ """
317
+ def __init__(self, face_model_path,
318
+ inference_size=None,
319
+ gaussian_sigma=None,
320
+ heatmap_peak_thresh=None):
321
+ self.inference_size = inference_size or params["inference_img_size"]
322
+ self.sigma = gaussian_sigma or params['gaussian_sigma']
323
+ self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"]
324
+ self.model = FaceNet()
325
+ self.model.load_state_dict(torch.load(face_model_path))
326
+ self.model.eval()
327
+
328
+ def to(self, device):
329
+ self.model.to(device)
330
+ return self
331
+
332
+ def __call__(self, face_img):
333
+ device = next(iter(self.model.parameters())).device
334
+ H, W, C = face_img.shape
335
+
336
+ w_size = 384
337
+ x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5
338
+
339
+ x_data = x_data.to(device)
340
+
341
+ with torch.no_grad():
342
+ hs = self.model(x_data[None, ...])
343
+ heatmaps = F.interpolate(
344
+ hs[-1],
345
+ (H, W),
346
+ mode='bilinear', align_corners=True).cpu().numpy()[0]
347
+ return heatmaps
348
+
349
+ def compute_peaks_from_heatmaps(self, heatmaps):
350
+ all_peaks = []
351
+ for part in range(heatmaps.shape[0]):
352
+ map_ori = heatmaps[part].copy()
353
+ binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8)
354
+
355
+ if np.sum(binary) == 0:
356
+ continue
357
+
358
+ positions = np.where(binary > 0.5)
359
+ intensities = map_ori[positions]
360
+ mi = np.argmax(intensities)
361
+ y, x = positions[0][mi], positions[1][mi]
362
+ all_peaks.append([x, y])
363
+
364
+ return np.array(all_peaks)
controlnet_aux/open_pose/hand.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+ from scipy.ndimage.filters import gaussian_filter
5
+ from skimage.measure import label
6
+
7
+ from . import util
8
+ from .model import handpose_model
9
+
10
+
11
+ class Hand(object):
12
+ def __init__(self, model_path):
13
+ self.model = handpose_model()
14
+ model_dict = util.transfer(self.model, torch.load(model_path))
15
+ self.model.load_state_dict(model_dict)
16
+ self.model.eval()
17
+
18
+ def to(self, device):
19
+ self.model.to(device)
20
+ return self
21
+
22
+ def __call__(self, oriImgRaw):
23
+ device = next(iter(self.model.parameters())).device
24
+ scale_search = [0.5, 1.0, 1.5, 2.0]
25
+ # scale_search = [0.5]
26
+ boxsize = 368
27
+ stride = 8
28
+ padValue = 128
29
+ thre = 0.05
30
+ multiplier = [x * boxsize for x in scale_search]
31
+
32
+ wsize = 128
33
+ heatmap_avg = np.zeros((wsize, wsize, 22))
34
+
35
+ Hr, Wr, Cr = oriImgRaw.shape
36
+
37
+ oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8)
38
+
39
+ for m in range(len(multiplier)):
40
+ scale = multiplier[m]
41
+ imageToTest = util.smart_resize(oriImg, (scale, scale))
42
+
43
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
44
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
45
+ im = np.ascontiguousarray(im)
46
+
47
+ data = torch.from_numpy(im).float()
48
+ data = data.to(device)
49
+
50
+ with torch.no_grad():
51
+ output = self.model(data).cpu().numpy()
52
+
53
+ # extract outputs, resize, and remove padding
54
+ heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
55
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
56
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
57
+ heatmap = util.smart_resize(heatmap, (wsize, wsize))
58
+
59
+ heatmap_avg += heatmap / len(multiplier)
60
+
61
+ all_peaks = []
62
+ for part in range(21):
63
+ map_ori = heatmap_avg[:, :, part]
64
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
65
+ binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
66
+
67
+ if np.sum(binary) == 0:
68
+ all_peaks.append([0, 0])
69
+ continue
70
+ label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
71
+ max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
72
+ label_img[label_img != max_index] = 0
73
+ map_ori[label_img == 0] = 0
74
+
75
+ y, x = util.npmax(map_ori)
76
+ y = int(float(y) * float(Hr) / float(wsize))
77
+ x = int(float(x) * float(Wr) / float(wsize))
78
+ all_peaks.append([x, y])
79
+ return np.array(all_peaks)
80
+
81
+ if __name__ == "__main__":
82
+ hand_estimation = Hand('../model/hand_pose_model.pth')
83
+
84
+ # test_image = '../images/hand.jpg'
85
+ test_image = '../images/hand.jpg'
86
+ oriImg = cv2.imread(test_image) # B,G,R order
87
+ peaks = hand_estimation(oriImg)
88
+ canvas = util.draw_handpose(oriImg, peaks, True)
89
+ cv2.imshow('', canvas)
90
+ cv2.waitKey(0)
controlnet_aux/open_pose/model.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from collections import OrderedDict
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ def make_layers(block, no_relu_layers):
8
+ layers = []
9
+ for layer_name, v in block.items():
10
+ if 'pool' in layer_name:
11
+ layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
12
+ padding=v[2])
13
+ layers.append((layer_name, layer))
14
+ else:
15
+ conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
16
+ kernel_size=v[2], stride=v[3],
17
+ padding=v[4])
18
+ layers.append((layer_name, conv2d))
19
+ if layer_name not in no_relu_layers:
20
+ layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
21
+
22
+ return nn.Sequential(OrderedDict(layers))
23
+
24
+ class bodypose_model(nn.Module):
25
+ def __init__(self):
26
+ super(bodypose_model, self).__init__()
27
+
28
+ # these layers have no relu layer
29
+ no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
30
+ 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
31
+ 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
32
+ 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
33
+ blocks = {}
34
+ block0 = OrderedDict([
35
+ ('conv1_1', [3, 64, 3, 1, 1]),
36
+ ('conv1_2', [64, 64, 3, 1, 1]),
37
+ ('pool1_stage1', [2, 2, 0]),
38
+ ('conv2_1', [64, 128, 3, 1, 1]),
39
+ ('conv2_2', [128, 128, 3, 1, 1]),
40
+ ('pool2_stage1', [2, 2, 0]),
41
+ ('conv3_1', [128, 256, 3, 1, 1]),
42
+ ('conv3_2', [256, 256, 3, 1, 1]),
43
+ ('conv3_3', [256, 256, 3, 1, 1]),
44
+ ('conv3_4', [256, 256, 3, 1, 1]),
45
+ ('pool3_stage1', [2, 2, 0]),
46
+ ('conv4_1', [256, 512, 3, 1, 1]),
47
+ ('conv4_2', [512, 512, 3, 1, 1]),
48
+ ('conv4_3_CPM', [512, 256, 3, 1, 1]),
49
+ ('conv4_4_CPM', [256, 128, 3, 1, 1])
50
+ ])
51
+
52
+
53
+ # Stage 1
54
+ block1_1 = OrderedDict([
55
+ ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
56
+ ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
57
+ ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
58
+ ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
59
+ ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
60
+ ])
61
+
62
+ block1_2 = OrderedDict([
63
+ ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
64
+ ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
65
+ ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
66
+ ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
67
+ ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
68
+ ])
69
+ blocks['block1_1'] = block1_1
70
+ blocks['block1_2'] = block1_2
71
+
72
+ self.model0 = make_layers(block0, no_relu_layers)
73
+
74
+ # Stages 2 - 6
75
+ for i in range(2, 7):
76
+ blocks['block%d_1' % i] = OrderedDict([
77
+ ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
78
+ ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
79
+ ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
80
+ ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
81
+ ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
82
+ ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
83
+ ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
84
+ ])
85
+
86
+ blocks['block%d_2' % i] = OrderedDict([
87
+ ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
88
+ ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
89
+ ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
90
+ ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
91
+ ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
92
+ ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
93
+ ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
94
+ ])
95
+
96
+ for k in blocks.keys():
97
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
98
+
99
+ self.model1_1 = blocks['block1_1']
100
+ self.model2_1 = blocks['block2_1']
101
+ self.model3_1 = blocks['block3_1']
102
+ self.model4_1 = blocks['block4_1']
103
+ self.model5_1 = blocks['block5_1']
104
+ self.model6_1 = blocks['block6_1']
105
+
106
+ self.model1_2 = blocks['block1_2']
107
+ self.model2_2 = blocks['block2_2']
108
+ self.model3_2 = blocks['block3_2']
109
+ self.model4_2 = blocks['block4_2']
110
+ self.model5_2 = blocks['block5_2']
111
+ self.model6_2 = blocks['block6_2']
112
+
113
+
114
+ def forward(self, x):
115
+
116
+ out1 = self.model0(x)
117
+
118
+ out1_1 = self.model1_1(out1)
119
+ out1_2 = self.model1_2(out1)
120
+ out2 = torch.cat([out1_1, out1_2, out1], 1)
121
+
122
+ out2_1 = self.model2_1(out2)
123
+ out2_2 = self.model2_2(out2)
124
+ out3 = torch.cat([out2_1, out2_2, out1], 1)
125
+
126
+ out3_1 = self.model3_1(out3)
127
+ out3_2 = self.model3_2(out3)
128
+ out4 = torch.cat([out3_1, out3_2, out1], 1)
129
+
130
+ out4_1 = self.model4_1(out4)
131
+ out4_2 = self.model4_2(out4)
132
+ out5 = torch.cat([out4_1, out4_2, out1], 1)
133
+
134
+ out5_1 = self.model5_1(out5)
135
+ out5_2 = self.model5_2(out5)
136
+ out6 = torch.cat([out5_1, out5_2, out1], 1)
137
+
138
+ out6_1 = self.model6_1(out6)
139
+ out6_2 = self.model6_2(out6)
140
+
141
+ return out6_1, out6_2
142
+
143
+ class handpose_model(nn.Module):
144
+ def __init__(self):
145
+ super(handpose_model, self).__init__()
146
+
147
+ # these layers have no relu layer
148
+ no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
149
+ 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
150
+ # stage 1
151
+ block1_0 = OrderedDict([
152
+ ('conv1_1', [3, 64, 3, 1, 1]),
153
+ ('conv1_2', [64, 64, 3, 1, 1]),
154
+ ('pool1_stage1', [2, 2, 0]),
155
+ ('conv2_1', [64, 128, 3, 1, 1]),
156
+ ('conv2_2', [128, 128, 3, 1, 1]),
157
+ ('pool2_stage1', [2, 2, 0]),
158
+ ('conv3_1', [128, 256, 3, 1, 1]),
159
+ ('conv3_2', [256, 256, 3, 1, 1]),
160
+ ('conv3_3', [256, 256, 3, 1, 1]),
161
+ ('conv3_4', [256, 256, 3, 1, 1]),
162
+ ('pool3_stage1', [2, 2, 0]),
163
+ ('conv4_1', [256, 512, 3, 1, 1]),
164
+ ('conv4_2', [512, 512, 3, 1, 1]),
165
+ ('conv4_3', [512, 512, 3, 1, 1]),
166
+ ('conv4_4', [512, 512, 3, 1, 1]),
167
+ ('conv5_1', [512, 512, 3, 1, 1]),
168
+ ('conv5_2', [512, 512, 3, 1, 1]),
169
+ ('conv5_3_CPM', [512, 128, 3, 1, 1])
170
+ ])
171
+
172
+ block1_1 = OrderedDict([
173
+ ('conv6_1_CPM', [128, 512, 1, 1, 0]),
174
+ ('conv6_2_CPM', [512, 22, 1, 1, 0])
175
+ ])
176
+
177
+ blocks = {}
178
+ blocks['block1_0'] = block1_0
179
+ blocks['block1_1'] = block1_1
180
+
181
+ # stage 2-6
182
+ for i in range(2, 7):
183
+ blocks['block%d' % i] = OrderedDict([
184
+ ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
185
+ ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
186
+ ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
187
+ ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
188
+ ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
189
+ ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
190
+ ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
191
+ ])
192
+
193
+ for k in blocks.keys():
194
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
195
+
196
+ self.model1_0 = blocks['block1_0']
197
+ self.model1_1 = blocks['block1_1']
198
+ self.model2 = blocks['block2']
199
+ self.model3 = blocks['block3']
200
+ self.model4 = blocks['block4']
201
+ self.model5 = blocks['block5']
202
+ self.model6 = blocks['block6']
203
+
204
+ def forward(self, x):
205
+ out1_0 = self.model1_0(x)
206
+ out1_1 = self.model1_1(out1_0)
207
+ concat_stage2 = torch.cat([out1_1, out1_0], 1)
208
+ out_stage2 = self.model2(concat_stage2)
209
+ concat_stage3 = torch.cat([out_stage2, out1_0], 1)
210
+ out_stage3 = self.model3(concat_stage3)
211
+ concat_stage4 = torch.cat([out_stage3, out1_0], 1)
212
+ out_stage4 = self.model4(concat_stage4)
213
+ concat_stage5 = torch.cat([out_stage4, out1_0], 1)
214
+ out_stage5 = self.model5(concat_stage5)
215
+ concat_stage6 = torch.cat([out_stage5, out1_0], 1)
216
+ out_stage6 = self.model6(concat_stage6)
217
+ return out_stage6
controlnet_aux/open_pose/util.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import cv2
4
+ from typing import List, Tuple, Union
5
+
6
+ from .body import BodyResult, Keypoint
7
+
8
+ eps = 0.01
9
+
10
+
11
+ def smart_resize(x, s):
12
+ Ht, Wt = s
13
+ if x.ndim == 2:
14
+ Ho, Wo = x.shape
15
+ Co = 1
16
+ else:
17
+ Ho, Wo, Co = x.shape
18
+ if Co == 3 or Co == 1:
19
+ k = float(Ht + Wt) / float(Ho + Wo)
20
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
21
+ else:
22
+ return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
23
+
24
+
25
+ def smart_resize_k(x, fx, fy):
26
+ if x.ndim == 2:
27
+ Ho, Wo = x.shape
28
+ Co = 1
29
+ else:
30
+ Ho, Wo, Co = x.shape
31
+ Ht, Wt = Ho * fy, Wo * fx
32
+ if Co == 3 or Co == 1:
33
+ k = float(Ht + Wt) / float(Ho + Wo)
34
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
35
+ else:
36
+ return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
37
+
38
+
39
+ def padRightDownCorner(img, stride, padValue):
40
+ h = img.shape[0]
41
+ w = img.shape[1]
42
+
43
+ pad = 4 * [None]
44
+ pad[0] = 0 # up
45
+ pad[1] = 0 # left
46
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
47
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
48
+
49
+ img_padded = img
50
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
51
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
52
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
53
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
54
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
55
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
56
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
57
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
58
+
59
+ return img_padded, pad
60
+
61
+
62
+ def transfer(model, model_weights):
63
+ transfered_model_weights = {}
64
+ for weights_name in model.state_dict().keys():
65
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
66
+ return transfered_model_weights
67
+
68
+
69
+ def draw_bodypose(canvas: np.ndarray, keypoints: List[Keypoint]) -> np.ndarray:
70
+ """
71
+ Draw keypoints and limbs representing body pose on a given canvas.
72
+
73
+ Args:
74
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the body pose.
75
+ keypoints (List[Keypoint]): A list of Keypoint objects representing the body keypoints to be drawn.
76
+
77
+ Returns:
78
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn body pose.
79
+
80
+ Note:
81
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
82
+ """
83
+ H, W, C = canvas.shape
84
+ stickwidth = 4
85
+
86
+ limbSeq = [
87
+ [2, 3], [2, 6], [3, 4], [4, 5],
88
+ [6, 7], [7, 8], [2, 9], [9, 10],
89
+ [10, 11], [2, 12], [12, 13], [13, 14],
90
+ [2, 1], [1, 15], [15, 17], [1, 16],
91
+ [16, 18],
92
+ ]
93
+
94
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
95
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
96
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
97
+
98
+ for (k1_index, k2_index), color in zip(limbSeq, colors):
99
+ keypoint1 = keypoints[k1_index - 1]
100
+ keypoint2 = keypoints[k2_index - 1]
101
+
102
+ if keypoint1 is None or keypoint2 is None:
103
+ continue
104
+
105
+ Y = np.array([keypoint1.x, keypoint2.x]) * float(W)
106
+ X = np.array([keypoint1.y, keypoint2.y]) * float(H)
107
+ mX = np.mean(X)
108
+ mY = np.mean(Y)
109
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
110
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
111
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
112
+ cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color])
113
+
114
+ for keypoint, color in zip(keypoints, colors):
115
+ if keypoint is None:
116
+ continue
117
+
118
+ x, y = keypoint.x, keypoint.y
119
+ x = int(x * W)
120
+ y = int(y * H)
121
+ cv2.circle(canvas, (int(x), int(y)), 4, color, thickness=-1)
122
+
123
+ return canvas
124
+
125
+
126
+ def draw_handpose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
127
+ import matplotlib
128
+ """
129
+ Draw keypoints and connections representing hand pose on a given canvas.
130
+
131
+ Args:
132
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the hand pose.
133
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the hand keypoints to be drawn
134
+ or None if no keypoints are present.
135
+
136
+ Returns:
137
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn hand pose.
138
+
139
+ Note:
140
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
141
+ """
142
+ if not keypoints:
143
+ return canvas
144
+
145
+ H, W, C = canvas.shape
146
+
147
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
148
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
149
+
150
+ for ie, (e1, e2) in enumerate(edges):
151
+ k1 = keypoints[e1]
152
+ k2 = keypoints[e2]
153
+ if k1 is None or k2 is None:
154
+ continue
155
+
156
+ x1 = int(k1.x * W)
157
+ y1 = int(k1.y * H)
158
+ x2 = int(k2.x * W)
159
+ y2 = int(k2.y * H)
160
+ if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
161
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
162
+
163
+ for keypoint in keypoints:
164
+ x, y = keypoint.x, keypoint.y
165
+ x = int(x * W)
166
+ y = int(y * H)
167
+ if x > eps and y > eps:
168
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
169
+ return canvas
170
+
171
+
172
+ def draw_facepose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
173
+ """
174
+ Draw keypoints representing face pose on a given canvas.
175
+
176
+ Args:
177
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the face pose.
178
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the face keypoints to be drawn
179
+ or None if no keypoints are present.
180
+
181
+ Returns:
182
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn face pose.
183
+
184
+ Note:
185
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
186
+ """
187
+ if not keypoints:
188
+ return canvas
189
+
190
+ H, W, C = canvas.shape
191
+ for keypoint in keypoints:
192
+ x, y = keypoint.x, keypoint.y
193
+ x = int(x * W)
194
+ y = int(y * H)
195
+ if x > eps and y > eps:
196
+ cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
197
+ return canvas
198
+
199
+
200
+ # detect hand according to body pose keypoints
201
+ # please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
202
+ def handDetect(body: BodyResult, oriImg) -> List[Tuple[int, int, int, bool]]:
203
+ """
204
+ Detect hands in the input body pose keypoints and calculate the bounding box for each hand.
205
+
206
+ Args:
207
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
208
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
209
+
210
+ Returns:
211
+ List[Tuple[int, int, int, bool]]: A list of tuples, each containing the coordinates (x, y) of the top-left
212
+ corner of the bounding box, the width (height) of the bounding box, and
213
+ a boolean flag indicating whether the hand is a left hand (True) or a
214
+ right hand (False).
215
+
216
+ Notes:
217
+ - The width and height of the bounding boxes are equal since the network requires squared input.
218
+ - The minimum bounding box size is 20 pixels.
219
+ """
220
+ ratioWristElbow = 0.33
221
+ detect_result = []
222
+ image_height, image_width = oriImg.shape[0:2]
223
+
224
+ keypoints = body.keypoints
225
+ # right hand: wrist 4, elbow 3, shoulder 2
226
+ # left hand: wrist 7, elbow 6, shoulder 5
227
+ left_shoulder = keypoints[5]
228
+ left_elbow = keypoints[6]
229
+ left_wrist = keypoints[7]
230
+ right_shoulder = keypoints[2]
231
+ right_elbow = keypoints[3]
232
+ right_wrist = keypoints[4]
233
+
234
+ # if any of three not detected
235
+ has_left = all(keypoint is not None for keypoint in (left_shoulder, left_elbow, left_wrist))
236
+ has_right = all(keypoint is not None for keypoint in (right_shoulder, right_elbow, right_wrist))
237
+ if not (has_left or has_right):
238
+ return []
239
+
240
+ hands = []
241
+ #left hand
242
+ if has_left:
243
+ hands.append([
244
+ left_shoulder.x, left_shoulder.y,
245
+ left_elbow.x, left_elbow.y,
246
+ left_wrist.x, left_wrist.y,
247
+ True
248
+ ])
249
+ # right hand
250
+ if has_right:
251
+ hands.append([
252
+ right_shoulder.x, right_shoulder.y,
253
+ right_elbow.x, right_elbow.y,
254
+ right_wrist.x, right_wrist.y,
255
+ False
256
+ ])
257
+
258
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
259
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
260
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
261
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
262
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
263
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
264
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
265
+ x = x3 + ratioWristElbow * (x3 - x2)
266
+ y = y3 + ratioWristElbow * (y3 - y2)
267
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
268
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
269
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
270
+ # x-y refers to the center --> offset to topLeft point
271
+ # handRectangle.x -= handRectangle.width / 2.f;
272
+ # handRectangle.y -= handRectangle.height / 2.f;
273
+ x -= width / 2
274
+ y -= width / 2 # width = height
275
+ # overflow the image
276
+ if x < 0: x = 0
277
+ if y < 0: y = 0
278
+ width1 = width
279
+ width2 = width
280
+ if x + width > image_width: width1 = image_width - x
281
+ if y + width > image_height: width2 = image_height - y
282
+ width = min(width1, width2)
283
+ # the max hand box value is 20 pixels
284
+ if width >= 20:
285
+ detect_result.append((int(x), int(y), int(width), is_left))
286
+
287
+ '''
288
+ return value: [[x, y, w, True if left hand else False]].
289
+ width=height since the network require squared input.
290
+ x, y is the coordinate of top left
291
+ '''
292
+ return detect_result
293
+
294
+
295
+ # Written by Lvmin
296
+ def faceDetect(body: BodyResult, oriImg) -> Union[Tuple[int, int, int], None]:
297
+ """
298
+ Detect the face in the input body pose keypoints and calculate the bounding box for the face.
299
+
300
+ Args:
301
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
302
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
303
+
304
+ Returns:
305
+ Tuple[int, int, int] | None: A tuple containing the coordinates (x, y) of the top-left corner of the
306
+ bounding box and the width (height) of the bounding box, or None if the
307
+ face is not detected or the bounding box width is less than 20 pixels.
308
+
309
+ Notes:
310
+ - The width and height of the bounding box are equal.
311
+ - The minimum bounding box size is 20 pixels.
312
+ """
313
+ # left right eye ear 14 15 16 17
314
+ image_height, image_width = oriImg.shape[0:2]
315
+
316
+ keypoints = body.keypoints
317
+ head = keypoints[0]
318
+ left_eye = keypoints[14]
319
+ right_eye = keypoints[15]
320
+ left_ear = keypoints[16]
321
+ right_ear = keypoints[17]
322
+
323
+ if head is None or all(keypoint is None for keypoint in (left_eye, right_eye, left_ear, right_ear)):
324
+ return None
325
+
326
+ width = 0.0
327
+ x0, y0 = head.x, head.y
328
+
329
+ if left_eye is not None:
330
+ x1, y1 = left_eye.x, left_eye.y
331
+ d = max(abs(x0 - x1), abs(y0 - y1))
332
+ width = max(width, d * 3.0)
333
+
334
+ if right_eye is not None:
335
+ x1, y1 = right_eye.x, right_eye.y
336
+ d = max(abs(x0 - x1), abs(y0 - y1))
337
+ width = max(width, d * 3.0)
338
+
339
+ if left_ear is not None:
340
+ x1, y1 = left_ear.x, left_ear.y
341
+ d = max(abs(x0 - x1), abs(y0 - y1))
342
+ width = max(width, d * 1.5)
343
+
344
+ if right_ear is not None:
345
+ x1, y1 = right_ear.x, right_ear.y
346
+ d = max(abs(x0 - x1), abs(y0 - y1))
347
+ width = max(width, d * 1.5)
348
+
349
+ x, y = x0, y0
350
+
351
+ x -= width
352
+ y -= width
353
+
354
+ if x < 0:
355
+ x = 0
356
+
357
+ if y < 0:
358
+ y = 0
359
+
360
+ width1 = width * 2
361
+ width2 = width * 2
362
+
363
+ if x + width > image_width:
364
+ width1 = image_width - x
365
+
366
+ if y + width > image_height:
367
+ width2 = image_height - y
368
+
369
+ width = min(width1, width2)
370
+
371
+ if width >= 20:
372
+ return int(x), int(y), int(width)
373
+ else:
374
+ return None
375
+
376
+
377
+ # get max index of 2d array
378
+ def npmax(array):
379
+ arrayindex = array.argmax(1)
380
+ arrayvalue = array.max(1)
381
+ i = arrayvalue.argmax()
382
+ j = arrayindex[i]
383
+ return i, j
controlnet_aux/util.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import torch
7
+
8
+ annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
9
+
10
+
11
+ def HWC3(x):
12
+ assert x.dtype == np.uint8
13
+ if x.ndim == 2:
14
+ x = x[:, :, None]
15
+ assert x.ndim == 3
16
+ H, W, C = x.shape
17
+ assert C == 1 or C == 3 or C == 4
18
+ if C == 3:
19
+ return x
20
+ if C == 1:
21
+ return np.concatenate([x, x, x], axis=2)
22
+ if C == 4:
23
+ color = x[:, :, 0:3].astype(np.float32)
24
+ alpha = x[:, :, 3:4].astype(np.float32) / 255.0
25
+ y = color * alpha + 255.0 * (1.0 - alpha)
26
+ y = y.clip(0, 255).astype(np.uint8)
27
+ return y
28
+
29
+
30
+ def make_noise_disk(H, W, C, F):
31
+ noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
32
+ noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
33
+ noise = noise[F: F + H, F: F + W]
34
+ noise -= np.min(noise)
35
+ noise /= np.max(noise)
36
+ if C == 1:
37
+ noise = noise[:, :, None]
38
+ return noise
39
+
40
+
41
+ def nms(x, t, s):
42
+ x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
43
+
44
+ f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
45
+ f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
46
+ f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
47
+ f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
48
+
49
+ y = np.zeros_like(x)
50
+
51
+ for f in [f1, f2, f3, f4]:
52
+ np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
53
+
54
+ z = np.zeros_like(y, dtype=np.uint8)
55
+ z[y > t] = 255
56
+ return z
57
+
58
+ def min_max_norm(x):
59
+ x -= np.min(x)
60
+ x /= np.maximum(np.max(x), 1e-5)
61
+ return x
62
+
63
+
64
+ def safe_step(x, step=2):
65
+ y = x.astype(np.float32) * float(step + 1)
66
+ y = y.astype(np.int32).astype(np.float32) / float(step)
67
+ return y
68
+
69
+
70
+ def img2mask(img, H, W, low=10, high=90):
71
+ assert img.ndim == 3 or img.ndim == 2
72
+ assert img.dtype == np.uint8
73
+
74
+ if img.ndim == 3:
75
+ y = img[:, :, random.randrange(0, img.shape[2])]
76
+ else:
77
+ y = img
78
+
79
+ y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
80
+
81
+ if random.uniform(0, 1) < 0.5:
82
+ y = 255 - y
83
+
84
+ return y < np.percentile(y, random.randrange(low, high))
85
+
86
+
87
+ def resize_image(input_image, resolution):
88
+ H, W, C = input_image.shape
89
+ H = float(H)
90
+ W = float(W)
91
+ k = float(resolution) / min(H, W)
92
+ H *= k
93
+ W *= k
94
+ H = int(np.round(H / 64.0)) * 64
95
+ W = int(np.round(W / 64.0)) * 64
96
+ img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
97
+ return img
98
+
99
+
100
+ def torch_gc():
101
+ if torch.cuda.is_available():
102
+ torch.cuda.empty_cache()
103
+ torch.cuda.ipc_collect()
104
+
105
+
106
+ def ade_palette():
107
+ """ADE20K palette that maps each class to RGB values."""
108
+ return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
109
+ [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
110
+ [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
111
+ [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
112
+ [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
113
+ [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
114
+ [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
115
+ [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
116
+ [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
117
+ [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
118
+ [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
119
+ [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
120
+ [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
121
+ [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
122
+ [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
123
+ [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
124
+ [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
125
+ [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
126
+ [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
127
+ [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
128
+ [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
129
+ [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
130
+ [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
131
+ [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
132
+ [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
133
+ [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
134
+ [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
135
+ [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
136
+ [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
137
+ [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
138
+ [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
139
+ [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
140
+ [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
141
+ [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
142
+ [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
143
+ [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
144
+ [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
145
+ [102, 255, 0], [92, 0, 255]]
146
+
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/diffusers.git
2
+ torch==2.0.1
3
+ torchvision==0.15.2
4
+ transformers==4.43.3
5
+ einops
6
+ onnxruntime-gpu
7
+ spaces
8
+ accelerate
9
+ omegaconf
10
+ huggingface-hub
11
+ opencv-python
12
+ gradio
13
+ xformers
14
+ sentencepiece
15
+ peft
16
+ scipy
17
+ scikit-image