Spaces:
Running
on
Zero
Running
on
Zero
Upload 40 files
Browse files- ControlNetUnion-space/.DS_Store +0 -0
- ControlNetUnion-space/app.py +264 -0
- ControlNetUnion-space/controlnet_aux/.DS_Store +0 -0
- ControlNetUnion-space/controlnet_aux/__init__.py +5 -0
- ControlNetUnion-space/controlnet_aux/canny/__init__.py +36 -0
- ControlNetUnion-space/controlnet_aux/open_pose/LICENSE +108 -0
- ControlNetUnion-space/controlnet_aux/open_pose/__init__.py +234 -0
- ControlNetUnion-space/controlnet_aux/open_pose/body.py +260 -0
- ControlNetUnion-space/controlnet_aux/open_pose/face.py +364 -0
- ControlNetUnion-space/controlnet_aux/open_pose/hand.py +90 -0
- ControlNetUnion-space/controlnet_aux/open_pose/model.py +217 -0
- ControlNetUnion-space/controlnet_aux/open_pose/util.py +383 -0
- ControlNetUnion-space/controlnet_aux/util.py +146 -0
- ControlNetUnion-space/depth_anything_v2/.DS_Store +0 -0
- ControlNetUnion-space/depth_anything_v2/dinov2.py +415 -0
- ControlNetUnion-space/depth_anything_v2/dinov2_layers/__init__.py +11 -0
- ControlNetUnion-space/depth_anything_v2/dinov2_layers/attention.py +83 -0
- ControlNetUnion-space/depth_anything_v2/dinov2_layers/block.py +252 -0
- ControlNetUnion-space/depth_anything_v2/dinov2_layers/drop_path.py +35 -0
- ControlNetUnion-space/depth_anything_v2/dinov2_layers/layer_scale.py +28 -0
- ControlNetUnion-space/depth_anything_v2/dinov2_layers/mlp.py +41 -0
- ControlNetUnion-space/depth_anything_v2/dinov2_layers/patch_embed.py +89 -0
- ControlNetUnion-space/depth_anything_v2/dinov2_layers/swiglu_ffn.py +63 -0
- ControlNetUnion-space/depth_anything_v2/dpt.py +221 -0
- ControlNetUnion-space/depth_anything_v2/util/blocks.py +148 -0
- ControlNetUnion-space/depth_anything_v2/util/transform.py +158 -0
- ControlNetUnion-space/requirements.txt +17 -0
- app.py +264 -0
- controlnet_aux/.DS_Store +0 -0
- controlnet_aux/__init__.py +5 -0
- controlnet_aux/canny/__init__.py +36 -0
- controlnet_aux/open_pose/LICENSE +108 -0
- controlnet_aux/open_pose/__init__.py +234 -0
- controlnet_aux/open_pose/body.py +260 -0
- controlnet_aux/open_pose/face.py +364 -0
- controlnet_aux/open_pose/hand.py +90 -0
- controlnet_aux/open_pose/model.py +217 -0
- controlnet_aux/open_pose/util.py +383 -0
- controlnet_aux/util.py +146 -0
- requirements.txt +17 -0
ControlNetUnion-space/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
ControlNetUnion-space/app.py
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append('./')
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import spaces
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import subprocess
|
9 |
+
import numpy as np
|
10 |
+
from PIL import Image
|
11 |
+
import cv2
|
12 |
+
import torch
|
13 |
+
import random
|
14 |
+
|
15 |
+
os.system("pip install -e ./controlnet_aux")
|
16 |
+
|
17 |
+
from controlnet_aux import OpenposeDetector, CannyDetector
|
18 |
+
from depth_anything_v2.dpt import DepthAnythingV2
|
19 |
+
|
20 |
+
from huggingface_hub import hf_hub_download
|
21 |
+
|
22 |
+
from huggingface_hub import login
|
23 |
+
hf_token = os.environ.get("HF_TOKEN_GATED")
|
24 |
+
login(token=hf_token)
|
25 |
+
|
26 |
+
MAX_SEED = np.iinfo(np.int32).max
|
27 |
+
|
28 |
+
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
|
29 |
+
if randomize_seed:
|
30 |
+
seed = random.randint(0, MAX_SEED)
|
31 |
+
return seed
|
32 |
+
|
33 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
34 |
+
model_configs = {
|
35 |
+
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
|
36 |
+
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
|
37 |
+
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
|
38 |
+
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
|
39 |
+
}
|
40 |
+
|
41 |
+
encoder = 'vitl'
|
42 |
+
model = DepthAnythingV2(**model_configs[encoder])
|
43 |
+
filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-Large", filename=f"depth_anything_v2_vitl.pth", repo_type="model")
|
44 |
+
state_dict = torch.load(filepath, map_location="cpu")
|
45 |
+
model.load_state_dict(state_dict)
|
46 |
+
model = model.to(DEVICE).eval()
|
47 |
+
|
48 |
+
import torch
|
49 |
+
from diffusers.utils import load_image
|
50 |
+
from diffusers import FluxControlNetPipeline, FluxControlNetModel
|
51 |
+
from diffusers.models import FluxMultiControlNetModel
|
52 |
+
|
53 |
+
base_model = 'black-forest-labs/FLUX.1-dev'
|
54 |
+
controlnet_model = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro'
|
55 |
+
controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
|
56 |
+
controlnet = FluxMultiControlNetModel([controlnet])
|
57 |
+
pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=controlnet, torch_dtype=torch.bfloat16)
|
58 |
+
pipe.to("cuda")
|
59 |
+
|
60 |
+
mode_mapping = {"canny":0, "tile":1, "depth":2, "blur":3, "openpose":4, "gray":5, "low quality": 6}
|
61 |
+
strength_mapping = {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}
|
62 |
+
|
63 |
+
canny = CannyDetector()
|
64 |
+
open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
|
65 |
+
|
66 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
67 |
+
pipe.vae.enable_tiling()
|
68 |
+
pipe.vae.enable_slicing()
|
69 |
+
pipe.enable_model_cpu_offload() # for saving memory
|
70 |
+
|
71 |
+
def convert_from_image_to_cv2(img: Image) -> np.ndarray:
|
72 |
+
return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
73 |
+
|
74 |
+
def convert_from_cv2_to_image(img: np.ndarray) -> Image:
|
75 |
+
return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
|
76 |
+
|
77 |
+
def extract_depth(image):
|
78 |
+
image = np.asarray(image)
|
79 |
+
depth = model.infer_image(image[:, :, ::-1])
|
80 |
+
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
|
81 |
+
depth = depth.astype(np.uint8)
|
82 |
+
gray_depth = Image.fromarray(depth).convert('RGB')
|
83 |
+
return gray_depth
|
84 |
+
|
85 |
+
def extract_openpose(img):
|
86 |
+
processed_image_open_pose = open_pose(img, hand_and_face=True)
|
87 |
+
return processed_image_open_pose
|
88 |
+
|
89 |
+
def extract_canny(image):
|
90 |
+
processed_image_canny = canny(image)
|
91 |
+
return processed_image_canny
|
92 |
+
|
93 |
+
def apply_gaussian_blur(image, kernel_size=(21, 21)):
|
94 |
+
image = convert_from_image_to_cv2(image)
|
95 |
+
blurred_image = convert_from_cv2_to_image(cv2.GaussianBlur(image, kernel_size, 0))
|
96 |
+
return blurred_image
|
97 |
+
|
98 |
+
def convert_to_grayscale(image):
|
99 |
+
image = convert_from_image_to_cv2(image)
|
100 |
+
gray_image = convert_from_cv2_to_image(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))
|
101 |
+
return gray_image
|
102 |
+
|
103 |
+
def add_gaussian_noise(image, mean=0, sigma=10):
|
104 |
+
image = convert_from_image_to_cv2(image)
|
105 |
+
noise = np.random.normal(mean, sigma, image.shape)
|
106 |
+
noisy_image = convert_from_cv2_to_image(np.clip(image.astype(np.float32) + noise, 0, 255).astype(np.uint8))
|
107 |
+
return noisy_image
|
108 |
+
|
109 |
+
def tile(input_image, resolution=768):
|
110 |
+
input_image = convert_from_image_to_cv2(input_image)
|
111 |
+
H, W, C = input_image.shape
|
112 |
+
H = float(H)
|
113 |
+
W = float(W)
|
114 |
+
k = float(resolution) / min(H, W)
|
115 |
+
H *= k
|
116 |
+
W *= k
|
117 |
+
H = int(np.round(H / 64.0)) * 64
|
118 |
+
W = int(np.round(W / 64.0)) * 64
|
119 |
+
img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
|
120 |
+
img = convert_from_cv2_to_image(img)
|
121 |
+
return img
|
122 |
+
|
123 |
+
def resize_img(input_image, max_side=768, min_side=512, size=None,
|
124 |
+
pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64):
|
125 |
+
|
126 |
+
w, h = input_image.size
|
127 |
+
if size is not None:
|
128 |
+
w_resize_new, h_resize_new = size
|
129 |
+
else:
|
130 |
+
ratio = min_side / min(h, w)
|
131 |
+
w, h = round(ratio*w), round(ratio*h)
|
132 |
+
ratio = max_side / max(h, w)
|
133 |
+
input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
|
134 |
+
w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
|
135 |
+
h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
|
136 |
+
input_image = input_image.resize([w_resize_new, h_resize_new], mode)
|
137 |
+
|
138 |
+
if pad_to_max_side:
|
139 |
+
res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
|
140 |
+
offset_x = (max_side - w_resize_new) // 2
|
141 |
+
offset_y = (max_side - h_resize_new) // 2
|
142 |
+
res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
|
143 |
+
input_image = Image.fromarray(res)
|
144 |
+
return input_image
|
145 |
+
|
146 |
+
@spaces.GPU(duration=180)
|
147 |
+
def infer(cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed, progress=gr.Progress(track_tqdm=True)):
|
148 |
+
|
149 |
+
control_mode_num = mode_mapping[control_mode]
|
150 |
+
|
151 |
+
if cond_in is None:
|
152 |
+
if image_in is not None:
|
153 |
+
image_in = resize_img(load_image(image_in))
|
154 |
+
if control_mode == "canny":
|
155 |
+
control_image = extract_canny(image_in)
|
156 |
+
elif control_mode == "depth":
|
157 |
+
control_image = extract_depth(image_in)
|
158 |
+
elif control_mode == "openpose":
|
159 |
+
control_image = extract_openpose(image_in)
|
160 |
+
elif control_mode == "blur":
|
161 |
+
control_image = apply_gaussian_blur(image_in)
|
162 |
+
elif control_mode == "low quality":
|
163 |
+
control_image = add_gaussian_noise(image_in)
|
164 |
+
elif control_mode == "gray":
|
165 |
+
control_image = convert_to_grayscale(image_in)
|
166 |
+
elif control_mode == "tile":
|
167 |
+
control_image = tile(image_in)
|
168 |
+
else:
|
169 |
+
control_image = resize_img(load_image(cond_in))
|
170 |
+
|
171 |
+
width, height = control_image.size
|
172 |
+
|
173 |
+
image = pipe(
|
174 |
+
prompt,
|
175 |
+
control_image=[control_image],
|
176 |
+
control_mode=[control_mode_num],
|
177 |
+
width=width,
|
178 |
+
height=height,
|
179 |
+
controlnet_conditioning_scale=[control_strength],
|
180 |
+
num_inference_steps=inference_steps,
|
181 |
+
guidance_scale=guidance_scale,
|
182 |
+
generator=torch.manual_seed(seed),
|
183 |
+
).images[0]
|
184 |
+
|
185 |
+
torch.cuda.empty_cache()
|
186 |
+
|
187 |
+
return image, control_image, gr.update(visible=True)
|
188 |
+
|
189 |
+
|
190 |
+
css="""
|
191 |
+
#col-container{
|
192 |
+
margin: 0 auto;
|
193 |
+
max-width: 1080px;
|
194 |
+
}
|
195 |
+
"""
|
196 |
+
with gr.Blocks(css=css) as demo:
|
197 |
+
with gr.Column(elem_id="col-container"):
|
198 |
+
gr.Markdown("""
|
199 |
+
# FLUX.1-dev-ControlNet-Union-Pro
|
200 |
+
A unified ControlNet for FLUX.1-dev model from the InstantX team and Shakker Labs. Model card: [Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro). <br />
|
201 |
+
The recommended strength: {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}. Long prompt is preferred by FLUX.1.
|
202 |
+
""")
|
203 |
+
|
204 |
+
with gr.Column():
|
205 |
+
|
206 |
+
with gr.Row():
|
207 |
+
with gr.Column():
|
208 |
+
|
209 |
+
with gr.Row(equal_height=True):
|
210 |
+
cond_in = gr.Image(label="Upload a processed control image", sources=["upload"], type="filepath")
|
211 |
+
image_in = gr.Image(label="Extract condition from a reference image (Optional)", sources=["upload"], type="filepath")
|
212 |
+
|
213 |
+
prompt = gr.Textbox(label="Prompt", value="best quality")
|
214 |
+
|
215 |
+
with gr.Accordion("Controlnet"):
|
216 |
+
control_mode = gr.Radio(
|
217 |
+
["canny", "depth", "openpose", "gray", "blur", "tile", "low quality"], label="Mode", value="gray",
|
218 |
+
info="select the control mode, one for all"
|
219 |
+
)
|
220 |
+
|
221 |
+
control_strength = gr.Slider(
|
222 |
+
label="control strength",
|
223 |
+
minimum=0,
|
224 |
+
maximum=1.0,
|
225 |
+
step=0.05,
|
226 |
+
value=0.50,
|
227 |
+
)
|
228 |
+
|
229 |
+
seed = gr.Slider(
|
230 |
+
label="Seed",
|
231 |
+
minimum=0,
|
232 |
+
maximum=MAX_SEED,
|
233 |
+
step=1,
|
234 |
+
value=42,
|
235 |
+
)
|
236 |
+
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
|
237 |
+
|
238 |
+
with gr.Accordion("Advanced settings", open=False):
|
239 |
+
with gr.Column():
|
240 |
+
with gr.Row():
|
241 |
+
inference_steps = gr.Slider(label="Inference steps", minimum=1, maximum=50, step=1, value=24)
|
242 |
+
guidance_scale = gr.Slider(label="Guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=3.5)
|
243 |
+
|
244 |
+
submit_btn = gr.Button("Submit")
|
245 |
+
|
246 |
+
with gr.Column():
|
247 |
+
result = gr.Image(label="Result")
|
248 |
+
processed_cond = gr.Image(label="Preprocessed Cond")
|
249 |
+
|
250 |
+
submit_btn.click(
|
251 |
+
fn=randomize_seed_fn,
|
252 |
+
inputs=[seed, randomize_seed],
|
253 |
+
outputs=seed,
|
254 |
+
queue=False,
|
255 |
+
api_name=False
|
256 |
+
).then(
|
257 |
+
fn = infer,
|
258 |
+
inputs = [cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed],
|
259 |
+
outputs = [result, processed_cond],
|
260 |
+
show_api=False
|
261 |
+
)
|
262 |
+
|
263 |
+
demo.queue(api_open=False)
|
264 |
+
demo.launch()
|
ControlNetUnion-space/controlnet_aux/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
ControlNetUnion-space/controlnet_aux/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__version__ = "0.0.9"
|
2 |
+
|
3 |
+
from .canny import CannyDetector
|
4 |
+
from .open_pose import OpenposeDetector
|
5 |
+
|
ControlNetUnion-space/controlnet_aux/canny/__init__.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
+
from ..util import HWC3, resize_image
|
6 |
+
|
7 |
+
class CannyDetector:
|
8 |
+
def __call__(self, input_image=None, low_threshold=100, high_threshold=200, detect_resolution=512, image_resolution=512, output_type=None, **kwargs):
|
9 |
+
if "img" in kwargs:
|
10 |
+
warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
|
11 |
+
input_image = kwargs.pop("img")
|
12 |
+
|
13 |
+
if input_image is None:
|
14 |
+
raise ValueError("input_image must be defined.")
|
15 |
+
|
16 |
+
if not isinstance(input_image, np.ndarray):
|
17 |
+
input_image = np.array(input_image, dtype=np.uint8)
|
18 |
+
output_type = output_type or "pil"
|
19 |
+
else:
|
20 |
+
output_type = output_type or "np"
|
21 |
+
|
22 |
+
input_image = HWC3(input_image)
|
23 |
+
input_image = resize_image(input_image, detect_resolution)
|
24 |
+
|
25 |
+
detected_map = cv2.Canny(input_image, low_threshold, high_threshold)
|
26 |
+
detected_map = HWC3(detected_map)
|
27 |
+
|
28 |
+
img = resize_image(input_image, image_resolution)
|
29 |
+
H, W, C = img.shape
|
30 |
+
|
31 |
+
detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
|
32 |
+
|
33 |
+
if output_type == "pil":
|
34 |
+
detected_map = Image.fromarray(detected_map)
|
35 |
+
|
36 |
+
return detected_map
|
ControlNetUnion-space/controlnet_aux/open_pose/LICENSE
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OPENPOSE: MULTIPERSON KEYPOINT DETECTION
|
2 |
+
SOFTWARE LICENSE AGREEMENT
|
3 |
+
ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
|
4 |
+
|
5 |
+
BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
|
6 |
+
|
7 |
+
This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
|
8 |
+
|
9 |
+
RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
|
10 |
+
Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
|
11 |
+
non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
|
12 |
+
|
13 |
+
CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
|
14 |
+
|
15 |
+
COPYRIGHT: The Software is owned by Licensor and is protected by United
|
16 |
+
States copyright laws and applicable international treaties and/or conventions.
|
17 |
+
|
18 |
+
PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
|
19 |
+
|
20 |
+
DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
|
21 |
+
|
22 |
+
BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
|
23 |
+
|
24 |
+
USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
|
25 |
+
|
26 |
+
You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
|
27 |
+
|
28 |
+
ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
|
29 |
+
|
30 |
+
TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
|
31 |
+
|
32 |
+
The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
|
33 |
+
|
34 |
+
FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
|
35 |
+
|
36 |
+
DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
|
37 |
+
|
38 |
+
SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
|
39 |
+
|
40 |
+
EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
|
41 |
+
|
42 |
+
EXPORT REGULATION: Licensee agrees to comply with any and all applicable
|
43 |
+
U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
|
44 |
+
|
45 |
+
SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
|
46 |
+
|
47 |
+
NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
|
48 |
+
|
49 |
+
GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
|
50 |
+
|
51 |
+
ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
************************************************************************
|
56 |
+
|
57 |
+
THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
|
58 |
+
|
59 |
+
This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
|
60 |
+
|
61 |
+
1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
|
62 |
+
|
63 |
+
COPYRIGHT
|
64 |
+
|
65 |
+
All contributions by the University of California:
|
66 |
+
Copyright (c) 2014-2017 The Regents of the University of California (Regents)
|
67 |
+
All rights reserved.
|
68 |
+
|
69 |
+
All other contributions:
|
70 |
+
Copyright (c) 2014-2017, the respective contributors
|
71 |
+
All rights reserved.
|
72 |
+
|
73 |
+
Caffe uses a shared copyright model: each contributor holds copyright over
|
74 |
+
their contributions to Caffe. The project versioning records all such
|
75 |
+
contribution and copyright details. If a contributor wants to further mark
|
76 |
+
their specific copyright on a particular contribution, they should indicate
|
77 |
+
their copyright solely in the commit message of the change when it is
|
78 |
+
committed.
|
79 |
+
|
80 |
+
LICENSE
|
81 |
+
|
82 |
+
Redistribution and use in source and binary forms, with or without
|
83 |
+
modification, are permitted provided that the following conditions are met:
|
84 |
+
|
85 |
+
1. Redistributions of source code must retain the above copyright notice, this
|
86 |
+
list of conditions and the following disclaimer.
|
87 |
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
88 |
+
this list of conditions and the following disclaimer in the documentation
|
89 |
+
and/or other materials provided with the distribution.
|
90 |
+
|
91 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
92 |
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
93 |
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
94 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
95 |
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
96 |
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
97 |
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
98 |
+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
99 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
100 |
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
101 |
+
|
102 |
+
CONTRIBUTION AGREEMENT
|
103 |
+
|
104 |
+
By contributing to the BVLC/caffe repository through pull-request, comment,
|
105 |
+
or otherwise, the contributor releases their content to the
|
106 |
+
license and copyright terms herein.
|
107 |
+
|
108 |
+
************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
|
ControlNetUnion-space/controlnet_aux/open_pose/__init__.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Openpose
|
2 |
+
# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
|
3 |
+
# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
|
4 |
+
# 3rd Edited by ControlNet
|
5 |
+
# 4th Edited by ControlNet (added face and correct hands)
|
6 |
+
# 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs)
|
7 |
+
# This preprocessor is licensed by CMU for non-commercial use only.
|
8 |
+
|
9 |
+
|
10 |
+
import os
|
11 |
+
|
12 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
13 |
+
|
14 |
+
import json
|
15 |
+
import warnings
|
16 |
+
from typing import Callable, List, NamedTuple, Tuple, Union
|
17 |
+
|
18 |
+
import cv2
|
19 |
+
import numpy as np
|
20 |
+
import torch
|
21 |
+
from huggingface_hub import hf_hub_download
|
22 |
+
from PIL import Image
|
23 |
+
|
24 |
+
from ..util import HWC3, resize_image
|
25 |
+
from . import util
|
26 |
+
from .body import Body, BodyResult, Keypoint
|
27 |
+
from .face import Face
|
28 |
+
from .hand import Hand
|
29 |
+
|
30 |
+
HandResult = List[Keypoint]
|
31 |
+
FaceResult = List[Keypoint]
|
32 |
+
|
33 |
+
class PoseResult(NamedTuple):
|
34 |
+
body: BodyResult
|
35 |
+
left_hand: Union[HandResult, None]
|
36 |
+
right_hand: Union[HandResult, None]
|
37 |
+
face: Union[FaceResult, None]
|
38 |
+
|
39 |
+
def draw_poses(poses: List[PoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True):
|
40 |
+
"""
|
41 |
+
Draw the detected poses on an empty canvas.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
poses (List[PoseResult]): A list of PoseResult objects containing the detected poses.
|
45 |
+
H (int): The height of the canvas.
|
46 |
+
W (int): The width of the canvas.
|
47 |
+
draw_body (bool, optional): Whether to draw body keypoints. Defaults to True.
|
48 |
+
draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True.
|
49 |
+
draw_face (bool, optional): Whether to draw face keypoints. Defaults to True.
|
50 |
+
|
51 |
+
Returns:
|
52 |
+
numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses.
|
53 |
+
"""
|
54 |
+
canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
|
55 |
+
|
56 |
+
for pose in poses:
|
57 |
+
if draw_body:
|
58 |
+
canvas = util.draw_bodypose(canvas, pose.body.keypoints)
|
59 |
+
|
60 |
+
if draw_hand:
|
61 |
+
canvas = util.draw_handpose(canvas, pose.left_hand)
|
62 |
+
canvas = util.draw_handpose(canvas, pose.right_hand)
|
63 |
+
|
64 |
+
if draw_face:
|
65 |
+
canvas = util.draw_facepose(canvas, pose.face)
|
66 |
+
|
67 |
+
return canvas
|
68 |
+
|
69 |
+
|
70 |
+
class OpenposeDetector:
|
71 |
+
"""
|
72 |
+
A class for detecting human poses in images using the Openpose model.
|
73 |
+
|
74 |
+
Attributes:
|
75 |
+
model_dir (str): Path to the directory where the pose models are stored.
|
76 |
+
"""
|
77 |
+
def __init__(self, body_estimation, hand_estimation=None, face_estimation=None):
|
78 |
+
self.body_estimation = body_estimation
|
79 |
+
self.hand_estimation = hand_estimation
|
80 |
+
self.face_estimation = face_estimation
|
81 |
+
|
82 |
+
@classmethod
|
83 |
+
def from_pretrained(cls, pretrained_model_or_path, filename=None, hand_filename=None, face_filename=None, cache_dir=None, local_files_only=False):
|
84 |
+
|
85 |
+
if pretrained_model_or_path == "lllyasviel/ControlNet":
|
86 |
+
filename = filename or "annotator/ckpts/body_pose_model.pth"
|
87 |
+
hand_filename = hand_filename or "annotator/ckpts/hand_pose_model.pth"
|
88 |
+
face_filename = face_filename or "facenet.pth"
|
89 |
+
|
90 |
+
face_pretrained_model_or_path = "lllyasviel/Annotators"
|
91 |
+
else:
|
92 |
+
filename = filename or "body_pose_model.pth"
|
93 |
+
hand_filename = hand_filename or "hand_pose_model.pth"
|
94 |
+
face_filename = face_filename or "facenet.pth"
|
95 |
+
|
96 |
+
face_pretrained_model_or_path = pretrained_model_or_path
|
97 |
+
|
98 |
+
if os.path.isdir(pretrained_model_or_path):
|
99 |
+
body_model_path = os.path.join(pretrained_model_or_path, filename)
|
100 |
+
hand_model_path = os.path.join(pretrained_model_or_path, hand_filename)
|
101 |
+
face_model_path = os.path.join(face_pretrained_model_or_path, face_filename)
|
102 |
+
else:
|
103 |
+
body_model_path = hf_hub_download(pretrained_model_or_path, filename, cache_dir=cache_dir, local_files_only=local_files_only)
|
104 |
+
hand_model_path = hf_hub_download(pretrained_model_or_path, hand_filename, cache_dir=cache_dir, local_files_only=local_files_only)
|
105 |
+
face_model_path = hf_hub_download(face_pretrained_model_or_path, face_filename, cache_dir=cache_dir, local_files_only=local_files_only)
|
106 |
+
|
107 |
+
body_estimation = Body(body_model_path)
|
108 |
+
hand_estimation = Hand(hand_model_path)
|
109 |
+
face_estimation = Face(face_model_path)
|
110 |
+
|
111 |
+
return cls(body_estimation, hand_estimation, face_estimation)
|
112 |
+
|
113 |
+
def to(self, device):
|
114 |
+
self.body_estimation.to(device)
|
115 |
+
self.hand_estimation.to(device)
|
116 |
+
self.face_estimation.to(device)
|
117 |
+
return self
|
118 |
+
|
119 |
+
def detect_hands(self, body: BodyResult, oriImg) -> Tuple[Union[HandResult, None], Union[HandResult, None]]:
|
120 |
+
left_hand = None
|
121 |
+
right_hand = None
|
122 |
+
H, W, _ = oriImg.shape
|
123 |
+
for x, y, w, is_left in util.handDetect(body, oriImg):
|
124 |
+
peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :]).astype(np.float32)
|
125 |
+
if peaks.ndim == 2 and peaks.shape[1] == 2:
|
126 |
+
peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
|
127 |
+
peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
|
128 |
+
|
129 |
+
hand_result = [
|
130 |
+
Keypoint(x=peak[0], y=peak[1])
|
131 |
+
for peak in peaks
|
132 |
+
]
|
133 |
+
|
134 |
+
if is_left:
|
135 |
+
left_hand = hand_result
|
136 |
+
else:
|
137 |
+
right_hand = hand_result
|
138 |
+
|
139 |
+
return left_hand, right_hand
|
140 |
+
|
141 |
+
def detect_face(self, body: BodyResult, oriImg) -> Union[FaceResult, None]:
|
142 |
+
face = util.faceDetect(body, oriImg)
|
143 |
+
if face is None:
|
144 |
+
return None
|
145 |
+
|
146 |
+
x, y, w = face
|
147 |
+
H, W, _ = oriImg.shape
|
148 |
+
heatmaps = self.face_estimation(oriImg[y:y+w, x:x+w, :])
|
149 |
+
peaks = self.face_estimation.compute_peaks_from_heatmaps(heatmaps).astype(np.float32)
|
150 |
+
if peaks.ndim == 2 and peaks.shape[1] == 2:
|
151 |
+
peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
|
152 |
+
peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
|
153 |
+
return [
|
154 |
+
Keypoint(x=peak[0], y=peak[1])
|
155 |
+
for peak in peaks
|
156 |
+
]
|
157 |
+
|
158 |
+
return None
|
159 |
+
|
160 |
+
def detect_poses(self, oriImg, include_hand=False, include_face=False) -> List[PoseResult]:
|
161 |
+
"""
|
162 |
+
Detect poses in the given image.
|
163 |
+
Args:
|
164 |
+
oriImg (numpy.ndarray): The input image for pose detection.
|
165 |
+
include_hand (bool, optional): Whether to include hand detection. Defaults to False.
|
166 |
+
include_face (bool, optional): Whether to include face detection. Defaults to False.
|
167 |
+
|
168 |
+
Returns:
|
169 |
+
List[PoseResult]: A list of PoseResult objects containing the detected poses.
|
170 |
+
"""
|
171 |
+
oriImg = oriImg[:, :, ::-1].copy()
|
172 |
+
H, W, C = oriImg.shape
|
173 |
+
with torch.no_grad():
|
174 |
+
candidate, subset = self.body_estimation(oriImg)
|
175 |
+
bodies = self.body_estimation.format_body_result(candidate, subset)
|
176 |
+
|
177 |
+
results = []
|
178 |
+
for body in bodies:
|
179 |
+
left_hand, right_hand, face = (None,) * 3
|
180 |
+
if include_hand:
|
181 |
+
left_hand, right_hand = self.detect_hands(body, oriImg)
|
182 |
+
if include_face:
|
183 |
+
face = self.detect_face(body, oriImg)
|
184 |
+
|
185 |
+
results.append(PoseResult(BodyResult(
|
186 |
+
keypoints=[
|
187 |
+
Keypoint(
|
188 |
+
x=keypoint.x / float(W),
|
189 |
+
y=keypoint.y / float(H)
|
190 |
+
) if keypoint is not None else None
|
191 |
+
for keypoint in body.keypoints
|
192 |
+
],
|
193 |
+
total_score=body.total_score,
|
194 |
+
total_parts=body.total_parts
|
195 |
+
), left_hand, right_hand, face))
|
196 |
+
|
197 |
+
return results
|
198 |
+
|
199 |
+
def __call__(self, input_image, detect_resolution=512, image_resolution=512, include_body=True, include_hand=False, include_face=False, hand_and_face=None, output_type="pil", **kwargs):
|
200 |
+
if hand_and_face is not None:
|
201 |
+
warnings.warn("hand_and_face is deprecated. Use include_hand and include_face instead.", DeprecationWarning)
|
202 |
+
include_hand = hand_and_face
|
203 |
+
include_face = hand_and_face
|
204 |
+
|
205 |
+
if "return_pil" in kwargs:
|
206 |
+
warnings.warn("return_pil is deprecated. Use output_type instead.", DeprecationWarning)
|
207 |
+
output_type = "pil" if kwargs["return_pil"] else "np"
|
208 |
+
if type(output_type) is bool:
|
209 |
+
warnings.warn("Passing `True` or `False` to `output_type` is deprecated and will raise an error in future versions")
|
210 |
+
if output_type:
|
211 |
+
output_type = "pil"
|
212 |
+
|
213 |
+
if not isinstance(input_image, np.ndarray):
|
214 |
+
input_image = np.array(input_image, dtype=np.uint8)
|
215 |
+
|
216 |
+
input_image = HWC3(input_image)
|
217 |
+
input_image = resize_image(input_image, detect_resolution)
|
218 |
+
H, W, C = input_image.shape
|
219 |
+
|
220 |
+
poses = self.detect_poses(input_image, include_hand, include_face)
|
221 |
+
canvas = draw_poses(poses, H, W, draw_body=include_body, draw_hand=include_hand, draw_face=include_face)
|
222 |
+
|
223 |
+
detected_map = canvas
|
224 |
+
detected_map = HWC3(detected_map)
|
225 |
+
|
226 |
+
img = resize_image(input_image, image_resolution)
|
227 |
+
H, W, C = img.shape
|
228 |
+
|
229 |
+
detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
|
230 |
+
|
231 |
+
if output_type == "pil":
|
232 |
+
detected_map = Image.fromarray(detected_map)
|
233 |
+
|
234 |
+
return detected_map
|
ControlNetUnion-space/controlnet_aux/open_pose/body.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from typing import List, NamedTuple, Union
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
from scipy.ndimage.filters import gaussian_filter
|
8 |
+
|
9 |
+
from . import util
|
10 |
+
from .model import bodypose_model
|
11 |
+
|
12 |
+
|
13 |
+
class Keypoint(NamedTuple):
|
14 |
+
x: float
|
15 |
+
y: float
|
16 |
+
score: float = 1.0
|
17 |
+
id: int = -1
|
18 |
+
|
19 |
+
|
20 |
+
class BodyResult(NamedTuple):
|
21 |
+
# Note: Using `Union` instead of `|` operator as the ladder is a Python
|
22 |
+
# 3.10 feature.
|
23 |
+
# Annotator code should be Python 3.8 Compatible, as controlnet repo uses
|
24 |
+
# Python 3.8 environment.
|
25 |
+
# https://github.com/lllyasviel/ControlNet/blob/d3284fcd0972c510635a4f5abe2eeb71dc0de524/environment.yaml#L6
|
26 |
+
keypoints: List[Union[Keypoint, None]]
|
27 |
+
total_score: float
|
28 |
+
total_parts: int
|
29 |
+
|
30 |
+
|
31 |
+
class Body(object):
|
32 |
+
def __init__(self, model_path):
|
33 |
+
self.model = bodypose_model()
|
34 |
+
model_dict = util.transfer(self.model, torch.load(model_path))
|
35 |
+
self.model.load_state_dict(model_dict)
|
36 |
+
self.model.eval()
|
37 |
+
|
38 |
+
def to(self, device):
|
39 |
+
self.model.to(device)
|
40 |
+
return self
|
41 |
+
|
42 |
+
def __call__(self, oriImg):
|
43 |
+
device = next(iter(self.model.parameters())).device
|
44 |
+
# scale_search = [0.5, 1.0, 1.5, 2.0]
|
45 |
+
scale_search = [0.5]
|
46 |
+
boxsize = 368
|
47 |
+
stride = 8
|
48 |
+
padValue = 128
|
49 |
+
thre1 = 0.1
|
50 |
+
thre2 = 0.05
|
51 |
+
multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
|
52 |
+
heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
|
53 |
+
paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
|
54 |
+
|
55 |
+
for m in range(len(multiplier)):
|
56 |
+
scale = multiplier[m]
|
57 |
+
imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale)
|
58 |
+
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
|
59 |
+
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
|
60 |
+
im = np.ascontiguousarray(im)
|
61 |
+
|
62 |
+
data = torch.from_numpy(im).float()
|
63 |
+
data = data.to(device)
|
64 |
+
# data = data.permute([2, 0, 1]).unsqueeze(0).float()
|
65 |
+
with torch.no_grad():
|
66 |
+
Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
|
67 |
+
Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
|
68 |
+
Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
|
69 |
+
|
70 |
+
# extract outputs, resize, and remove padding
|
71 |
+
# heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
|
72 |
+
heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
|
73 |
+
heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
|
74 |
+
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
75 |
+
heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1]))
|
76 |
+
|
77 |
+
# paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
|
78 |
+
paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
|
79 |
+
paf = util.smart_resize_k(paf, fx=stride, fy=stride)
|
80 |
+
paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
81 |
+
paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1]))
|
82 |
+
|
83 |
+
heatmap_avg += heatmap_avg + heatmap / len(multiplier)
|
84 |
+
paf_avg += + paf / len(multiplier)
|
85 |
+
|
86 |
+
all_peaks = []
|
87 |
+
peak_counter = 0
|
88 |
+
|
89 |
+
for part in range(18):
|
90 |
+
map_ori = heatmap_avg[:, :, part]
|
91 |
+
one_heatmap = gaussian_filter(map_ori, sigma=3)
|
92 |
+
|
93 |
+
map_left = np.zeros(one_heatmap.shape)
|
94 |
+
map_left[1:, :] = one_heatmap[:-1, :]
|
95 |
+
map_right = np.zeros(one_heatmap.shape)
|
96 |
+
map_right[:-1, :] = one_heatmap[1:, :]
|
97 |
+
map_up = np.zeros(one_heatmap.shape)
|
98 |
+
map_up[:, 1:] = one_heatmap[:, :-1]
|
99 |
+
map_down = np.zeros(one_heatmap.shape)
|
100 |
+
map_down[:, :-1] = one_heatmap[:, 1:]
|
101 |
+
|
102 |
+
peaks_binary = np.logical_and.reduce(
|
103 |
+
(one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
|
104 |
+
peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
|
105 |
+
peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
|
106 |
+
peak_id = range(peak_counter, peak_counter + len(peaks))
|
107 |
+
peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
|
108 |
+
|
109 |
+
all_peaks.append(peaks_with_score_and_id)
|
110 |
+
peak_counter += len(peaks)
|
111 |
+
|
112 |
+
# find connection in the specified sequence, center 29 is in the position 15
|
113 |
+
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
|
114 |
+
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
|
115 |
+
[1, 16], [16, 18], [3, 17], [6, 18]]
|
116 |
+
# the middle joints heatmap correpondence
|
117 |
+
mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
|
118 |
+
[23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
|
119 |
+
[55, 56], [37, 38], [45, 46]]
|
120 |
+
|
121 |
+
connection_all = []
|
122 |
+
special_k = []
|
123 |
+
mid_num = 10
|
124 |
+
|
125 |
+
for k in range(len(mapIdx)):
|
126 |
+
score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
|
127 |
+
candA = all_peaks[limbSeq[k][0] - 1]
|
128 |
+
candB = all_peaks[limbSeq[k][1] - 1]
|
129 |
+
nA = len(candA)
|
130 |
+
nB = len(candB)
|
131 |
+
indexA, indexB = limbSeq[k]
|
132 |
+
if (nA != 0 and nB != 0):
|
133 |
+
connection_candidate = []
|
134 |
+
for i in range(nA):
|
135 |
+
for j in range(nB):
|
136 |
+
vec = np.subtract(candB[j][:2], candA[i][:2])
|
137 |
+
norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
|
138 |
+
norm = max(0.001, norm)
|
139 |
+
vec = np.divide(vec, norm)
|
140 |
+
|
141 |
+
startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
|
142 |
+
np.linspace(candA[i][1], candB[j][1], num=mid_num)))
|
143 |
+
|
144 |
+
vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
|
145 |
+
for I in range(len(startend))])
|
146 |
+
vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
|
147 |
+
for I in range(len(startend))])
|
148 |
+
|
149 |
+
score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
|
150 |
+
score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
|
151 |
+
0.5 * oriImg.shape[0] / norm - 1, 0)
|
152 |
+
criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
|
153 |
+
criterion2 = score_with_dist_prior > 0
|
154 |
+
if criterion1 and criterion2:
|
155 |
+
connection_candidate.append(
|
156 |
+
[i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
|
157 |
+
|
158 |
+
connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
|
159 |
+
connection = np.zeros((0, 5))
|
160 |
+
for c in range(len(connection_candidate)):
|
161 |
+
i, j, s = connection_candidate[c][0:3]
|
162 |
+
if (i not in connection[:, 3] and j not in connection[:, 4]):
|
163 |
+
connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
|
164 |
+
if (len(connection) >= min(nA, nB)):
|
165 |
+
break
|
166 |
+
|
167 |
+
connection_all.append(connection)
|
168 |
+
else:
|
169 |
+
special_k.append(k)
|
170 |
+
connection_all.append([])
|
171 |
+
|
172 |
+
# last number in each row is the total parts number of that person
|
173 |
+
# the second last number in each row is the score of the overall configuration
|
174 |
+
subset = -1 * np.ones((0, 20))
|
175 |
+
candidate = np.array([item for sublist in all_peaks for item in sublist])
|
176 |
+
|
177 |
+
for k in range(len(mapIdx)):
|
178 |
+
if k not in special_k:
|
179 |
+
partAs = connection_all[k][:, 0]
|
180 |
+
partBs = connection_all[k][:, 1]
|
181 |
+
indexA, indexB = np.array(limbSeq[k]) - 1
|
182 |
+
|
183 |
+
for i in range(len(connection_all[k])): # = 1:size(temp,1)
|
184 |
+
found = 0
|
185 |
+
subset_idx = [-1, -1]
|
186 |
+
for j in range(len(subset)): # 1:size(subset,1):
|
187 |
+
if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
|
188 |
+
subset_idx[found] = j
|
189 |
+
found += 1
|
190 |
+
|
191 |
+
if found == 1:
|
192 |
+
j = subset_idx[0]
|
193 |
+
if subset[j][indexB] != partBs[i]:
|
194 |
+
subset[j][indexB] = partBs[i]
|
195 |
+
subset[j][-1] += 1
|
196 |
+
subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
|
197 |
+
elif found == 2: # if found 2 and disjoint, merge them
|
198 |
+
j1, j2 = subset_idx
|
199 |
+
membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
|
200 |
+
if len(np.nonzero(membership == 2)[0]) == 0: # merge
|
201 |
+
subset[j1][:-2] += (subset[j2][:-2] + 1)
|
202 |
+
subset[j1][-2:] += subset[j2][-2:]
|
203 |
+
subset[j1][-2] += connection_all[k][i][2]
|
204 |
+
subset = np.delete(subset, j2, 0)
|
205 |
+
else: # as like found == 1
|
206 |
+
subset[j1][indexB] = partBs[i]
|
207 |
+
subset[j1][-1] += 1
|
208 |
+
subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
|
209 |
+
|
210 |
+
# if find no partA in the subset, create a new subset
|
211 |
+
elif not found and k < 17:
|
212 |
+
row = -1 * np.ones(20)
|
213 |
+
row[indexA] = partAs[i]
|
214 |
+
row[indexB] = partBs[i]
|
215 |
+
row[-1] = 2
|
216 |
+
row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
|
217 |
+
subset = np.vstack([subset, row])
|
218 |
+
# delete some rows of subset which has few parts occur
|
219 |
+
deleteIdx = []
|
220 |
+
for i in range(len(subset)):
|
221 |
+
if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
|
222 |
+
deleteIdx.append(i)
|
223 |
+
subset = np.delete(subset, deleteIdx, axis=0)
|
224 |
+
|
225 |
+
# subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
|
226 |
+
# candidate: x, y, score, id
|
227 |
+
return candidate, subset
|
228 |
+
|
229 |
+
@staticmethod
|
230 |
+
def format_body_result(candidate: np.ndarray, subset: np.ndarray) -> List[BodyResult]:
|
231 |
+
"""
|
232 |
+
Format the body results from the candidate and subset arrays into a list of BodyResult objects.
|
233 |
+
|
234 |
+
Args:
|
235 |
+
candidate (np.ndarray): An array of candidates containing the x, y coordinates, score, and id
|
236 |
+
for each body part.
|
237 |
+
subset (np.ndarray): An array of subsets containing indices to the candidate array for each
|
238 |
+
person detected. The last two columns of each row hold the total score and total parts
|
239 |
+
of the person.
|
240 |
+
|
241 |
+
Returns:
|
242 |
+
List[BodyResult]: A list of BodyResult objects, where each object represents a person with
|
243 |
+
detected keypoints, total score, and total parts.
|
244 |
+
"""
|
245 |
+
return [
|
246 |
+
BodyResult(
|
247 |
+
keypoints=[
|
248 |
+
Keypoint(
|
249 |
+
x=candidate[candidate_index][0],
|
250 |
+
y=candidate[candidate_index][1],
|
251 |
+
score=candidate[candidate_index][2],
|
252 |
+
id=candidate[candidate_index][3]
|
253 |
+
) if candidate_index != -1 else None
|
254 |
+
for candidate_index in person[:18].astype(int)
|
255 |
+
],
|
256 |
+
total_score=person[18],
|
257 |
+
total_parts=person[19]
|
258 |
+
)
|
259 |
+
for person in subset
|
260 |
+
]
|
ControlNetUnion-space/controlnet_aux/open_pose/face.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn.functional as F
|
6 |
+
from torch.nn import Conv2d, MaxPool2d, Module, ReLU, init
|
7 |
+
from torchvision.transforms import ToPILImage, ToTensor
|
8 |
+
|
9 |
+
from . import util
|
10 |
+
|
11 |
+
|
12 |
+
class FaceNet(Module):
|
13 |
+
"""Model the cascading heatmaps. """
|
14 |
+
def __init__(self):
|
15 |
+
super(FaceNet, self).__init__()
|
16 |
+
# cnn to make feature map
|
17 |
+
self.relu = ReLU()
|
18 |
+
self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2)
|
19 |
+
self.conv1_1 = Conv2d(in_channels=3, out_channels=64,
|
20 |
+
kernel_size=3, stride=1, padding=1)
|
21 |
+
self.conv1_2 = Conv2d(
|
22 |
+
in_channels=64, out_channels=64, kernel_size=3, stride=1,
|
23 |
+
padding=1)
|
24 |
+
self.conv2_1 = Conv2d(
|
25 |
+
in_channels=64, out_channels=128, kernel_size=3, stride=1,
|
26 |
+
padding=1)
|
27 |
+
self.conv2_2 = Conv2d(
|
28 |
+
in_channels=128, out_channels=128, kernel_size=3, stride=1,
|
29 |
+
padding=1)
|
30 |
+
self.conv3_1 = Conv2d(
|
31 |
+
in_channels=128, out_channels=256, kernel_size=3, stride=1,
|
32 |
+
padding=1)
|
33 |
+
self.conv3_2 = Conv2d(
|
34 |
+
in_channels=256, out_channels=256, kernel_size=3, stride=1,
|
35 |
+
padding=1)
|
36 |
+
self.conv3_3 = Conv2d(
|
37 |
+
in_channels=256, out_channels=256, kernel_size=3, stride=1,
|
38 |
+
padding=1)
|
39 |
+
self.conv3_4 = Conv2d(
|
40 |
+
in_channels=256, out_channels=256, kernel_size=3, stride=1,
|
41 |
+
padding=1)
|
42 |
+
self.conv4_1 = Conv2d(
|
43 |
+
in_channels=256, out_channels=512, kernel_size=3, stride=1,
|
44 |
+
padding=1)
|
45 |
+
self.conv4_2 = Conv2d(
|
46 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
47 |
+
padding=1)
|
48 |
+
self.conv4_3 = Conv2d(
|
49 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
50 |
+
padding=1)
|
51 |
+
self.conv4_4 = Conv2d(
|
52 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
53 |
+
padding=1)
|
54 |
+
self.conv5_1 = Conv2d(
|
55 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
56 |
+
padding=1)
|
57 |
+
self.conv5_2 = Conv2d(
|
58 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
59 |
+
padding=1)
|
60 |
+
self.conv5_3_CPM = Conv2d(
|
61 |
+
in_channels=512, out_channels=128, kernel_size=3, stride=1,
|
62 |
+
padding=1)
|
63 |
+
|
64 |
+
# stage1
|
65 |
+
self.conv6_1_CPM = Conv2d(
|
66 |
+
in_channels=128, out_channels=512, kernel_size=1, stride=1,
|
67 |
+
padding=0)
|
68 |
+
self.conv6_2_CPM = Conv2d(
|
69 |
+
in_channels=512, out_channels=71, kernel_size=1, stride=1,
|
70 |
+
padding=0)
|
71 |
+
|
72 |
+
# stage2
|
73 |
+
self.Mconv1_stage2 = Conv2d(
|
74 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
75 |
+
padding=3)
|
76 |
+
self.Mconv2_stage2 = Conv2d(
|
77 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
78 |
+
padding=3)
|
79 |
+
self.Mconv3_stage2 = Conv2d(
|
80 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
81 |
+
padding=3)
|
82 |
+
self.Mconv4_stage2 = Conv2d(
|
83 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
84 |
+
padding=3)
|
85 |
+
self.Mconv5_stage2 = Conv2d(
|
86 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
87 |
+
padding=3)
|
88 |
+
self.Mconv6_stage2 = Conv2d(
|
89 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
90 |
+
padding=0)
|
91 |
+
self.Mconv7_stage2 = Conv2d(
|
92 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
93 |
+
padding=0)
|
94 |
+
|
95 |
+
# stage3
|
96 |
+
self.Mconv1_stage3 = Conv2d(
|
97 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
98 |
+
padding=3)
|
99 |
+
self.Mconv2_stage3 = Conv2d(
|
100 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
101 |
+
padding=3)
|
102 |
+
self.Mconv3_stage3 = Conv2d(
|
103 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
104 |
+
padding=3)
|
105 |
+
self.Mconv4_stage3 = Conv2d(
|
106 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
107 |
+
padding=3)
|
108 |
+
self.Mconv5_stage3 = Conv2d(
|
109 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
110 |
+
padding=3)
|
111 |
+
self.Mconv6_stage3 = Conv2d(
|
112 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
113 |
+
padding=0)
|
114 |
+
self.Mconv7_stage3 = Conv2d(
|
115 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
116 |
+
padding=0)
|
117 |
+
|
118 |
+
# stage4
|
119 |
+
self.Mconv1_stage4 = Conv2d(
|
120 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
121 |
+
padding=3)
|
122 |
+
self.Mconv2_stage4 = Conv2d(
|
123 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
124 |
+
padding=3)
|
125 |
+
self.Mconv3_stage4 = Conv2d(
|
126 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
127 |
+
padding=3)
|
128 |
+
self.Mconv4_stage4 = Conv2d(
|
129 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
130 |
+
padding=3)
|
131 |
+
self.Mconv5_stage4 = Conv2d(
|
132 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
133 |
+
padding=3)
|
134 |
+
self.Mconv6_stage4 = Conv2d(
|
135 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
136 |
+
padding=0)
|
137 |
+
self.Mconv7_stage4 = Conv2d(
|
138 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
139 |
+
padding=0)
|
140 |
+
|
141 |
+
# stage5
|
142 |
+
self.Mconv1_stage5 = Conv2d(
|
143 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
144 |
+
padding=3)
|
145 |
+
self.Mconv2_stage5 = Conv2d(
|
146 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
147 |
+
padding=3)
|
148 |
+
self.Mconv3_stage5 = Conv2d(
|
149 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
150 |
+
padding=3)
|
151 |
+
self.Mconv4_stage5 = Conv2d(
|
152 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
153 |
+
padding=3)
|
154 |
+
self.Mconv5_stage5 = Conv2d(
|
155 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
156 |
+
padding=3)
|
157 |
+
self.Mconv6_stage5 = Conv2d(
|
158 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
159 |
+
padding=0)
|
160 |
+
self.Mconv7_stage5 = Conv2d(
|
161 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
162 |
+
padding=0)
|
163 |
+
|
164 |
+
# stage6
|
165 |
+
self.Mconv1_stage6 = Conv2d(
|
166 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
167 |
+
padding=3)
|
168 |
+
self.Mconv2_stage6 = Conv2d(
|
169 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
170 |
+
padding=3)
|
171 |
+
self.Mconv3_stage6 = Conv2d(
|
172 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
173 |
+
padding=3)
|
174 |
+
self.Mconv4_stage6 = Conv2d(
|
175 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
176 |
+
padding=3)
|
177 |
+
self.Mconv5_stage6 = Conv2d(
|
178 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
179 |
+
padding=3)
|
180 |
+
self.Mconv6_stage6 = Conv2d(
|
181 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
182 |
+
padding=0)
|
183 |
+
self.Mconv7_stage6 = Conv2d(
|
184 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
185 |
+
padding=0)
|
186 |
+
|
187 |
+
for m in self.modules():
|
188 |
+
if isinstance(m, Conv2d):
|
189 |
+
init.constant_(m.bias, 0)
|
190 |
+
|
191 |
+
def forward(self, x):
|
192 |
+
"""Return a list of heatmaps."""
|
193 |
+
heatmaps = []
|
194 |
+
|
195 |
+
h = self.relu(self.conv1_1(x))
|
196 |
+
h = self.relu(self.conv1_2(h))
|
197 |
+
h = self.max_pooling_2d(h)
|
198 |
+
h = self.relu(self.conv2_1(h))
|
199 |
+
h = self.relu(self.conv2_2(h))
|
200 |
+
h = self.max_pooling_2d(h)
|
201 |
+
h = self.relu(self.conv3_1(h))
|
202 |
+
h = self.relu(self.conv3_2(h))
|
203 |
+
h = self.relu(self.conv3_3(h))
|
204 |
+
h = self.relu(self.conv3_4(h))
|
205 |
+
h = self.max_pooling_2d(h)
|
206 |
+
h = self.relu(self.conv4_1(h))
|
207 |
+
h = self.relu(self.conv4_2(h))
|
208 |
+
h = self.relu(self.conv4_3(h))
|
209 |
+
h = self.relu(self.conv4_4(h))
|
210 |
+
h = self.relu(self.conv5_1(h))
|
211 |
+
h = self.relu(self.conv5_2(h))
|
212 |
+
h = self.relu(self.conv5_3_CPM(h))
|
213 |
+
feature_map = h
|
214 |
+
|
215 |
+
# stage1
|
216 |
+
h = self.relu(self.conv6_1_CPM(h))
|
217 |
+
h = self.conv6_2_CPM(h)
|
218 |
+
heatmaps.append(h)
|
219 |
+
|
220 |
+
# stage2
|
221 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
222 |
+
h = self.relu(self.Mconv1_stage2(h))
|
223 |
+
h = self.relu(self.Mconv2_stage2(h))
|
224 |
+
h = self.relu(self.Mconv3_stage2(h))
|
225 |
+
h = self.relu(self.Mconv4_stage2(h))
|
226 |
+
h = self.relu(self.Mconv5_stage2(h))
|
227 |
+
h = self.relu(self.Mconv6_stage2(h))
|
228 |
+
h = self.Mconv7_stage2(h)
|
229 |
+
heatmaps.append(h)
|
230 |
+
|
231 |
+
# stage3
|
232 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
233 |
+
h = self.relu(self.Mconv1_stage3(h))
|
234 |
+
h = self.relu(self.Mconv2_stage3(h))
|
235 |
+
h = self.relu(self.Mconv3_stage3(h))
|
236 |
+
h = self.relu(self.Mconv4_stage3(h))
|
237 |
+
h = self.relu(self.Mconv5_stage3(h))
|
238 |
+
h = self.relu(self.Mconv6_stage3(h))
|
239 |
+
h = self.Mconv7_stage3(h)
|
240 |
+
heatmaps.append(h)
|
241 |
+
|
242 |
+
# stage4
|
243 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
244 |
+
h = self.relu(self.Mconv1_stage4(h))
|
245 |
+
h = self.relu(self.Mconv2_stage4(h))
|
246 |
+
h = self.relu(self.Mconv3_stage4(h))
|
247 |
+
h = self.relu(self.Mconv4_stage4(h))
|
248 |
+
h = self.relu(self.Mconv5_stage4(h))
|
249 |
+
h = self.relu(self.Mconv6_stage4(h))
|
250 |
+
h = self.Mconv7_stage4(h)
|
251 |
+
heatmaps.append(h)
|
252 |
+
|
253 |
+
# stage5
|
254 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
255 |
+
h = self.relu(self.Mconv1_stage5(h))
|
256 |
+
h = self.relu(self.Mconv2_stage5(h))
|
257 |
+
h = self.relu(self.Mconv3_stage5(h))
|
258 |
+
h = self.relu(self.Mconv4_stage5(h))
|
259 |
+
h = self.relu(self.Mconv5_stage5(h))
|
260 |
+
h = self.relu(self.Mconv6_stage5(h))
|
261 |
+
h = self.Mconv7_stage5(h)
|
262 |
+
heatmaps.append(h)
|
263 |
+
|
264 |
+
# stage6
|
265 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
266 |
+
h = self.relu(self.Mconv1_stage6(h))
|
267 |
+
h = self.relu(self.Mconv2_stage6(h))
|
268 |
+
h = self.relu(self.Mconv3_stage6(h))
|
269 |
+
h = self.relu(self.Mconv4_stage6(h))
|
270 |
+
h = self.relu(self.Mconv5_stage6(h))
|
271 |
+
h = self.relu(self.Mconv6_stage6(h))
|
272 |
+
h = self.Mconv7_stage6(h)
|
273 |
+
heatmaps.append(h)
|
274 |
+
|
275 |
+
return heatmaps
|
276 |
+
|
277 |
+
|
278 |
+
LOG = logging.getLogger(__name__)
|
279 |
+
TOTEN = ToTensor()
|
280 |
+
TOPIL = ToPILImage()
|
281 |
+
|
282 |
+
|
283 |
+
params = {
|
284 |
+
'gaussian_sigma': 2.5,
|
285 |
+
'inference_img_size': 736, # 368, 736, 1312
|
286 |
+
'heatmap_peak_thresh': 0.1,
|
287 |
+
'crop_scale': 1.5,
|
288 |
+
'line_indices': [
|
289 |
+
[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
|
290 |
+
[6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13],
|
291 |
+
[13, 14], [14, 15], [15, 16],
|
292 |
+
[17, 18], [18, 19], [19, 20], [20, 21],
|
293 |
+
[22, 23], [23, 24], [24, 25], [25, 26],
|
294 |
+
[27, 28], [28, 29], [29, 30],
|
295 |
+
[31, 32], [32, 33], [33, 34], [34, 35],
|
296 |
+
[36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36],
|
297 |
+
[42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42],
|
298 |
+
[48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54],
|
299 |
+
[54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48],
|
300 |
+
[60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66],
|
301 |
+
[66, 67], [67, 60]
|
302 |
+
],
|
303 |
+
}
|
304 |
+
|
305 |
+
|
306 |
+
class Face(object):
|
307 |
+
"""
|
308 |
+
The OpenPose face landmark detector model.
|
309 |
+
|
310 |
+
Args:
|
311 |
+
inference_size: set the size of the inference image size, suggested:
|
312 |
+
368, 736, 1312, default 736
|
313 |
+
gaussian_sigma: blur the heatmaps, default 2.5
|
314 |
+
heatmap_peak_thresh: return landmark if over threshold, default 0.1
|
315 |
+
|
316 |
+
"""
|
317 |
+
def __init__(self, face_model_path,
|
318 |
+
inference_size=None,
|
319 |
+
gaussian_sigma=None,
|
320 |
+
heatmap_peak_thresh=None):
|
321 |
+
self.inference_size = inference_size or params["inference_img_size"]
|
322 |
+
self.sigma = gaussian_sigma or params['gaussian_sigma']
|
323 |
+
self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"]
|
324 |
+
self.model = FaceNet()
|
325 |
+
self.model.load_state_dict(torch.load(face_model_path))
|
326 |
+
self.model.eval()
|
327 |
+
|
328 |
+
def to(self, device):
|
329 |
+
self.model.to(device)
|
330 |
+
return self
|
331 |
+
|
332 |
+
def __call__(self, face_img):
|
333 |
+
device = next(iter(self.model.parameters())).device
|
334 |
+
H, W, C = face_img.shape
|
335 |
+
|
336 |
+
w_size = 384
|
337 |
+
x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5
|
338 |
+
|
339 |
+
x_data = x_data.to(device)
|
340 |
+
|
341 |
+
with torch.no_grad():
|
342 |
+
hs = self.model(x_data[None, ...])
|
343 |
+
heatmaps = F.interpolate(
|
344 |
+
hs[-1],
|
345 |
+
(H, W),
|
346 |
+
mode='bilinear', align_corners=True).cpu().numpy()[0]
|
347 |
+
return heatmaps
|
348 |
+
|
349 |
+
def compute_peaks_from_heatmaps(self, heatmaps):
|
350 |
+
all_peaks = []
|
351 |
+
for part in range(heatmaps.shape[0]):
|
352 |
+
map_ori = heatmaps[part].copy()
|
353 |
+
binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8)
|
354 |
+
|
355 |
+
if np.sum(binary) == 0:
|
356 |
+
continue
|
357 |
+
|
358 |
+
positions = np.where(binary > 0.5)
|
359 |
+
intensities = map_ori[positions]
|
360 |
+
mi = np.argmax(intensities)
|
361 |
+
y, x = positions[0][mi], positions[1][mi]
|
362 |
+
all_peaks.append([x, y])
|
363 |
+
|
364 |
+
return np.array(all_peaks)
|
ControlNetUnion-space/controlnet_aux/open_pose/hand.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from scipy.ndimage.filters import gaussian_filter
|
5 |
+
from skimage.measure import label
|
6 |
+
|
7 |
+
from . import util
|
8 |
+
from .model import handpose_model
|
9 |
+
|
10 |
+
|
11 |
+
class Hand(object):
|
12 |
+
def __init__(self, model_path):
|
13 |
+
self.model = handpose_model()
|
14 |
+
model_dict = util.transfer(self.model, torch.load(model_path))
|
15 |
+
self.model.load_state_dict(model_dict)
|
16 |
+
self.model.eval()
|
17 |
+
|
18 |
+
def to(self, device):
|
19 |
+
self.model.to(device)
|
20 |
+
return self
|
21 |
+
|
22 |
+
def __call__(self, oriImgRaw):
|
23 |
+
device = next(iter(self.model.parameters())).device
|
24 |
+
scale_search = [0.5, 1.0, 1.5, 2.0]
|
25 |
+
# scale_search = [0.5]
|
26 |
+
boxsize = 368
|
27 |
+
stride = 8
|
28 |
+
padValue = 128
|
29 |
+
thre = 0.05
|
30 |
+
multiplier = [x * boxsize for x in scale_search]
|
31 |
+
|
32 |
+
wsize = 128
|
33 |
+
heatmap_avg = np.zeros((wsize, wsize, 22))
|
34 |
+
|
35 |
+
Hr, Wr, Cr = oriImgRaw.shape
|
36 |
+
|
37 |
+
oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8)
|
38 |
+
|
39 |
+
for m in range(len(multiplier)):
|
40 |
+
scale = multiplier[m]
|
41 |
+
imageToTest = util.smart_resize(oriImg, (scale, scale))
|
42 |
+
|
43 |
+
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
|
44 |
+
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
|
45 |
+
im = np.ascontiguousarray(im)
|
46 |
+
|
47 |
+
data = torch.from_numpy(im).float()
|
48 |
+
data = data.to(device)
|
49 |
+
|
50 |
+
with torch.no_grad():
|
51 |
+
output = self.model(data).cpu().numpy()
|
52 |
+
|
53 |
+
# extract outputs, resize, and remove padding
|
54 |
+
heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
|
55 |
+
heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
|
56 |
+
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
57 |
+
heatmap = util.smart_resize(heatmap, (wsize, wsize))
|
58 |
+
|
59 |
+
heatmap_avg += heatmap / len(multiplier)
|
60 |
+
|
61 |
+
all_peaks = []
|
62 |
+
for part in range(21):
|
63 |
+
map_ori = heatmap_avg[:, :, part]
|
64 |
+
one_heatmap = gaussian_filter(map_ori, sigma=3)
|
65 |
+
binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
|
66 |
+
|
67 |
+
if np.sum(binary) == 0:
|
68 |
+
all_peaks.append([0, 0])
|
69 |
+
continue
|
70 |
+
label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
|
71 |
+
max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
|
72 |
+
label_img[label_img != max_index] = 0
|
73 |
+
map_ori[label_img == 0] = 0
|
74 |
+
|
75 |
+
y, x = util.npmax(map_ori)
|
76 |
+
y = int(float(y) * float(Hr) / float(wsize))
|
77 |
+
x = int(float(x) * float(Wr) / float(wsize))
|
78 |
+
all_peaks.append([x, y])
|
79 |
+
return np.array(all_peaks)
|
80 |
+
|
81 |
+
if __name__ == "__main__":
|
82 |
+
hand_estimation = Hand('../model/hand_pose_model.pth')
|
83 |
+
|
84 |
+
# test_image = '../images/hand.jpg'
|
85 |
+
test_image = '../images/hand.jpg'
|
86 |
+
oriImg = cv2.imread(test_image) # B,G,R order
|
87 |
+
peaks = hand_estimation(oriImg)
|
88 |
+
canvas = util.draw_handpose(oriImg, peaks, True)
|
89 |
+
cv2.imshow('', canvas)
|
90 |
+
cv2.waitKey(0)
|
ControlNetUnion-space/controlnet_aux/open_pose/model.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from collections import OrderedDict
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
|
7 |
+
def make_layers(block, no_relu_layers):
|
8 |
+
layers = []
|
9 |
+
for layer_name, v in block.items():
|
10 |
+
if 'pool' in layer_name:
|
11 |
+
layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
|
12 |
+
padding=v[2])
|
13 |
+
layers.append((layer_name, layer))
|
14 |
+
else:
|
15 |
+
conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
|
16 |
+
kernel_size=v[2], stride=v[3],
|
17 |
+
padding=v[4])
|
18 |
+
layers.append((layer_name, conv2d))
|
19 |
+
if layer_name not in no_relu_layers:
|
20 |
+
layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
|
21 |
+
|
22 |
+
return nn.Sequential(OrderedDict(layers))
|
23 |
+
|
24 |
+
class bodypose_model(nn.Module):
|
25 |
+
def __init__(self):
|
26 |
+
super(bodypose_model, self).__init__()
|
27 |
+
|
28 |
+
# these layers have no relu layer
|
29 |
+
no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
|
30 |
+
'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
|
31 |
+
'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
|
32 |
+
'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
|
33 |
+
blocks = {}
|
34 |
+
block0 = OrderedDict([
|
35 |
+
('conv1_1', [3, 64, 3, 1, 1]),
|
36 |
+
('conv1_2', [64, 64, 3, 1, 1]),
|
37 |
+
('pool1_stage1', [2, 2, 0]),
|
38 |
+
('conv2_1', [64, 128, 3, 1, 1]),
|
39 |
+
('conv2_2', [128, 128, 3, 1, 1]),
|
40 |
+
('pool2_stage1', [2, 2, 0]),
|
41 |
+
('conv3_1', [128, 256, 3, 1, 1]),
|
42 |
+
('conv3_2', [256, 256, 3, 1, 1]),
|
43 |
+
('conv3_3', [256, 256, 3, 1, 1]),
|
44 |
+
('conv3_4', [256, 256, 3, 1, 1]),
|
45 |
+
('pool3_stage1', [2, 2, 0]),
|
46 |
+
('conv4_1', [256, 512, 3, 1, 1]),
|
47 |
+
('conv4_2', [512, 512, 3, 1, 1]),
|
48 |
+
('conv4_3_CPM', [512, 256, 3, 1, 1]),
|
49 |
+
('conv4_4_CPM', [256, 128, 3, 1, 1])
|
50 |
+
])
|
51 |
+
|
52 |
+
|
53 |
+
# Stage 1
|
54 |
+
block1_1 = OrderedDict([
|
55 |
+
('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
|
56 |
+
('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
|
57 |
+
('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
|
58 |
+
('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
|
59 |
+
('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
|
60 |
+
])
|
61 |
+
|
62 |
+
block1_2 = OrderedDict([
|
63 |
+
('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
|
64 |
+
('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
|
65 |
+
('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
|
66 |
+
('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
|
67 |
+
('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
|
68 |
+
])
|
69 |
+
blocks['block1_1'] = block1_1
|
70 |
+
blocks['block1_2'] = block1_2
|
71 |
+
|
72 |
+
self.model0 = make_layers(block0, no_relu_layers)
|
73 |
+
|
74 |
+
# Stages 2 - 6
|
75 |
+
for i in range(2, 7):
|
76 |
+
blocks['block%d_1' % i] = OrderedDict([
|
77 |
+
('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
|
78 |
+
('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
79 |
+
('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
80 |
+
('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
81 |
+
('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
82 |
+
('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
|
83 |
+
('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
|
84 |
+
])
|
85 |
+
|
86 |
+
blocks['block%d_2' % i] = OrderedDict([
|
87 |
+
('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
|
88 |
+
('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
89 |
+
('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
90 |
+
('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
91 |
+
('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
92 |
+
('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
|
93 |
+
('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
|
94 |
+
])
|
95 |
+
|
96 |
+
for k in blocks.keys():
|
97 |
+
blocks[k] = make_layers(blocks[k], no_relu_layers)
|
98 |
+
|
99 |
+
self.model1_1 = blocks['block1_1']
|
100 |
+
self.model2_1 = blocks['block2_1']
|
101 |
+
self.model3_1 = blocks['block3_1']
|
102 |
+
self.model4_1 = blocks['block4_1']
|
103 |
+
self.model5_1 = blocks['block5_1']
|
104 |
+
self.model6_1 = blocks['block6_1']
|
105 |
+
|
106 |
+
self.model1_2 = blocks['block1_2']
|
107 |
+
self.model2_2 = blocks['block2_2']
|
108 |
+
self.model3_2 = blocks['block3_2']
|
109 |
+
self.model4_2 = blocks['block4_2']
|
110 |
+
self.model5_2 = blocks['block5_2']
|
111 |
+
self.model6_2 = blocks['block6_2']
|
112 |
+
|
113 |
+
|
114 |
+
def forward(self, x):
|
115 |
+
|
116 |
+
out1 = self.model0(x)
|
117 |
+
|
118 |
+
out1_1 = self.model1_1(out1)
|
119 |
+
out1_2 = self.model1_2(out1)
|
120 |
+
out2 = torch.cat([out1_1, out1_2, out1], 1)
|
121 |
+
|
122 |
+
out2_1 = self.model2_1(out2)
|
123 |
+
out2_2 = self.model2_2(out2)
|
124 |
+
out3 = torch.cat([out2_1, out2_2, out1], 1)
|
125 |
+
|
126 |
+
out3_1 = self.model3_1(out3)
|
127 |
+
out3_2 = self.model3_2(out3)
|
128 |
+
out4 = torch.cat([out3_1, out3_2, out1], 1)
|
129 |
+
|
130 |
+
out4_1 = self.model4_1(out4)
|
131 |
+
out4_2 = self.model4_2(out4)
|
132 |
+
out5 = torch.cat([out4_1, out4_2, out1], 1)
|
133 |
+
|
134 |
+
out5_1 = self.model5_1(out5)
|
135 |
+
out5_2 = self.model5_2(out5)
|
136 |
+
out6 = torch.cat([out5_1, out5_2, out1], 1)
|
137 |
+
|
138 |
+
out6_1 = self.model6_1(out6)
|
139 |
+
out6_2 = self.model6_2(out6)
|
140 |
+
|
141 |
+
return out6_1, out6_2
|
142 |
+
|
143 |
+
class handpose_model(nn.Module):
|
144 |
+
def __init__(self):
|
145 |
+
super(handpose_model, self).__init__()
|
146 |
+
|
147 |
+
# these layers have no relu layer
|
148 |
+
no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
|
149 |
+
'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
|
150 |
+
# stage 1
|
151 |
+
block1_0 = OrderedDict([
|
152 |
+
('conv1_1', [3, 64, 3, 1, 1]),
|
153 |
+
('conv1_2', [64, 64, 3, 1, 1]),
|
154 |
+
('pool1_stage1', [2, 2, 0]),
|
155 |
+
('conv2_1', [64, 128, 3, 1, 1]),
|
156 |
+
('conv2_2', [128, 128, 3, 1, 1]),
|
157 |
+
('pool2_stage1', [2, 2, 0]),
|
158 |
+
('conv3_1', [128, 256, 3, 1, 1]),
|
159 |
+
('conv3_2', [256, 256, 3, 1, 1]),
|
160 |
+
('conv3_3', [256, 256, 3, 1, 1]),
|
161 |
+
('conv3_4', [256, 256, 3, 1, 1]),
|
162 |
+
('pool3_stage1', [2, 2, 0]),
|
163 |
+
('conv4_1', [256, 512, 3, 1, 1]),
|
164 |
+
('conv4_2', [512, 512, 3, 1, 1]),
|
165 |
+
('conv4_3', [512, 512, 3, 1, 1]),
|
166 |
+
('conv4_4', [512, 512, 3, 1, 1]),
|
167 |
+
('conv5_1', [512, 512, 3, 1, 1]),
|
168 |
+
('conv5_2', [512, 512, 3, 1, 1]),
|
169 |
+
('conv5_3_CPM', [512, 128, 3, 1, 1])
|
170 |
+
])
|
171 |
+
|
172 |
+
block1_1 = OrderedDict([
|
173 |
+
('conv6_1_CPM', [128, 512, 1, 1, 0]),
|
174 |
+
('conv6_2_CPM', [512, 22, 1, 1, 0])
|
175 |
+
])
|
176 |
+
|
177 |
+
blocks = {}
|
178 |
+
blocks['block1_0'] = block1_0
|
179 |
+
blocks['block1_1'] = block1_1
|
180 |
+
|
181 |
+
# stage 2-6
|
182 |
+
for i in range(2, 7):
|
183 |
+
blocks['block%d' % i] = OrderedDict([
|
184 |
+
('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
|
185 |
+
('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
|
186 |
+
('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
|
187 |
+
('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
|
188 |
+
('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
|
189 |
+
('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
|
190 |
+
('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
|
191 |
+
])
|
192 |
+
|
193 |
+
for k in blocks.keys():
|
194 |
+
blocks[k] = make_layers(blocks[k], no_relu_layers)
|
195 |
+
|
196 |
+
self.model1_0 = blocks['block1_0']
|
197 |
+
self.model1_1 = blocks['block1_1']
|
198 |
+
self.model2 = blocks['block2']
|
199 |
+
self.model3 = blocks['block3']
|
200 |
+
self.model4 = blocks['block4']
|
201 |
+
self.model5 = blocks['block5']
|
202 |
+
self.model6 = blocks['block6']
|
203 |
+
|
204 |
+
def forward(self, x):
|
205 |
+
out1_0 = self.model1_0(x)
|
206 |
+
out1_1 = self.model1_1(out1_0)
|
207 |
+
concat_stage2 = torch.cat([out1_1, out1_0], 1)
|
208 |
+
out_stage2 = self.model2(concat_stage2)
|
209 |
+
concat_stage3 = torch.cat([out_stage2, out1_0], 1)
|
210 |
+
out_stage3 = self.model3(concat_stage3)
|
211 |
+
concat_stage4 = torch.cat([out_stage3, out1_0], 1)
|
212 |
+
out_stage4 = self.model4(concat_stage4)
|
213 |
+
concat_stage5 = torch.cat([out_stage4, out1_0], 1)
|
214 |
+
out_stage5 = self.model5(concat_stage5)
|
215 |
+
concat_stage6 = torch.cat([out_stage5, out1_0], 1)
|
216 |
+
out_stage6 = self.model6(concat_stage6)
|
217 |
+
return out_stage6
|
ControlNetUnion-space/controlnet_aux/open_pose/util.py
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numpy as np
|
3 |
+
import cv2
|
4 |
+
from typing import List, Tuple, Union
|
5 |
+
|
6 |
+
from .body import BodyResult, Keypoint
|
7 |
+
|
8 |
+
eps = 0.01
|
9 |
+
|
10 |
+
|
11 |
+
def smart_resize(x, s):
|
12 |
+
Ht, Wt = s
|
13 |
+
if x.ndim == 2:
|
14 |
+
Ho, Wo = x.shape
|
15 |
+
Co = 1
|
16 |
+
else:
|
17 |
+
Ho, Wo, Co = x.shape
|
18 |
+
if Co == 3 or Co == 1:
|
19 |
+
k = float(Ht + Wt) / float(Ho + Wo)
|
20 |
+
return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
|
21 |
+
else:
|
22 |
+
return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
|
23 |
+
|
24 |
+
|
25 |
+
def smart_resize_k(x, fx, fy):
|
26 |
+
if x.ndim == 2:
|
27 |
+
Ho, Wo = x.shape
|
28 |
+
Co = 1
|
29 |
+
else:
|
30 |
+
Ho, Wo, Co = x.shape
|
31 |
+
Ht, Wt = Ho * fy, Wo * fx
|
32 |
+
if Co == 3 or Co == 1:
|
33 |
+
k = float(Ht + Wt) / float(Ho + Wo)
|
34 |
+
return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
|
35 |
+
else:
|
36 |
+
return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
|
37 |
+
|
38 |
+
|
39 |
+
def padRightDownCorner(img, stride, padValue):
|
40 |
+
h = img.shape[0]
|
41 |
+
w = img.shape[1]
|
42 |
+
|
43 |
+
pad = 4 * [None]
|
44 |
+
pad[0] = 0 # up
|
45 |
+
pad[1] = 0 # left
|
46 |
+
pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
|
47 |
+
pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
|
48 |
+
|
49 |
+
img_padded = img
|
50 |
+
pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
|
51 |
+
img_padded = np.concatenate((pad_up, img_padded), axis=0)
|
52 |
+
pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
|
53 |
+
img_padded = np.concatenate((pad_left, img_padded), axis=1)
|
54 |
+
pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
|
55 |
+
img_padded = np.concatenate((img_padded, pad_down), axis=0)
|
56 |
+
pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
|
57 |
+
img_padded = np.concatenate((img_padded, pad_right), axis=1)
|
58 |
+
|
59 |
+
return img_padded, pad
|
60 |
+
|
61 |
+
|
62 |
+
def transfer(model, model_weights):
|
63 |
+
transfered_model_weights = {}
|
64 |
+
for weights_name in model.state_dict().keys():
|
65 |
+
transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
|
66 |
+
return transfered_model_weights
|
67 |
+
|
68 |
+
|
69 |
+
def draw_bodypose(canvas: np.ndarray, keypoints: List[Keypoint]) -> np.ndarray:
|
70 |
+
"""
|
71 |
+
Draw keypoints and limbs representing body pose on a given canvas.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the body pose.
|
75 |
+
keypoints (List[Keypoint]): A list of Keypoint objects representing the body keypoints to be drawn.
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
np.ndarray: A 3D numpy array representing the modified canvas with the drawn body pose.
|
79 |
+
|
80 |
+
Note:
|
81 |
+
The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
|
82 |
+
"""
|
83 |
+
H, W, C = canvas.shape
|
84 |
+
stickwidth = 4
|
85 |
+
|
86 |
+
limbSeq = [
|
87 |
+
[2, 3], [2, 6], [3, 4], [4, 5],
|
88 |
+
[6, 7], [7, 8], [2, 9], [9, 10],
|
89 |
+
[10, 11], [2, 12], [12, 13], [13, 14],
|
90 |
+
[2, 1], [1, 15], [15, 17], [1, 16],
|
91 |
+
[16, 18],
|
92 |
+
]
|
93 |
+
|
94 |
+
colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
|
95 |
+
[0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
|
96 |
+
[170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
|
97 |
+
|
98 |
+
for (k1_index, k2_index), color in zip(limbSeq, colors):
|
99 |
+
keypoint1 = keypoints[k1_index - 1]
|
100 |
+
keypoint2 = keypoints[k2_index - 1]
|
101 |
+
|
102 |
+
if keypoint1 is None or keypoint2 is None:
|
103 |
+
continue
|
104 |
+
|
105 |
+
Y = np.array([keypoint1.x, keypoint2.x]) * float(W)
|
106 |
+
X = np.array([keypoint1.y, keypoint2.y]) * float(H)
|
107 |
+
mX = np.mean(X)
|
108 |
+
mY = np.mean(Y)
|
109 |
+
length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
|
110 |
+
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
|
111 |
+
polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
|
112 |
+
cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color])
|
113 |
+
|
114 |
+
for keypoint, color in zip(keypoints, colors):
|
115 |
+
if keypoint is None:
|
116 |
+
continue
|
117 |
+
|
118 |
+
x, y = keypoint.x, keypoint.y
|
119 |
+
x = int(x * W)
|
120 |
+
y = int(y * H)
|
121 |
+
cv2.circle(canvas, (int(x), int(y)), 4, color, thickness=-1)
|
122 |
+
|
123 |
+
return canvas
|
124 |
+
|
125 |
+
|
126 |
+
def draw_handpose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
|
127 |
+
import matplotlib
|
128 |
+
"""
|
129 |
+
Draw keypoints and connections representing hand pose on a given canvas.
|
130 |
+
|
131 |
+
Args:
|
132 |
+
canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the hand pose.
|
133 |
+
keypoints (List[Keypoint]| None): A list of Keypoint objects representing the hand keypoints to be drawn
|
134 |
+
or None if no keypoints are present.
|
135 |
+
|
136 |
+
Returns:
|
137 |
+
np.ndarray: A 3D numpy array representing the modified canvas with the drawn hand pose.
|
138 |
+
|
139 |
+
Note:
|
140 |
+
The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
|
141 |
+
"""
|
142 |
+
if not keypoints:
|
143 |
+
return canvas
|
144 |
+
|
145 |
+
H, W, C = canvas.shape
|
146 |
+
|
147 |
+
edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
|
148 |
+
[10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
|
149 |
+
|
150 |
+
for ie, (e1, e2) in enumerate(edges):
|
151 |
+
k1 = keypoints[e1]
|
152 |
+
k2 = keypoints[e2]
|
153 |
+
if k1 is None or k2 is None:
|
154 |
+
continue
|
155 |
+
|
156 |
+
x1 = int(k1.x * W)
|
157 |
+
y1 = int(k1.y * H)
|
158 |
+
x2 = int(k2.x * W)
|
159 |
+
y2 = int(k2.y * H)
|
160 |
+
if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
|
161 |
+
cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
|
162 |
+
|
163 |
+
for keypoint in keypoints:
|
164 |
+
x, y = keypoint.x, keypoint.y
|
165 |
+
x = int(x * W)
|
166 |
+
y = int(y * H)
|
167 |
+
if x > eps and y > eps:
|
168 |
+
cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
|
169 |
+
return canvas
|
170 |
+
|
171 |
+
|
172 |
+
def draw_facepose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
|
173 |
+
"""
|
174 |
+
Draw keypoints representing face pose on a given canvas.
|
175 |
+
|
176 |
+
Args:
|
177 |
+
canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the face pose.
|
178 |
+
keypoints (List[Keypoint]| None): A list of Keypoint objects representing the face keypoints to be drawn
|
179 |
+
or None if no keypoints are present.
|
180 |
+
|
181 |
+
Returns:
|
182 |
+
np.ndarray: A 3D numpy array representing the modified canvas with the drawn face pose.
|
183 |
+
|
184 |
+
Note:
|
185 |
+
The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
|
186 |
+
"""
|
187 |
+
if not keypoints:
|
188 |
+
return canvas
|
189 |
+
|
190 |
+
H, W, C = canvas.shape
|
191 |
+
for keypoint in keypoints:
|
192 |
+
x, y = keypoint.x, keypoint.y
|
193 |
+
x = int(x * W)
|
194 |
+
y = int(y * H)
|
195 |
+
if x > eps and y > eps:
|
196 |
+
cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
|
197 |
+
return canvas
|
198 |
+
|
199 |
+
|
200 |
+
# detect hand according to body pose keypoints
|
201 |
+
# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
|
202 |
+
def handDetect(body: BodyResult, oriImg) -> List[Tuple[int, int, int, bool]]:
|
203 |
+
"""
|
204 |
+
Detect hands in the input body pose keypoints and calculate the bounding box for each hand.
|
205 |
+
|
206 |
+
Args:
|
207 |
+
body (BodyResult): A BodyResult object containing the detected body pose keypoints.
|
208 |
+
oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
|
209 |
+
|
210 |
+
Returns:
|
211 |
+
List[Tuple[int, int, int, bool]]: A list of tuples, each containing the coordinates (x, y) of the top-left
|
212 |
+
corner of the bounding box, the width (height) of the bounding box, and
|
213 |
+
a boolean flag indicating whether the hand is a left hand (True) or a
|
214 |
+
right hand (False).
|
215 |
+
|
216 |
+
Notes:
|
217 |
+
- The width and height of the bounding boxes are equal since the network requires squared input.
|
218 |
+
- The minimum bounding box size is 20 pixels.
|
219 |
+
"""
|
220 |
+
ratioWristElbow = 0.33
|
221 |
+
detect_result = []
|
222 |
+
image_height, image_width = oriImg.shape[0:2]
|
223 |
+
|
224 |
+
keypoints = body.keypoints
|
225 |
+
# right hand: wrist 4, elbow 3, shoulder 2
|
226 |
+
# left hand: wrist 7, elbow 6, shoulder 5
|
227 |
+
left_shoulder = keypoints[5]
|
228 |
+
left_elbow = keypoints[6]
|
229 |
+
left_wrist = keypoints[7]
|
230 |
+
right_shoulder = keypoints[2]
|
231 |
+
right_elbow = keypoints[3]
|
232 |
+
right_wrist = keypoints[4]
|
233 |
+
|
234 |
+
# if any of three not detected
|
235 |
+
has_left = all(keypoint is not None for keypoint in (left_shoulder, left_elbow, left_wrist))
|
236 |
+
has_right = all(keypoint is not None for keypoint in (right_shoulder, right_elbow, right_wrist))
|
237 |
+
if not (has_left or has_right):
|
238 |
+
return []
|
239 |
+
|
240 |
+
hands = []
|
241 |
+
#left hand
|
242 |
+
if has_left:
|
243 |
+
hands.append([
|
244 |
+
left_shoulder.x, left_shoulder.y,
|
245 |
+
left_elbow.x, left_elbow.y,
|
246 |
+
left_wrist.x, left_wrist.y,
|
247 |
+
True
|
248 |
+
])
|
249 |
+
# right hand
|
250 |
+
if has_right:
|
251 |
+
hands.append([
|
252 |
+
right_shoulder.x, right_shoulder.y,
|
253 |
+
right_elbow.x, right_elbow.y,
|
254 |
+
right_wrist.x, right_wrist.y,
|
255 |
+
False
|
256 |
+
])
|
257 |
+
|
258 |
+
for x1, y1, x2, y2, x3, y3, is_left in hands:
|
259 |
+
# pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
|
260 |
+
# handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
|
261 |
+
# handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
|
262 |
+
# const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
|
263 |
+
# const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
|
264 |
+
# handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
|
265 |
+
x = x3 + ratioWristElbow * (x3 - x2)
|
266 |
+
y = y3 + ratioWristElbow * (y3 - y2)
|
267 |
+
distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
|
268 |
+
distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
|
269 |
+
width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
|
270 |
+
# x-y refers to the center --> offset to topLeft point
|
271 |
+
# handRectangle.x -= handRectangle.width / 2.f;
|
272 |
+
# handRectangle.y -= handRectangle.height / 2.f;
|
273 |
+
x -= width / 2
|
274 |
+
y -= width / 2 # width = height
|
275 |
+
# overflow the image
|
276 |
+
if x < 0: x = 0
|
277 |
+
if y < 0: y = 0
|
278 |
+
width1 = width
|
279 |
+
width2 = width
|
280 |
+
if x + width > image_width: width1 = image_width - x
|
281 |
+
if y + width > image_height: width2 = image_height - y
|
282 |
+
width = min(width1, width2)
|
283 |
+
# the max hand box value is 20 pixels
|
284 |
+
if width >= 20:
|
285 |
+
detect_result.append((int(x), int(y), int(width), is_left))
|
286 |
+
|
287 |
+
'''
|
288 |
+
return value: [[x, y, w, True if left hand else False]].
|
289 |
+
width=height since the network require squared input.
|
290 |
+
x, y is the coordinate of top left
|
291 |
+
'''
|
292 |
+
return detect_result
|
293 |
+
|
294 |
+
|
295 |
+
# Written by Lvmin
|
296 |
+
def faceDetect(body: BodyResult, oriImg) -> Union[Tuple[int, int, int], None]:
|
297 |
+
"""
|
298 |
+
Detect the face in the input body pose keypoints and calculate the bounding box for the face.
|
299 |
+
|
300 |
+
Args:
|
301 |
+
body (BodyResult): A BodyResult object containing the detected body pose keypoints.
|
302 |
+
oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
|
303 |
+
|
304 |
+
Returns:
|
305 |
+
Tuple[int, int, int] | None: A tuple containing the coordinates (x, y) of the top-left corner of the
|
306 |
+
bounding box and the width (height) of the bounding box, or None if the
|
307 |
+
face is not detected or the bounding box width is less than 20 pixels.
|
308 |
+
|
309 |
+
Notes:
|
310 |
+
- The width and height of the bounding box are equal.
|
311 |
+
- The minimum bounding box size is 20 pixels.
|
312 |
+
"""
|
313 |
+
# left right eye ear 14 15 16 17
|
314 |
+
image_height, image_width = oriImg.shape[0:2]
|
315 |
+
|
316 |
+
keypoints = body.keypoints
|
317 |
+
head = keypoints[0]
|
318 |
+
left_eye = keypoints[14]
|
319 |
+
right_eye = keypoints[15]
|
320 |
+
left_ear = keypoints[16]
|
321 |
+
right_ear = keypoints[17]
|
322 |
+
|
323 |
+
if head is None or all(keypoint is None for keypoint in (left_eye, right_eye, left_ear, right_ear)):
|
324 |
+
return None
|
325 |
+
|
326 |
+
width = 0.0
|
327 |
+
x0, y0 = head.x, head.y
|
328 |
+
|
329 |
+
if left_eye is not None:
|
330 |
+
x1, y1 = left_eye.x, left_eye.y
|
331 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
332 |
+
width = max(width, d * 3.0)
|
333 |
+
|
334 |
+
if right_eye is not None:
|
335 |
+
x1, y1 = right_eye.x, right_eye.y
|
336 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
337 |
+
width = max(width, d * 3.0)
|
338 |
+
|
339 |
+
if left_ear is not None:
|
340 |
+
x1, y1 = left_ear.x, left_ear.y
|
341 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
342 |
+
width = max(width, d * 1.5)
|
343 |
+
|
344 |
+
if right_ear is not None:
|
345 |
+
x1, y1 = right_ear.x, right_ear.y
|
346 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
347 |
+
width = max(width, d * 1.5)
|
348 |
+
|
349 |
+
x, y = x0, y0
|
350 |
+
|
351 |
+
x -= width
|
352 |
+
y -= width
|
353 |
+
|
354 |
+
if x < 0:
|
355 |
+
x = 0
|
356 |
+
|
357 |
+
if y < 0:
|
358 |
+
y = 0
|
359 |
+
|
360 |
+
width1 = width * 2
|
361 |
+
width2 = width * 2
|
362 |
+
|
363 |
+
if x + width > image_width:
|
364 |
+
width1 = image_width - x
|
365 |
+
|
366 |
+
if y + width > image_height:
|
367 |
+
width2 = image_height - y
|
368 |
+
|
369 |
+
width = min(width1, width2)
|
370 |
+
|
371 |
+
if width >= 20:
|
372 |
+
return int(x), int(y), int(width)
|
373 |
+
else:
|
374 |
+
return None
|
375 |
+
|
376 |
+
|
377 |
+
# get max index of 2d array
|
378 |
+
def npmax(array):
|
379 |
+
arrayindex = array.argmax(1)
|
380 |
+
arrayvalue = array.max(1)
|
381 |
+
i = arrayvalue.argmax()
|
382 |
+
j = arrayindex[i]
|
383 |
+
return i, j
|
ControlNetUnion-space/controlnet_aux/util.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
|
8 |
+
annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
|
9 |
+
|
10 |
+
|
11 |
+
def HWC3(x):
|
12 |
+
assert x.dtype == np.uint8
|
13 |
+
if x.ndim == 2:
|
14 |
+
x = x[:, :, None]
|
15 |
+
assert x.ndim == 3
|
16 |
+
H, W, C = x.shape
|
17 |
+
assert C == 1 or C == 3 or C == 4
|
18 |
+
if C == 3:
|
19 |
+
return x
|
20 |
+
if C == 1:
|
21 |
+
return np.concatenate([x, x, x], axis=2)
|
22 |
+
if C == 4:
|
23 |
+
color = x[:, :, 0:3].astype(np.float32)
|
24 |
+
alpha = x[:, :, 3:4].astype(np.float32) / 255.0
|
25 |
+
y = color * alpha + 255.0 * (1.0 - alpha)
|
26 |
+
y = y.clip(0, 255).astype(np.uint8)
|
27 |
+
return y
|
28 |
+
|
29 |
+
|
30 |
+
def make_noise_disk(H, W, C, F):
|
31 |
+
noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
|
32 |
+
noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
|
33 |
+
noise = noise[F: F + H, F: F + W]
|
34 |
+
noise -= np.min(noise)
|
35 |
+
noise /= np.max(noise)
|
36 |
+
if C == 1:
|
37 |
+
noise = noise[:, :, None]
|
38 |
+
return noise
|
39 |
+
|
40 |
+
|
41 |
+
def nms(x, t, s):
|
42 |
+
x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
|
43 |
+
|
44 |
+
f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
|
45 |
+
f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
|
46 |
+
f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
|
47 |
+
f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
|
48 |
+
|
49 |
+
y = np.zeros_like(x)
|
50 |
+
|
51 |
+
for f in [f1, f2, f3, f4]:
|
52 |
+
np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
|
53 |
+
|
54 |
+
z = np.zeros_like(y, dtype=np.uint8)
|
55 |
+
z[y > t] = 255
|
56 |
+
return z
|
57 |
+
|
58 |
+
def min_max_norm(x):
|
59 |
+
x -= np.min(x)
|
60 |
+
x /= np.maximum(np.max(x), 1e-5)
|
61 |
+
return x
|
62 |
+
|
63 |
+
|
64 |
+
def safe_step(x, step=2):
|
65 |
+
y = x.astype(np.float32) * float(step + 1)
|
66 |
+
y = y.astype(np.int32).astype(np.float32) / float(step)
|
67 |
+
return y
|
68 |
+
|
69 |
+
|
70 |
+
def img2mask(img, H, W, low=10, high=90):
|
71 |
+
assert img.ndim == 3 or img.ndim == 2
|
72 |
+
assert img.dtype == np.uint8
|
73 |
+
|
74 |
+
if img.ndim == 3:
|
75 |
+
y = img[:, :, random.randrange(0, img.shape[2])]
|
76 |
+
else:
|
77 |
+
y = img
|
78 |
+
|
79 |
+
y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
|
80 |
+
|
81 |
+
if random.uniform(0, 1) < 0.5:
|
82 |
+
y = 255 - y
|
83 |
+
|
84 |
+
return y < np.percentile(y, random.randrange(low, high))
|
85 |
+
|
86 |
+
|
87 |
+
def resize_image(input_image, resolution):
|
88 |
+
H, W, C = input_image.shape
|
89 |
+
H = float(H)
|
90 |
+
W = float(W)
|
91 |
+
k = float(resolution) / min(H, W)
|
92 |
+
H *= k
|
93 |
+
W *= k
|
94 |
+
H = int(np.round(H / 64.0)) * 64
|
95 |
+
W = int(np.round(W / 64.0)) * 64
|
96 |
+
img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
|
97 |
+
return img
|
98 |
+
|
99 |
+
|
100 |
+
def torch_gc():
|
101 |
+
if torch.cuda.is_available():
|
102 |
+
torch.cuda.empty_cache()
|
103 |
+
torch.cuda.ipc_collect()
|
104 |
+
|
105 |
+
|
106 |
+
def ade_palette():
|
107 |
+
"""ADE20K palette that maps each class to RGB values."""
|
108 |
+
return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
|
109 |
+
[4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
|
110 |
+
[230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
|
111 |
+
[150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
|
112 |
+
[143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
|
113 |
+
[0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
|
114 |
+
[255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
|
115 |
+
[255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
|
116 |
+
[255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
|
117 |
+
[224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
|
118 |
+
[255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
|
119 |
+
[6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
|
120 |
+
[140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
|
121 |
+
[255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
|
122 |
+
[255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
|
123 |
+
[11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
|
124 |
+
[0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
|
125 |
+
[255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
|
126 |
+
[0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
|
127 |
+
[173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
|
128 |
+
[255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
|
129 |
+
[255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
|
130 |
+
[255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
|
131 |
+
[0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
|
132 |
+
[0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
|
133 |
+
[143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
|
134 |
+
[8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
|
135 |
+
[255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
|
136 |
+
[92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
|
137 |
+
[163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
|
138 |
+
[255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
|
139 |
+
[255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
|
140 |
+
[10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
|
141 |
+
[255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
|
142 |
+
[41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
|
143 |
+
[71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
|
144 |
+
[184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
|
145 |
+
[102, 255, 0], [92, 0, 255]]
|
146 |
+
|
ControlNetUnion-space/depth_anything_v2/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
ControlNetUnion-space/depth_anything_v2/dinov2.py
ADDED
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the Apache License, Version 2.0
|
4 |
+
# found in the LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
# References:
|
7 |
+
# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
|
8 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
9 |
+
|
10 |
+
from functools import partial
|
11 |
+
import math
|
12 |
+
import logging
|
13 |
+
from typing import Sequence, Tuple, Union, Callable
|
14 |
+
|
15 |
+
import torch
|
16 |
+
import torch.nn as nn
|
17 |
+
import torch.utils.checkpoint
|
18 |
+
from torch.nn.init import trunc_normal_
|
19 |
+
|
20 |
+
from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
|
21 |
+
|
22 |
+
|
23 |
+
logger = logging.getLogger("dinov2")
|
24 |
+
|
25 |
+
|
26 |
+
def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
|
27 |
+
if not depth_first and include_root:
|
28 |
+
fn(module=module, name=name)
|
29 |
+
for child_name, child_module in module.named_children():
|
30 |
+
child_name = ".".join((name, child_name)) if name else child_name
|
31 |
+
named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
|
32 |
+
if depth_first and include_root:
|
33 |
+
fn(module=module, name=name)
|
34 |
+
return module
|
35 |
+
|
36 |
+
|
37 |
+
class BlockChunk(nn.ModuleList):
|
38 |
+
def forward(self, x):
|
39 |
+
for b in self:
|
40 |
+
x = b(x)
|
41 |
+
return x
|
42 |
+
|
43 |
+
|
44 |
+
class DinoVisionTransformer(nn.Module):
|
45 |
+
def __init__(
|
46 |
+
self,
|
47 |
+
img_size=224,
|
48 |
+
patch_size=16,
|
49 |
+
in_chans=3,
|
50 |
+
embed_dim=768,
|
51 |
+
depth=12,
|
52 |
+
num_heads=12,
|
53 |
+
mlp_ratio=4.0,
|
54 |
+
qkv_bias=True,
|
55 |
+
ffn_bias=True,
|
56 |
+
proj_bias=True,
|
57 |
+
drop_path_rate=0.0,
|
58 |
+
drop_path_uniform=False,
|
59 |
+
init_values=None, # for layerscale: None or 0 => no layerscale
|
60 |
+
embed_layer=PatchEmbed,
|
61 |
+
act_layer=nn.GELU,
|
62 |
+
block_fn=Block,
|
63 |
+
ffn_layer="mlp",
|
64 |
+
block_chunks=1,
|
65 |
+
num_register_tokens=0,
|
66 |
+
interpolate_antialias=False,
|
67 |
+
interpolate_offset=0.1,
|
68 |
+
):
|
69 |
+
"""
|
70 |
+
Args:
|
71 |
+
img_size (int, tuple): input image size
|
72 |
+
patch_size (int, tuple): patch size
|
73 |
+
in_chans (int): number of input channels
|
74 |
+
embed_dim (int): embedding dimension
|
75 |
+
depth (int): depth of transformer
|
76 |
+
num_heads (int): number of attention heads
|
77 |
+
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
|
78 |
+
qkv_bias (bool): enable bias for qkv if True
|
79 |
+
proj_bias (bool): enable bias for proj in attn if True
|
80 |
+
ffn_bias (bool): enable bias for ffn if True
|
81 |
+
drop_path_rate (float): stochastic depth rate
|
82 |
+
drop_path_uniform (bool): apply uniform drop rate across blocks
|
83 |
+
weight_init (str): weight init scheme
|
84 |
+
init_values (float): layer-scale init values
|
85 |
+
embed_layer (nn.Module): patch embedding layer
|
86 |
+
act_layer (nn.Module): MLP activation layer
|
87 |
+
block_fn (nn.Module): transformer block class
|
88 |
+
ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
|
89 |
+
block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
|
90 |
+
num_register_tokens: (int) number of extra cls tokens (so-called "registers")
|
91 |
+
interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
|
92 |
+
interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
|
93 |
+
"""
|
94 |
+
super().__init__()
|
95 |
+
norm_layer = partial(nn.LayerNorm, eps=1e-6)
|
96 |
+
|
97 |
+
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
|
98 |
+
self.num_tokens = 1
|
99 |
+
self.n_blocks = depth
|
100 |
+
self.num_heads = num_heads
|
101 |
+
self.patch_size = patch_size
|
102 |
+
self.num_register_tokens = num_register_tokens
|
103 |
+
self.interpolate_antialias = interpolate_antialias
|
104 |
+
self.interpolate_offset = interpolate_offset
|
105 |
+
|
106 |
+
self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
|
107 |
+
num_patches = self.patch_embed.num_patches
|
108 |
+
|
109 |
+
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
|
110 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
|
111 |
+
assert num_register_tokens >= 0
|
112 |
+
self.register_tokens = (
|
113 |
+
nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
|
114 |
+
)
|
115 |
+
|
116 |
+
if drop_path_uniform is True:
|
117 |
+
dpr = [drop_path_rate] * depth
|
118 |
+
else:
|
119 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
|
120 |
+
|
121 |
+
if ffn_layer == "mlp":
|
122 |
+
logger.info("using MLP layer as FFN")
|
123 |
+
ffn_layer = Mlp
|
124 |
+
elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
|
125 |
+
logger.info("using SwiGLU layer as FFN")
|
126 |
+
ffn_layer = SwiGLUFFNFused
|
127 |
+
elif ffn_layer == "identity":
|
128 |
+
logger.info("using Identity layer as FFN")
|
129 |
+
|
130 |
+
def f(*args, **kwargs):
|
131 |
+
return nn.Identity()
|
132 |
+
|
133 |
+
ffn_layer = f
|
134 |
+
else:
|
135 |
+
raise NotImplementedError
|
136 |
+
|
137 |
+
blocks_list = [
|
138 |
+
block_fn(
|
139 |
+
dim=embed_dim,
|
140 |
+
num_heads=num_heads,
|
141 |
+
mlp_ratio=mlp_ratio,
|
142 |
+
qkv_bias=qkv_bias,
|
143 |
+
proj_bias=proj_bias,
|
144 |
+
ffn_bias=ffn_bias,
|
145 |
+
drop_path=dpr[i],
|
146 |
+
norm_layer=norm_layer,
|
147 |
+
act_layer=act_layer,
|
148 |
+
ffn_layer=ffn_layer,
|
149 |
+
init_values=init_values,
|
150 |
+
)
|
151 |
+
for i in range(depth)
|
152 |
+
]
|
153 |
+
if block_chunks > 0:
|
154 |
+
self.chunked_blocks = True
|
155 |
+
chunked_blocks = []
|
156 |
+
chunksize = depth // block_chunks
|
157 |
+
for i in range(0, depth, chunksize):
|
158 |
+
# this is to keep the block index consistent if we chunk the block list
|
159 |
+
chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
|
160 |
+
self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
|
161 |
+
else:
|
162 |
+
self.chunked_blocks = False
|
163 |
+
self.blocks = nn.ModuleList(blocks_list)
|
164 |
+
|
165 |
+
self.norm = norm_layer(embed_dim)
|
166 |
+
self.head = nn.Identity()
|
167 |
+
|
168 |
+
self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
|
169 |
+
|
170 |
+
self.init_weights()
|
171 |
+
|
172 |
+
def init_weights(self):
|
173 |
+
trunc_normal_(self.pos_embed, std=0.02)
|
174 |
+
nn.init.normal_(self.cls_token, std=1e-6)
|
175 |
+
if self.register_tokens is not None:
|
176 |
+
nn.init.normal_(self.register_tokens, std=1e-6)
|
177 |
+
named_apply(init_weights_vit_timm, self)
|
178 |
+
|
179 |
+
def interpolate_pos_encoding(self, x, w, h):
|
180 |
+
previous_dtype = x.dtype
|
181 |
+
npatch = x.shape[1] - 1
|
182 |
+
N = self.pos_embed.shape[1] - 1
|
183 |
+
if npatch == N and w == h:
|
184 |
+
return self.pos_embed
|
185 |
+
pos_embed = self.pos_embed.float()
|
186 |
+
class_pos_embed = pos_embed[:, 0]
|
187 |
+
patch_pos_embed = pos_embed[:, 1:]
|
188 |
+
dim = x.shape[-1]
|
189 |
+
w0 = w // self.patch_size
|
190 |
+
h0 = h // self.patch_size
|
191 |
+
# we add a small number to avoid floating point error in the interpolation
|
192 |
+
# see discussion at https://github.com/facebookresearch/dino/issues/8
|
193 |
+
# DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
|
194 |
+
w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
|
195 |
+
# w0, h0 = w0 + 0.1, h0 + 0.1
|
196 |
+
|
197 |
+
sqrt_N = math.sqrt(N)
|
198 |
+
sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
|
199 |
+
patch_pos_embed = nn.functional.interpolate(
|
200 |
+
patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
|
201 |
+
scale_factor=(sx, sy),
|
202 |
+
# (int(w0), int(h0)), # to solve the upsampling shape issue
|
203 |
+
mode="bicubic",
|
204 |
+
antialias=self.interpolate_antialias
|
205 |
+
)
|
206 |
+
|
207 |
+
assert int(w0) == patch_pos_embed.shape[-2]
|
208 |
+
assert int(h0) == patch_pos_embed.shape[-1]
|
209 |
+
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
|
210 |
+
return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
|
211 |
+
|
212 |
+
def prepare_tokens_with_masks(self, x, masks=None):
|
213 |
+
B, nc, w, h = x.shape
|
214 |
+
x = self.patch_embed(x)
|
215 |
+
if masks is not None:
|
216 |
+
x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
|
217 |
+
|
218 |
+
x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
|
219 |
+
x = x + self.interpolate_pos_encoding(x, w, h)
|
220 |
+
|
221 |
+
if self.register_tokens is not None:
|
222 |
+
x = torch.cat(
|
223 |
+
(
|
224 |
+
x[:, :1],
|
225 |
+
self.register_tokens.expand(x.shape[0], -1, -1),
|
226 |
+
x[:, 1:],
|
227 |
+
),
|
228 |
+
dim=1,
|
229 |
+
)
|
230 |
+
|
231 |
+
return x
|
232 |
+
|
233 |
+
def forward_features_list(self, x_list, masks_list):
|
234 |
+
x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
|
235 |
+
for blk in self.blocks:
|
236 |
+
x = blk(x)
|
237 |
+
|
238 |
+
all_x = x
|
239 |
+
output = []
|
240 |
+
for x, masks in zip(all_x, masks_list):
|
241 |
+
x_norm = self.norm(x)
|
242 |
+
output.append(
|
243 |
+
{
|
244 |
+
"x_norm_clstoken": x_norm[:, 0],
|
245 |
+
"x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
|
246 |
+
"x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
|
247 |
+
"x_prenorm": x,
|
248 |
+
"masks": masks,
|
249 |
+
}
|
250 |
+
)
|
251 |
+
return output
|
252 |
+
|
253 |
+
def forward_features(self, x, masks=None):
|
254 |
+
if isinstance(x, list):
|
255 |
+
return self.forward_features_list(x, masks)
|
256 |
+
|
257 |
+
x = self.prepare_tokens_with_masks(x, masks)
|
258 |
+
|
259 |
+
for blk in self.blocks:
|
260 |
+
x = blk(x)
|
261 |
+
|
262 |
+
x_norm = self.norm(x)
|
263 |
+
return {
|
264 |
+
"x_norm_clstoken": x_norm[:, 0],
|
265 |
+
"x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
|
266 |
+
"x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
|
267 |
+
"x_prenorm": x,
|
268 |
+
"masks": masks,
|
269 |
+
}
|
270 |
+
|
271 |
+
def _get_intermediate_layers_not_chunked(self, x, n=1):
|
272 |
+
x = self.prepare_tokens_with_masks(x)
|
273 |
+
# If n is an int, take the n last blocks. If it's a list, take them
|
274 |
+
output, total_block_len = [], len(self.blocks)
|
275 |
+
blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
|
276 |
+
for i, blk in enumerate(self.blocks):
|
277 |
+
x = blk(x)
|
278 |
+
if i in blocks_to_take:
|
279 |
+
output.append(x)
|
280 |
+
assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
|
281 |
+
return output
|
282 |
+
|
283 |
+
def _get_intermediate_layers_chunked(self, x, n=1):
|
284 |
+
x = self.prepare_tokens_with_masks(x)
|
285 |
+
output, i, total_block_len = [], 0, len(self.blocks[-1])
|
286 |
+
# If n is an int, take the n last blocks. If it's a list, take them
|
287 |
+
blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
|
288 |
+
for block_chunk in self.blocks:
|
289 |
+
for blk in block_chunk[i:]: # Passing the nn.Identity()
|
290 |
+
x = blk(x)
|
291 |
+
if i in blocks_to_take:
|
292 |
+
output.append(x)
|
293 |
+
i += 1
|
294 |
+
assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
|
295 |
+
return output
|
296 |
+
|
297 |
+
def get_intermediate_layers(
|
298 |
+
self,
|
299 |
+
x: torch.Tensor,
|
300 |
+
n: Union[int, Sequence] = 1, # Layers or n last layers to take
|
301 |
+
reshape: bool = False,
|
302 |
+
return_class_token: bool = False,
|
303 |
+
norm=True
|
304 |
+
) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
|
305 |
+
if self.chunked_blocks:
|
306 |
+
outputs = self._get_intermediate_layers_chunked(x, n)
|
307 |
+
else:
|
308 |
+
outputs = self._get_intermediate_layers_not_chunked(x, n)
|
309 |
+
if norm:
|
310 |
+
outputs = [self.norm(out) for out in outputs]
|
311 |
+
class_tokens = [out[:, 0] for out in outputs]
|
312 |
+
outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
|
313 |
+
if reshape:
|
314 |
+
B, _, w, h = x.shape
|
315 |
+
outputs = [
|
316 |
+
out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
|
317 |
+
for out in outputs
|
318 |
+
]
|
319 |
+
if return_class_token:
|
320 |
+
return tuple(zip(outputs, class_tokens))
|
321 |
+
return tuple(outputs)
|
322 |
+
|
323 |
+
def forward(self, *args, is_training=False, **kwargs):
|
324 |
+
ret = self.forward_features(*args, **kwargs)
|
325 |
+
if is_training:
|
326 |
+
return ret
|
327 |
+
else:
|
328 |
+
return self.head(ret["x_norm_clstoken"])
|
329 |
+
|
330 |
+
|
331 |
+
def init_weights_vit_timm(module: nn.Module, name: str = ""):
|
332 |
+
"""ViT weight initialization, original timm impl (for reproducibility)"""
|
333 |
+
if isinstance(module, nn.Linear):
|
334 |
+
trunc_normal_(module.weight, std=0.02)
|
335 |
+
if module.bias is not None:
|
336 |
+
nn.init.zeros_(module.bias)
|
337 |
+
|
338 |
+
|
339 |
+
def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
|
340 |
+
model = DinoVisionTransformer(
|
341 |
+
patch_size=patch_size,
|
342 |
+
embed_dim=384,
|
343 |
+
depth=12,
|
344 |
+
num_heads=6,
|
345 |
+
mlp_ratio=4,
|
346 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
347 |
+
num_register_tokens=num_register_tokens,
|
348 |
+
**kwargs,
|
349 |
+
)
|
350 |
+
return model
|
351 |
+
|
352 |
+
|
353 |
+
def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
|
354 |
+
model = DinoVisionTransformer(
|
355 |
+
patch_size=patch_size,
|
356 |
+
embed_dim=768,
|
357 |
+
depth=12,
|
358 |
+
num_heads=12,
|
359 |
+
mlp_ratio=4,
|
360 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
361 |
+
num_register_tokens=num_register_tokens,
|
362 |
+
**kwargs,
|
363 |
+
)
|
364 |
+
return model
|
365 |
+
|
366 |
+
|
367 |
+
def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
|
368 |
+
model = DinoVisionTransformer(
|
369 |
+
patch_size=patch_size,
|
370 |
+
embed_dim=1024,
|
371 |
+
depth=24,
|
372 |
+
num_heads=16,
|
373 |
+
mlp_ratio=4,
|
374 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
375 |
+
num_register_tokens=num_register_tokens,
|
376 |
+
**kwargs,
|
377 |
+
)
|
378 |
+
return model
|
379 |
+
|
380 |
+
|
381 |
+
def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
|
382 |
+
"""
|
383 |
+
Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
|
384 |
+
"""
|
385 |
+
model = DinoVisionTransformer(
|
386 |
+
patch_size=patch_size,
|
387 |
+
embed_dim=1536,
|
388 |
+
depth=40,
|
389 |
+
num_heads=24,
|
390 |
+
mlp_ratio=4,
|
391 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
392 |
+
num_register_tokens=num_register_tokens,
|
393 |
+
**kwargs,
|
394 |
+
)
|
395 |
+
return model
|
396 |
+
|
397 |
+
|
398 |
+
def DINOv2(model_name):
|
399 |
+
model_zoo = {
|
400 |
+
"vits": vit_small,
|
401 |
+
"vitb": vit_base,
|
402 |
+
"vitl": vit_large,
|
403 |
+
"vitg": vit_giant2
|
404 |
+
}
|
405 |
+
|
406 |
+
return model_zoo[model_name](
|
407 |
+
img_size=518,
|
408 |
+
patch_size=14,
|
409 |
+
init_values=1.0,
|
410 |
+
ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
|
411 |
+
block_chunks=0,
|
412 |
+
num_register_tokens=0,
|
413 |
+
interpolate_antialias=False,
|
414 |
+
interpolate_offset=0.1
|
415 |
+
)
|
ControlNetUnion-space/depth_anything_v2/dinov2_layers/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
from .mlp import Mlp
|
8 |
+
from .patch_embed import PatchEmbed
|
9 |
+
from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
|
10 |
+
from .block import NestedTensorBlock
|
11 |
+
from .attention import MemEffAttention
|
ControlNetUnion-space/depth_anything_v2/dinov2_layers/attention.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
# References:
|
8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
10 |
+
|
11 |
+
import logging
|
12 |
+
|
13 |
+
from torch import Tensor
|
14 |
+
from torch import nn
|
15 |
+
|
16 |
+
|
17 |
+
logger = logging.getLogger("dinov2")
|
18 |
+
|
19 |
+
|
20 |
+
try:
|
21 |
+
from xformers.ops import memory_efficient_attention, unbind, fmha
|
22 |
+
|
23 |
+
XFORMERS_AVAILABLE = True
|
24 |
+
except ImportError:
|
25 |
+
logger.warning("xFormers not available")
|
26 |
+
XFORMERS_AVAILABLE = False
|
27 |
+
|
28 |
+
|
29 |
+
class Attention(nn.Module):
|
30 |
+
def __init__(
|
31 |
+
self,
|
32 |
+
dim: int,
|
33 |
+
num_heads: int = 8,
|
34 |
+
qkv_bias: bool = False,
|
35 |
+
proj_bias: bool = True,
|
36 |
+
attn_drop: float = 0.0,
|
37 |
+
proj_drop: float = 0.0,
|
38 |
+
) -> None:
|
39 |
+
super().__init__()
|
40 |
+
self.num_heads = num_heads
|
41 |
+
head_dim = dim // num_heads
|
42 |
+
self.scale = head_dim**-0.5
|
43 |
+
|
44 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
45 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
46 |
+
self.proj = nn.Linear(dim, dim, bias=proj_bias)
|
47 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
48 |
+
|
49 |
+
def forward(self, x: Tensor) -> Tensor:
|
50 |
+
B, N, C = x.shape
|
51 |
+
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
52 |
+
|
53 |
+
q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
|
54 |
+
attn = q @ k.transpose(-2, -1)
|
55 |
+
|
56 |
+
attn = attn.softmax(dim=-1)
|
57 |
+
attn = self.attn_drop(attn)
|
58 |
+
|
59 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
60 |
+
x = self.proj(x)
|
61 |
+
x = self.proj_drop(x)
|
62 |
+
return x
|
63 |
+
|
64 |
+
|
65 |
+
class MemEffAttention(Attention):
|
66 |
+
def forward(self, x: Tensor, attn_bias=None) -> Tensor:
|
67 |
+
if not XFORMERS_AVAILABLE:
|
68 |
+
assert attn_bias is None, "xFormers is required for nested tensors usage"
|
69 |
+
return super().forward(x)
|
70 |
+
|
71 |
+
B, N, C = x.shape
|
72 |
+
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
73 |
+
|
74 |
+
q, k, v = unbind(qkv, 2)
|
75 |
+
|
76 |
+
x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
|
77 |
+
x = x.reshape([B, N, C])
|
78 |
+
|
79 |
+
x = self.proj(x)
|
80 |
+
x = self.proj_drop(x)
|
81 |
+
return x
|
82 |
+
|
83 |
+
|
ControlNetUnion-space/depth_anything_v2/dinov2_layers/block.py
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
# References:
|
8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
|
10 |
+
|
11 |
+
import logging
|
12 |
+
from typing import Callable, List, Any, Tuple, Dict
|
13 |
+
|
14 |
+
import torch
|
15 |
+
from torch import nn, Tensor
|
16 |
+
|
17 |
+
from .attention import Attention, MemEffAttention
|
18 |
+
from .drop_path import DropPath
|
19 |
+
from .layer_scale import LayerScale
|
20 |
+
from .mlp import Mlp
|
21 |
+
|
22 |
+
|
23 |
+
logger = logging.getLogger("dinov2")
|
24 |
+
|
25 |
+
|
26 |
+
try:
|
27 |
+
from xformers.ops import fmha
|
28 |
+
from xformers.ops import scaled_index_add, index_select_cat
|
29 |
+
|
30 |
+
XFORMERS_AVAILABLE = True
|
31 |
+
except ImportError:
|
32 |
+
logger.warning("xFormers not available")
|
33 |
+
XFORMERS_AVAILABLE = False
|
34 |
+
|
35 |
+
|
36 |
+
class Block(nn.Module):
|
37 |
+
def __init__(
|
38 |
+
self,
|
39 |
+
dim: int,
|
40 |
+
num_heads: int,
|
41 |
+
mlp_ratio: float = 4.0,
|
42 |
+
qkv_bias: bool = False,
|
43 |
+
proj_bias: bool = True,
|
44 |
+
ffn_bias: bool = True,
|
45 |
+
drop: float = 0.0,
|
46 |
+
attn_drop: float = 0.0,
|
47 |
+
init_values=None,
|
48 |
+
drop_path: float = 0.0,
|
49 |
+
act_layer: Callable[..., nn.Module] = nn.GELU,
|
50 |
+
norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
|
51 |
+
attn_class: Callable[..., nn.Module] = Attention,
|
52 |
+
ffn_layer: Callable[..., nn.Module] = Mlp,
|
53 |
+
) -> None:
|
54 |
+
super().__init__()
|
55 |
+
# print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
|
56 |
+
self.norm1 = norm_layer(dim)
|
57 |
+
self.attn = attn_class(
|
58 |
+
dim,
|
59 |
+
num_heads=num_heads,
|
60 |
+
qkv_bias=qkv_bias,
|
61 |
+
proj_bias=proj_bias,
|
62 |
+
attn_drop=attn_drop,
|
63 |
+
proj_drop=drop,
|
64 |
+
)
|
65 |
+
self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
66 |
+
self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
67 |
+
|
68 |
+
self.norm2 = norm_layer(dim)
|
69 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
70 |
+
self.mlp = ffn_layer(
|
71 |
+
in_features=dim,
|
72 |
+
hidden_features=mlp_hidden_dim,
|
73 |
+
act_layer=act_layer,
|
74 |
+
drop=drop,
|
75 |
+
bias=ffn_bias,
|
76 |
+
)
|
77 |
+
self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
78 |
+
self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
79 |
+
|
80 |
+
self.sample_drop_ratio = drop_path
|
81 |
+
|
82 |
+
def forward(self, x: Tensor) -> Tensor:
|
83 |
+
def attn_residual_func(x: Tensor) -> Tensor:
|
84 |
+
return self.ls1(self.attn(self.norm1(x)))
|
85 |
+
|
86 |
+
def ffn_residual_func(x: Tensor) -> Tensor:
|
87 |
+
return self.ls2(self.mlp(self.norm2(x)))
|
88 |
+
|
89 |
+
if self.training and self.sample_drop_ratio > 0.1:
|
90 |
+
# the overhead is compensated only for a drop path rate larger than 0.1
|
91 |
+
x = drop_add_residual_stochastic_depth(
|
92 |
+
x,
|
93 |
+
residual_func=attn_residual_func,
|
94 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
95 |
+
)
|
96 |
+
x = drop_add_residual_stochastic_depth(
|
97 |
+
x,
|
98 |
+
residual_func=ffn_residual_func,
|
99 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
100 |
+
)
|
101 |
+
elif self.training and self.sample_drop_ratio > 0.0:
|
102 |
+
x = x + self.drop_path1(attn_residual_func(x))
|
103 |
+
x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
|
104 |
+
else:
|
105 |
+
x = x + attn_residual_func(x)
|
106 |
+
x = x + ffn_residual_func(x)
|
107 |
+
return x
|
108 |
+
|
109 |
+
|
110 |
+
def drop_add_residual_stochastic_depth(
|
111 |
+
x: Tensor,
|
112 |
+
residual_func: Callable[[Tensor], Tensor],
|
113 |
+
sample_drop_ratio: float = 0.0,
|
114 |
+
) -> Tensor:
|
115 |
+
# 1) extract subset using permutation
|
116 |
+
b, n, d = x.shape
|
117 |
+
sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
|
118 |
+
brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
|
119 |
+
x_subset = x[brange]
|
120 |
+
|
121 |
+
# 2) apply residual_func to get residual
|
122 |
+
residual = residual_func(x_subset)
|
123 |
+
|
124 |
+
x_flat = x.flatten(1)
|
125 |
+
residual = residual.flatten(1)
|
126 |
+
|
127 |
+
residual_scale_factor = b / sample_subset_size
|
128 |
+
|
129 |
+
# 3) add the residual
|
130 |
+
x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
|
131 |
+
return x_plus_residual.view_as(x)
|
132 |
+
|
133 |
+
|
134 |
+
def get_branges_scales(x, sample_drop_ratio=0.0):
|
135 |
+
b, n, d = x.shape
|
136 |
+
sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
|
137 |
+
brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
|
138 |
+
residual_scale_factor = b / sample_subset_size
|
139 |
+
return brange, residual_scale_factor
|
140 |
+
|
141 |
+
|
142 |
+
def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
|
143 |
+
if scaling_vector is None:
|
144 |
+
x_flat = x.flatten(1)
|
145 |
+
residual = residual.flatten(1)
|
146 |
+
x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
|
147 |
+
else:
|
148 |
+
x_plus_residual = scaled_index_add(
|
149 |
+
x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
|
150 |
+
)
|
151 |
+
return x_plus_residual
|
152 |
+
|
153 |
+
|
154 |
+
attn_bias_cache: Dict[Tuple, Any] = {}
|
155 |
+
|
156 |
+
|
157 |
+
def get_attn_bias_and_cat(x_list, branges=None):
|
158 |
+
"""
|
159 |
+
this will perform the index select, cat the tensors, and provide the attn_bias from cache
|
160 |
+
"""
|
161 |
+
batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
|
162 |
+
all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
|
163 |
+
if all_shapes not in attn_bias_cache.keys():
|
164 |
+
seqlens = []
|
165 |
+
for b, x in zip(batch_sizes, x_list):
|
166 |
+
for _ in range(b):
|
167 |
+
seqlens.append(x.shape[1])
|
168 |
+
attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
|
169 |
+
attn_bias._batch_sizes = batch_sizes
|
170 |
+
attn_bias_cache[all_shapes] = attn_bias
|
171 |
+
|
172 |
+
if branges is not None:
|
173 |
+
cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
|
174 |
+
else:
|
175 |
+
tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
|
176 |
+
cat_tensors = torch.cat(tensors_bs1, dim=1)
|
177 |
+
|
178 |
+
return attn_bias_cache[all_shapes], cat_tensors
|
179 |
+
|
180 |
+
|
181 |
+
def drop_add_residual_stochastic_depth_list(
|
182 |
+
x_list: List[Tensor],
|
183 |
+
residual_func: Callable[[Tensor, Any], Tensor],
|
184 |
+
sample_drop_ratio: float = 0.0,
|
185 |
+
scaling_vector=None,
|
186 |
+
) -> Tensor:
|
187 |
+
# 1) generate random set of indices for dropping samples in the batch
|
188 |
+
branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
|
189 |
+
branges = [s[0] for s in branges_scales]
|
190 |
+
residual_scale_factors = [s[1] for s in branges_scales]
|
191 |
+
|
192 |
+
# 2) get attention bias and index+concat the tensors
|
193 |
+
attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
|
194 |
+
|
195 |
+
# 3) apply residual_func to get residual, and split the result
|
196 |
+
residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
|
197 |
+
|
198 |
+
outputs = []
|
199 |
+
for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
|
200 |
+
outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
|
201 |
+
return outputs
|
202 |
+
|
203 |
+
|
204 |
+
class NestedTensorBlock(Block):
|
205 |
+
def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
|
206 |
+
"""
|
207 |
+
x_list contains a list of tensors to nest together and run
|
208 |
+
"""
|
209 |
+
assert isinstance(self.attn, MemEffAttention)
|
210 |
+
|
211 |
+
if self.training and self.sample_drop_ratio > 0.0:
|
212 |
+
|
213 |
+
def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
214 |
+
return self.attn(self.norm1(x), attn_bias=attn_bias)
|
215 |
+
|
216 |
+
def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
217 |
+
return self.mlp(self.norm2(x))
|
218 |
+
|
219 |
+
x_list = drop_add_residual_stochastic_depth_list(
|
220 |
+
x_list,
|
221 |
+
residual_func=attn_residual_func,
|
222 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
223 |
+
scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
|
224 |
+
)
|
225 |
+
x_list = drop_add_residual_stochastic_depth_list(
|
226 |
+
x_list,
|
227 |
+
residual_func=ffn_residual_func,
|
228 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
229 |
+
scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
|
230 |
+
)
|
231 |
+
return x_list
|
232 |
+
else:
|
233 |
+
|
234 |
+
def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
235 |
+
return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
|
236 |
+
|
237 |
+
def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
238 |
+
return self.ls2(self.mlp(self.norm2(x)))
|
239 |
+
|
240 |
+
attn_bias, x = get_attn_bias_and_cat(x_list)
|
241 |
+
x = x + attn_residual_func(x, attn_bias=attn_bias)
|
242 |
+
x = x + ffn_residual_func(x)
|
243 |
+
return attn_bias.split(x)
|
244 |
+
|
245 |
+
def forward(self, x_or_x_list):
|
246 |
+
if isinstance(x_or_x_list, Tensor):
|
247 |
+
return super().forward(x_or_x_list)
|
248 |
+
elif isinstance(x_or_x_list, list):
|
249 |
+
assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
|
250 |
+
return self.forward_nested(x_or_x_list)
|
251 |
+
else:
|
252 |
+
raise AssertionError
|
ControlNetUnion-space/depth_anything_v2/dinov2_layers/drop_path.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
# References:
|
8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
|
10 |
+
|
11 |
+
|
12 |
+
from torch import nn
|
13 |
+
|
14 |
+
|
15 |
+
def drop_path(x, drop_prob: float = 0.0, training: bool = False):
|
16 |
+
if drop_prob == 0.0 or not training:
|
17 |
+
return x
|
18 |
+
keep_prob = 1 - drop_prob
|
19 |
+
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
|
20 |
+
random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
|
21 |
+
if keep_prob > 0.0:
|
22 |
+
random_tensor.div_(keep_prob)
|
23 |
+
output = x * random_tensor
|
24 |
+
return output
|
25 |
+
|
26 |
+
|
27 |
+
class DropPath(nn.Module):
|
28 |
+
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
|
29 |
+
|
30 |
+
def __init__(self, drop_prob=None):
|
31 |
+
super(DropPath, self).__init__()
|
32 |
+
self.drop_prob = drop_prob
|
33 |
+
|
34 |
+
def forward(self, x):
|
35 |
+
return drop_path(x, self.drop_prob, self.training)
|
ControlNetUnion-space/depth_anything_v2/dinov2_layers/layer_scale.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
|
8 |
+
|
9 |
+
from typing import Union
|
10 |
+
|
11 |
+
import torch
|
12 |
+
from torch import Tensor
|
13 |
+
from torch import nn
|
14 |
+
|
15 |
+
|
16 |
+
class LayerScale(nn.Module):
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
dim: int,
|
20 |
+
init_values: Union[float, Tensor] = 1e-5,
|
21 |
+
inplace: bool = False,
|
22 |
+
) -> None:
|
23 |
+
super().__init__()
|
24 |
+
self.inplace = inplace
|
25 |
+
self.gamma = nn.Parameter(init_values * torch.ones(dim))
|
26 |
+
|
27 |
+
def forward(self, x: Tensor) -> Tensor:
|
28 |
+
return x.mul_(self.gamma) if self.inplace else x * self.gamma
|
ControlNetUnion-space/depth_anything_v2/dinov2_layers/mlp.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
# References:
|
8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
|
10 |
+
|
11 |
+
|
12 |
+
from typing import Callable, Optional
|
13 |
+
|
14 |
+
from torch import Tensor, nn
|
15 |
+
|
16 |
+
|
17 |
+
class Mlp(nn.Module):
|
18 |
+
def __init__(
|
19 |
+
self,
|
20 |
+
in_features: int,
|
21 |
+
hidden_features: Optional[int] = None,
|
22 |
+
out_features: Optional[int] = None,
|
23 |
+
act_layer: Callable[..., nn.Module] = nn.GELU,
|
24 |
+
drop: float = 0.0,
|
25 |
+
bias: bool = True,
|
26 |
+
) -> None:
|
27 |
+
super().__init__()
|
28 |
+
out_features = out_features or in_features
|
29 |
+
hidden_features = hidden_features or in_features
|
30 |
+
self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
|
31 |
+
self.act = act_layer()
|
32 |
+
self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
|
33 |
+
self.drop = nn.Dropout(drop)
|
34 |
+
|
35 |
+
def forward(self, x: Tensor) -> Tensor:
|
36 |
+
x = self.fc1(x)
|
37 |
+
x = self.act(x)
|
38 |
+
x = self.drop(x)
|
39 |
+
x = self.fc2(x)
|
40 |
+
x = self.drop(x)
|
41 |
+
return x
|
ControlNetUnion-space/depth_anything_v2/dinov2_layers/patch_embed.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
# References:
|
8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
|
10 |
+
|
11 |
+
from typing import Callable, Optional, Tuple, Union
|
12 |
+
|
13 |
+
from torch import Tensor
|
14 |
+
import torch.nn as nn
|
15 |
+
|
16 |
+
|
17 |
+
def make_2tuple(x):
|
18 |
+
if isinstance(x, tuple):
|
19 |
+
assert len(x) == 2
|
20 |
+
return x
|
21 |
+
|
22 |
+
assert isinstance(x, int)
|
23 |
+
return (x, x)
|
24 |
+
|
25 |
+
|
26 |
+
class PatchEmbed(nn.Module):
|
27 |
+
"""
|
28 |
+
2D image to patch embedding: (B,C,H,W) -> (B,N,D)
|
29 |
+
|
30 |
+
Args:
|
31 |
+
img_size: Image size.
|
32 |
+
patch_size: Patch token size.
|
33 |
+
in_chans: Number of input image channels.
|
34 |
+
embed_dim: Number of linear projection output channels.
|
35 |
+
norm_layer: Normalization layer.
|
36 |
+
"""
|
37 |
+
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
img_size: Union[int, Tuple[int, int]] = 224,
|
41 |
+
patch_size: Union[int, Tuple[int, int]] = 16,
|
42 |
+
in_chans: int = 3,
|
43 |
+
embed_dim: int = 768,
|
44 |
+
norm_layer: Optional[Callable] = None,
|
45 |
+
flatten_embedding: bool = True,
|
46 |
+
) -> None:
|
47 |
+
super().__init__()
|
48 |
+
|
49 |
+
image_HW = make_2tuple(img_size)
|
50 |
+
patch_HW = make_2tuple(patch_size)
|
51 |
+
patch_grid_size = (
|
52 |
+
image_HW[0] // patch_HW[0],
|
53 |
+
image_HW[1] // patch_HW[1],
|
54 |
+
)
|
55 |
+
|
56 |
+
self.img_size = image_HW
|
57 |
+
self.patch_size = patch_HW
|
58 |
+
self.patches_resolution = patch_grid_size
|
59 |
+
self.num_patches = patch_grid_size[0] * patch_grid_size[1]
|
60 |
+
|
61 |
+
self.in_chans = in_chans
|
62 |
+
self.embed_dim = embed_dim
|
63 |
+
|
64 |
+
self.flatten_embedding = flatten_embedding
|
65 |
+
|
66 |
+
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
|
67 |
+
self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
|
68 |
+
|
69 |
+
def forward(self, x: Tensor) -> Tensor:
|
70 |
+
_, _, H, W = x.shape
|
71 |
+
patch_H, patch_W = self.patch_size
|
72 |
+
|
73 |
+
assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
|
74 |
+
assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
|
75 |
+
|
76 |
+
x = self.proj(x) # B C H W
|
77 |
+
H, W = x.size(2), x.size(3)
|
78 |
+
x = x.flatten(2).transpose(1, 2) # B HW C
|
79 |
+
x = self.norm(x)
|
80 |
+
if not self.flatten_embedding:
|
81 |
+
x = x.reshape(-1, H, W, self.embed_dim) # B H W C
|
82 |
+
return x
|
83 |
+
|
84 |
+
def flops(self) -> float:
|
85 |
+
Ho, Wo = self.patches_resolution
|
86 |
+
flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
|
87 |
+
if self.norm is not None:
|
88 |
+
flops += Ho * Wo * self.embed_dim
|
89 |
+
return flops
|
ControlNetUnion-space/depth_anything_v2/dinov2_layers/swiglu_ffn.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
from typing import Callable, Optional
|
8 |
+
|
9 |
+
from torch import Tensor, nn
|
10 |
+
import torch.nn.functional as F
|
11 |
+
|
12 |
+
|
13 |
+
class SwiGLUFFN(nn.Module):
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
in_features: int,
|
17 |
+
hidden_features: Optional[int] = None,
|
18 |
+
out_features: Optional[int] = None,
|
19 |
+
act_layer: Callable[..., nn.Module] = None,
|
20 |
+
drop: float = 0.0,
|
21 |
+
bias: bool = True,
|
22 |
+
) -> None:
|
23 |
+
super().__init__()
|
24 |
+
out_features = out_features or in_features
|
25 |
+
hidden_features = hidden_features or in_features
|
26 |
+
self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
|
27 |
+
self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
|
28 |
+
|
29 |
+
def forward(self, x: Tensor) -> Tensor:
|
30 |
+
x12 = self.w12(x)
|
31 |
+
x1, x2 = x12.chunk(2, dim=-1)
|
32 |
+
hidden = F.silu(x1) * x2
|
33 |
+
return self.w3(hidden)
|
34 |
+
|
35 |
+
|
36 |
+
try:
|
37 |
+
from xformers.ops import SwiGLU
|
38 |
+
|
39 |
+
XFORMERS_AVAILABLE = True
|
40 |
+
except ImportError:
|
41 |
+
SwiGLU = SwiGLUFFN
|
42 |
+
XFORMERS_AVAILABLE = False
|
43 |
+
|
44 |
+
|
45 |
+
class SwiGLUFFNFused(SwiGLU):
|
46 |
+
def __init__(
|
47 |
+
self,
|
48 |
+
in_features: int,
|
49 |
+
hidden_features: Optional[int] = None,
|
50 |
+
out_features: Optional[int] = None,
|
51 |
+
act_layer: Callable[..., nn.Module] = None,
|
52 |
+
drop: float = 0.0,
|
53 |
+
bias: bool = True,
|
54 |
+
) -> None:
|
55 |
+
out_features = out_features or in_features
|
56 |
+
hidden_features = hidden_features or in_features
|
57 |
+
hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
|
58 |
+
super().__init__(
|
59 |
+
in_features=in_features,
|
60 |
+
hidden_features=hidden_features,
|
61 |
+
out_features=out_features,
|
62 |
+
bias=bias,
|
63 |
+
)
|
ControlNetUnion-space/depth_anything_v2/dpt.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
from torchvision.transforms import Compose
|
6 |
+
|
7 |
+
from .dinov2 import DINOv2
|
8 |
+
from .util.blocks import FeatureFusionBlock, _make_scratch
|
9 |
+
from .util.transform import Resize, NormalizeImage, PrepareForNet
|
10 |
+
|
11 |
+
|
12 |
+
def _make_fusion_block(features, use_bn, size=None):
|
13 |
+
return FeatureFusionBlock(
|
14 |
+
features,
|
15 |
+
nn.ReLU(False),
|
16 |
+
deconv=False,
|
17 |
+
bn=use_bn,
|
18 |
+
expand=False,
|
19 |
+
align_corners=True,
|
20 |
+
size=size,
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
class ConvBlock(nn.Module):
|
25 |
+
def __init__(self, in_feature, out_feature):
|
26 |
+
super().__init__()
|
27 |
+
|
28 |
+
self.conv_block = nn.Sequential(
|
29 |
+
nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
|
30 |
+
nn.BatchNorm2d(out_feature),
|
31 |
+
nn.ReLU(True)
|
32 |
+
)
|
33 |
+
|
34 |
+
def forward(self, x):
|
35 |
+
return self.conv_block(x)
|
36 |
+
|
37 |
+
|
38 |
+
class DPTHead(nn.Module):
|
39 |
+
def __init__(
|
40 |
+
self,
|
41 |
+
in_channels,
|
42 |
+
features=256,
|
43 |
+
use_bn=False,
|
44 |
+
out_channels=[256, 512, 1024, 1024],
|
45 |
+
use_clstoken=False
|
46 |
+
):
|
47 |
+
super(DPTHead, self).__init__()
|
48 |
+
|
49 |
+
self.use_clstoken = use_clstoken
|
50 |
+
|
51 |
+
self.projects = nn.ModuleList([
|
52 |
+
nn.Conv2d(
|
53 |
+
in_channels=in_channels,
|
54 |
+
out_channels=out_channel,
|
55 |
+
kernel_size=1,
|
56 |
+
stride=1,
|
57 |
+
padding=0,
|
58 |
+
) for out_channel in out_channels
|
59 |
+
])
|
60 |
+
|
61 |
+
self.resize_layers = nn.ModuleList([
|
62 |
+
nn.ConvTranspose2d(
|
63 |
+
in_channels=out_channels[0],
|
64 |
+
out_channels=out_channels[0],
|
65 |
+
kernel_size=4,
|
66 |
+
stride=4,
|
67 |
+
padding=0),
|
68 |
+
nn.ConvTranspose2d(
|
69 |
+
in_channels=out_channels[1],
|
70 |
+
out_channels=out_channels[1],
|
71 |
+
kernel_size=2,
|
72 |
+
stride=2,
|
73 |
+
padding=0),
|
74 |
+
nn.Identity(),
|
75 |
+
nn.Conv2d(
|
76 |
+
in_channels=out_channels[3],
|
77 |
+
out_channels=out_channels[3],
|
78 |
+
kernel_size=3,
|
79 |
+
stride=2,
|
80 |
+
padding=1)
|
81 |
+
])
|
82 |
+
|
83 |
+
if use_clstoken:
|
84 |
+
self.readout_projects = nn.ModuleList()
|
85 |
+
for _ in range(len(self.projects)):
|
86 |
+
self.readout_projects.append(
|
87 |
+
nn.Sequential(
|
88 |
+
nn.Linear(2 * in_channels, in_channels),
|
89 |
+
nn.GELU()))
|
90 |
+
|
91 |
+
self.scratch = _make_scratch(
|
92 |
+
out_channels,
|
93 |
+
features,
|
94 |
+
groups=1,
|
95 |
+
expand=False,
|
96 |
+
)
|
97 |
+
|
98 |
+
self.scratch.stem_transpose = None
|
99 |
+
|
100 |
+
self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
|
101 |
+
self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
|
102 |
+
self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
|
103 |
+
self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
|
104 |
+
|
105 |
+
head_features_1 = features
|
106 |
+
head_features_2 = 32
|
107 |
+
|
108 |
+
self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
|
109 |
+
self.scratch.output_conv2 = nn.Sequential(
|
110 |
+
nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
|
111 |
+
nn.ReLU(True),
|
112 |
+
nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
|
113 |
+
nn.ReLU(True),
|
114 |
+
nn.Identity(),
|
115 |
+
)
|
116 |
+
|
117 |
+
def forward(self, out_features, patch_h, patch_w):
|
118 |
+
out = []
|
119 |
+
for i, x in enumerate(out_features):
|
120 |
+
if self.use_clstoken:
|
121 |
+
x, cls_token = x[0], x[1]
|
122 |
+
readout = cls_token.unsqueeze(1).expand_as(x)
|
123 |
+
x = self.readout_projects[i](torch.cat((x, readout), -1))
|
124 |
+
else:
|
125 |
+
x = x[0]
|
126 |
+
|
127 |
+
x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
|
128 |
+
|
129 |
+
x = self.projects[i](x)
|
130 |
+
x = self.resize_layers[i](x)
|
131 |
+
|
132 |
+
out.append(x)
|
133 |
+
|
134 |
+
layer_1, layer_2, layer_3, layer_4 = out
|
135 |
+
|
136 |
+
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
137 |
+
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
138 |
+
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
139 |
+
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
140 |
+
|
141 |
+
path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
|
142 |
+
path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
|
143 |
+
path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
|
144 |
+
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
145 |
+
|
146 |
+
out = self.scratch.output_conv1(path_1)
|
147 |
+
out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
|
148 |
+
out = self.scratch.output_conv2(out)
|
149 |
+
|
150 |
+
return out
|
151 |
+
|
152 |
+
|
153 |
+
class DepthAnythingV2(nn.Module):
|
154 |
+
def __init__(
|
155 |
+
self,
|
156 |
+
encoder='vitl',
|
157 |
+
features=256,
|
158 |
+
out_channels=[256, 512, 1024, 1024],
|
159 |
+
use_bn=False,
|
160 |
+
use_clstoken=False
|
161 |
+
):
|
162 |
+
super(DepthAnythingV2, self).__init__()
|
163 |
+
|
164 |
+
self.intermediate_layer_idx = {
|
165 |
+
'vits': [2, 5, 8, 11],
|
166 |
+
'vitb': [2, 5, 8, 11],
|
167 |
+
'vitl': [4, 11, 17, 23],
|
168 |
+
'vitg': [9, 19, 29, 39]
|
169 |
+
}
|
170 |
+
|
171 |
+
self.encoder = encoder
|
172 |
+
self.pretrained = DINOv2(model_name=encoder)
|
173 |
+
|
174 |
+
self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
|
175 |
+
|
176 |
+
def forward(self, x):
|
177 |
+
patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
|
178 |
+
|
179 |
+
features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True)
|
180 |
+
|
181 |
+
depth = self.depth_head(features, patch_h, patch_w)
|
182 |
+
depth = F.relu(depth)
|
183 |
+
|
184 |
+
return depth.squeeze(1)
|
185 |
+
|
186 |
+
@torch.no_grad()
|
187 |
+
def infer_image(self, raw_image, input_size=518):
|
188 |
+
image, (h, w) = self.image2tensor(raw_image, input_size)
|
189 |
+
|
190 |
+
depth = self.forward(image)
|
191 |
+
|
192 |
+
depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
|
193 |
+
|
194 |
+
return depth.cpu().numpy()
|
195 |
+
|
196 |
+
def image2tensor(self, raw_image, input_size=518):
|
197 |
+
transform = Compose([
|
198 |
+
Resize(
|
199 |
+
width=input_size,
|
200 |
+
height=input_size,
|
201 |
+
resize_target=False,
|
202 |
+
keep_aspect_ratio=True,
|
203 |
+
ensure_multiple_of=14,
|
204 |
+
resize_method='lower_bound',
|
205 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
206 |
+
),
|
207 |
+
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
208 |
+
PrepareForNet(),
|
209 |
+
])
|
210 |
+
|
211 |
+
h, w = raw_image.shape[:2]
|
212 |
+
|
213 |
+
image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
|
214 |
+
|
215 |
+
image = transform({'image': image})['image']
|
216 |
+
image = torch.from_numpy(image).unsqueeze(0)
|
217 |
+
|
218 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
|
219 |
+
image = image.to(DEVICE)
|
220 |
+
|
221 |
+
return image, (h, w)
|
ControlNetUnion-space/depth_anything_v2/util/blocks.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
|
3 |
+
|
4 |
+
def _make_scratch(in_shape, out_shape, groups=1, expand=False):
|
5 |
+
scratch = nn.Module()
|
6 |
+
|
7 |
+
out_shape1 = out_shape
|
8 |
+
out_shape2 = out_shape
|
9 |
+
out_shape3 = out_shape
|
10 |
+
if len(in_shape) >= 4:
|
11 |
+
out_shape4 = out_shape
|
12 |
+
|
13 |
+
if expand:
|
14 |
+
out_shape1 = out_shape
|
15 |
+
out_shape2 = out_shape * 2
|
16 |
+
out_shape3 = out_shape * 4
|
17 |
+
if len(in_shape) >= 4:
|
18 |
+
out_shape4 = out_shape * 8
|
19 |
+
|
20 |
+
scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
|
21 |
+
scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
|
22 |
+
scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
|
23 |
+
if len(in_shape) >= 4:
|
24 |
+
scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
|
25 |
+
|
26 |
+
return scratch
|
27 |
+
|
28 |
+
|
29 |
+
class ResidualConvUnit(nn.Module):
|
30 |
+
"""Residual convolution module.
|
31 |
+
"""
|
32 |
+
|
33 |
+
def __init__(self, features, activation, bn):
|
34 |
+
"""Init.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
features (int): number of features
|
38 |
+
"""
|
39 |
+
super().__init__()
|
40 |
+
|
41 |
+
self.bn = bn
|
42 |
+
|
43 |
+
self.groups=1
|
44 |
+
|
45 |
+
self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
|
46 |
+
|
47 |
+
self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
|
48 |
+
|
49 |
+
if self.bn == True:
|
50 |
+
self.bn1 = nn.BatchNorm2d(features)
|
51 |
+
self.bn2 = nn.BatchNorm2d(features)
|
52 |
+
|
53 |
+
self.activation = activation
|
54 |
+
|
55 |
+
self.skip_add = nn.quantized.FloatFunctional()
|
56 |
+
|
57 |
+
def forward(self, x):
|
58 |
+
"""Forward pass.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
x (tensor): input
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
tensor: output
|
65 |
+
"""
|
66 |
+
|
67 |
+
out = self.activation(x)
|
68 |
+
out = self.conv1(out)
|
69 |
+
if self.bn == True:
|
70 |
+
out = self.bn1(out)
|
71 |
+
|
72 |
+
out = self.activation(out)
|
73 |
+
out = self.conv2(out)
|
74 |
+
if self.bn == True:
|
75 |
+
out = self.bn2(out)
|
76 |
+
|
77 |
+
if self.groups > 1:
|
78 |
+
out = self.conv_merge(out)
|
79 |
+
|
80 |
+
return self.skip_add.add(out, x)
|
81 |
+
|
82 |
+
|
83 |
+
class FeatureFusionBlock(nn.Module):
|
84 |
+
"""Feature fusion block.
|
85 |
+
"""
|
86 |
+
|
87 |
+
def __init__(
|
88 |
+
self,
|
89 |
+
features,
|
90 |
+
activation,
|
91 |
+
deconv=False,
|
92 |
+
bn=False,
|
93 |
+
expand=False,
|
94 |
+
align_corners=True,
|
95 |
+
size=None
|
96 |
+
):
|
97 |
+
"""Init.
|
98 |
+
|
99 |
+
Args:
|
100 |
+
features (int): number of features
|
101 |
+
"""
|
102 |
+
super(FeatureFusionBlock, self).__init__()
|
103 |
+
|
104 |
+
self.deconv = deconv
|
105 |
+
self.align_corners = align_corners
|
106 |
+
|
107 |
+
self.groups=1
|
108 |
+
|
109 |
+
self.expand = expand
|
110 |
+
out_features = features
|
111 |
+
if self.expand == True:
|
112 |
+
out_features = features // 2
|
113 |
+
|
114 |
+
self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
|
115 |
+
|
116 |
+
self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
|
117 |
+
self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
|
118 |
+
|
119 |
+
self.skip_add = nn.quantized.FloatFunctional()
|
120 |
+
|
121 |
+
self.size=size
|
122 |
+
|
123 |
+
def forward(self, *xs, size=None):
|
124 |
+
"""Forward pass.
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
tensor: output
|
128 |
+
"""
|
129 |
+
output = xs[0]
|
130 |
+
|
131 |
+
if len(xs) == 2:
|
132 |
+
res = self.resConfUnit1(xs[1])
|
133 |
+
output = self.skip_add.add(output, res)
|
134 |
+
|
135 |
+
output = self.resConfUnit2(output)
|
136 |
+
|
137 |
+
if (size is None) and (self.size is None):
|
138 |
+
modifier = {"scale_factor": 2}
|
139 |
+
elif size is None:
|
140 |
+
modifier = {"size": self.size}
|
141 |
+
else:
|
142 |
+
modifier = {"size": size}
|
143 |
+
|
144 |
+
output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
|
145 |
+
|
146 |
+
output = self.out_conv(output)
|
147 |
+
|
148 |
+
return output
|
ControlNetUnion-space/depth_anything_v2/util/transform.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import cv2
|
3 |
+
|
4 |
+
|
5 |
+
class Resize(object):
|
6 |
+
"""Resize sample to given size (width, height).
|
7 |
+
"""
|
8 |
+
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
width,
|
12 |
+
height,
|
13 |
+
resize_target=True,
|
14 |
+
keep_aspect_ratio=False,
|
15 |
+
ensure_multiple_of=1,
|
16 |
+
resize_method="lower_bound",
|
17 |
+
image_interpolation_method=cv2.INTER_AREA,
|
18 |
+
):
|
19 |
+
"""Init.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
width (int): desired output width
|
23 |
+
height (int): desired output height
|
24 |
+
resize_target (bool, optional):
|
25 |
+
True: Resize the full sample (image, mask, target).
|
26 |
+
False: Resize image only.
|
27 |
+
Defaults to True.
|
28 |
+
keep_aspect_ratio (bool, optional):
|
29 |
+
True: Keep the aspect ratio of the input sample.
|
30 |
+
Output sample might not have the given width and height, and
|
31 |
+
resize behaviour depends on the parameter 'resize_method'.
|
32 |
+
Defaults to False.
|
33 |
+
ensure_multiple_of (int, optional):
|
34 |
+
Output width and height is constrained to be multiple of this parameter.
|
35 |
+
Defaults to 1.
|
36 |
+
resize_method (str, optional):
|
37 |
+
"lower_bound": Output will be at least as large as the given size.
|
38 |
+
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
|
39 |
+
"minimal": Scale as least as possible. (Output size might be smaller than given size.)
|
40 |
+
Defaults to "lower_bound".
|
41 |
+
"""
|
42 |
+
self.__width = width
|
43 |
+
self.__height = height
|
44 |
+
|
45 |
+
self.__resize_target = resize_target
|
46 |
+
self.__keep_aspect_ratio = keep_aspect_ratio
|
47 |
+
self.__multiple_of = ensure_multiple_of
|
48 |
+
self.__resize_method = resize_method
|
49 |
+
self.__image_interpolation_method = image_interpolation_method
|
50 |
+
|
51 |
+
def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
|
52 |
+
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
53 |
+
|
54 |
+
if max_val is not None and y > max_val:
|
55 |
+
y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
56 |
+
|
57 |
+
if y < min_val:
|
58 |
+
y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
59 |
+
|
60 |
+
return y
|
61 |
+
|
62 |
+
def get_size(self, width, height):
|
63 |
+
# determine new height and width
|
64 |
+
scale_height = self.__height / height
|
65 |
+
scale_width = self.__width / width
|
66 |
+
|
67 |
+
if self.__keep_aspect_ratio:
|
68 |
+
if self.__resize_method == "lower_bound":
|
69 |
+
# scale such that output size is lower bound
|
70 |
+
if scale_width > scale_height:
|
71 |
+
# fit width
|
72 |
+
scale_height = scale_width
|
73 |
+
else:
|
74 |
+
# fit height
|
75 |
+
scale_width = scale_height
|
76 |
+
elif self.__resize_method == "upper_bound":
|
77 |
+
# scale such that output size is upper bound
|
78 |
+
if scale_width < scale_height:
|
79 |
+
# fit width
|
80 |
+
scale_height = scale_width
|
81 |
+
else:
|
82 |
+
# fit height
|
83 |
+
scale_width = scale_height
|
84 |
+
elif self.__resize_method == "minimal":
|
85 |
+
# scale as least as possbile
|
86 |
+
if abs(1 - scale_width) < abs(1 - scale_height):
|
87 |
+
# fit width
|
88 |
+
scale_height = scale_width
|
89 |
+
else:
|
90 |
+
# fit height
|
91 |
+
scale_width = scale_height
|
92 |
+
else:
|
93 |
+
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
94 |
+
|
95 |
+
if self.__resize_method == "lower_bound":
|
96 |
+
new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
|
97 |
+
new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
|
98 |
+
elif self.__resize_method == "upper_bound":
|
99 |
+
new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
|
100 |
+
new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
|
101 |
+
elif self.__resize_method == "minimal":
|
102 |
+
new_height = self.constrain_to_multiple_of(scale_height * height)
|
103 |
+
new_width = self.constrain_to_multiple_of(scale_width * width)
|
104 |
+
else:
|
105 |
+
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
106 |
+
|
107 |
+
return (new_width, new_height)
|
108 |
+
|
109 |
+
def __call__(self, sample):
|
110 |
+
width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
|
111 |
+
|
112 |
+
# resize sample
|
113 |
+
sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
|
114 |
+
|
115 |
+
if self.__resize_target:
|
116 |
+
if "depth" in sample:
|
117 |
+
sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
|
118 |
+
|
119 |
+
if "mask" in sample:
|
120 |
+
sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
|
121 |
+
|
122 |
+
return sample
|
123 |
+
|
124 |
+
|
125 |
+
class NormalizeImage(object):
|
126 |
+
"""Normlize image by given mean and std.
|
127 |
+
"""
|
128 |
+
|
129 |
+
def __init__(self, mean, std):
|
130 |
+
self.__mean = mean
|
131 |
+
self.__std = std
|
132 |
+
|
133 |
+
def __call__(self, sample):
|
134 |
+
sample["image"] = (sample["image"] - self.__mean) / self.__std
|
135 |
+
|
136 |
+
return sample
|
137 |
+
|
138 |
+
|
139 |
+
class PrepareForNet(object):
|
140 |
+
"""Prepare sample for usage as network input.
|
141 |
+
"""
|
142 |
+
|
143 |
+
def __init__(self):
|
144 |
+
pass
|
145 |
+
|
146 |
+
def __call__(self, sample):
|
147 |
+
image = np.transpose(sample["image"], (2, 0, 1))
|
148 |
+
sample["image"] = np.ascontiguousarray(image).astype(np.float32)
|
149 |
+
|
150 |
+
if "depth" in sample:
|
151 |
+
depth = sample["depth"].astype(np.float32)
|
152 |
+
sample["depth"] = np.ascontiguousarray(depth)
|
153 |
+
|
154 |
+
if "mask" in sample:
|
155 |
+
sample["mask"] = sample["mask"].astype(np.float32)
|
156 |
+
sample["mask"] = np.ascontiguousarray(sample["mask"])
|
157 |
+
|
158 |
+
return sample
|
ControlNetUnion-space/requirements.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/diffusers.git
|
2 |
+
torch==2.0.1
|
3 |
+
torchvision==0.15.2
|
4 |
+
transformers==4.43.3
|
5 |
+
einops
|
6 |
+
onnxruntime-gpu
|
7 |
+
spaces
|
8 |
+
accelerate
|
9 |
+
omegaconf
|
10 |
+
huggingface-hub
|
11 |
+
opencv-python
|
12 |
+
gradio
|
13 |
+
xformers
|
14 |
+
sentencepiece
|
15 |
+
peft
|
16 |
+
scipy
|
17 |
+
scikit-image
|
app.py
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append('./')
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import spaces
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import subprocess
|
9 |
+
import numpy as np
|
10 |
+
from PIL import Image
|
11 |
+
import cv2
|
12 |
+
import torch
|
13 |
+
import random
|
14 |
+
|
15 |
+
os.system("pip install -e ./controlnet_aux")
|
16 |
+
|
17 |
+
from controlnet_aux import OpenposeDetector, CannyDetector
|
18 |
+
from depth_anything_v2.dpt import DepthAnythingV2
|
19 |
+
|
20 |
+
from huggingface_hub import hf_hub_download
|
21 |
+
|
22 |
+
from huggingface_hub import login
|
23 |
+
hf_token = os.environ.get("HF_TOKEN_GATED")
|
24 |
+
login(token=hf_token)
|
25 |
+
|
26 |
+
MAX_SEED = np.iinfo(np.int32).max
|
27 |
+
|
28 |
+
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
|
29 |
+
if randomize_seed:
|
30 |
+
seed = random.randint(0, MAX_SEED)
|
31 |
+
return seed
|
32 |
+
|
33 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
34 |
+
model_configs = {
|
35 |
+
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
|
36 |
+
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
|
37 |
+
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
|
38 |
+
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
|
39 |
+
}
|
40 |
+
|
41 |
+
encoder = 'vitl'
|
42 |
+
model = DepthAnythingV2(**model_configs[encoder])
|
43 |
+
filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-Large", filename=f"depth_anything_v2_vitl.pth", repo_type="model")
|
44 |
+
state_dict = torch.load(filepath, map_location="cpu")
|
45 |
+
model.load_state_dict(state_dict)
|
46 |
+
model = model.to(DEVICE).eval()
|
47 |
+
|
48 |
+
import torch
|
49 |
+
from diffusers.utils import load_image
|
50 |
+
from diffusers import FluxControlNetPipeline, FluxControlNetModel
|
51 |
+
from diffusers.models import FluxMultiControlNetModel
|
52 |
+
|
53 |
+
base_model = 'black-forest-labs/FLUX.1-dev'
|
54 |
+
controlnet_model = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro'
|
55 |
+
controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
|
56 |
+
controlnet = FluxMultiControlNetModel([controlnet])
|
57 |
+
pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=controlnet, torch_dtype=torch.bfloat16)
|
58 |
+
pipe.to("cuda")
|
59 |
+
|
60 |
+
mode_mapping = {"canny":0, "tile":1, "depth":2, "blur":3, "openpose":4, "gray":5, "low quality": 6}
|
61 |
+
strength_mapping = {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}
|
62 |
+
|
63 |
+
canny = CannyDetector()
|
64 |
+
open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
|
65 |
+
|
66 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
67 |
+
pipe.vae.enable_tiling()
|
68 |
+
pipe.vae.enable_slicing()
|
69 |
+
pipe.enable_model_cpu_offload() # for saving memory
|
70 |
+
|
71 |
+
def convert_from_image_to_cv2(img: Image) -> np.ndarray:
|
72 |
+
return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
73 |
+
|
74 |
+
def convert_from_cv2_to_image(img: np.ndarray) -> Image:
|
75 |
+
return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
|
76 |
+
|
77 |
+
def extract_depth(image):
|
78 |
+
image = np.asarray(image)
|
79 |
+
depth = model.infer_image(image[:, :, ::-1])
|
80 |
+
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
|
81 |
+
depth = depth.astype(np.uint8)
|
82 |
+
gray_depth = Image.fromarray(depth).convert('RGB')
|
83 |
+
return gray_depth
|
84 |
+
|
85 |
+
def extract_openpose(img):
|
86 |
+
processed_image_open_pose = open_pose(img, hand_and_face=True)
|
87 |
+
return processed_image_open_pose
|
88 |
+
|
89 |
+
def extract_canny(image):
|
90 |
+
processed_image_canny = canny(image)
|
91 |
+
return processed_image_canny
|
92 |
+
|
93 |
+
def apply_gaussian_blur(image, kernel_size=(21, 21)):
|
94 |
+
image = convert_from_image_to_cv2(image)
|
95 |
+
blurred_image = convert_from_cv2_to_image(cv2.GaussianBlur(image, kernel_size, 0))
|
96 |
+
return blurred_image
|
97 |
+
|
98 |
+
def convert_to_grayscale(image):
|
99 |
+
image = convert_from_image_to_cv2(image)
|
100 |
+
gray_image = convert_from_cv2_to_image(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))
|
101 |
+
return gray_image
|
102 |
+
|
103 |
+
def add_gaussian_noise(image, mean=0, sigma=10):
|
104 |
+
image = convert_from_image_to_cv2(image)
|
105 |
+
noise = np.random.normal(mean, sigma, image.shape)
|
106 |
+
noisy_image = convert_from_cv2_to_image(np.clip(image.astype(np.float32) + noise, 0, 255).astype(np.uint8))
|
107 |
+
return noisy_image
|
108 |
+
|
109 |
+
def tile(input_image, resolution=768):
|
110 |
+
input_image = convert_from_image_to_cv2(input_image)
|
111 |
+
H, W, C = input_image.shape
|
112 |
+
H = float(H)
|
113 |
+
W = float(W)
|
114 |
+
k = float(resolution) / min(H, W)
|
115 |
+
H *= k
|
116 |
+
W *= k
|
117 |
+
H = int(np.round(H / 64.0)) * 64
|
118 |
+
W = int(np.round(W / 64.0)) * 64
|
119 |
+
img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
|
120 |
+
img = convert_from_cv2_to_image(img)
|
121 |
+
return img
|
122 |
+
|
123 |
+
def resize_img(input_image, max_side=768, min_side=512, size=None,
|
124 |
+
pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64):
|
125 |
+
|
126 |
+
w, h = input_image.size
|
127 |
+
if size is not None:
|
128 |
+
w_resize_new, h_resize_new = size
|
129 |
+
else:
|
130 |
+
ratio = min_side / min(h, w)
|
131 |
+
w, h = round(ratio*w), round(ratio*h)
|
132 |
+
ratio = max_side / max(h, w)
|
133 |
+
input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
|
134 |
+
w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
|
135 |
+
h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
|
136 |
+
input_image = input_image.resize([w_resize_new, h_resize_new], mode)
|
137 |
+
|
138 |
+
if pad_to_max_side:
|
139 |
+
res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
|
140 |
+
offset_x = (max_side - w_resize_new) // 2
|
141 |
+
offset_y = (max_side - h_resize_new) // 2
|
142 |
+
res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
|
143 |
+
input_image = Image.fromarray(res)
|
144 |
+
return input_image
|
145 |
+
|
146 |
+
@spaces.GPU(duration=180)
|
147 |
+
def infer(cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed, progress=gr.Progress(track_tqdm=True)):
|
148 |
+
|
149 |
+
control_mode_num = mode_mapping[control_mode]
|
150 |
+
|
151 |
+
if cond_in is None:
|
152 |
+
if image_in is not None:
|
153 |
+
image_in = resize_img(load_image(image_in))
|
154 |
+
if control_mode == "canny":
|
155 |
+
control_image = extract_canny(image_in)
|
156 |
+
elif control_mode == "depth":
|
157 |
+
control_image = extract_depth(image_in)
|
158 |
+
elif control_mode == "openpose":
|
159 |
+
control_image = extract_openpose(image_in)
|
160 |
+
elif control_mode == "blur":
|
161 |
+
control_image = apply_gaussian_blur(image_in)
|
162 |
+
elif control_mode == "low quality":
|
163 |
+
control_image = add_gaussian_noise(image_in)
|
164 |
+
elif control_mode == "gray":
|
165 |
+
control_image = convert_to_grayscale(image_in)
|
166 |
+
elif control_mode == "tile":
|
167 |
+
control_image = tile(image_in)
|
168 |
+
else:
|
169 |
+
control_image = resize_img(load_image(cond_in))
|
170 |
+
|
171 |
+
width, height = control_image.size
|
172 |
+
|
173 |
+
image = pipe(
|
174 |
+
prompt,
|
175 |
+
control_image=[control_image],
|
176 |
+
control_mode=[control_mode_num],
|
177 |
+
width=width,
|
178 |
+
height=height,
|
179 |
+
controlnet_conditioning_scale=[control_strength],
|
180 |
+
num_inference_steps=inference_steps,
|
181 |
+
guidance_scale=guidance_scale,
|
182 |
+
generator=torch.manual_seed(seed),
|
183 |
+
).images[0]
|
184 |
+
|
185 |
+
torch.cuda.empty_cache()
|
186 |
+
|
187 |
+
return image, control_image, gr.update(visible=True)
|
188 |
+
|
189 |
+
|
190 |
+
css="""
|
191 |
+
#col-container{
|
192 |
+
margin: 0 auto;
|
193 |
+
max-width: 1080px;
|
194 |
+
}
|
195 |
+
"""
|
196 |
+
with gr.Blocks(css=css) as demo:
|
197 |
+
with gr.Column(elem_id="col-container"):
|
198 |
+
gr.Markdown("""
|
199 |
+
# FLUX.1-dev-ControlNet-Union-Pro
|
200 |
+
A unified ControlNet for FLUX.1-dev model from the InstantX team and Shakker Labs. Model card: [Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro). <br />
|
201 |
+
The recommended strength: {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}. Long prompt is preferred by FLUX.1.
|
202 |
+
""")
|
203 |
+
|
204 |
+
with gr.Column():
|
205 |
+
|
206 |
+
with gr.Row():
|
207 |
+
with gr.Column():
|
208 |
+
|
209 |
+
with gr.Row(equal_height=True):
|
210 |
+
cond_in = gr.Image(label="Upload a processed control image", sources=["upload"], type="filepath")
|
211 |
+
image_in = gr.Image(label="Extract condition from a reference image (Optional)", sources=["upload"], type="filepath")
|
212 |
+
|
213 |
+
prompt = gr.Textbox(label="Prompt", value="best quality")
|
214 |
+
|
215 |
+
with gr.Accordion("Controlnet"):
|
216 |
+
control_mode = gr.Radio(
|
217 |
+
["canny", "depth", "openpose", "gray", "blur", "tile", "low quality"], label="Mode", value="gray",
|
218 |
+
info="select the control mode, one for all"
|
219 |
+
)
|
220 |
+
|
221 |
+
control_strength = gr.Slider(
|
222 |
+
label="control strength",
|
223 |
+
minimum=0,
|
224 |
+
maximum=1.0,
|
225 |
+
step=0.05,
|
226 |
+
value=0.50,
|
227 |
+
)
|
228 |
+
|
229 |
+
seed = gr.Slider(
|
230 |
+
label="Seed",
|
231 |
+
minimum=0,
|
232 |
+
maximum=MAX_SEED,
|
233 |
+
step=1,
|
234 |
+
value=42,
|
235 |
+
)
|
236 |
+
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
|
237 |
+
|
238 |
+
with gr.Accordion("Advanced settings", open=False):
|
239 |
+
with gr.Column():
|
240 |
+
with gr.Row():
|
241 |
+
inference_steps = gr.Slider(label="Inference steps", minimum=1, maximum=50, step=1, value=24)
|
242 |
+
guidance_scale = gr.Slider(label="Guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=3.5)
|
243 |
+
|
244 |
+
submit_btn = gr.Button("Submit")
|
245 |
+
|
246 |
+
with gr.Column():
|
247 |
+
result = gr.Image(label="Result")
|
248 |
+
processed_cond = gr.Image(label="Preprocessed Cond")
|
249 |
+
|
250 |
+
submit_btn.click(
|
251 |
+
fn=randomize_seed_fn,
|
252 |
+
inputs=[seed, randomize_seed],
|
253 |
+
outputs=seed,
|
254 |
+
queue=False,
|
255 |
+
api_name=False
|
256 |
+
).then(
|
257 |
+
fn = infer,
|
258 |
+
inputs = [cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed],
|
259 |
+
outputs = [result, processed_cond],
|
260 |
+
show_api=False
|
261 |
+
)
|
262 |
+
|
263 |
+
demo.queue(api_open=False)
|
264 |
+
demo.launch()
|
controlnet_aux/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
controlnet_aux/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__version__ = "0.0.9"
|
2 |
+
|
3 |
+
from .canny import CannyDetector
|
4 |
+
from .open_pose import OpenposeDetector
|
5 |
+
|
controlnet_aux/canny/__init__.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
+
from ..util import HWC3, resize_image
|
6 |
+
|
7 |
+
class CannyDetector:
|
8 |
+
def __call__(self, input_image=None, low_threshold=100, high_threshold=200, detect_resolution=512, image_resolution=512, output_type=None, **kwargs):
|
9 |
+
if "img" in kwargs:
|
10 |
+
warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
|
11 |
+
input_image = kwargs.pop("img")
|
12 |
+
|
13 |
+
if input_image is None:
|
14 |
+
raise ValueError("input_image must be defined.")
|
15 |
+
|
16 |
+
if not isinstance(input_image, np.ndarray):
|
17 |
+
input_image = np.array(input_image, dtype=np.uint8)
|
18 |
+
output_type = output_type or "pil"
|
19 |
+
else:
|
20 |
+
output_type = output_type or "np"
|
21 |
+
|
22 |
+
input_image = HWC3(input_image)
|
23 |
+
input_image = resize_image(input_image, detect_resolution)
|
24 |
+
|
25 |
+
detected_map = cv2.Canny(input_image, low_threshold, high_threshold)
|
26 |
+
detected_map = HWC3(detected_map)
|
27 |
+
|
28 |
+
img = resize_image(input_image, image_resolution)
|
29 |
+
H, W, C = img.shape
|
30 |
+
|
31 |
+
detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
|
32 |
+
|
33 |
+
if output_type == "pil":
|
34 |
+
detected_map = Image.fromarray(detected_map)
|
35 |
+
|
36 |
+
return detected_map
|
controlnet_aux/open_pose/LICENSE
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OPENPOSE: MULTIPERSON KEYPOINT DETECTION
|
2 |
+
SOFTWARE LICENSE AGREEMENT
|
3 |
+
ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
|
4 |
+
|
5 |
+
BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
|
6 |
+
|
7 |
+
This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
|
8 |
+
|
9 |
+
RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
|
10 |
+
Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
|
11 |
+
non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
|
12 |
+
|
13 |
+
CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
|
14 |
+
|
15 |
+
COPYRIGHT: The Software is owned by Licensor and is protected by United
|
16 |
+
States copyright laws and applicable international treaties and/or conventions.
|
17 |
+
|
18 |
+
PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
|
19 |
+
|
20 |
+
DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
|
21 |
+
|
22 |
+
BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
|
23 |
+
|
24 |
+
USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
|
25 |
+
|
26 |
+
You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
|
27 |
+
|
28 |
+
ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
|
29 |
+
|
30 |
+
TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
|
31 |
+
|
32 |
+
The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
|
33 |
+
|
34 |
+
FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
|
35 |
+
|
36 |
+
DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
|
37 |
+
|
38 |
+
SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
|
39 |
+
|
40 |
+
EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
|
41 |
+
|
42 |
+
EXPORT REGULATION: Licensee agrees to comply with any and all applicable
|
43 |
+
U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
|
44 |
+
|
45 |
+
SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
|
46 |
+
|
47 |
+
NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
|
48 |
+
|
49 |
+
GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
|
50 |
+
|
51 |
+
ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
************************************************************************
|
56 |
+
|
57 |
+
THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
|
58 |
+
|
59 |
+
This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
|
60 |
+
|
61 |
+
1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
|
62 |
+
|
63 |
+
COPYRIGHT
|
64 |
+
|
65 |
+
All contributions by the University of California:
|
66 |
+
Copyright (c) 2014-2017 The Regents of the University of California (Regents)
|
67 |
+
All rights reserved.
|
68 |
+
|
69 |
+
All other contributions:
|
70 |
+
Copyright (c) 2014-2017, the respective contributors
|
71 |
+
All rights reserved.
|
72 |
+
|
73 |
+
Caffe uses a shared copyright model: each contributor holds copyright over
|
74 |
+
their contributions to Caffe. The project versioning records all such
|
75 |
+
contribution and copyright details. If a contributor wants to further mark
|
76 |
+
their specific copyright on a particular contribution, they should indicate
|
77 |
+
their copyright solely in the commit message of the change when it is
|
78 |
+
committed.
|
79 |
+
|
80 |
+
LICENSE
|
81 |
+
|
82 |
+
Redistribution and use in source and binary forms, with or without
|
83 |
+
modification, are permitted provided that the following conditions are met:
|
84 |
+
|
85 |
+
1. Redistributions of source code must retain the above copyright notice, this
|
86 |
+
list of conditions and the following disclaimer.
|
87 |
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
88 |
+
this list of conditions and the following disclaimer in the documentation
|
89 |
+
and/or other materials provided with the distribution.
|
90 |
+
|
91 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
92 |
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
93 |
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
94 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
95 |
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
96 |
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
97 |
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
98 |
+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
99 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
100 |
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
101 |
+
|
102 |
+
CONTRIBUTION AGREEMENT
|
103 |
+
|
104 |
+
By contributing to the BVLC/caffe repository through pull-request, comment,
|
105 |
+
or otherwise, the contributor releases their content to the
|
106 |
+
license and copyright terms herein.
|
107 |
+
|
108 |
+
************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
|
controlnet_aux/open_pose/__init__.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Openpose
|
2 |
+
# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
|
3 |
+
# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
|
4 |
+
# 3rd Edited by ControlNet
|
5 |
+
# 4th Edited by ControlNet (added face and correct hands)
|
6 |
+
# 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs)
|
7 |
+
# This preprocessor is licensed by CMU for non-commercial use only.
|
8 |
+
|
9 |
+
|
10 |
+
import os
|
11 |
+
|
12 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
13 |
+
|
14 |
+
import json
|
15 |
+
import warnings
|
16 |
+
from typing import Callable, List, NamedTuple, Tuple, Union
|
17 |
+
|
18 |
+
import cv2
|
19 |
+
import numpy as np
|
20 |
+
import torch
|
21 |
+
from huggingface_hub import hf_hub_download
|
22 |
+
from PIL import Image
|
23 |
+
|
24 |
+
from ..util import HWC3, resize_image
|
25 |
+
from . import util
|
26 |
+
from .body import Body, BodyResult, Keypoint
|
27 |
+
from .face import Face
|
28 |
+
from .hand import Hand
|
29 |
+
|
30 |
+
HandResult = List[Keypoint]
|
31 |
+
FaceResult = List[Keypoint]
|
32 |
+
|
33 |
+
class PoseResult(NamedTuple):
|
34 |
+
body: BodyResult
|
35 |
+
left_hand: Union[HandResult, None]
|
36 |
+
right_hand: Union[HandResult, None]
|
37 |
+
face: Union[FaceResult, None]
|
38 |
+
|
39 |
+
def draw_poses(poses: List[PoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True):
|
40 |
+
"""
|
41 |
+
Draw the detected poses on an empty canvas.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
poses (List[PoseResult]): A list of PoseResult objects containing the detected poses.
|
45 |
+
H (int): The height of the canvas.
|
46 |
+
W (int): The width of the canvas.
|
47 |
+
draw_body (bool, optional): Whether to draw body keypoints. Defaults to True.
|
48 |
+
draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True.
|
49 |
+
draw_face (bool, optional): Whether to draw face keypoints. Defaults to True.
|
50 |
+
|
51 |
+
Returns:
|
52 |
+
numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses.
|
53 |
+
"""
|
54 |
+
canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
|
55 |
+
|
56 |
+
for pose in poses:
|
57 |
+
if draw_body:
|
58 |
+
canvas = util.draw_bodypose(canvas, pose.body.keypoints)
|
59 |
+
|
60 |
+
if draw_hand:
|
61 |
+
canvas = util.draw_handpose(canvas, pose.left_hand)
|
62 |
+
canvas = util.draw_handpose(canvas, pose.right_hand)
|
63 |
+
|
64 |
+
if draw_face:
|
65 |
+
canvas = util.draw_facepose(canvas, pose.face)
|
66 |
+
|
67 |
+
return canvas
|
68 |
+
|
69 |
+
|
70 |
+
class OpenposeDetector:
|
71 |
+
"""
|
72 |
+
A class for detecting human poses in images using the Openpose model.
|
73 |
+
|
74 |
+
Attributes:
|
75 |
+
model_dir (str): Path to the directory where the pose models are stored.
|
76 |
+
"""
|
77 |
+
def __init__(self, body_estimation, hand_estimation=None, face_estimation=None):
|
78 |
+
self.body_estimation = body_estimation
|
79 |
+
self.hand_estimation = hand_estimation
|
80 |
+
self.face_estimation = face_estimation
|
81 |
+
|
82 |
+
@classmethod
|
83 |
+
def from_pretrained(cls, pretrained_model_or_path, filename=None, hand_filename=None, face_filename=None, cache_dir=None, local_files_only=False):
|
84 |
+
|
85 |
+
if pretrained_model_or_path == "lllyasviel/ControlNet":
|
86 |
+
filename = filename or "annotator/ckpts/body_pose_model.pth"
|
87 |
+
hand_filename = hand_filename or "annotator/ckpts/hand_pose_model.pth"
|
88 |
+
face_filename = face_filename or "facenet.pth"
|
89 |
+
|
90 |
+
face_pretrained_model_or_path = "lllyasviel/Annotators"
|
91 |
+
else:
|
92 |
+
filename = filename or "body_pose_model.pth"
|
93 |
+
hand_filename = hand_filename or "hand_pose_model.pth"
|
94 |
+
face_filename = face_filename or "facenet.pth"
|
95 |
+
|
96 |
+
face_pretrained_model_or_path = pretrained_model_or_path
|
97 |
+
|
98 |
+
if os.path.isdir(pretrained_model_or_path):
|
99 |
+
body_model_path = os.path.join(pretrained_model_or_path, filename)
|
100 |
+
hand_model_path = os.path.join(pretrained_model_or_path, hand_filename)
|
101 |
+
face_model_path = os.path.join(face_pretrained_model_or_path, face_filename)
|
102 |
+
else:
|
103 |
+
body_model_path = hf_hub_download(pretrained_model_or_path, filename, cache_dir=cache_dir, local_files_only=local_files_only)
|
104 |
+
hand_model_path = hf_hub_download(pretrained_model_or_path, hand_filename, cache_dir=cache_dir, local_files_only=local_files_only)
|
105 |
+
face_model_path = hf_hub_download(face_pretrained_model_or_path, face_filename, cache_dir=cache_dir, local_files_only=local_files_only)
|
106 |
+
|
107 |
+
body_estimation = Body(body_model_path)
|
108 |
+
hand_estimation = Hand(hand_model_path)
|
109 |
+
face_estimation = Face(face_model_path)
|
110 |
+
|
111 |
+
return cls(body_estimation, hand_estimation, face_estimation)
|
112 |
+
|
113 |
+
def to(self, device):
|
114 |
+
self.body_estimation.to(device)
|
115 |
+
self.hand_estimation.to(device)
|
116 |
+
self.face_estimation.to(device)
|
117 |
+
return self
|
118 |
+
|
119 |
+
def detect_hands(self, body: BodyResult, oriImg) -> Tuple[Union[HandResult, None], Union[HandResult, None]]:
|
120 |
+
left_hand = None
|
121 |
+
right_hand = None
|
122 |
+
H, W, _ = oriImg.shape
|
123 |
+
for x, y, w, is_left in util.handDetect(body, oriImg):
|
124 |
+
peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :]).astype(np.float32)
|
125 |
+
if peaks.ndim == 2 and peaks.shape[1] == 2:
|
126 |
+
peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
|
127 |
+
peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
|
128 |
+
|
129 |
+
hand_result = [
|
130 |
+
Keypoint(x=peak[0], y=peak[1])
|
131 |
+
for peak in peaks
|
132 |
+
]
|
133 |
+
|
134 |
+
if is_left:
|
135 |
+
left_hand = hand_result
|
136 |
+
else:
|
137 |
+
right_hand = hand_result
|
138 |
+
|
139 |
+
return left_hand, right_hand
|
140 |
+
|
141 |
+
def detect_face(self, body: BodyResult, oriImg) -> Union[FaceResult, None]:
|
142 |
+
face = util.faceDetect(body, oriImg)
|
143 |
+
if face is None:
|
144 |
+
return None
|
145 |
+
|
146 |
+
x, y, w = face
|
147 |
+
H, W, _ = oriImg.shape
|
148 |
+
heatmaps = self.face_estimation(oriImg[y:y+w, x:x+w, :])
|
149 |
+
peaks = self.face_estimation.compute_peaks_from_heatmaps(heatmaps).astype(np.float32)
|
150 |
+
if peaks.ndim == 2 and peaks.shape[1] == 2:
|
151 |
+
peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
|
152 |
+
peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
|
153 |
+
return [
|
154 |
+
Keypoint(x=peak[0], y=peak[1])
|
155 |
+
for peak in peaks
|
156 |
+
]
|
157 |
+
|
158 |
+
return None
|
159 |
+
|
160 |
+
def detect_poses(self, oriImg, include_hand=False, include_face=False) -> List[PoseResult]:
|
161 |
+
"""
|
162 |
+
Detect poses in the given image.
|
163 |
+
Args:
|
164 |
+
oriImg (numpy.ndarray): The input image for pose detection.
|
165 |
+
include_hand (bool, optional): Whether to include hand detection. Defaults to False.
|
166 |
+
include_face (bool, optional): Whether to include face detection. Defaults to False.
|
167 |
+
|
168 |
+
Returns:
|
169 |
+
List[PoseResult]: A list of PoseResult objects containing the detected poses.
|
170 |
+
"""
|
171 |
+
oriImg = oriImg[:, :, ::-1].copy()
|
172 |
+
H, W, C = oriImg.shape
|
173 |
+
with torch.no_grad():
|
174 |
+
candidate, subset = self.body_estimation(oriImg)
|
175 |
+
bodies = self.body_estimation.format_body_result(candidate, subset)
|
176 |
+
|
177 |
+
results = []
|
178 |
+
for body in bodies:
|
179 |
+
left_hand, right_hand, face = (None,) * 3
|
180 |
+
if include_hand:
|
181 |
+
left_hand, right_hand = self.detect_hands(body, oriImg)
|
182 |
+
if include_face:
|
183 |
+
face = self.detect_face(body, oriImg)
|
184 |
+
|
185 |
+
results.append(PoseResult(BodyResult(
|
186 |
+
keypoints=[
|
187 |
+
Keypoint(
|
188 |
+
x=keypoint.x / float(W),
|
189 |
+
y=keypoint.y / float(H)
|
190 |
+
) if keypoint is not None else None
|
191 |
+
for keypoint in body.keypoints
|
192 |
+
],
|
193 |
+
total_score=body.total_score,
|
194 |
+
total_parts=body.total_parts
|
195 |
+
), left_hand, right_hand, face))
|
196 |
+
|
197 |
+
return results
|
198 |
+
|
199 |
+
def __call__(self, input_image, detect_resolution=512, image_resolution=512, include_body=True, include_hand=False, include_face=False, hand_and_face=None, output_type="pil", **kwargs):
|
200 |
+
if hand_and_face is not None:
|
201 |
+
warnings.warn("hand_and_face is deprecated. Use include_hand and include_face instead.", DeprecationWarning)
|
202 |
+
include_hand = hand_and_face
|
203 |
+
include_face = hand_and_face
|
204 |
+
|
205 |
+
if "return_pil" in kwargs:
|
206 |
+
warnings.warn("return_pil is deprecated. Use output_type instead.", DeprecationWarning)
|
207 |
+
output_type = "pil" if kwargs["return_pil"] else "np"
|
208 |
+
if type(output_type) is bool:
|
209 |
+
warnings.warn("Passing `True` or `False` to `output_type` is deprecated and will raise an error in future versions")
|
210 |
+
if output_type:
|
211 |
+
output_type = "pil"
|
212 |
+
|
213 |
+
if not isinstance(input_image, np.ndarray):
|
214 |
+
input_image = np.array(input_image, dtype=np.uint8)
|
215 |
+
|
216 |
+
input_image = HWC3(input_image)
|
217 |
+
input_image = resize_image(input_image, detect_resolution)
|
218 |
+
H, W, C = input_image.shape
|
219 |
+
|
220 |
+
poses = self.detect_poses(input_image, include_hand, include_face)
|
221 |
+
canvas = draw_poses(poses, H, W, draw_body=include_body, draw_hand=include_hand, draw_face=include_face)
|
222 |
+
|
223 |
+
detected_map = canvas
|
224 |
+
detected_map = HWC3(detected_map)
|
225 |
+
|
226 |
+
img = resize_image(input_image, image_resolution)
|
227 |
+
H, W, C = img.shape
|
228 |
+
|
229 |
+
detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
|
230 |
+
|
231 |
+
if output_type == "pil":
|
232 |
+
detected_map = Image.fromarray(detected_map)
|
233 |
+
|
234 |
+
return detected_map
|
controlnet_aux/open_pose/body.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from typing import List, NamedTuple, Union
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
from scipy.ndimage.filters import gaussian_filter
|
8 |
+
|
9 |
+
from . import util
|
10 |
+
from .model import bodypose_model
|
11 |
+
|
12 |
+
|
13 |
+
class Keypoint(NamedTuple):
|
14 |
+
x: float
|
15 |
+
y: float
|
16 |
+
score: float = 1.0
|
17 |
+
id: int = -1
|
18 |
+
|
19 |
+
|
20 |
+
class BodyResult(NamedTuple):
|
21 |
+
# Note: Using `Union` instead of `|` operator as the ladder is a Python
|
22 |
+
# 3.10 feature.
|
23 |
+
# Annotator code should be Python 3.8 Compatible, as controlnet repo uses
|
24 |
+
# Python 3.8 environment.
|
25 |
+
# https://github.com/lllyasviel/ControlNet/blob/d3284fcd0972c510635a4f5abe2eeb71dc0de524/environment.yaml#L6
|
26 |
+
keypoints: List[Union[Keypoint, None]]
|
27 |
+
total_score: float
|
28 |
+
total_parts: int
|
29 |
+
|
30 |
+
|
31 |
+
class Body(object):
|
32 |
+
def __init__(self, model_path):
|
33 |
+
self.model = bodypose_model()
|
34 |
+
model_dict = util.transfer(self.model, torch.load(model_path))
|
35 |
+
self.model.load_state_dict(model_dict)
|
36 |
+
self.model.eval()
|
37 |
+
|
38 |
+
def to(self, device):
|
39 |
+
self.model.to(device)
|
40 |
+
return self
|
41 |
+
|
42 |
+
def __call__(self, oriImg):
|
43 |
+
device = next(iter(self.model.parameters())).device
|
44 |
+
# scale_search = [0.5, 1.0, 1.5, 2.0]
|
45 |
+
scale_search = [0.5]
|
46 |
+
boxsize = 368
|
47 |
+
stride = 8
|
48 |
+
padValue = 128
|
49 |
+
thre1 = 0.1
|
50 |
+
thre2 = 0.05
|
51 |
+
multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
|
52 |
+
heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
|
53 |
+
paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
|
54 |
+
|
55 |
+
for m in range(len(multiplier)):
|
56 |
+
scale = multiplier[m]
|
57 |
+
imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale)
|
58 |
+
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
|
59 |
+
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
|
60 |
+
im = np.ascontiguousarray(im)
|
61 |
+
|
62 |
+
data = torch.from_numpy(im).float()
|
63 |
+
data = data.to(device)
|
64 |
+
# data = data.permute([2, 0, 1]).unsqueeze(0).float()
|
65 |
+
with torch.no_grad():
|
66 |
+
Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
|
67 |
+
Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
|
68 |
+
Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
|
69 |
+
|
70 |
+
# extract outputs, resize, and remove padding
|
71 |
+
# heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
|
72 |
+
heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
|
73 |
+
heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
|
74 |
+
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
75 |
+
heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1]))
|
76 |
+
|
77 |
+
# paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
|
78 |
+
paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
|
79 |
+
paf = util.smart_resize_k(paf, fx=stride, fy=stride)
|
80 |
+
paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
81 |
+
paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1]))
|
82 |
+
|
83 |
+
heatmap_avg += heatmap_avg + heatmap / len(multiplier)
|
84 |
+
paf_avg += + paf / len(multiplier)
|
85 |
+
|
86 |
+
all_peaks = []
|
87 |
+
peak_counter = 0
|
88 |
+
|
89 |
+
for part in range(18):
|
90 |
+
map_ori = heatmap_avg[:, :, part]
|
91 |
+
one_heatmap = gaussian_filter(map_ori, sigma=3)
|
92 |
+
|
93 |
+
map_left = np.zeros(one_heatmap.shape)
|
94 |
+
map_left[1:, :] = one_heatmap[:-1, :]
|
95 |
+
map_right = np.zeros(one_heatmap.shape)
|
96 |
+
map_right[:-1, :] = one_heatmap[1:, :]
|
97 |
+
map_up = np.zeros(one_heatmap.shape)
|
98 |
+
map_up[:, 1:] = one_heatmap[:, :-1]
|
99 |
+
map_down = np.zeros(one_heatmap.shape)
|
100 |
+
map_down[:, :-1] = one_heatmap[:, 1:]
|
101 |
+
|
102 |
+
peaks_binary = np.logical_and.reduce(
|
103 |
+
(one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
|
104 |
+
peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
|
105 |
+
peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
|
106 |
+
peak_id = range(peak_counter, peak_counter + len(peaks))
|
107 |
+
peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
|
108 |
+
|
109 |
+
all_peaks.append(peaks_with_score_and_id)
|
110 |
+
peak_counter += len(peaks)
|
111 |
+
|
112 |
+
# find connection in the specified sequence, center 29 is in the position 15
|
113 |
+
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
|
114 |
+
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
|
115 |
+
[1, 16], [16, 18], [3, 17], [6, 18]]
|
116 |
+
# the middle joints heatmap correpondence
|
117 |
+
mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
|
118 |
+
[23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
|
119 |
+
[55, 56], [37, 38], [45, 46]]
|
120 |
+
|
121 |
+
connection_all = []
|
122 |
+
special_k = []
|
123 |
+
mid_num = 10
|
124 |
+
|
125 |
+
for k in range(len(mapIdx)):
|
126 |
+
score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
|
127 |
+
candA = all_peaks[limbSeq[k][0] - 1]
|
128 |
+
candB = all_peaks[limbSeq[k][1] - 1]
|
129 |
+
nA = len(candA)
|
130 |
+
nB = len(candB)
|
131 |
+
indexA, indexB = limbSeq[k]
|
132 |
+
if (nA != 0 and nB != 0):
|
133 |
+
connection_candidate = []
|
134 |
+
for i in range(nA):
|
135 |
+
for j in range(nB):
|
136 |
+
vec = np.subtract(candB[j][:2], candA[i][:2])
|
137 |
+
norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
|
138 |
+
norm = max(0.001, norm)
|
139 |
+
vec = np.divide(vec, norm)
|
140 |
+
|
141 |
+
startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
|
142 |
+
np.linspace(candA[i][1], candB[j][1], num=mid_num)))
|
143 |
+
|
144 |
+
vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
|
145 |
+
for I in range(len(startend))])
|
146 |
+
vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
|
147 |
+
for I in range(len(startend))])
|
148 |
+
|
149 |
+
score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
|
150 |
+
score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
|
151 |
+
0.5 * oriImg.shape[0] / norm - 1, 0)
|
152 |
+
criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
|
153 |
+
criterion2 = score_with_dist_prior > 0
|
154 |
+
if criterion1 and criterion2:
|
155 |
+
connection_candidate.append(
|
156 |
+
[i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
|
157 |
+
|
158 |
+
connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
|
159 |
+
connection = np.zeros((0, 5))
|
160 |
+
for c in range(len(connection_candidate)):
|
161 |
+
i, j, s = connection_candidate[c][0:3]
|
162 |
+
if (i not in connection[:, 3] and j not in connection[:, 4]):
|
163 |
+
connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
|
164 |
+
if (len(connection) >= min(nA, nB)):
|
165 |
+
break
|
166 |
+
|
167 |
+
connection_all.append(connection)
|
168 |
+
else:
|
169 |
+
special_k.append(k)
|
170 |
+
connection_all.append([])
|
171 |
+
|
172 |
+
# last number in each row is the total parts number of that person
|
173 |
+
# the second last number in each row is the score of the overall configuration
|
174 |
+
subset = -1 * np.ones((0, 20))
|
175 |
+
candidate = np.array([item for sublist in all_peaks for item in sublist])
|
176 |
+
|
177 |
+
for k in range(len(mapIdx)):
|
178 |
+
if k not in special_k:
|
179 |
+
partAs = connection_all[k][:, 0]
|
180 |
+
partBs = connection_all[k][:, 1]
|
181 |
+
indexA, indexB = np.array(limbSeq[k]) - 1
|
182 |
+
|
183 |
+
for i in range(len(connection_all[k])): # = 1:size(temp,1)
|
184 |
+
found = 0
|
185 |
+
subset_idx = [-1, -1]
|
186 |
+
for j in range(len(subset)): # 1:size(subset,1):
|
187 |
+
if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
|
188 |
+
subset_idx[found] = j
|
189 |
+
found += 1
|
190 |
+
|
191 |
+
if found == 1:
|
192 |
+
j = subset_idx[0]
|
193 |
+
if subset[j][indexB] != partBs[i]:
|
194 |
+
subset[j][indexB] = partBs[i]
|
195 |
+
subset[j][-1] += 1
|
196 |
+
subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
|
197 |
+
elif found == 2: # if found 2 and disjoint, merge them
|
198 |
+
j1, j2 = subset_idx
|
199 |
+
membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
|
200 |
+
if len(np.nonzero(membership == 2)[0]) == 0: # merge
|
201 |
+
subset[j1][:-2] += (subset[j2][:-2] + 1)
|
202 |
+
subset[j1][-2:] += subset[j2][-2:]
|
203 |
+
subset[j1][-2] += connection_all[k][i][2]
|
204 |
+
subset = np.delete(subset, j2, 0)
|
205 |
+
else: # as like found == 1
|
206 |
+
subset[j1][indexB] = partBs[i]
|
207 |
+
subset[j1][-1] += 1
|
208 |
+
subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
|
209 |
+
|
210 |
+
# if find no partA in the subset, create a new subset
|
211 |
+
elif not found and k < 17:
|
212 |
+
row = -1 * np.ones(20)
|
213 |
+
row[indexA] = partAs[i]
|
214 |
+
row[indexB] = partBs[i]
|
215 |
+
row[-1] = 2
|
216 |
+
row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
|
217 |
+
subset = np.vstack([subset, row])
|
218 |
+
# delete some rows of subset which has few parts occur
|
219 |
+
deleteIdx = []
|
220 |
+
for i in range(len(subset)):
|
221 |
+
if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
|
222 |
+
deleteIdx.append(i)
|
223 |
+
subset = np.delete(subset, deleteIdx, axis=0)
|
224 |
+
|
225 |
+
# subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
|
226 |
+
# candidate: x, y, score, id
|
227 |
+
return candidate, subset
|
228 |
+
|
229 |
+
@staticmethod
|
230 |
+
def format_body_result(candidate: np.ndarray, subset: np.ndarray) -> List[BodyResult]:
|
231 |
+
"""
|
232 |
+
Format the body results from the candidate and subset arrays into a list of BodyResult objects.
|
233 |
+
|
234 |
+
Args:
|
235 |
+
candidate (np.ndarray): An array of candidates containing the x, y coordinates, score, and id
|
236 |
+
for each body part.
|
237 |
+
subset (np.ndarray): An array of subsets containing indices to the candidate array for each
|
238 |
+
person detected. The last two columns of each row hold the total score and total parts
|
239 |
+
of the person.
|
240 |
+
|
241 |
+
Returns:
|
242 |
+
List[BodyResult]: A list of BodyResult objects, where each object represents a person with
|
243 |
+
detected keypoints, total score, and total parts.
|
244 |
+
"""
|
245 |
+
return [
|
246 |
+
BodyResult(
|
247 |
+
keypoints=[
|
248 |
+
Keypoint(
|
249 |
+
x=candidate[candidate_index][0],
|
250 |
+
y=candidate[candidate_index][1],
|
251 |
+
score=candidate[candidate_index][2],
|
252 |
+
id=candidate[candidate_index][3]
|
253 |
+
) if candidate_index != -1 else None
|
254 |
+
for candidate_index in person[:18].astype(int)
|
255 |
+
],
|
256 |
+
total_score=person[18],
|
257 |
+
total_parts=person[19]
|
258 |
+
)
|
259 |
+
for person in subset
|
260 |
+
]
|
controlnet_aux/open_pose/face.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn.functional as F
|
6 |
+
from torch.nn import Conv2d, MaxPool2d, Module, ReLU, init
|
7 |
+
from torchvision.transforms import ToPILImage, ToTensor
|
8 |
+
|
9 |
+
from . import util
|
10 |
+
|
11 |
+
|
12 |
+
class FaceNet(Module):
|
13 |
+
"""Model the cascading heatmaps. """
|
14 |
+
def __init__(self):
|
15 |
+
super(FaceNet, self).__init__()
|
16 |
+
# cnn to make feature map
|
17 |
+
self.relu = ReLU()
|
18 |
+
self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2)
|
19 |
+
self.conv1_1 = Conv2d(in_channels=3, out_channels=64,
|
20 |
+
kernel_size=3, stride=1, padding=1)
|
21 |
+
self.conv1_2 = Conv2d(
|
22 |
+
in_channels=64, out_channels=64, kernel_size=3, stride=1,
|
23 |
+
padding=1)
|
24 |
+
self.conv2_1 = Conv2d(
|
25 |
+
in_channels=64, out_channels=128, kernel_size=3, stride=1,
|
26 |
+
padding=1)
|
27 |
+
self.conv2_2 = Conv2d(
|
28 |
+
in_channels=128, out_channels=128, kernel_size=3, stride=1,
|
29 |
+
padding=1)
|
30 |
+
self.conv3_1 = Conv2d(
|
31 |
+
in_channels=128, out_channels=256, kernel_size=3, stride=1,
|
32 |
+
padding=1)
|
33 |
+
self.conv3_2 = Conv2d(
|
34 |
+
in_channels=256, out_channels=256, kernel_size=3, stride=1,
|
35 |
+
padding=1)
|
36 |
+
self.conv3_3 = Conv2d(
|
37 |
+
in_channels=256, out_channels=256, kernel_size=3, stride=1,
|
38 |
+
padding=1)
|
39 |
+
self.conv3_4 = Conv2d(
|
40 |
+
in_channels=256, out_channels=256, kernel_size=3, stride=1,
|
41 |
+
padding=1)
|
42 |
+
self.conv4_1 = Conv2d(
|
43 |
+
in_channels=256, out_channels=512, kernel_size=3, stride=1,
|
44 |
+
padding=1)
|
45 |
+
self.conv4_2 = Conv2d(
|
46 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
47 |
+
padding=1)
|
48 |
+
self.conv4_3 = Conv2d(
|
49 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
50 |
+
padding=1)
|
51 |
+
self.conv4_4 = Conv2d(
|
52 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
53 |
+
padding=1)
|
54 |
+
self.conv5_1 = Conv2d(
|
55 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
56 |
+
padding=1)
|
57 |
+
self.conv5_2 = Conv2d(
|
58 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
59 |
+
padding=1)
|
60 |
+
self.conv5_3_CPM = Conv2d(
|
61 |
+
in_channels=512, out_channels=128, kernel_size=3, stride=1,
|
62 |
+
padding=1)
|
63 |
+
|
64 |
+
# stage1
|
65 |
+
self.conv6_1_CPM = Conv2d(
|
66 |
+
in_channels=128, out_channels=512, kernel_size=1, stride=1,
|
67 |
+
padding=0)
|
68 |
+
self.conv6_2_CPM = Conv2d(
|
69 |
+
in_channels=512, out_channels=71, kernel_size=1, stride=1,
|
70 |
+
padding=0)
|
71 |
+
|
72 |
+
# stage2
|
73 |
+
self.Mconv1_stage2 = Conv2d(
|
74 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
75 |
+
padding=3)
|
76 |
+
self.Mconv2_stage2 = Conv2d(
|
77 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
78 |
+
padding=3)
|
79 |
+
self.Mconv3_stage2 = Conv2d(
|
80 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
81 |
+
padding=3)
|
82 |
+
self.Mconv4_stage2 = Conv2d(
|
83 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
84 |
+
padding=3)
|
85 |
+
self.Mconv5_stage2 = Conv2d(
|
86 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
87 |
+
padding=3)
|
88 |
+
self.Mconv6_stage2 = Conv2d(
|
89 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
90 |
+
padding=0)
|
91 |
+
self.Mconv7_stage2 = Conv2d(
|
92 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
93 |
+
padding=0)
|
94 |
+
|
95 |
+
# stage3
|
96 |
+
self.Mconv1_stage3 = Conv2d(
|
97 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
98 |
+
padding=3)
|
99 |
+
self.Mconv2_stage3 = Conv2d(
|
100 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
101 |
+
padding=3)
|
102 |
+
self.Mconv3_stage3 = Conv2d(
|
103 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
104 |
+
padding=3)
|
105 |
+
self.Mconv4_stage3 = Conv2d(
|
106 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
107 |
+
padding=3)
|
108 |
+
self.Mconv5_stage3 = Conv2d(
|
109 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
110 |
+
padding=3)
|
111 |
+
self.Mconv6_stage3 = Conv2d(
|
112 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
113 |
+
padding=0)
|
114 |
+
self.Mconv7_stage3 = Conv2d(
|
115 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
116 |
+
padding=0)
|
117 |
+
|
118 |
+
# stage4
|
119 |
+
self.Mconv1_stage4 = Conv2d(
|
120 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
121 |
+
padding=3)
|
122 |
+
self.Mconv2_stage4 = Conv2d(
|
123 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
124 |
+
padding=3)
|
125 |
+
self.Mconv3_stage4 = Conv2d(
|
126 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
127 |
+
padding=3)
|
128 |
+
self.Mconv4_stage4 = Conv2d(
|
129 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
130 |
+
padding=3)
|
131 |
+
self.Mconv5_stage4 = Conv2d(
|
132 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
133 |
+
padding=3)
|
134 |
+
self.Mconv6_stage4 = Conv2d(
|
135 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
136 |
+
padding=0)
|
137 |
+
self.Mconv7_stage4 = Conv2d(
|
138 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
139 |
+
padding=0)
|
140 |
+
|
141 |
+
# stage5
|
142 |
+
self.Mconv1_stage5 = Conv2d(
|
143 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
144 |
+
padding=3)
|
145 |
+
self.Mconv2_stage5 = Conv2d(
|
146 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
147 |
+
padding=3)
|
148 |
+
self.Mconv3_stage5 = Conv2d(
|
149 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
150 |
+
padding=3)
|
151 |
+
self.Mconv4_stage5 = Conv2d(
|
152 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
153 |
+
padding=3)
|
154 |
+
self.Mconv5_stage5 = Conv2d(
|
155 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
156 |
+
padding=3)
|
157 |
+
self.Mconv6_stage5 = Conv2d(
|
158 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
159 |
+
padding=0)
|
160 |
+
self.Mconv7_stage5 = Conv2d(
|
161 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
162 |
+
padding=0)
|
163 |
+
|
164 |
+
# stage6
|
165 |
+
self.Mconv1_stage6 = Conv2d(
|
166 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
167 |
+
padding=3)
|
168 |
+
self.Mconv2_stage6 = Conv2d(
|
169 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
170 |
+
padding=3)
|
171 |
+
self.Mconv3_stage6 = Conv2d(
|
172 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
173 |
+
padding=3)
|
174 |
+
self.Mconv4_stage6 = Conv2d(
|
175 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
176 |
+
padding=3)
|
177 |
+
self.Mconv5_stage6 = Conv2d(
|
178 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
179 |
+
padding=3)
|
180 |
+
self.Mconv6_stage6 = Conv2d(
|
181 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
182 |
+
padding=0)
|
183 |
+
self.Mconv7_stage6 = Conv2d(
|
184 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
185 |
+
padding=0)
|
186 |
+
|
187 |
+
for m in self.modules():
|
188 |
+
if isinstance(m, Conv2d):
|
189 |
+
init.constant_(m.bias, 0)
|
190 |
+
|
191 |
+
def forward(self, x):
|
192 |
+
"""Return a list of heatmaps."""
|
193 |
+
heatmaps = []
|
194 |
+
|
195 |
+
h = self.relu(self.conv1_1(x))
|
196 |
+
h = self.relu(self.conv1_2(h))
|
197 |
+
h = self.max_pooling_2d(h)
|
198 |
+
h = self.relu(self.conv2_1(h))
|
199 |
+
h = self.relu(self.conv2_2(h))
|
200 |
+
h = self.max_pooling_2d(h)
|
201 |
+
h = self.relu(self.conv3_1(h))
|
202 |
+
h = self.relu(self.conv3_2(h))
|
203 |
+
h = self.relu(self.conv3_3(h))
|
204 |
+
h = self.relu(self.conv3_4(h))
|
205 |
+
h = self.max_pooling_2d(h)
|
206 |
+
h = self.relu(self.conv4_1(h))
|
207 |
+
h = self.relu(self.conv4_2(h))
|
208 |
+
h = self.relu(self.conv4_3(h))
|
209 |
+
h = self.relu(self.conv4_4(h))
|
210 |
+
h = self.relu(self.conv5_1(h))
|
211 |
+
h = self.relu(self.conv5_2(h))
|
212 |
+
h = self.relu(self.conv5_3_CPM(h))
|
213 |
+
feature_map = h
|
214 |
+
|
215 |
+
# stage1
|
216 |
+
h = self.relu(self.conv6_1_CPM(h))
|
217 |
+
h = self.conv6_2_CPM(h)
|
218 |
+
heatmaps.append(h)
|
219 |
+
|
220 |
+
# stage2
|
221 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
222 |
+
h = self.relu(self.Mconv1_stage2(h))
|
223 |
+
h = self.relu(self.Mconv2_stage2(h))
|
224 |
+
h = self.relu(self.Mconv3_stage2(h))
|
225 |
+
h = self.relu(self.Mconv4_stage2(h))
|
226 |
+
h = self.relu(self.Mconv5_stage2(h))
|
227 |
+
h = self.relu(self.Mconv6_stage2(h))
|
228 |
+
h = self.Mconv7_stage2(h)
|
229 |
+
heatmaps.append(h)
|
230 |
+
|
231 |
+
# stage3
|
232 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
233 |
+
h = self.relu(self.Mconv1_stage3(h))
|
234 |
+
h = self.relu(self.Mconv2_stage3(h))
|
235 |
+
h = self.relu(self.Mconv3_stage3(h))
|
236 |
+
h = self.relu(self.Mconv4_stage3(h))
|
237 |
+
h = self.relu(self.Mconv5_stage3(h))
|
238 |
+
h = self.relu(self.Mconv6_stage3(h))
|
239 |
+
h = self.Mconv7_stage3(h)
|
240 |
+
heatmaps.append(h)
|
241 |
+
|
242 |
+
# stage4
|
243 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
244 |
+
h = self.relu(self.Mconv1_stage4(h))
|
245 |
+
h = self.relu(self.Mconv2_stage4(h))
|
246 |
+
h = self.relu(self.Mconv3_stage4(h))
|
247 |
+
h = self.relu(self.Mconv4_stage4(h))
|
248 |
+
h = self.relu(self.Mconv5_stage4(h))
|
249 |
+
h = self.relu(self.Mconv6_stage4(h))
|
250 |
+
h = self.Mconv7_stage4(h)
|
251 |
+
heatmaps.append(h)
|
252 |
+
|
253 |
+
# stage5
|
254 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
255 |
+
h = self.relu(self.Mconv1_stage5(h))
|
256 |
+
h = self.relu(self.Mconv2_stage5(h))
|
257 |
+
h = self.relu(self.Mconv3_stage5(h))
|
258 |
+
h = self.relu(self.Mconv4_stage5(h))
|
259 |
+
h = self.relu(self.Mconv5_stage5(h))
|
260 |
+
h = self.relu(self.Mconv6_stage5(h))
|
261 |
+
h = self.Mconv7_stage5(h)
|
262 |
+
heatmaps.append(h)
|
263 |
+
|
264 |
+
# stage6
|
265 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
266 |
+
h = self.relu(self.Mconv1_stage6(h))
|
267 |
+
h = self.relu(self.Mconv2_stage6(h))
|
268 |
+
h = self.relu(self.Mconv3_stage6(h))
|
269 |
+
h = self.relu(self.Mconv4_stage6(h))
|
270 |
+
h = self.relu(self.Mconv5_stage6(h))
|
271 |
+
h = self.relu(self.Mconv6_stage6(h))
|
272 |
+
h = self.Mconv7_stage6(h)
|
273 |
+
heatmaps.append(h)
|
274 |
+
|
275 |
+
return heatmaps
|
276 |
+
|
277 |
+
|
278 |
+
LOG = logging.getLogger(__name__)
|
279 |
+
TOTEN = ToTensor()
|
280 |
+
TOPIL = ToPILImage()
|
281 |
+
|
282 |
+
|
283 |
+
params = {
|
284 |
+
'gaussian_sigma': 2.5,
|
285 |
+
'inference_img_size': 736, # 368, 736, 1312
|
286 |
+
'heatmap_peak_thresh': 0.1,
|
287 |
+
'crop_scale': 1.5,
|
288 |
+
'line_indices': [
|
289 |
+
[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
|
290 |
+
[6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13],
|
291 |
+
[13, 14], [14, 15], [15, 16],
|
292 |
+
[17, 18], [18, 19], [19, 20], [20, 21],
|
293 |
+
[22, 23], [23, 24], [24, 25], [25, 26],
|
294 |
+
[27, 28], [28, 29], [29, 30],
|
295 |
+
[31, 32], [32, 33], [33, 34], [34, 35],
|
296 |
+
[36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36],
|
297 |
+
[42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42],
|
298 |
+
[48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54],
|
299 |
+
[54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48],
|
300 |
+
[60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66],
|
301 |
+
[66, 67], [67, 60]
|
302 |
+
],
|
303 |
+
}
|
304 |
+
|
305 |
+
|
306 |
+
class Face(object):
|
307 |
+
"""
|
308 |
+
The OpenPose face landmark detector model.
|
309 |
+
|
310 |
+
Args:
|
311 |
+
inference_size: set the size of the inference image size, suggested:
|
312 |
+
368, 736, 1312, default 736
|
313 |
+
gaussian_sigma: blur the heatmaps, default 2.5
|
314 |
+
heatmap_peak_thresh: return landmark if over threshold, default 0.1
|
315 |
+
|
316 |
+
"""
|
317 |
+
def __init__(self, face_model_path,
|
318 |
+
inference_size=None,
|
319 |
+
gaussian_sigma=None,
|
320 |
+
heatmap_peak_thresh=None):
|
321 |
+
self.inference_size = inference_size or params["inference_img_size"]
|
322 |
+
self.sigma = gaussian_sigma or params['gaussian_sigma']
|
323 |
+
self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"]
|
324 |
+
self.model = FaceNet()
|
325 |
+
self.model.load_state_dict(torch.load(face_model_path))
|
326 |
+
self.model.eval()
|
327 |
+
|
328 |
+
def to(self, device):
|
329 |
+
self.model.to(device)
|
330 |
+
return self
|
331 |
+
|
332 |
+
def __call__(self, face_img):
|
333 |
+
device = next(iter(self.model.parameters())).device
|
334 |
+
H, W, C = face_img.shape
|
335 |
+
|
336 |
+
w_size = 384
|
337 |
+
x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5
|
338 |
+
|
339 |
+
x_data = x_data.to(device)
|
340 |
+
|
341 |
+
with torch.no_grad():
|
342 |
+
hs = self.model(x_data[None, ...])
|
343 |
+
heatmaps = F.interpolate(
|
344 |
+
hs[-1],
|
345 |
+
(H, W),
|
346 |
+
mode='bilinear', align_corners=True).cpu().numpy()[0]
|
347 |
+
return heatmaps
|
348 |
+
|
349 |
+
def compute_peaks_from_heatmaps(self, heatmaps):
|
350 |
+
all_peaks = []
|
351 |
+
for part in range(heatmaps.shape[0]):
|
352 |
+
map_ori = heatmaps[part].copy()
|
353 |
+
binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8)
|
354 |
+
|
355 |
+
if np.sum(binary) == 0:
|
356 |
+
continue
|
357 |
+
|
358 |
+
positions = np.where(binary > 0.5)
|
359 |
+
intensities = map_ori[positions]
|
360 |
+
mi = np.argmax(intensities)
|
361 |
+
y, x = positions[0][mi], positions[1][mi]
|
362 |
+
all_peaks.append([x, y])
|
363 |
+
|
364 |
+
return np.array(all_peaks)
|
controlnet_aux/open_pose/hand.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from scipy.ndimage.filters import gaussian_filter
|
5 |
+
from skimage.measure import label
|
6 |
+
|
7 |
+
from . import util
|
8 |
+
from .model import handpose_model
|
9 |
+
|
10 |
+
|
11 |
+
class Hand(object):
|
12 |
+
def __init__(self, model_path):
|
13 |
+
self.model = handpose_model()
|
14 |
+
model_dict = util.transfer(self.model, torch.load(model_path))
|
15 |
+
self.model.load_state_dict(model_dict)
|
16 |
+
self.model.eval()
|
17 |
+
|
18 |
+
def to(self, device):
|
19 |
+
self.model.to(device)
|
20 |
+
return self
|
21 |
+
|
22 |
+
def __call__(self, oriImgRaw):
|
23 |
+
device = next(iter(self.model.parameters())).device
|
24 |
+
scale_search = [0.5, 1.0, 1.5, 2.0]
|
25 |
+
# scale_search = [0.5]
|
26 |
+
boxsize = 368
|
27 |
+
stride = 8
|
28 |
+
padValue = 128
|
29 |
+
thre = 0.05
|
30 |
+
multiplier = [x * boxsize for x in scale_search]
|
31 |
+
|
32 |
+
wsize = 128
|
33 |
+
heatmap_avg = np.zeros((wsize, wsize, 22))
|
34 |
+
|
35 |
+
Hr, Wr, Cr = oriImgRaw.shape
|
36 |
+
|
37 |
+
oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8)
|
38 |
+
|
39 |
+
for m in range(len(multiplier)):
|
40 |
+
scale = multiplier[m]
|
41 |
+
imageToTest = util.smart_resize(oriImg, (scale, scale))
|
42 |
+
|
43 |
+
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
|
44 |
+
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
|
45 |
+
im = np.ascontiguousarray(im)
|
46 |
+
|
47 |
+
data = torch.from_numpy(im).float()
|
48 |
+
data = data.to(device)
|
49 |
+
|
50 |
+
with torch.no_grad():
|
51 |
+
output = self.model(data).cpu().numpy()
|
52 |
+
|
53 |
+
# extract outputs, resize, and remove padding
|
54 |
+
heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
|
55 |
+
heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
|
56 |
+
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
57 |
+
heatmap = util.smart_resize(heatmap, (wsize, wsize))
|
58 |
+
|
59 |
+
heatmap_avg += heatmap / len(multiplier)
|
60 |
+
|
61 |
+
all_peaks = []
|
62 |
+
for part in range(21):
|
63 |
+
map_ori = heatmap_avg[:, :, part]
|
64 |
+
one_heatmap = gaussian_filter(map_ori, sigma=3)
|
65 |
+
binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
|
66 |
+
|
67 |
+
if np.sum(binary) == 0:
|
68 |
+
all_peaks.append([0, 0])
|
69 |
+
continue
|
70 |
+
label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
|
71 |
+
max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
|
72 |
+
label_img[label_img != max_index] = 0
|
73 |
+
map_ori[label_img == 0] = 0
|
74 |
+
|
75 |
+
y, x = util.npmax(map_ori)
|
76 |
+
y = int(float(y) * float(Hr) / float(wsize))
|
77 |
+
x = int(float(x) * float(Wr) / float(wsize))
|
78 |
+
all_peaks.append([x, y])
|
79 |
+
return np.array(all_peaks)
|
80 |
+
|
81 |
+
if __name__ == "__main__":
|
82 |
+
hand_estimation = Hand('../model/hand_pose_model.pth')
|
83 |
+
|
84 |
+
# test_image = '../images/hand.jpg'
|
85 |
+
test_image = '../images/hand.jpg'
|
86 |
+
oriImg = cv2.imread(test_image) # B,G,R order
|
87 |
+
peaks = hand_estimation(oriImg)
|
88 |
+
canvas = util.draw_handpose(oriImg, peaks, True)
|
89 |
+
cv2.imshow('', canvas)
|
90 |
+
cv2.waitKey(0)
|
controlnet_aux/open_pose/model.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from collections import OrderedDict
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
|
7 |
+
def make_layers(block, no_relu_layers):
|
8 |
+
layers = []
|
9 |
+
for layer_name, v in block.items():
|
10 |
+
if 'pool' in layer_name:
|
11 |
+
layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
|
12 |
+
padding=v[2])
|
13 |
+
layers.append((layer_name, layer))
|
14 |
+
else:
|
15 |
+
conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
|
16 |
+
kernel_size=v[2], stride=v[3],
|
17 |
+
padding=v[4])
|
18 |
+
layers.append((layer_name, conv2d))
|
19 |
+
if layer_name not in no_relu_layers:
|
20 |
+
layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
|
21 |
+
|
22 |
+
return nn.Sequential(OrderedDict(layers))
|
23 |
+
|
24 |
+
class bodypose_model(nn.Module):
|
25 |
+
def __init__(self):
|
26 |
+
super(bodypose_model, self).__init__()
|
27 |
+
|
28 |
+
# these layers have no relu layer
|
29 |
+
no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
|
30 |
+
'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
|
31 |
+
'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
|
32 |
+
'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
|
33 |
+
blocks = {}
|
34 |
+
block0 = OrderedDict([
|
35 |
+
('conv1_1', [3, 64, 3, 1, 1]),
|
36 |
+
('conv1_2', [64, 64, 3, 1, 1]),
|
37 |
+
('pool1_stage1', [2, 2, 0]),
|
38 |
+
('conv2_1', [64, 128, 3, 1, 1]),
|
39 |
+
('conv2_2', [128, 128, 3, 1, 1]),
|
40 |
+
('pool2_stage1', [2, 2, 0]),
|
41 |
+
('conv3_1', [128, 256, 3, 1, 1]),
|
42 |
+
('conv3_2', [256, 256, 3, 1, 1]),
|
43 |
+
('conv3_3', [256, 256, 3, 1, 1]),
|
44 |
+
('conv3_4', [256, 256, 3, 1, 1]),
|
45 |
+
('pool3_stage1', [2, 2, 0]),
|
46 |
+
('conv4_1', [256, 512, 3, 1, 1]),
|
47 |
+
('conv4_2', [512, 512, 3, 1, 1]),
|
48 |
+
('conv4_3_CPM', [512, 256, 3, 1, 1]),
|
49 |
+
('conv4_4_CPM', [256, 128, 3, 1, 1])
|
50 |
+
])
|
51 |
+
|
52 |
+
|
53 |
+
# Stage 1
|
54 |
+
block1_1 = OrderedDict([
|
55 |
+
('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
|
56 |
+
('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
|
57 |
+
('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
|
58 |
+
('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
|
59 |
+
('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
|
60 |
+
])
|
61 |
+
|
62 |
+
block1_2 = OrderedDict([
|
63 |
+
('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
|
64 |
+
('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
|
65 |
+
('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
|
66 |
+
('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
|
67 |
+
('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
|
68 |
+
])
|
69 |
+
blocks['block1_1'] = block1_1
|
70 |
+
blocks['block1_2'] = block1_2
|
71 |
+
|
72 |
+
self.model0 = make_layers(block0, no_relu_layers)
|
73 |
+
|
74 |
+
# Stages 2 - 6
|
75 |
+
for i in range(2, 7):
|
76 |
+
blocks['block%d_1' % i] = OrderedDict([
|
77 |
+
('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
|
78 |
+
('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
79 |
+
('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
80 |
+
('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
81 |
+
('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
82 |
+
('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
|
83 |
+
('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
|
84 |
+
])
|
85 |
+
|
86 |
+
blocks['block%d_2' % i] = OrderedDict([
|
87 |
+
('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
|
88 |
+
('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
89 |
+
('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
90 |
+
('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
91 |
+
('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
92 |
+
('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
|
93 |
+
('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
|
94 |
+
])
|
95 |
+
|
96 |
+
for k in blocks.keys():
|
97 |
+
blocks[k] = make_layers(blocks[k], no_relu_layers)
|
98 |
+
|
99 |
+
self.model1_1 = blocks['block1_1']
|
100 |
+
self.model2_1 = blocks['block2_1']
|
101 |
+
self.model3_1 = blocks['block3_1']
|
102 |
+
self.model4_1 = blocks['block4_1']
|
103 |
+
self.model5_1 = blocks['block5_1']
|
104 |
+
self.model6_1 = blocks['block6_1']
|
105 |
+
|
106 |
+
self.model1_2 = blocks['block1_2']
|
107 |
+
self.model2_2 = blocks['block2_2']
|
108 |
+
self.model3_2 = blocks['block3_2']
|
109 |
+
self.model4_2 = blocks['block4_2']
|
110 |
+
self.model5_2 = blocks['block5_2']
|
111 |
+
self.model6_2 = blocks['block6_2']
|
112 |
+
|
113 |
+
|
114 |
+
def forward(self, x):
|
115 |
+
|
116 |
+
out1 = self.model0(x)
|
117 |
+
|
118 |
+
out1_1 = self.model1_1(out1)
|
119 |
+
out1_2 = self.model1_2(out1)
|
120 |
+
out2 = torch.cat([out1_1, out1_2, out1], 1)
|
121 |
+
|
122 |
+
out2_1 = self.model2_1(out2)
|
123 |
+
out2_2 = self.model2_2(out2)
|
124 |
+
out3 = torch.cat([out2_1, out2_2, out1], 1)
|
125 |
+
|
126 |
+
out3_1 = self.model3_1(out3)
|
127 |
+
out3_2 = self.model3_2(out3)
|
128 |
+
out4 = torch.cat([out3_1, out3_2, out1], 1)
|
129 |
+
|
130 |
+
out4_1 = self.model4_1(out4)
|
131 |
+
out4_2 = self.model4_2(out4)
|
132 |
+
out5 = torch.cat([out4_1, out4_2, out1], 1)
|
133 |
+
|
134 |
+
out5_1 = self.model5_1(out5)
|
135 |
+
out5_2 = self.model5_2(out5)
|
136 |
+
out6 = torch.cat([out5_1, out5_2, out1], 1)
|
137 |
+
|
138 |
+
out6_1 = self.model6_1(out6)
|
139 |
+
out6_2 = self.model6_2(out6)
|
140 |
+
|
141 |
+
return out6_1, out6_2
|
142 |
+
|
143 |
+
class handpose_model(nn.Module):
|
144 |
+
def __init__(self):
|
145 |
+
super(handpose_model, self).__init__()
|
146 |
+
|
147 |
+
# these layers have no relu layer
|
148 |
+
no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
|
149 |
+
'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
|
150 |
+
# stage 1
|
151 |
+
block1_0 = OrderedDict([
|
152 |
+
('conv1_1', [3, 64, 3, 1, 1]),
|
153 |
+
('conv1_2', [64, 64, 3, 1, 1]),
|
154 |
+
('pool1_stage1', [2, 2, 0]),
|
155 |
+
('conv2_1', [64, 128, 3, 1, 1]),
|
156 |
+
('conv2_2', [128, 128, 3, 1, 1]),
|
157 |
+
('pool2_stage1', [2, 2, 0]),
|
158 |
+
('conv3_1', [128, 256, 3, 1, 1]),
|
159 |
+
('conv3_2', [256, 256, 3, 1, 1]),
|
160 |
+
('conv3_3', [256, 256, 3, 1, 1]),
|
161 |
+
('conv3_4', [256, 256, 3, 1, 1]),
|
162 |
+
('pool3_stage1', [2, 2, 0]),
|
163 |
+
('conv4_1', [256, 512, 3, 1, 1]),
|
164 |
+
('conv4_2', [512, 512, 3, 1, 1]),
|
165 |
+
('conv4_3', [512, 512, 3, 1, 1]),
|
166 |
+
('conv4_4', [512, 512, 3, 1, 1]),
|
167 |
+
('conv5_1', [512, 512, 3, 1, 1]),
|
168 |
+
('conv5_2', [512, 512, 3, 1, 1]),
|
169 |
+
('conv5_3_CPM', [512, 128, 3, 1, 1])
|
170 |
+
])
|
171 |
+
|
172 |
+
block1_1 = OrderedDict([
|
173 |
+
('conv6_1_CPM', [128, 512, 1, 1, 0]),
|
174 |
+
('conv6_2_CPM', [512, 22, 1, 1, 0])
|
175 |
+
])
|
176 |
+
|
177 |
+
blocks = {}
|
178 |
+
blocks['block1_0'] = block1_0
|
179 |
+
blocks['block1_1'] = block1_1
|
180 |
+
|
181 |
+
# stage 2-6
|
182 |
+
for i in range(2, 7):
|
183 |
+
blocks['block%d' % i] = OrderedDict([
|
184 |
+
('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
|
185 |
+
('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
|
186 |
+
('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
|
187 |
+
('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
|
188 |
+
('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
|
189 |
+
('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
|
190 |
+
('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
|
191 |
+
])
|
192 |
+
|
193 |
+
for k in blocks.keys():
|
194 |
+
blocks[k] = make_layers(blocks[k], no_relu_layers)
|
195 |
+
|
196 |
+
self.model1_0 = blocks['block1_0']
|
197 |
+
self.model1_1 = blocks['block1_1']
|
198 |
+
self.model2 = blocks['block2']
|
199 |
+
self.model3 = blocks['block3']
|
200 |
+
self.model4 = blocks['block4']
|
201 |
+
self.model5 = blocks['block5']
|
202 |
+
self.model6 = blocks['block6']
|
203 |
+
|
204 |
+
def forward(self, x):
|
205 |
+
out1_0 = self.model1_0(x)
|
206 |
+
out1_1 = self.model1_1(out1_0)
|
207 |
+
concat_stage2 = torch.cat([out1_1, out1_0], 1)
|
208 |
+
out_stage2 = self.model2(concat_stage2)
|
209 |
+
concat_stage3 = torch.cat([out_stage2, out1_0], 1)
|
210 |
+
out_stage3 = self.model3(concat_stage3)
|
211 |
+
concat_stage4 = torch.cat([out_stage3, out1_0], 1)
|
212 |
+
out_stage4 = self.model4(concat_stage4)
|
213 |
+
concat_stage5 = torch.cat([out_stage4, out1_0], 1)
|
214 |
+
out_stage5 = self.model5(concat_stage5)
|
215 |
+
concat_stage6 = torch.cat([out_stage5, out1_0], 1)
|
216 |
+
out_stage6 = self.model6(concat_stage6)
|
217 |
+
return out_stage6
|
controlnet_aux/open_pose/util.py
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numpy as np
|
3 |
+
import cv2
|
4 |
+
from typing import List, Tuple, Union
|
5 |
+
|
6 |
+
from .body import BodyResult, Keypoint
|
7 |
+
|
8 |
+
eps = 0.01
|
9 |
+
|
10 |
+
|
11 |
+
def smart_resize(x, s):
|
12 |
+
Ht, Wt = s
|
13 |
+
if x.ndim == 2:
|
14 |
+
Ho, Wo = x.shape
|
15 |
+
Co = 1
|
16 |
+
else:
|
17 |
+
Ho, Wo, Co = x.shape
|
18 |
+
if Co == 3 or Co == 1:
|
19 |
+
k = float(Ht + Wt) / float(Ho + Wo)
|
20 |
+
return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
|
21 |
+
else:
|
22 |
+
return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
|
23 |
+
|
24 |
+
|
25 |
+
def smart_resize_k(x, fx, fy):
|
26 |
+
if x.ndim == 2:
|
27 |
+
Ho, Wo = x.shape
|
28 |
+
Co = 1
|
29 |
+
else:
|
30 |
+
Ho, Wo, Co = x.shape
|
31 |
+
Ht, Wt = Ho * fy, Wo * fx
|
32 |
+
if Co == 3 or Co == 1:
|
33 |
+
k = float(Ht + Wt) / float(Ho + Wo)
|
34 |
+
return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
|
35 |
+
else:
|
36 |
+
return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
|
37 |
+
|
38 |
+
|
39 |
+
def padRightDownCorner(img, stride, padValue):
|
40 |
+
h = img.shape[0]
|
41 |
+
w = img.shape[1]
|
42 |
+
|
43 |
+
pad = 4 * [None]
|
44 |
+
pad[0] = 0 # up
|
45 |
+
pad[1] = 0 # left
|
46 |
+
pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
|
47 |
+
pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
|
48 |
+
|
49 |
+
img_padded = img
|
50 |
+
pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
|
51 |
+
img_padded = np.concatenate((pad_up, img_padded), axis=0)
|
52 |
+
pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
|
53 |
+
img_padded = np.concatenate((pad_left, img_padded), axis=1)
|
54 |
+
pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
|
55 |
+
img_padded = np.concatenate((img_padded, pad_down), axis=0)
|
56 |
+
pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
|
57 |
+
img_padded = np.concatenate((img_padded, pad_right), axis=1)
|
58 |
+
|
59 |
+
return img_padded, pad
|
60 |
+
|
61 |
+
|
62 |
+
def transfer(model, model_weights):
|
63 |
+
transfered_model_weights = {}
|
64 |
+
for weights_name in model.state_dict().keys():
|
65 |
+
transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
|
66 |
+
return transfered_model_weights
|
67 |
+
|
68 |
+
|
69 |
+
def draw_bodypose(canvas: np.ndarray, keypoints: List[Keypoint]) -> np.ndarray:
|
70 |
+
"""
|
71 |
+
Draw keypoints and limbs representing body pose on a given canvas.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the body pose.
|
75 |
+
keypoints (List[Keypoint]): A list of Keypoint objects representing the body keypoints to be drawn.
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
np.ndarray: A 3D numpy array representing the modified canvas with the drawn body pose.
|
79 |
+
|
80 |
+
Note:
|
81 |
+
The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
|
82 |
+
"""
|
83 |
+
H, W, C = canvas.shape
|
84 |
+
stickwidth = 4
|
85 |
+
|
86 |
+
limbSeq = [
|
87 |
+
[2, 3], [2, 6], [3, 4], [4, 5],
|
88 |
+
[6, 7], [7, 8], [2, 9], [9, 10],
|
89 |
+
[10, 11], [2, 12], [12, 13], [13, 14],
|
90 |
+
[2, 1], [1, 15], [15, 17], [1, 16],
|
91 |
+
[16, 18],
|
92 |
+
]
|
93 |
+
|
94 |
+
colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
|
95 |
+
[0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
|
96 |
+
[170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
|
97 |
+
|
98 |
+
for (k1_index, k2_index), color in zip(limbSeq, colors):
|
99 |
+
keypoint1 = keypoints[k1_index - 1]
|
100 |
+
keypoint2 = keypoints[k2_index - 1]
|
101 |
+
|
102 |
+
if keypoint1 is None or keypoint2 is None:
|
103 |
+
continue
|
104 |
+
|
105 |
+
Y = np.array([keypoint1.x, keypoint2.x]) * float(W)
|
106 |
+
X = np.array([keypoint1.y, keypoint2.y]) * float(H)
|
107 |
+
mX = np.mean(X)
|
108 |
+
mY = np.mean(Y)
|
109 |
+
length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
|
110 |
+
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
|
111 |
+
polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
|
112 |
+
cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color])
|
113 |
+
|
114 |
+
for keypoint, color in zip(keypoints, colors):
|
115 |
+
if keypoint is None:
|
116 |
+
continue
|
117 |
+
|
118 |
+
x, y = keypoint.x, keypoint.y
|
119 |
+
x = int(x * W)
|
120 |
+
y = int(y * H)
|
121 |
+
cv2.circle(canvas, (int(x), int(y)), 4, color, thickness=-1)
|
122 |
+
|
123 |
+
return canvas
|
124 |
+
|
125 |
+
|
126 |
+
def draw_handpose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
|
127 |
+
import matplotlib
|
128 |
+
"""
|
129 |
+
Draw keypoints and connections representing hand pose on a given canvas.
|
130 |
+
|
131 |
+
Args:
|
132 |
+
canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the hand pose.
|
133 |
+
keypoints (List[Keypoint]| None): A list of Keypoint objects representing the hand keypoints to be drawn
|
134 |
+
or None if no keypoints are present.
|
135 |
+
|
136 |
+
Returns:
|
137 |
+
np.ndarray: A 3D numpy array representing the modified canvas with the drawn hand pose.
|
138 |
+
|
139 |
+
Note:
|
140 |
+
The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
|
141 |
+
"""
|
142 |
+
if not keypoints:
|
143 |
+
return canvas
|
144 |
+
|
145 |
+
H, W, C = canvas.shape
|
146 |
+
|
147 |
+
edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
|
148 |
+
[10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
|
149 |
+
|
150 |
+
for ie, (e1, e2) in enumerate(edges):
|
151 |
+
k1 = keypoints[e1]
|
152 |
+
k2 = keypoints[e2]
|
153 |
+
if k1 is None or k2 is None:
|
154 |
+
continue
|
155 |
+
|
156 |
+
x1 = int(k1.x * W)
|
157 |
+
y1 = int(k1.y * H)
|
158 |
+
x2 = int(k2.x * W)
|
159 |
+
y2 = int(k2.y * H)
|
160 |
+
if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
|
161 |
+
cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
|
162 |
+
|
163 |
+
for keypoint in keypoints:
|
164 |
+
x, y = keypoint.x, keypoint.y
|
165 |
+
x = int(x * W)
|
166 |
+
y = int(y * H)
|
167 |
+
if x > eps and y > eps:
|
168 |
+
cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
|
169 |
+
return canvas
|
170 |
+
|
171 |
+
|
172 |
+
def draw_facepose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
|
173 |
+
"""
|
174 |
+
Draw keypoints representing face pose on a given canvas.
|
175 |
+
|
176 |
+
Args:
|
177 |
+
canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the face pose.
|
178 |
+
keypoints (List[Keypoint]| None): A list of Keypoint objects representing the face keypoints to be drawn
|
179 |
+
or None if no keypoints are present.
|
180 |
+
|
181 |
+
Returns:
|
182 |
+
np.ndarray: A 3D numpy array representing the modified canvas with the drawn face pose.
|
183 |
+
|
184 |
+
Note:
|
185 |
+
The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
|
186 |
+
"""
|
187 |
+
if not keypoints:
|
188 |
+
return canvas
|
189 |
+
|
190 |
+
H, W, C = canvas.shape
|
191 |
+
for keypoint in keypoints:
|
192 |
+
x, y = keypoint.x, keypoint.y
|
193 |
+
x = int(x * W)
|
194 |
+
y = int(y * H)
|
195 |
+
if x > eps and y > eps:
|
196 |
+
cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
|
197 |
+
return canvas
|
198 |
+
|
199 |
+
|
200 |
+
# detect hand according to body pose keypoints
|
201 |
+
# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
|
202 |
+
def handDetect(body: BodyResult, oriImg) -> List[Tuple[int, int, int, bool]]:
|
203 |
+
"""
|
204 |
+
Detect hands in the input body pose keypoints and calculate the bounding box for each hand.
|
205 |
+
|
206 |
+
Args:
|
207 |
+
body (BodyResult): A BodyResult object containing the detected body pose keypoints.
|
208 |
+
oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
|
209 |
+
|
210 |
+
Returns:
|
211 |
+
List[Tuple[int, int, int, bool]]: A list of tuples, each containing the coordinates (x, y) of the top-left
|
212 |
+
corner of the bounding box, the width (height) of the bounding box, and
|
213 |
+
a boolean flag indicating whether the hand is a left hand (True) or a
|
214 |
+
right hand (False).
|
215 |
+
|
216 |
+
Notes:
|
217 |
+
- The width and height of the bounding boxes are equal since the network requires squared input.
|
218 |
+
- The minimum bounding box size is 20 pixels.
|
219 |
+
"""
|
220 |
+
ratioWristElbow = 0.33
|
221 |
+
detect_result = []
|
222 |
+
image_height, image_width = oriImg.shape[0:2]
|
223 |
+
|
224 |
+
keypoints = body.keypoints
|
225 |
+
# right hand: wrist 4, elbow 3, shoulder 2
|
226 |
+
# left hand: wrist 7, elbow 6, shoulder 5
|
227 |
+
left_shoulder = keypoints[5]
|
228 |
+
left_elbow = keypoints[6]
|
229 |
+
left_wrist = keypoints[7]
|
230 |
+
right_shoulder = keypoints[2]
|
231 |
+
right_elbow = keypoints[3]
|
232 |
+
right_wrist = keypoints[4]
|
233 |
+
|
234 |
+
# if any of three not detected
|
235 |
+
has_left = all(keypoint is not None for keypoint in (left_shoulder, left_elbow, left_wrist))
|
236 |
+
has_right = all(keypoint is not None for keypoint in (right_shoulder, right_elbow, right_wrist))
|
237 |
+
if not (has_left or has_right):
|
238 |
+
return []
|
239 |
+
|
240 |
+
hands = []
|
241 |
+
#left hand
|
242 |
+
if has_left:
|
243 |
+
hands.append([
|
244 |
+
left_shoulder.x, left_shoulder.y,
|
245 |
+
left_elbow.x, left_elbow.y,
|
246 |
+
left_wrist.x, left_wrist.y,
|
247 |
+
True
|
248 |
+
])
|
249 |
+
# right hand
|
250 |
+
if has_right:
|
251 |
+
hands.append([
|
252 |
+
right_shoulder.x, right_shoulder.y,
|
253 |
+
right_elbow.x, right_elbow.y,
|
254 |
+
right_wrist.x, right_wrist.y,
|
255 |
+
False
|
256 |
+
])
|
257 |
+
|
258 |
+
for x1, y1, x2, y2, x3, y3, is_left in hands:
|
259 |
+
# pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
|
260 |
+
# handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
|
261 |
+
# handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
|
262 |
+
# const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
|
263 |
+
# const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
|
264 |
+
# handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
|
265 |
+
x = x3 + ratioWristElbow * (x3 - x2)
|
266 |
+
y = y3 + ratioWristElbow * (y3 - y2)
|
267 |
+
distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
|
268 |
+
distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
|
269 |
+
width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
|
270 |
+
# x-y refers to the center --> offset to topLeft point
|
271 |
+
# handRectangle.x -= handRectangle.width / 2.f;
|
272 |
+
# handRectangle.y -= handRectangle.height / 2.f;
|
273 |
+
x -= width / 2
|
274 |
+
y -= width / 2 # width = height
|
275 |
+
# overflow the image
|
276 |
+
if x < 0: x = 0
|
277 |
+
if y < 0: y = 0
|
278 |
+
width1 = width
|
279 |
+
width2 = width
|
280 |
+
if x + width > image_width: width1 = image_width - x
|
281 |
+
if y + width > image_height: width2 = image_height - y
|
282 |
+
width = min(width1, width2)
|
283 |
+
# the max hand box value is 20 pixels
|
284 |
+
if width >= 20:
|
285 |
+
detect_result.append((int(x), int(y), int(width), is_left))
|
286 |
+
|
287 |
+
'''
|
288 |
+
return value: [[x, y, w, True if left hand else False]].
|
289 |
+
width=height since the network require squared input.
|
290 |
+
x, y is the coordinate of top left
|
291 |
+
'''
|
292 |
+
return detect_result
|
293 |
+
|
294 |
+
|
295 |
+
# Written by Lvmin
|
296 |
+
def faceDetect(body: BodyResult, oriImg) -> Union[Tuple[int, int, int], None]:
|
297 |
+
"""
|
298 |
+
Detect the face in the input body pose keypoints and calculate the bounding box for the face.
|
299 |
+
|
300 |
+
Args:
|
301 |
+
body (BodyResult): A BodyResult object containing the detected body pose keypoints.
|
302 |
+
oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
|
303 |
+
|
304 |
+
Returns:
|
305 |
+
Tuple[int, int, int] | None: A tuple containing the coordinates (x, y) of the top-left corner of the
|
306 |
+
bounding box and the width (height) of the bounding box, or None if the
|
307 |
+
face is not detected or the bounding box width is less than 20 pixels.
|
308 |
+
|
309 |
+
Notes:
|
310 |
+
- The width and height of the bounding box are equal.
|
311 |
+
- The minimum bounding box size is 20 pixels.
|
312 |
+
"""
|
313 |
+
# left right eye ear 14 15 16 17
|
314 |
+
image_height, image_width = oriImg.shape[0:2]
|
315 |
+
|
316 |
+
keypoints = body.keypoints
|
317 |
+
head = keypoints[0]
|
318 |
+
left_eye = keypoints[14]
|
319 |
+
right_eye = keypoints[15]
|
320 |
+
left_ear = keypoints[16]
|
321 |
+
right_ear = keypoints[17]
|
322 |
+
|
323 |
+
if head is None or all(keypoint is None for keypoint in (left_eye, right_eye, left_ear, right_ear)):
|
324 |
+
return None
|
325 |
+
|
326 |
+
width = 0.0
|
327 |
+
x0, y0 = head.x, head.y
|
328 |
+
|
329 |
+
if left_eye is not None:
|
330 |
+
x1, y1 = left_eye.x, left_eye.y
|
331 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
332 |
+
width = max(width, d * 3.0)
|
333 |
+
|
334 |
+
if right_eye is not None:
|
335 |
+
x1, y1 = right_eye.x, right_eye.y
|
336 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
337 |
+
width = max(width, d * 3.0)
|
338 |
+
|
339 |
+
if left_ear is not None:
|
340 |
+
x1, y1 = left_ear.x, left_ear.y
|
341 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
342 |
+
width = max(width, d * 1.5)
|
343 |
+
|
344 |
+
if right_ear is not None:
|
345 |
+
x1, y1 = right_ear.x, right_ear.y
|
346 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
347 |
+
width = max(width, d * 1.5)
|
348 |
+
|
349 |
+
x, y = x0, y0
|
350 |
+
|
351 |
+
x -= width
|
352 |
+
y -= width
|
353 |
+
|
354 |
+
if x < 0:
|
355 |
+
x = 0
|
356 |
+
|
357 |
+
if y < 0:
|
358 |
+
y = 0
|
359 |
+
|
360 |
+
width1 = width * 2
|
361 |
+
width2 = width * 2
|
362 |
+
|
363 |
+
if x + width > image_width:
|
364 |
+
width1 = image_width - x
|
365 |
+
|
366 |
+
if y + width > image_height:
|
367 |
+
width2 = image_height - y
|
368 |
+
|
369 |
+
width = min(width1, width2)
|
370 |
+
|
371 |
+
if width >= 20:
|
372 |
+
return int(x), int(y), int(width)
|
373 |
+
else:
|
374 |
+
return None
|
375 |
+
|
376 |
+
|
377 |
+
# get max index of 2d array
|
378 |
+
def npmax(array):
|
379 |
+
arrayindex = array.argmax(1)
|
380 |
+
arrayvalue = array.max(1)
|
381 |
+
i = arrayvalue.argmax()
|
382 |
+
j = arrayindex[i]
|
383 |
+
return i, j
|
controlnet_aux/util.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
|
8 |
+
annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
|
9 |
+
|
10 |
+
|
11 |
+
def HWC3(x):
|
12 |
+
assert x.dtype == np.uint8
|
13 |
+
if x.ndim == 2:
|
14 |
+
x = x[:, :, None]
|
15 |
+
assert x.ndim == 3
|
16 |
+
H, W, C = x.shape
|
17 |
+
assert C == 1 or C == 3 or C == 4
|
18 |
+
if C == 3:
|
19 |
+
return x
|
20 |
+
if C == 1:
|
21 |
+
return np.concatenate([x, x, x], axis=2)
|
22 |
+
if C == 4:
|
23 |
+
color = x[:, :, 0:3].astype(np.float32)
|
24 |
+
alpha = x[:, :, 3:4].astype(np.float32) / 255.0
|
25 |
+
y = color * alpha + 255.0 * (1.0 - alpha)
|
26 |
+
y = y.clip(0, 255).astype(np.uint8)
|
27 |
+
return y
|
28 |
+
|
29 |
+
|
30 |
+
def make_noise_disk(H, W, C, F):
|
31 |
+
noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
|
32 |
+
noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
|
33 |
+
noise = noise[F: F + H, F: F + W]
|
34 |
+
noise -= np.min(noise)
|
35 |
+
noise /= np.max(noise)
|
36 |
+
if C == 1:
|
37 |
+
noise = noise[:, :, None]
|
38 |
+
return noise
|
39 |
+
|
40 |
+
|
41 |
+
def nms(x, t, s):
|
42 |
+
x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
|
43 |
+
|
44 |
+
f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
|
45 |
+
f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
|
46 |
+
f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
|
47 |
+
f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
|
48 |
+
|
49 |
+
y = np.zeros_like(x)
|
50 |
+
|
51 |
+
for f in [f1, f2, f3, f4]:
|
52 |
+
np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
|
53 |
+
|
54 |
+
z = np.zeros_like(y, dtype=np.uint8)
|
55 |
+
z[y > t] = 255
|
56 |
+
return z
|
57 |
+
|
58 |
+
def min_max_norm(x):
|
59 |
+
x -= np.min(x)
|
60 |
+
x /= np.maximum(np.max(x), 1e-5)
|
61 |
+
return x
|
62 |
+
|
63 |
+
|
64 |
+
def safe_step(x, step=2):
|
65 |
+
y = x.astype(np.float32) * float(step + 1)
|
66 |
+
y = y.astype(np.int32).astype(np.float32) / float(step)
|
67 |
+
return y
|
68 |
+
|
69 |
+
|
70 |
+
def img2mask(img, H, W, low=10, high=90):
|
71 |
+
assert img.ndim == 3 or img.ndim == 2
|
72 |
+
assert img.dtype == np.uint8
|
73 |
+
|
74 |
+
if img.ndim == 3:
|
75 |
+
y = img[:, :, random.randrange(0, img.shape[2])]
|
76 |
+
else:
|
77 |
+
y = img
|
78 |
+
|
79 |
+
y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
|
80 |
+
|
81 |
+
if random.uniform(0, 1) < 0.5:
|
82 |
+
y = 255 - y
|
83 |
+
|
84 |
+
return y < np.percentile(y, random.randrange(low, high))
|
85 |
+
|
86 |
+
|
87 |
+
def resize_image(input_image, resolution):
|
88 |
+
H, W, C = input_image.shape
|
89 |
+
H = float(H)
|
90 |
+
W = float(W)
|
91 |
+
k = float(resolution) / min(H, W)
|
92 |
+
H *= k
|
93 |
+
W *= k
|
94 |
+
H = int(np.round(H / 64.0)) * 64
|
95 |
+
W = int(np.round(W / 64.0)) * 64
|
96 |
+
img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
|
97 |
+
return img
|
98 |
+
|
99 |
+
|
100 |
+
def torch_gc():
|
101 |
+
if torch.cuda.is_available():
|
102 |
+
torch.cuda.empty_cache()
|
103 |
+
torch.cuda.ipc_collect()
|
104 |
+
|
105 |
+
|
106 |
+
def ade_palette():
|
107 |
+
"""ADE20K palette that maps each class to RGB values."""
|
108 |
+
return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
|
109 |
+
[4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
|
110 |
+
[230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
|
111 |
+
[150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
|
112 |
+
[143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
|
113 |
+
[0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
|
114 |
+
[255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
|
115 |
+
[255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
|
116 |
+
[255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
|
117 |
+
[224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
|
118 |
+
[255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
|
119 |
+
[6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
|
120 |
+
[140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
|
121 |
+
[255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
|
122 |
+
[255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
|
123 |
+
[11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
|
124 |
+
[0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
|
125 |
+
[255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
|
126 |
+
[0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
|
127 |
+
[173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
|
128 |
+
[255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
|
129 |
+
[255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
|
130 |
+
[255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
|
131 |
+
[0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
|
132 |
+
[0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
|
133 |
+
[143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
|
134 |
+
[8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
|
135 |
+
[255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
|
136 |
+
[92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
|
137 |
+
[163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
|
138 |
+
[255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
|
139 |
+
[255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
|
140 |
+
[10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
|
141 |
+
[255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
|
142 |
+
[41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
|
143 |
+
[71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
|
144 |
+
[184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
|
145 |
+
[102, 255, 0], [92, 0, 255]]
|
146 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/diffusers.git
|
2 |
+
torch==2.0.1
|
3 |
+
torchvision==0.15.2
|
4 |
+
transformers==4.43.3
|
5 |
+
einops
|
6 |
+
onnxruntime-gpu
|
7 |
+
spaces
|
8 |
+
accelerate
|
9 |
+
omegaconf
|
10 |
+
huggingface-hub
|
11 |
+
opencv-python
|
12 |
+
gradio
|
13 |
+
xformers
|
14 |
+
sentencepiece
|
15 |
+
peft
|
16 |
+
scipy
|
17 |
+
scikit-image
|