Spaces:
Build error
Build error
zejunyang
commited on
Commit
·
ac336de
1
Parent(s):
4f3b622
update
Browse files- app.py +6 -25
- src/utils/crop_face_single.py +0 -3
- src/utils/mp_utils.py +0 -2
app.py
CHANGED
|
@@ -99,9 +99,7 @@ pipe = pipe.to("cuda", dtype=weight_dtype)
|
|
| 99 |
frame_inter_model = init_frame_interpolation_model()
|
| 100 |
|
| 101 |
@spaces.GPU
|
| 102 |
-
def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
|
| 103 |
-
print('=====Start processing======')
|
| 104 |
-
|
| 105 |
fps = 30
|
| 106 |
cfg = 3.5
|
| 107 |
fi_step = 3
|
|
@@ -121,11 +119,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 121 |
while os.path.exists(save_dir):
|
| 122 |
save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
|
| 123 |
save_dir.mkdir(exist_ok=True, parents=True)
|
| 124 |
-
|
| 125 |
-
print('=====1======')
|
| 126 |
|
| 127 |
ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
|
| 128 |
-
print('=====1======', ref_img.shape, ref_image_np.shape)
|
| 129 |
ref_image_np = crop_face(ref_image_np, lmk_extractor)
|
| 130 |
if ref_image_np is None:
|
| 131 |
return None, Image.fromarray(ref_img)
|
|
@@ -133,22 +128,16 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 133 |
ref_image_np = cv2.resize(ref_image_np, (size, size))
|
| 134 |
ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
|
| 135 |
|
| 136 |
-
print('=====2======')
|
| 137 |
-
|
| 138 |
face_result = lmk_extractor(ref_image_np)
|
| 139 |
if face_result is None:
|
| 140 |
return None, ref_image_pil
|
| 141 |
-
|
| 142 |
-
print('=====3======')
|
| 143 |
-
|
| 144 |
lmks = face_result['lmks'].astype(np.float32)
|
| 145 |
ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
|
| 146 |
|
| 147 |
sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
|
| 148 |
sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
|
| 149 |
sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
|
| 150 |
-
|
| 151 |
-
print('=====4======')
|
| 152 |
|
| 153 |
# inference
|
| 154 |
pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
|
|
@@ -156,8 +145,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 156 |
pred = pred.reshape(pred.shape[0], -1, 3)
|
| 157 |
pred = pred + face_result['lmks3d']
|
| 158 |
|
| 159 |
-
print('=====5======')
|
| 160 |
-
|
| 161 |
if headpose_video is not None:
|
| 162 |
pose_seq = get_headpose_temp(headpose_video)
|
| 163 |
else:
|
|
@@ -172,8 +159,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 172 |
for i, verts in enumerate(projected_vertices):
|
| 173 |
lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
|
| 174 |
pose_images.append(lmk_img)
|
| 175 |
-
|
| 176 |
-
print('=====6======')
|
| 177 |
|
| 178 |
pose_list = []
|
| 179 |
# pose_tensor_list = []
|
|
@@ -182,7 +167,7 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 182 |
# [transforms.Resize((height, width)), transforms.ToTensor()]
|
| 183 |
# )
|
| 184 |
args_L = len(pose_images) if length==0 or length > len(pose_images) else length
|
| 185 |
-
args_L = min(args_L,
|
| 186 |
for pose_image_np in pose_images[: args_L : fi_step]:
|
| 187 |
# pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
|
| 188 |
# pose_tensor_list.append(pose_transform(pose_image_pil))
|
|
@@ -192,8 +177,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 192 |
pose_list = np.array(pose_list)
|
| 193 |
|
| 194 |
video_length = len(pose_list)
|
| 195 |
-
|
| 196 |
-
print('=====7======')
|
| 197 |
|
| 198 |
video = pipe(
|
| 199 |
ref_image_pil,
|
|
@@ -231,8 +214,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 231 |
|
| 232 |
@spaces.GPU
|
| 233 |
def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
|
| 234 |
-
print('=====Start processing======')
|
| 235 |
-
|
| 236 |
cfg = 3.5
|
| 237 |
fi_step = 3
|
| 238 |
|
|
@@ -282,7 +263,7 @@ def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
|
|
| 282 |
verts_list = []
|
| 283 |
bs_list = []
|
| 284 |
args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
|
| 285 |
-
args_L = min(args_L,
|
| 286 |
for src_image_pil in source_images[: args_L : step*fi_step]:
|
| 287 |
src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
|
| 288 |
frame_height, frame_width, _ = src_img_np.shape
|
|
@@ -408,7 +389,7 @@ with gr.Blocks() as demo:
|
|
| 408 |
a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
| 409 |
|
| 410 |
with gr.Row():
|
| 411 |
-
a2v_length = gr.Slider(minimum=0, maximum=
|
| 412 |
a2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
| 413 |
|
| 414 |
a2v_botton = gr.Button("Generate", variant="primary")
|
|
@@ -436,7 +417,7 @@ with gr.Blocks() as demo:
|
|
| 436 |
v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
| 437 |
|
| 438 |
with gr.Row():
|
| 439 |
-
v2v_length = gr.Slider(minimum=0, maximum=
|
| 440 |
v2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
| 441 |
|
| 442 |
v2v_botton = gr.Button("Generate", variant="primary")
|
|
|
|
| 99 |
frame_inter_model = init_frame_interpolation_model()
|
| 100 |
|
| 101 |
@spaces.GPU
|
| 102 |
+
def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
|
|
|
|
|
|
|
| 103 |
fps = 30
|
| 104 |
cfg = 3.5
|
| 105 |
fi_step = 3
|
|
|
|
| 119 |
while os.path.exists(save_dir):
|
| 120 |
save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
|
| 121 |
save_dir.mkdir(exist_ok=True, parents=True)
|
|
|
|
|
|
|
| 122 |
|
| 123 |
ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
|
|
|
|
| 124 |
ref_image_np = crop_face(ref_image_np, lmk_extractor)
|
| 125 |
if ref_image_np is None:
|
| 126 |
return None, Image.fromarray(ref_img)
|
|
|
|
| 128 |
ref_image_np = cv2.resize(ref_image_np, (size, size))
|
| 129 |
ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
|
| 130 |
|
|
|
|
|
|
|
| 131 |
face_result = lmk_extractor(ref_image_np)
|
| 132 |
if face_result is None:
|
| 133 |
return None, ref_image_pil
|
| 134 |
+
|
|
|
|
|
|
|
| 135 |
lmks = face_result['lmks'].astype(np.float32)
|
| 136 |
ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
|
| 137 |
|
| 138 |
sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
|
| 139 |
sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
|
| 140 |
sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
|
|
|
|
|
|
|
| 141 |
|
| 142 |
# inference
|
| 143 |
pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
|
|
|
|
| 145 |
pred = pred.reshape(pred.shape[0], -1, 3)
|
| 146 |
pred = pred + face_result['lmks3d']
|
| 147 |
|
|
|
|
|
|
|
| 148 |
if headpose_video is not None:
|
| 149 |
pose_seq = get_headpose_temp(headpose_video)
|
| 150 |
else:
|
|
|
|
| 159 |
for i, verts in enumerate(projected_vertices):
|
| 160 |
lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
|
| 161 |
pose_images.append(lmk_img)
|
|
|
|
|
|
|
| 162 |
|
| 163 |
pose_list = []
|
| 164 |
# pose_tensor_list = []
|
|
|
|
| 167 |
# [transforms.Resize((height, width)), transforms.ToTensor()]
|
| 168 |
# )
|
| 169 |
args_L = len(pose_images) if length==0 or length > len(pose_images) else length
|
| 170 |
+
args_L = min(args_L, 90)
|
| 171 |
for pose_image_np in pose_images[: args_L : fi_step]:
|
| 172 |
# pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
|
| 173 |
# pose_tensor_list.append(pose_transform(pose_image_pil))
|
|
|
|
| 177 |
pose_list = np.array(pose_list)
|
| 178 |
|
| 179 |
video_length = len(pose_list)
|
|
|
|
|
|
|
| 180 |
|
| 181 |
video = pipe(
|
| 182 |
ref_image_pil,
|
|
|
|
| 214 |
|
| 215 |
@spaces.GPU
|
| 216 |
def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
|
|
|
|
|
|
|
| 217 |
cfg = 3.5
|
| 218 |
fi_step = 3
|
| 219 |
|
|
|
|
| 263 |
verts_list = []
|
| 264 |
bs_list = []
|
| 265 |
args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
|
| 266 |
+
args_L = min(args_L, 90*step)
|
| 267 |
for src_image_pil in source_images[: args_L : step*fi_step]:
|
| 268 |
src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
|
| 269 |
frame_height, frame_width, _ = src_img_np.shape
|
|
|
|
| 389 |
a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
| 390 |
|
| 391 |
with gr.Row():
|
| 392 |
+
a2v_length = gr.Slider(minimum=0, maximum=90, step=1, value=30, label="Length (-L)")
|
| 393 |
a2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
| 394 |
|
| 395 |
a2v_botton = gr.Button("Generate", variant="primary")
|
|
|
|
| 417 |
v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
| 418 |
|
| 419 |
with gr.Row():
|
| 420 |
+
v2v_length = gr.Slider(minimum=0, maximum=90, step=1, value=30, label="Length (-L)")
|
| 421 |
v2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
| 422 |
|
| 423 |
v2v_botton = gr.Button("Generate", variant="primary")
|
src/utils/crop_face_single.py
CHANGED
|
@@ -3,10 +3,7 @@ import cv2
|
|
| 3 |
|
| 4 |
|
| 5 |
def crop_face(img, lmk_extractor, expand=1.5):
|
| 6 |
-
print('****=====1======')
|
| 7 |
result = lmk_extractor(img) # cv2 BGR
|
| 8 |
-
|
| 9 |
-
print('****=====2======')
|
| 10 |
|
| 11 |
if result is None:
|
| 12 |
return None
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def crop_face(img, lmk_extractor, expand=1.5):
|
|
|
|
| 6 |
result = lmk_extractor(img) # cv2 BGR
|
|
|
|
|
|
|
| 7 |
|
| 8 |
if result is None:
|
| 9 |
return None
|
src/utils/mp_utils.py
CHANGED
|
@@ -38,7 +38,6 @@ class LMKExtractor():
|
|
| 38 |
|
| 39 |
|
| 40 |
def __call__(self, img):
|
| 41 |
-
print('///=====1======')
|
| 42 |
frame = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 43 |
image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
|
| 44 |
# t0 = time.time()
|
|
@@ -61,7 +60,6 @@ class LMKExtractor():
|
|
| 61 |
except:
|
| 62 |
return None
|
| 63 |
|
| 64 |
-
print('///=====2======')
|
| 65 |
bs_list = detection_result.face_blendshapes
|
| 66 |
if len(bs_list) == 1:
|
| 67 |
bs = bs_list[0]
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def __call__(self, img):
|
|
|
|
| 41 |
frame = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 42 |
image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
|
| 43 |
# t0 = time.time()
|
|
|
|
| 60 |
except:
|
| 61 |
return None
|
| 62 |
|
|
|
|
| 63 |
bs_list = detection_result.face_blendshapes
|
| 64 |
if len(bs_list) == 1:
|
| 65 |
bs = bs_list[0]
|