Spaces:

YaohuiW
/

LIA-X

Running on Zero

App Files Files Community

YaohuiW commited on 21 days ago

Commit

6b0ef0f

1 Parent(s): 01e0491

update

Browse files

Files changed (7) hide show

app.py +4 -0
assets/instruction.md +4 -4
assets/title.md +1 -0
gradio_tabs/animation.py +154 -53
gradio_tabs/img_edit.py +87 -27
gradio_tabs/vid_edit.py +157 -64
networks/generator.py +40 -1

app.py CHANGED Viewed

@@ -7,6 +7,10 @@ from gradio_tabs.vid_edit import vid_edit
 from gradio_tabs.img_edit import img_edit
 from networks.generator import Generator
 device = torch.device("cuda")
 gen = Generator(size=512, motion_dim=40, scale=2).to(device)
 ckpt_path = hf_hub_download(repo_id="YaohuiW/LIA-X", filename="lia-x.pt")

 from gradio_tabs.img_edit import img_edit
 from networks.generator import Generator
+# Optimize torch.compile performance
+torch.set_float32_matmul_precision('high')  # Enable TensorFloat32 for better performance
+torch._dynamo.config.cache_size_limit = 64  # Increase cache size to reduce recompilations
 device = torch.device("cuda")
 gen = Generator(size=512, motion_dim=40, scale=2).to(device)
 ckpt_path = hf_hub_download(repo_id="YaohuiW/LIA-X", filename="lia-x.pt")

assets/instruction.md CHANGED Viewed

@@ -3,18 +3,18 @@
 * **Image Animation**
     - Upload `Source Image` and `Driving Video`
-    - Use `Control Panel` to edit source image and `Edit` button to display the `Edited Image`
-    - Use `Animate` button to obtained `Animated Video`
 * **Image Editing**
     - Upload `Source Image`
-    - Use `Control Panel` to edit source image and `Edit` button to display the `Edited Image`
 * **Video Editing**
     - Upload `Video`
-    - Use `Control Panel` to edit first frame of video and `Edit` button to display the `Edited Image`
     - Use `Generate` button to obtain `Edited Video`
 **NOTE: we recommend to crop both input images and videos using provided [tools](https://github.com/wyhsirius/LIA-X/tree/main) for better results**

 * **Image Animation**
     - Upload `Source Image` and `Driving Video`
+    - Using sliders in the `Control Panel` to edit image
+	- Use `Animate` button to obtain `Animated Video`
 * **Image Editing**
     - Upload `Source Image`
+	- Using sliders in the `Control Panel` to edit image
 * **Video Editing**
     - Upload `Video`
+	- Using sliders in the `Control Panel` to edit image
     - Use `Generate` button to obtain `Edited Video`
 **NOTE: we recommend to crop both input images and videos using provided [tools](https://github.com/wyhsirius/LIA-X/tree/main) for better results**

assets/title.md CHANGED Viewed

@@ -1,4 +1,5 @@
 <font size=7><center>LIA-X: Interpretable Latent Portrait Animator</center></font>
 <div style="display: flex;align-items: center;justify-content: center">
 [<a href="https://arxiv.org/abs/2508.09959">Technical Report</a>] | [<a href="https://wyhsirius.github.io/LIA-X-project/">Project Page</a>] | [<a href="https://github.com/wyhsirius/LIA-X">Code</a>]
 </div>

 <font size=7><center>LIA-X: Interpretable Latent Portrait Animator</center></font>
+<font size=5><center>Toward Interactive Portrait Animation and Editing</center></font>
 <div style="display: flex;align-items: center;justify-content: center">
 [<a href="https://arxiv.org/abs/2508.09959">Technical Report</a>] | [<a href="https://wyhsirius.github.io/LIA-X-project/">Project Page</a>] | [<a href="https://github.com/wyhsirius/LIA-X">Code</a>]
 </div>

gradio_tabs/animation.py CHANGED Viewed

@@ -36,64 +36,78 @@ labels_v = [
 	13, 24, 17, 26
 ]
 def load_image(img, size):
 	img = Image.open(img).convert('RGB')
 	w, h = img.size
 	img = img.resize((size, size))
 	img = np.asarray(img)
 	img = np.transpose(img, (2, 0, 1))	# 3 x 256 x 256
 	return img / 255.0, w, h
 def img_preprocessing(img_path, size):
-	img, w, h = load_image(img_path, size)  # [0, 1]
 	img = torch.from_numpy(img).unsqueeze(0).float()  # [0, 1]
 	imgs_norm = (img - 0.5) * 2.0  # [-1, 1]
 	return imgs_norm, w, h
-def resize(img, size):
-	transform = torchvision.transforms.Compose([
-		torchvision.transforms.Resize((size, size), antialias=True),
-	])
 	return transform(img)
 def resize_back(img, w, h):
-	transform = torchvision.transforms.Compose([
-		torchvision.transforms.Resize((h, w), antialias=True),
-	])
 	return transform(img)
 def vid_preprocessing(vid_path, size):
 	vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec')
-	vid = vid_dict[0].permute(0, 3, 1, 2).unsqueeze(0)	# btchw
 	fps = vid_dict[2]['video_fps']
 	vid_norm = (vid / 255.0 - 0.5) * 2.0  # [-1, 1]
-	vid_norm = torch.cat([
-		resize(vid_norm[:, i, :, :, :], size).unsqueeze(1) for i in range(vid.size(1))
-	], dim=1)
 	return vid_norm, fps
 def img_denorm(img):
-	img = img.clamp(-1, 1).cpu()
 	img = (img - img.min()) / (img.max() - img.min())
 	return img
 def vid_denorm(vid):
-	vid = vid.clamp(-1, 1).cpu()
 	vid = (vid - vid.min()) / (vid.max() - vid.min())
 	return vid
@@ -101,24 +115,32 @@ def vid_denorm(vid):
 def img_postprocessing(image, w, h):
-	image = resize_back(image, w, h)
-	image = image.permute(0, 2, 3, 1)
-	edited_image = img_denorm(image)
-	img_output = (edited_image[0].numpy() * 255).astype(np.uint8)
-	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
-		imageio.imwrite(temp_file.name, img_output, quality=8)
-		return temp_file.name
 def vid_postprocessing(video, w, h, fps):
-	# video: BCTHW
-	b,c,t,_,_ = video.size()
-	vid_batch = resize_back(rearrange(video, "b c t h w -> (b t) c h w"), w, h)
-	vid = rearrange(vid_batch, "(b t) c h w -> b t h w c", b=b)	# B T H W C
-	vid_np = (vid_denorm(vid[0]).numpy() * 255).astype('uint8')
 	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
 		imageio.mimwrite(temp_file.name, vid_np, fps=fps, codec='libx264', quality=8)
@@ -126,15 +148,59 @@ def vid_postprocessing(video, w, h, fps):
 def animation(gen, chunk_size, device):
 	@spaces.GPU
-	@torch.no_grad()
 	def edit_media(image, *selected_s):
 		image_tensor, w, h = img_preprocessing(image, 512)
 		image_tensor = image_tensor.to(device)
-		edited_image_tensor = gen.edit_img(image_tensor, labels_v, selected_s)
 		# de-norm
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
@@ -142,16 +208,38 @@ def animation(gen, chunk_size, device):
 		return edited_image
 	@spaces.GPU
-	@torch.no_grad()
 	def animate_media(image, video, *selected_s):
 		image_tensor, w, h = img_preprocessing(image, 512)
 		vid_target_tensor, fps = vid_preprocessing(video, 512)
 		image_tensor = image_tensor.to(device)
-		video_target_tensor = vid_target_tensor.to(device)
-		animated_video = gen.animate_batch(image_tensor, video_target_tensor, labels_v, selected_s, chunk_size)
-		edited_image = animated_video[:,:,0,:,:]
 		# postprocessing
 		animated_video = vid_postprocessing(animated_video, w, h, fps)
@@ -162,7 +250,7 @@ def animation(gen, chunk_size, device):
 	def clear_media():
 		return None, None, *([0] * len(labels_k))
 	with gr.Tab("Image Animation"):
 		inputs_s = []
@@ -202,11 +290,10 @@ def animation(gen, chunk_size, device):
 				with gr.Row():
 					with gr.Column(scale=1):
 						with gr.Row():	# Buttons now within a single Row
-							edit_btn = gr.Button("Edit", elem_id="button_edit",)
-							clear_btn = gr.Button("Clear", elem_id="button_clear")
-						with gr.Row():
 							animate_btn = gr.Button("Animate", elem_id="button_animate")
 			with gr.Column(scale=1):
@@ -221,7 +308,7 @@ def animation(gen, chunk_size, device):
 						#video_output.render()
 						video_output = gr.Video(label="Output Video", elem_id="output_vid", width=512)#.render()
-				with gr.Accordion("Control Panel", open=True):
 					with gr.Tab("Head"):
 						with gr.Row():
 							for k in labels_k[:3]:
@@ -251,20 +338,34 @@ def animation(gen, chunk_size, device):
 							for k in labels_k[12:14]:
 								slider = gr.Slider(minimum=-0.2, maximum=0.2, value=0, label=k, elem_id="slider_"+str(k))
 								inputs_s.append(slider)
-		edit_btn.click(
-			fn=edit_media,
-			inputs=[image_input] + inputs_s,
-			outputs=[image_output],
-			show_progress=True
-		)
 		animate_btn.click(
 			fn=animate_media,
 			inputs=[image_input, video_input] + inputs_s,
 			outputs=[image_output, video_output],
-            show_progress=True
 		)
 		clear_btn.click(
@@ -280,14 +381,14 @@ def animation(gen, chunk_size, device):
 				['./data/source/macron.png', './data/driving/driving1.mp4', 0.14,0,-0.26,-0.29,-0.11,0,-0.13,-0.18,0,0,0,0,-0.02,0.07],
 				['./data/source/portrait3.png', './data/driving/driving1.mp4', -0.03,0.21,-0.31,-0.12,-0.11,0,-0.05,-0.16,0,0,0,0,-0.02,0.07],
 				['./data/source/einstein.png','./data/driving/driving2.mp4',-0.31,0,0,0.16,0.08,0,-0.07,0,0.13,0,0,0,0,0],
-                ['./data/source/portrait1.png', './data/driving/driving4.mp4', 0, 0, -0.17, -0.19, 0.25, 0, 0, -0.086,
 				 0.087, 0, 0, 0, 0, 0],
 				['./data/source/portrait2.png','./data/driving/driving8.mp4',0,0,-0.25,0,0,0,0,0,0,0.126,0,0,0,0],
 			],
-            fn=animate_media,
 			inputs=[image_input, video_input] + inputs_s,
-            outputs=[image_output, video_output],
 		)

 	13, 24, 17, 26
 ]
+@torch.compiler.allow_in_graph
 def load_image(img, size):
 	img = Image.open(img).convert('RGB')
 	w, h = img.size
 	img = img.resize((size, size))
 	img = np.asarray(img)
+	img = np.copy(img)
 	img = np.transpose(img, (2, 0, 1))	# 3 x 256 x 256
 	return img / 255.0, w, h
+@torch.compiler.allow_in_graph
 def img_preprocessing(img_path, size):
+	img, w, h = load_image(img_path, size)	# [0, 1]
 	img = torch.from_numpy(img).unsqueeze(0).float()  # [0, 1]
 	imgs_norm = (img - 0.5) * 2.0  # [-1, 1]
 	return imgs_norm, w, h
+# Pre-compile resize transforms for better performance
+resize_transform_cache = {}
+def get_resize_transform(size):
+	"""Get cached resize transform - creates once, reuses many times"""
+	if size not in resize_transform_cache:
+		# Only create the transform if it doesn't exist in cache
+		resize_transform_cache[size] = torchvision.transforms.Resize(
+			size,
+			interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
+			antialias=True
+		)
+	return resize_transform_cache[size]
+def resize(img, size):
+	"""Use cached resize transform"""
+	transform = get_resize_transform((size, size))
 	return transform(img)
 def resize_back(img, w, h):
+	"""Use cached resize transform for back operation"""
+	transform = get_resize_transform((h, w))
 	return transform(img)
 def vid_preprocessing(vid_path, size):
 	vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec')
+	vid = vid_dict[0].permute(0, 3, 1, 2) # tchw
 	fps = vid_dict[2]['video_fps']
 	vid_norm = (vid / 255.0 - 0.5) * 2.0  # [-1, 1]
+	#vid_norm = torch.cat([
+	#	resize(vid_norm[i:i+1, :, :, :], size).unsqueeze(1) for i in range(vid.size(0))
+	#], dim=1)
+	vid_norm = resize(vid_norm, size) # tchw
 	return vid_norm, fps
 def img_denorm(img):
+	img = img.clamp(-1, 1)
 	img = (img - img.min()) / (img.max() - img.min())
 	return img
 def vid_denorm(vid):
+	vid = vid.clamp(-1, 1)
 	vid = (vid - vid.min()) / (vid.max() - vid.min())
 	return vid
 def img_postprocessing(image, w, h):
+	img = resize_back(image, w, h)
+	# Denormalize ON GPU (avoid early CPU transfer)
+	img = img.clamp(-1, 1)	# Still on GPU
+	img = (img - img.min()) / (img.max() - img.min())  # Still on GPU
+	# Single optimized CPU transfer
+	img = img.squeeze(0).permute(1, 2, 0).contiguous()	# contiguous() for fast transfer
+	img_output = (img.cpu().numpy() * 255).astype(np.uint8)  # Single CPU transfer
+	# return the Numpy array directly, since Gradio supports it
+	return img_output
 def vid_postprocessing(video, w, h, fps):
+	# video: TCHW
+	t,c,_,_ = video.size()
+	vid = resize_back(video, w, h)
+	vid = vid.clamp(-1, 1)
+	vid = (vid - vid.min()) / (vid.max() - vid.min())
+	vid = rearrange(vid, "t c h w -> t h w c")	# T H W C
+	vid_np = (vid.cpu().numpy() * 255).astype('uint8')
 	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
 		imageio.mimwrite(temp_file.name, vid_np, fps=fps, codec='libx264', quality=8)
 def animation(gen, chunk_size, device):
+	@torch.compile
+	def compiled_enc_img(image_tensor, selected_s):
+		"""Compiled version of just the model inference"""
+		return gen.enc_img(image_tensor, labels_v, selected_s)
+	@torch.compile
+	def compiled_dec_img(z_s2r, alpha_r2s, feat_rgb):
+		"""Compiled version of just the model inference"""
+		return gen.dec_img(z_s2r, alpha_r2s, feat_rgb)
+	@torch.compile
+	def compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch):
+		"""Compiled version of animate_batch for animation tab"""
+		return gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
+	# Pre-warm the compiled model with dummy data to reduce first-run compilation time
+	def _warmup_model():
+		"""Pre-warm the model compilation with representative shapes"""
+		print("[img_edit] Pre-warming model compilation...")
+		dummy_image = torch.randn(1, 3, 512, 512, device=device)
+		dummy_video = torch.randn(chunk_size, 3, 512, 512, device=device)
+		dummy_selected_s = [0.0] * len(labels_v)
+		try:
+			with torch.inference_mode():
+				z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
+				_ = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
+			print("[img_edit] Model pre-warming completed successfully")
+		except Exception as e:
+			print(f"[img_edit] Model pre-warming failed (will compile on first use): {e}")
+		try:
+			with torch.inference_mode():
+				z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
+				_ = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, dummy_video[0], dummy_video)
+			print("[img_animation] Model pre-warming completed successfully")
+		except Exception as e:
+			print(f"[img_animation] Model pre-warming failed (will compile on first use): {e}")
+	# Pre-warm the model
+	_warmup_model()
 	@spaces.GPU
+	@torch.inference_mode()
 	def edit_media(image, *selected_s):
 		image_tensor, w, h = img_preprocessing(image, 512)
 		image_tensor = image_tensor.to(device)
+		z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s)
+		edited_image_tensor = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
 		# de-norm
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
 		return edited_image
 	@spaces.GPU
+	@torch.inference_mode()
 	def animate_media(image, video, *selected_s):
 		image_tensor, w, h = img_preprocessing(image, 512)
 		vid_target_tensor, fps = vid_preprocessing(video, 512)
 		image_tensor = image_tensor.to(device)
+		video_target_tensor = vid_target_tensor.to(device) #tchw
+		#animated_video = gen.animate_batch(image_tensor, video_target_tensor, labels_v, selected_s, chunk_size)
+		#edited_image = animated_video[:,:,0,:,:]
+		img_start = video_target_tensor[0:1,:,:,:]
+		#vid_target_tensor_batch = rearrange(video_target_tensor, 'b t c h w -> (b t) c h w')
+		res = []
+		t = video_target_tensor.size(1)
+		chunks = t // chunk_size
+		z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s)
+		#z_s2r, alpha_r2s, feat_rgb = gen.enc_img(image_tensor, labels_v, selected_s)
+		for i in range(chunks+1):
+			if i == chunks:
+				img_target = vid_target_tensor[i*chunk_size:, :, :, :]
+				img_animated = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target)
+				#img_animated_batch = gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
+			else:
+				img_target = vid_target_tensor[i*chunk_size:(i+1)*chunk_size, :, :, :]
+				img_animated = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target)
+				#img_animated_batch = gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
+			res.append(img_animated)
+		animated_video = torch.cat(res, dim=0) # TCHW
+		edited_image = animated_video[0:1,:,:,:]
 		# postprocessing
 		animated_video = vid_postprocessing(animated_video, w, h, fps)
 	def clear_media():
 		return None, None, *([0] * len(labels_k))
 	with gr.Tab("Image Animation"):
 		inputs_s = []
 				with gr.Row():
 					with gr.Column(scale=1):
 						with gr.Row():	# Buttons now within a single Row
+							#edit_btn = gr.Button("Edit", elem_id="button_edit",)
 							animate_btn = gr.Button("Animate", elem_id="button_animate")
+						with gr.Row():
+							clear_btn = gr.Button("Clear", elem_id="button_clear")
 			with gr.Column(scale=1):
 						#video_output.render()
 						video_output = gr.Video(label="Output Video", elem_id="output_vid", width=512)#.render()
+				with gr.Accordion("Control Panel (Using Sliders to Edit Image)", open=True):
 					with gr.Tab("Head"):
 						with gr.Row():
 							for k in labels_k[:3]:
 							for k in labels_k[12:14]:
 								slider = gr.Slider(minimum=-0.2, maximum=0.2, value=0, label=k, elem_id="slider_"+str(k))
 								inputs_s.append(slider)
+		for slider in inputs_s:
+			slider.change(
+				fn=edit_media,
+				inputs=[image_input] + inputs_s,
+				outputs=[image_output],
+				show_progress='hidden',
+				trigger_mode='always_last',
+				# currently we have a latency around 450ms
+				stream_every=0.5
+			)
+		#edit_btn.click(
+		#	fn=edit_media,
+		#	inputs=[image_input] + inputs_s,
+		#	outputs=[image_output],
+		#	show_progress=True
+		#)
 		animate_btn.click(
 			fn=animate_media,
 			inputs=[image_input, video_input] + inputs_s,
 			outputs=[image_output, video_output],
+			show_progress=True
 		)
 		clear_btn.click(
 				['./data/source/macron.png', './data/driving/driving1.mp4', 0.14,0,-0.26,-0.29,-0.11,0,-0.13,-0.18,0,0,0,0,-0.02,0.07],
 				['./data/source/portrait3.png', './data/driving/driving1.mp4', -0.03,0.21,-0.31,-0.12,-0.11,0,-0.05,-0.16,0,0,0,0,-0.02,0.07],
 				['./data/source/einstein.png','./data/driving/driving2.mp4',-0.31,0,0,0.16,0.08,0,-0.07,0,0.13,0,0,0,0,0],
+				['./data/source/portrait1.png', './data/driving/driving4.mp4', 0, 0, -0.17, -0.19, 0.25, 0, 0, -0.086,
 				 0.087, 0, 0, 0, 0, 0],
 				['./data/source/portrait2.png','./data/driving/driving8.mp4',0,0,-0.25,0,0,0,0,0,0,0.126,0,0,0,0],
 			],
+			fn=animate_media,
 			inputs=[image_input, video_input] + inputs_s,
+			outputs=[image_output, video_output],
 		)

gradio_tabs/img_edit.py CHANGED Viewed

@@ -37,69 +37,115 @@ labels_v = [
 ]
 def load_image(img, size):
 	img = Image.open(img).convert('RGB')
 	w, h = img.size
 	img = img.resize((size, size))
 	img = np.asarray(img)
 	img = np.transpose(img, (2, 0, 1))	# 3 x 256 x 256
 	return img / 255.0, w, h
 def img_preprocessing(img_path, size):
-	img, w, h = load_image(img_path, size)  # [0, 1]
 	img = torch.from_numpy(img).unsqueeze(0).float()  # [0, 1]
 	imgs_norm = (img - 0.5) * 2.0  # [-1, 1]
 	return imgs_norm, w, h
-def resize(img, size):
-	transform = torchvision.transforms.Compose([
-		torchvision.transforms.Resize((size,size), antialias=True),
-	])
 	return transform(img)
 def resize_back(img, w, h):
-	transform = torchvision.transforms.Compose([
-		torchvision.transforms.Resize((h, w), antialias=True),
-	])
 	return transform(img)
 def img_denorm(img):
-	img = img.clamp(-1, 1).cpu()
 	img = (img - img.min()) / (img.max() - img.min())
 	return img
-def img_postprocessing(image, w, h):
-	image = resize_back(image, w, h)
-	image = image.permute(0, 2, 3, 1)
-	edited_image = img_denorm(image)
-	img_output = (edited_image[0].numpy() * 255).astype(np.uint8)
-	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
-		imageio.imwrite(temp_file.name, img_output, quality=8)
-		return temp_file.name
 def img_edit(gen, device):
 	@spaces.GPU
-	@torch.no_grad()
 	def edit_img(image, *selected_s):
 		image_tensor, w, h = img_preprocessing(image, 512)
 		image_tensor = image_tensor.to(device)
-		edited_image_tensor = gen.edit_img(image_tensor, labels_v, selected_s)
 		# de-norm
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
@@ -136,7 +182,7 @@ def img_edit(gen, device):
 				with gr.Row():
 					with gr.Column(scale=1):
 						with gr.Row():	# Buttons now within a single Row
-							edit_btn = gr.Button("Edit")
 							clear_btn = gr.Button("Clear")
 						#with gr.Row():
 						#	animate_btn = gr.Button("Generate")
@@ -150,7 +196,7 @@ def img_edit(gen, device):
 						image_output = gr.Image(label="Output Image", type='numpy', interactive=False, width=512)
-				with gr.Accordion("Control Panel", open=True):
 					with gr.Tab("Head"):
 						with gr.Row():
 							for k in labels_k[:3]:
@@ -181,15 +227,29 @@ def img_edit(gen, device):
 								slider = gr.Slider(minimum=-0.2, maximum=0.2, value=0, label=k)
 								inputs_s.append(slider)
-		edit_btn.click(
 			fn=edit_img,
 			inputs=[image_input] + inputs_s,
 			outputs=[image_output],
-			show_progress=True
-		)
 		clear_btn.click(
 			fn=clear_media,
 			outputs=[image_output] + inputs_s
-		)

 ]
+@torch.compiler.allow_in_graph
 def load_image(img, size):
 	img = Image.open(img).convert('RGB')
 	w, h = img.size
 	img = img.resize((size, size))
 	img = np.asarray(img)
+	img = np.copy(img)
 	img = np.transpose(img, (2, 0, 1))	# 3 x 256 x 256
 	return img / 255.0, w, h
+@torch.compiler.allow_in_graph
 def img_preprocessing(img_path, size):
+	img, w, h = load_image(img_path, size)	# [0, 1]
 	img = torch.from_numpy(img).unsqueeze(0).float()  # [0, 1]
 	imgs_norm = (img - 0.5) * 2.0  # [-1, 1]
 	return imgs_norm, w, h
+# Pre-compile resize transforms for better performance
+resize_transform_cache = {}
+def get_resize_transform(size):
+	"""Get cached resize transform - creates once, reuses many times"""
+	if size not in resize_transform_cache:
+		# Only create the transform if it doesn't exist in cache
+		resize_transform_cache[size] = torchvision.transforms.Resize(
+			size,
+			interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
+			antialias=True
+		)
+	return resize_transform_cache[size]
+def resize(img, size):
+	"""Use cached resize transform"""
+	transform = get_resize_transform((size, size))
 	return transform(img)
 def resize_back(img, w, h):
+	"""Use cached resize transform for back operation"""
+	transform = get_resize_transform((h, w))
 	return transform(img)
 def img_denorm(img):
+	img = img.clamp(-1, 1)
 	img = (img - img.min()) / (img.max() - img.min())
 	return img
+def img_postprocessing(img, w, h):
+	img = resize_back(img, w, h)
+	#image = image.permute(0, 2, 3, 1)
+	img = img_denorm(img)
+	img = img.squeeze(0).permute(1, 2, 0).contiguous()	# contiguous() for fast transfer
+	img_output = (img.cpu().numpy() * 255).astype(np.uint8)
+	#with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
+	#	imageio.imwrite(temp_file.name, img_output, quality=8)
+	#	return temp_file.name
+	return img_output
 def img_edit(gen, device):
+	@torch.compile
+	def compiled_enc_img(image_tensor, selected_s):
+		"""Compiled version of just the model inference"""
+		return gen.enc_img(image_tensor, labels_v, selected_s)
+	@torch.compile
+	def compiled_dec_img(z_s2r, alpha_r2s, feat_rgb):
+		"""Compiled version of just the model inference"""
+		return gen.dec_img(z_s2r, alpha_r2s, feat_rgb)
+	# Pre-warm the compiled model with dummy data to reduce first-run compilation time
+	def _warmup_model():
+		"""Pre-warm the model compilation with representative shapes"""
+		print("[img_edit] Pre-warming model compilation...")
+		dummy_image = torch.randn(1, 3, 512, 512, device=device)
+		dummy_selected_s = [0.0] * len(labels_v)
+		try:
+			with torch.inference_mode():
+				z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
+				_ = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
+			print("[img_edit] Model pre-warming completed successfully")
+		except Exception as e:
+			print(f"[img_edit] Model pre-warming failed (will compile on first use): {e}")
+	# Pre-warm the model
+	_warmup_model()
 	@spaces.GPU
+	@torch.inference_mode()
 	def edit_img(image, *selected_s):
 		image_tensor, w, h = img_preprocessing(image, 512)
 		image_tensor = image_tensor.to(device)
+		z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s)
+		edited_image_tensor = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
 		# de-norm
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
 				with gr.Row():
 					with gr.Column(scale=1):
 						with gr.Row():	# Buttons now within a single Row
+							#edit_btn = gr.Button("Edit")
 							clear_btn = gr.Button("Clear")
 						#with gr.Row():
 						#	animate_btn = gr.Button("Generate")
 						image_output = gr.Image(label="Output Image", type='numpy', interactive=False, width=512)
+				with gr.Accordion("Control Panel (Using Sliders to Edit Image)", open=True):
 					with gr.Tab("Head"):
 						with gr.Row():
 							for k in labels_k[:3]:
 								slider = gr.Slider(minimum=-0.2, maximum=0.2, value=0, label=k)
 								inputs_s.append(slider)
+		for slider in inputs_s:
+			slider.change(
 			fn=edit_img,
 			inputs=[image_input] + inputs_s,
 			outputs=[image_output],
+			show_progress='hidden',
+			trigger_mode='always_last',
+			# currently we have a latency around 450ms
+			stream_every=0.5
+		)
+		#edit_btn.click(
+		#	fn=edit_img,
+		#	inputs=[image_input] + inputs_s,
+		#	outputs=[image_output],
+		#	show_progress=True
+		#)
 		clear_btn.click(
 			fn=clear_media,
 			outputs=[image_output] + inputs_s
+		)

gradio_tabs/vid_edit.py CHANGED Viewed

@@ -37,92 +37,118 @@ labels_v = [
 ]
 def load_image(img, size):
-	# img = Image.open(filename).convert('RGB')
-	if not isinstance(img, np.ndarray):
-		img = Image.open(img).convert('RGB')
-		img = img.resize((size, size))
-		img = np.asarray(img)
 	img = np.transpose(img, (2, 0, 1))	# 3 x 256 x 256
-	return img / 255.0
 def img_preprocessing(img_path, size):
-	img = load_image(img_path, size)  # [0, 1]
 	img = torch.from_numpy(img).unsqueeze(0).float()  # [0, 1]
 	imgs_norm = (img - 0.5) * 2.0  # [-1, 1]
-	return imgs_norm
-def resize(img, size):
-	transform = torchvision.transforms.Compose([
-		torchvision.transforms.Resize((size, size), antialias=True),
-	])
 	return transform(img)
 def resize_back(img, w, h):
-	transform = torchvision.transforms.Compose([
-		torchvision.transforms.Resize((h, w), antialias=True),
-	])
 	return transform(img)
 def vid_preprocessing(vid_path, size):
 	vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec')
-	vid = vid_dict[0].permute(0, 3, 1, 2).unsqueeze(0)	# btchw
 	_,_,_,h,w = vid.size()
 	fps = vid_dict[2]['video_fps']
 	vid_norm = (vid / 255.0 - 0.5) * 2.0  # [-1, 1]
-	vid_norm = torch.cat([
-		resize(vid_norm[:, i, :, :, :], size).unsqueeze(1) for i in range(vid.size(1))
-	], dim=1)
 	return vid_norm, fps, w, h
 def img_denorm(img):
-	img = img.clamp(-1, 1).cpu()
 	img = (img - img.min()) / (img.max() - img.min())
 	return img
 def vid_denorm(vid):
-	vid = vid.clamp(-1, 1).cpu()
 	vid = (vid - vid.min()) / (vid.max() - vid.min())
 	return vid
 def img_postprocessing(image, w, h):
-	image = resize_back(image, w, h)
-	image = image.permute(0, 2, 3, 1)
-	edited_image = img_denorm(image)
-	img_output = (edited_image[0].numpy() * 255).astype(np.uint8)
-	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
-		imageio.imwrite(temp_file.name, img_output, quality=8)
-		return temp_file.name
 def vid_all_save(vid_d, vid_a, w, h, fps):
-	b,t,c,_,_ = vid_d.size()
-	vid_d_batch = resize_back(rearrange(vid_d, "b t c h w -> (b t) c h w"), w, h)
-	vid_a_batch = resize_back(rearrange(vid_a, "b c t h w -> (b t) c h w"), w, h)
-	vid_d = rearrange(vid_d_batch, "(b t) c h w -> b t h w c", b=b) # B T H W C
-	vid_a = rearrange(vid_a_batch, "(b t) c h w -> b t h w c", b=b) # B T H W C
-	vid_all = torch.cat([vid_d, vid_a], dim=3)
-	vid_a_np = (vid_denorm(vid_a[0]).numpy() * 255).astype('uint8')
-	vid_all_np = (vid_denorm(vid_all[0]).numpy() * 255).astype('uint8')
 	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as output_path:
 		imageio.mimwrite(output_path.name, vid_a_np, fps=fps, codec='libx264', quality=8)
@@ -134,16 +160,59 @@ def vid_all_save(vid_d, vid_a, w, h, fps):
 def vid_edit(gen, chunk_size, device):
 	@spaces.GPU
-	@torch.no_grad()
 	def edit_img(video, *selected_s):
-		vid_target_tensor, fps, w, h = vid_preprocessing(video, 512)
-		video_target_tensor = vid_target_tensor.to(device)
-		image_tensor = video_target_tensor[:,0,:,:,:]
-		edited_image_tensor = gen.edit_img(image_tensor, labels_v, selected_s)
 		# de-norm
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
@@ -151,21 +220,35 @@ def vid_edit(gen, chunk_size, device):
 		return edited_image
 	@spaces.GPU
-	@torch.no_grad()
 	def edit_vid(video, *selected_s):
 		video_target_tensor, fps, w, h = vid_preprocessing(video, 512)
 		video_target_tensor = video_target_tensor.to(device)
-		edited_video_tensor = gen.edit_vid_batch(video_target_tensor, labels_v, selected_s, chunk_size)
-		edited_image_tensor = edited_video_tensor[:,:,0,:,:]
 		# de-norm
-		animated_video, animated_all_video = vid_all_save(video_target_tensor, edited_video_tensor, w, h, fps)
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
-		return edited_image, animated_video, animated_all_video
 	def clear_media():
 		return None, None, None, *([0] * len(labels_k))
@@ -210,7 +293,7 @@ def vid_edit(gen, chunk_size, device):
 						video_all_output = gr.Video(label="Videos", elem_id="output_vid_all")
 			with gr.Column(scale=1):
-				with gr.Accordion("Control Panel", open=True):
 					with gr.Tab("Head"):
 						with gr.Row():
 							for k in labels_k[:3]:
@@ -244,17 +327,27 @@ def vid_edit(gen, chunk_size, device):
 				with gr.Row():
 					with gr.Column(scale=1):
 						with gr.Row():	# Buttons now within a single Row
-							edit_btn = gr.Button("Edit",elem_id="button_edit")
-							clear_btn = gr.Button("Clear",elem_id="button_clear")
-						with gr.Row():
 							animate_btn = gr.Button("Generate",elem_id="button_generate")
-		edit_btn.click(
-			fn=edit_img,
-			inputs=[video_input] + inputs_s,
-			outputs=[image_output],
-			show_progress=True
-		)
 		animate_btn.click(
 			fn=edit_vid,
@@ -280,9 +373,9 @@ def vid_edit(gen, chunk_size, device):
 				['./data/driving/driving9.mp4', 0, 0, 0, 0, 0, 0, 0,
 				 0, 0, 0, 0, 0, -0.1, 0.07],
 			],
-            fn=edit_vid,
 			inputs=[video_input] + inputs_s,
-            outputs=[image_output, video_output, video_all_output],
 		)

 ]
+@torch.compiler.allow_in_graph
 def load_image(img, size):
+	img = Image.open(img).convert('RGB')
+	w, h = img.size
+	img = img.resize((size, size))
+	img = np.asarray(img)
+	img = np.copy(img)
 	img = np.transpose(img, (2, 0, 1))	# 3 x 256 x 256
+	return img / 255.0, w, h
+@torch.compiler.allow_in_graph
 def img_preprocessing(img_path, size):
+	img, w, h = load_image(img_path, size)	# [0, 1]
 	img = torch.from_numpy(img).unsqueeze(0).float()  # [0, 1]
 	imgs_norm = (img - 0.5) * 2.0  # [-1, 1]
+	return imgs_norm, w, h
+# Pre-compile resize transforms for better performance
+resize_transform_cache = {}
+def get_resize_transform(size):
+	"""Get cached resize transform - creates once, reuses many times"""
+	if size not in resize_transform_cache:
+		# Only create the transform if it doesn't exist in cache
+		resize_transform_cache[size] = torchvision.transforms.Resize(
+			size,
+			interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
+			antialias=True
+		)
+	return resize_transform_cache[size]
+def resize(img, size):
+	"""Use cached resize transform"""
+	transform = get_resize_transform((size, size))
 	return transform(img)
 def resize_back(img, w, h):
+	"""Use cached resize transform for back operation"""
+	transform = get_resize_transform((h, w))
 	return transform(img)
 def vid_preprocessing(vid_path, size):
 	vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec')
+	vid = vid_dict[0].permute(0, 3, 1, 2)	# tchw
 	_,_,_,h,w = vid.size()
 	fps = vid_dict[2]['video_fps']
 	vid_norm = (vid / 255.0 - 0.5) * 2.0  # [-1, 1]
+	vid_norm = resize(vid_norm, size)
 	return vid_norm, fps, w, h
 def img_denorm(img):
+	img = img.clamp(-1, 1)
 	img = (img - img.min()) / (img.max() - img.min())
 	return img
 def vid_denorm(vid):
+	vid = vid.clamp(-1, 1)
 	vid = (vid - vid.min()) / (vid.max() - vid.min())
 	return vid
 def img_postprocessing(image, w, h):
+	img = resize_back(image, w, h)
+	# Denormalize ON GPU (avoid early CPU transfer)
+	img = img_denorm(img)
+	# Single optimized CPU transfer
+	img = img.squeeze(0).permute(1, 2, 0).contiguous()	# contiguous() for fast transfer
+	img_output = (img.cpu().numpy() * 255).astype(np.uint8)  # Single CPU transfer
+	# return the Numpy array directly, since Gradio supports it
+	return img_output
+def process_first_frame(vid_path, size):
+	vid_dict = torchvision.io.read_video(vid_path, start_pts=0, end_pts=0, pts_unit='sec')
+	img = vid_dict[0].permute(0, 3, 1, 2)  # bchw
+	_, _, h, w = img.size()
+	img_norm = (img / 255.0 - 0.5) * 2.0 # [-1, 1]
+	img_norm = resize(img_norm, size)
+	return img_norm, w, h
 def vid_all_save(vid_d, vid_a, w, h, fps):
+	# vid_d: tchw
+	# vid_a: tchw
+	t, c, _, _ = vid_d.size()
+	vid_d_batch = resize_back(vid_d, w, h)
+	vid_a_batch = resize_back(vid_a, w, h)
+	vid_d = rearrange(vid_d_batch, "t c h w -> t h w c")  # T H W C
+	vid_a = rearrange(vid_a_batch, "t c h w -> t h w c")  # T H W C
+	vid_all = torch.cat([vid_d, vid_a], dim=2)
+	vid_a_np = (vid_denorm(vid_a).cpu().numpy() * 255).astype('uint8')
+	vid_all_np = (vid_denorm(vid_all).cpu().numpy() * 255).astype('uint8')
 	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as output_path:
 		imageio.mimwrite(output_path.name, vid_a_np, fps=fps, codec='libx264', quality=8)
 def vid_edit(gen, chunk_size, device):
+	@torch.compile
+	def compiled_enc_img(image_tensor, selected_s):
+		"""Compiled version of just the model inference"""
+		return gen.enc_img(image_tensor, labels_v, selected_s)
+	@torch.compile
+	def compiled_dec_img(z_s2r, alpha_r2s, feat_rgb):
+		"""Compiled version of just the model inference"""
+		return gen.dec_img(z_s2r, alpha_r2s, feat_rgb)
+	@torch.compile
+	def compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch):
+		"""Compiled version of animate_batch for animation tab"""
+		return gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
+	# Pre-warm the compiled model with dummy data to reduce first-run compilation time
+	def _warmup_model():
+		"""Pre-warm the model compilation with representative shapes"""
+		print("[img_edit] Pre-warming model compilation...")
+		dummy_image = torch.randn(1, 3, 512, 512, device=device)
+		dummy_video = torch.randn(chunk_size, 3, 512, 512, device=device)
+		dummy_selected_s = [0.0] * len(labels_v)
+		try:
+			with torch.inference_mode():
+				z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
+				_ = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
+			print("[img_edit] Model pre-warming completed successfully")
+		except Exception as e:
+			print(f"[img_edit] Model pre-warming failed (will compile on first use): {e}")
+		try:
+			with torch.inference_mode():
+				z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
+				_ = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, dummy_video[0], dummy_video)
+			print("[img_animation] Model pre-warming completed successfully")
+		except Exception as e:
+			print(f"[img_animation] Model pre-warming failed (will compile on first use): {e}")
+	# Pre-warm the model
+	_warmup_model()
 	@spaces.GPU
+	@torch.inference_mode()
 	def edit_img(video, *selected_s):
+		image_tensor, w, h = process_first_frame(video, 512)
+		image_tensor = image_tensor.to(device)
+		z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s)
+		edited_image_tensor = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
 		# de-norm
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
 		return edited_image
 	@spaces.GPU
+	@torch.inference_mode()
 	def edit_vid(video, *selected_s):
 		video_target_tensor, fps, w, h = vid_preprocessing(video, 512)
 		video_target_tensor = video_target_tensor.to(device)
+		img_start = video_target_tensor[0:1, :, :, :]
+		res = []
+		t = video_target_tensor.size(1)
+		chunks = t // chunk_size
+		z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(img_start, selected_s)
+		for i in range(chunks + 1):
+			if i == chunks:
+				img_target_batch = vid_target_tensor_batch[i * chunk_size:, :, :, :]
+				img_animated_batch = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target)
+			else:
+				img_target_batch = vid_target_tensor_batch[i * chunk_size:(i + 1) * chunk_size, :, :, :]
+				img_animated_batch = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target)
+			res.append(img_animated_batch)
+		edited_video_tensor = torch.cat(res, dim=0)  # TCHW
+		edited_image_tensor = edited_video_tensor[0:1,:,:,:]
 		# de-norm
+		animated_video, animated_all_video = vid_all_save(vid_target_tensor_batch, edited_video_tensor, w, h, fps)
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
+		return edited_image, animated_video, animated_all_video
 	def clear_media():
 		return None, None, None, *([0] * len(labels_k))
 						video_all_output = gr.Video(label="Videos", elem_id="output_vid_all")
 			with gr.Column(scale=1):
+				with gr.Accordion("Control Panel (Using Sliders to Edit Image)", open=True):
 					with gr.Tab("Head"):
 						with gr.Row():
 							for k in labels_k[:3]:
 				with gr.Row():
 					with gr.Column(scale=1):
 						with gr.Row():	# Buttons now within a single Row
+							#edit_btn = gr.Button("Edit",elem_id="button_edit")
 							animate_btn = gr.Button("Generate",elem_id="button_generate")
+							clear_btn = gr.Button("Clear",elem_id="button_clear")
+		for slider in inputs_s:
+			slider.change(
+				fn=edit_img,
+				inputs=[video_input] + inputs_s,
+				outputs=[image_output],
+				show_progress='hidden',
+				trigger_mode='always_last',
+				# currently we have a latency around 450ms
+				stream_every=0.5
+			)
+		#edit_btn.click(
+		#	fn=edit_img,
+		#	inputs=[video_input] + inputs_s,
+		#	outputs=[image_output],
+		#	show_progress=True
+		#)
 		animate_btn.click(
 			fn=edit_vid,
 				['./data/driving/driving9.mp4', 0, 0, 0, 0, 0, 0, 0,
 				 0, 0, 0, 0, 0, -0.1, 0.07],
 			],
+			fn=edit_vid,
 			inputs=[video_input] + inputs_s,
+			outputs=[image_output, video_output, video_all_output],
 		)

networks/generator.py CHANGED Viewed

@@ -17,6 +17,12 @@ class Generator(nn.Module):
 		self.enc = Encoder(style_dim, motion_dim, scale)
 		self.dec = Decoder(style_dim, motion_dim, scale)
 	def get_alpha(self, x):
 		return self.enc.enc_motion(x)
@@ -83,7 +89,7 @@ class Generator(nn.Module):
 		vid_target_recon = rearrange(vid_target_recon, 'b t c h w -> b c t h w')
 		return vid_target_recon # BCTHW
 	def edit_vid(self, vid_target, d_l, v_l):
 		img_source = vid_target[:, 0, :, :, :]
@@ -195,3 +201,36 @@ class Generator(nn.Module):
 		return vid_target_recon

 		self.enc = Encoder(style_dim, motion_dim, scale)
 		self.dec = Decoder(style_dim, motion_dim, scale)
+	@property
+	def device(self):
+		if self._device is None:
+			self._device = next(self.parameters()).device
+		return self._device
 	def get_alpha(self, x):
 		return self.enc.enc_motion(x)
 		vid_target_recon = rearrange(vid_target_recon, 'b t c h w -> b c t h w')
 		return vid_target_recon # BCTHW
 	def edit_vid(self, vid_target, d_l, v_l):
 		img_source = vid_target[:, 0, :, :, :]
 		return vid_target_recon
+	def enc_img(self, img_source, d_l, v_l):
+		"""Core edit_img logic without timing - can be compiled"""
+		z_s2r, feat_rgb = self.enc.enc_2r(img_source)
+		alpha_r2s = self.enc.enc_r2t(z_s2r)
+		# Create tensor directly on the same device as alpha_r2s
+		v_l_tensor = torch.tensor(v_l, device=alpha_r2s.device, dtype=alpha_r2s.dtype).unsqueeze(0)
+		alpha_r2s[:, d_l] = alpha_r2s[:, d_l] + v_l_tensor
+		return z_s2r, alpha_r2s, feat_rgb
+	def dec_img(self, z_s2r, alpha_r2s, feat_rgb):
+		return self.dec(z_s2r, [alpha_r2s], feat_rgb)
+	def dec_vid(self, z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch):
+		# z_s2r: BC
+		# alpha_r2s: BC
+		# feat: BCHW
+		# alpha_start: BC
+		bs = img_target_batch.size(0)
+		alpha_start = self.get_alpha(img_start)
+		alpha_start_r = repeat(alpha_start, 'b c -> (repeat b) c', repeat=bs)
+		alpha_r2s_r = repeat(alpha_r2s, 'b c -> (repeat b) c', repeat=bs)
+		feat_rgb_r = [repeat(feat, 'b c h w -> (repeat b) c h w', repeat=bs) for feat in feat_rgb]
+		z_s2r_r = repeat(z_s2r, 'b c -> (repeat b) c', repeat=bs)
+		alpha = self.enc.enc_transfer_vid(alpha_r2s_r, img_target_batch, alpha_start_r)
+		img_batch_recon = self.dec(z_s2r_r, alpha, feat_rgb_r) # bs x 3 x h x w
+		return img_batch_recon