Spaces:

faraday
/

V-Express

Paused

App Files Files Community

faraday commited on May 31, 2024

Commit

86716b3

1 Parent(s): 88590fc

just cuda

Browse files

Files changed (4) hide show

app.py +3 -4
inference.py +17 -22
requirements.txt +0 -1
scripts/extract_kps_sequence_and_audio.py +0 -2

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import spaces
 import gradio as gr
 import shutil
 import subprocess
@@ -42,12 +41,13 @@ DEFAULT_MODEL_ARGS = {
     #'audio_attention_weight': 3.0
 }
-@spaces.GPU(duration=600)
 def infer(reference_image, audio_path, kps_sequence_save_path,
         output_path,
         retarget_strategy,
         reference_attention_weight, audio_attention_weight):
-    INFERENCE_ENGINE = InferenceEngine(DEFAULT_MODEL_ARGS)
     INFERENCE_ENGINE.infer(
         reference_image, audio_path, kps_sequence_save_path,
         output_path,
@@ -57,7 +57,6 @@ def infer(reference_image, audio_path, kps_sequence_save_path,
     return output_path, kps_sequence_save_path
 # Function to run V-Express demo
-@spaces.GPU(duration=600)
 def run_demo(
         reference_image, audio, video,
         kps_path, output_path, retarget_strategy,

 import gradio as gr
 import shutil
 import subprocess
     #'audio_attention_weight': 3.0
 }
+INFERENCE_ENGINE = InferenceEngine(DEFAULT_MODEL_ARGS)
 def infer(reference_image, audio_path, kps_sequence_save_path,
         output_path,
         retarget_strategy,
         reference_attention_weight, audio_attention_weight):
+    global INFERENCE_ENGINE
     INFERENCE_ENGINE.infer(
         reference_image, audio_path, kps_sequence_save_path,
         output_path,
     return output_path, kps_sequence_save_path
 # Function to run V-Express demo
 def run_demo(
         reference_image, audio, video,
         kps_path, output_path, retarget_strategy,

inference.py CHANGED Viewed

@@ -1,6 +1,3 @@
-import spaces
-import argparse
 import os
 import cv2
 import numpy as np
@@ -20,14 +17,14 @@ from pipelines import VExpressPipeline
 from pipelines.utils import draw_kps_image, save_video
 from pipelines.utils import retarget_kps
-@spaces.GPU
 def load_reference_net(unet_config_path, reference_net_path, dtype, device):
     reference_net = UNet2DConditionModel.from_config(unet_config_path).to(dtype=dtype, device=device)
     reference_net.load_state_dict(torch.load(reference_net_path, map_location="cpu"), strict=False)
     print(f'Loaded weights of Reference Net from {reference_net_path}.')
     return reference_net
-@spaces.GPU
 def load_denoising_unet(unet_config_path, denoising_unet_path, motion_module_path, dtype, device):
     inference_config_path = './inference_v2.yaml'
     inference_config = OmegaConf.load(inference_config_path)
@@ -43,14 +40,14 @@ def load_denoising_unet(unet_config_path, denoising_unet_path, motion_module_pat
     return denoising_unet
-@spaces.GPU
 def load_v_kps_guider(v_kps_guider_path, dtype, device):
     v_kps_guider = VKpsGuider(320, block_out_channels=(16, 32, 96, 256)).to(dtype=dtype, device=device)
     v_kps_guider.load_state_dict(torch.load(v_kps_guider_path, map_location="cpu"))
     print(f'Loaded weights of V-Kps Guider from {v_kps_guider_path}.')
     return v_kps_guider
-@spaces.GPU
 def load_audio_projection(
         audio_projection_path,
         dtype,
@@ -76,7 +73,7 @@ def load_audio_projection(
     print(f'Loaded weights of Audio Projection from {audio_projection_path}.')
     return audio_projection
-@spaces.GPU
 def get_scheduler():
     inference_config_path = './inference_v2.yaml'
     inference_config = OmegaConf.load(inference_config_path)
@@ -86,7 +83,7 @@ def get_scheduler():
 class InferenceEngine(object):
-    @spaces.GPU
     def __init__(self, args):
         self.init_params(args)
         self.load_models()
@@ -94,7 +91,7 @@ class InferenceEngine(object):
         self.set_vexpress_pipeline()
         self.set_face_analysis_app()
-    @spaces.GPU
     def init_params(self, args):
         for key, value in args.items():
             setattr(self, key, value)
@@ -103,7 +100,7 @@ class InferenceEngine(object):
         print("Image height: ", self.image_height)
-    @spaces.GPU
     def load_models(self):
         self.device = torch.device(f'cuda:{self.gpu_id}')
         self.dtype = torch.float16 if self.dtype == 'fp16' else torch.float32
@@ -134,11 +131,11 @@ class InferenceEngine(object):
         else:
             raise ValueError("xformers is not available. Make sure it is installed correctly")
-    @spaces.GPU
     def set_generator(self):
         self.generator = torch.manual_seed(self.seed)
-    @spaces.GPU
     def set_vexpress_pipeline(self):
         print("VAE exists (2): ", self.vae)
         self.pipeline = VExpressPipeline(
@@ -152,7 +149,7 @@ class InferenceEngine(object):
             scheduler=self.scheduler,
         ).to(dtype=self.dtype, device=self.device)
-    @spaces.GPU
     def set_face_analysis_app(self):
         self.app = FaceAnalysis(
             providers=['CUDAExecutionProvider'],
@@ -161,7 +158,7 @@ class InferenceEngine(object):
         )
         self.app.prepare(ctx_id=0, det_size=(self.image_height, self.image_width))
-    @spaces.GPU
     def get_reference_image_for_kps(self, reference_image_path):
         reference_image = Image.open(reference_image_path).convert('RGB')
         print("Image width ???", self.image_width)
@@ -172,7 +169,7 @@ class InferenceEngine(object):
         reference_kps = self.app.get(reference_image_for_kps)[0].kps[:3]
         return reference_image, reference_image_for_kps, reference_kps
-    @spaces.GPU
     def get_waveform_video_length(self, audio_path):
         _, audio_waveform, meta_info = torchvision.io.read_video(audio_path, pts_unit='sec')
         audio_sampling_rate = meta_info['audio_fps']
@@ -190,7 +187,7 @@ class InferenceEngine(object):
         print(f'The corresponding video length is {video_length}.')
         return audio_waveform, video_length
-    @spaces.GPU
     def get_kps_sequence(self, kps_path, reference_kps, video_length, retarget_strategy):
         if kps_path != "":
             assert os.path.exists(kps_path), f'{kps_path} does not exist'
@@ -213,7 +210,7 @@ class InferenceEngine(object):
         return kps_sequence
-    @spaces.GPU
     def get_kps_images(self, kps_sequence, reference_image_for_kps, video_length):
         kps_images = []
         for i in range(video_length):
@@ -222,7 +219,6 @@ class InferenceEngine(object):
             kps_images.append(Image.fromarray(kps_image))
         return kps_images
-    @spaces.GPU(duration=600)
     def get_video_latents(self, reference_image, kps_images, audio_waveform, video_length, reference_attention_weight, audio_attention_weight):
         vae_scale_factor = 8
         latent_height = self.image_height // vae_scale_factor
@@ -252,19 +248,18 @@ class InferenceEngine(object):
         return video_latents
-    @spaces.GPU
     def get_video_tensor(self, video_latents):
         video_tensor = self.pipeline.decode_latents(video_latents)
         if isinstance(video_tensor, np.ndarray):
             video_tensor = torch.from_numpy(video_tensor)
         return video_tensor
-    @spaces.GPU
     def save_video_tensor(self, video_tensor, audio_path, output_path):
         save_video(video_tensor, audio_path, output_path, self.fps)
         print(f'The generated video has been saved at {output_path}.')
-    @spaces.GPU(duration=600)
     def infer(
             self,
             reference_image_path, audio_path, kps_path,

 import os
 import cv2
 import numpy as np
 from pipelines.utils import draw_kps_image, save_video
 from pipelines.utils import retarget_kps
 def load_reference_net(unet_config_path, reference_net_path, dtype, device):
     reference_net = UNet2DConditionModel.from_config(unet_config_path).to(dtype=dtype, device=device)
     reference_net.load_state_dict(torch.load(reference_net_path, map_location="cpu"), strict=False)
     print(f'Loaded weights of Reference Net from {reference_net_path}.')
     return reference_net
 def load_denoising_unet(unet_config_path, denoising_unet_path, motion_module_path, dtype, device):
     inference_config_path = './inference_v2.yaml'
     inference_config = OmegaConf.load(inference_config_path)
     return denoising_unet
 def load_v_kps_guider(v_kps_guider_path, dtype, device):
     v_kps_guider = VKpsGuider(320, block_out_channels=(16, 32, 96, 256)).to(dtype=dtype, device=device)
     v_kps_guider.load_state_dict(torch.load(v_kps_guider_path, map_location="cpu"))
     print(f'Loaded weights of V-Kps Guider from {v_kps_guider_path}.')
     return v_kps_guider
 def load_audio_projection(
         audio_projection_path,
         dtype,
     print(f'Loaded weights of Audio Projection from {audio_projection_path}.')
     return audio_projection
 def get_scheduler():
     inference_config_path = './inference_v2.yaml'
     inference_config = OmegaConf.load(inference_config_path)
 class InferenceEngine(object):
     def __init__(self, args):
         self.init_params(args)
         self.load_models()
         self.set_vexpress_pipeline()
         self.set_face_analysis_app()
     def init_params(self, args):
         for key, value in args.items():
             setattr(self, key, value)
         print("Image height: ", self.image_height)
     def load_models(self):
         self.device = torch.device(f'cuda:{self.gpu_id}')
         self.dtype = torch.float16 if self.dtype == 'fp16' else torch.float32
         else:
             raise ValueError("xformers is not available. Make sure it is installed correctly")
     def set_generator(self):
         self.generator = torch.manual_seed(self.seed)
     def set_vexpress_pipeline(self):
         print("VAE exists (2): ", self.vae)
         self.pipeline = VExpressPipeline(
             scheduler=self.scheduler,
         ).to(dtype=self.dtype, device=self.device)
     def set_face_analysis_app(self):
         self.app = FaceAnalysis(
             providers=['CUDAExecutionProvider'],
         )
         self.app.prepare(ctx_id=0, det_size=(self.image_height, self.image_width))
     def get_reference_image_for_kps(self, reference_image_path):
         reference_image = Image.open(reference_image_path).convert('RGB')
         print("Image width ???", self.image_width)
         reference_kps = self.app.get(reference_image_for_kps)[0].kps[:3]
         return reference_image, reference_image_for_kps, reference_kps
     def get_waveform_video_length(self, audio_path):
         _, audio_waveform, meta_info = torchvision.io.read_video(audio_path, pts_unit='sec')
         audio_sampling_rate = meta_info['audio_fps']
         print(f'The corresponding video length is {video_length}.')
         return audio_waveform, video_length
     def get_kps_sequence(self, kps_path, reference_kps, video_length, retarget_strategy):
         if kps_path != "":
             assert os.path.exists(kps_path), f'{kps_path} does not exist'
         return kps_sequence
     def get_kps_images(self, kps_sequence, reference_image_for_kps, video_length):
         kps_images = []
         for i in range(video_length):
             kps_images.append(Image.fromarray(kps_image))
         return kps_images
     def get_video_latents(self, reference_image, kps_images, audio_waveform, video_length, reference_attention_weight, audio_attention_weight):
         vae_scale_factor = 8
         latent_height = self.image_height // vae_scale_factor
         return video_latents
     def get_video_tensor(self, video_latents):
         video_tensor = self.pipeline.decode_latents(video_latents)
         if isinstance(video_tensor, np.ndarray):
             video_tensor = torch.from_numpy(video_tensor)
         return video_tensor
     def save_video_tensor(self, video_tensor, audio_path, output_path):
         save_video(video_tensor, audio_path, output_path, self.fps)
         print(f'The generated video has been saved at {output_path}.')
     def infer(
             self,
             reference_image_path, audio_path, kps_path,

requirements.txt CHANGED Viewed

@@ -15,4 +15,3 @@ tqdm==4.66.1
 xformers==0.0.20
 accelerate==0.19.0
 gitpython==3.1.31
-spaces==0.28.3

 xformers==0.0.20
 accelerate==0.19.0
 gitpython==3.1.31

scripts/extract_kps_sequence_and_audio.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import spaces
 import argparse
 import os
@@ -7,7 +6,6 @@ import torch
 from insightface.app import FaceAnalysis
 from imageio_ffmpeg import get_ffmpeg_exe
-@spaces.GPU
 def main(args):
     app = FaceAnalysis(
         providers=['CUDAExecutionProvider'],

 import argparse
 import os
 from insightface.app import FaceAnalysis
 from imageio_ffmpeg import get_ffmpeg_exe
 def main(args):
     app = FaceAnalysis(
         providers=['CUDAExecutionProvider'],