Spaces:
Running
on
L40S
Running
on
L40S
Update app.py
Browse files
app.py
CHANGED
|
@@ -65,12 +65,6 @@ from diffusers.models import AutoencoderKLCogVideoX
|
|
| 65 |
from transformers import SiglipImageProcessor, SiglipVisionModel
|
| 66 |
from diffposetalk.diffposetalk import DiffPoseTalk
|
| 67 |
|
| 68 |
-
def cleanup_resources():
|
| 69 |
-
"""Clear CUDA cache and garbage collect"""
|
| 70 |
-
if torch.cuda.is_available():
|
| 71 |
-
torch.cuda.empty_cache()
|
| 72 |
-
gc.collect()
|
| 73 |
-
|
| 74 |
# Helper functions from the original script
|
| 75 |
def parse_video(driving_frames, max_frame_num, fps=25):
|
| 76 |
video_length = len(driving_frames)
|
|
@@ -180,8 +174,8 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
|
|
| 180 |
final_output_path = temp_output_file.name
|
| 181 |
|
| 182 |
# Set seed
|
| 183 |
-
seed = 43
|
| 184 |
-
generator = torch.Generator(device="cuda").manual_seed(seed)
|
| 185 |
|
| 186 |
progress(0.2, desc="Processing image...")
|
| 187 |
# Load and process image
|
|
@@ -244,20 +238,20 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
|
|
| 244 |
|
| 245 |
progress(0.6, desc="Generating animation (this may take a while)...")
|
| 246 |
# Generate video
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
out_samples = sample.frames[0]
|
| 262 |
|
| 263 |
out_samples = out_samples[2:] # Skip first two frames
|
|
@@ -290,7 +284,10 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
|
|
| 290 |
comparison_with_audio = save_video_with_audio(comparison_path, audio_path, comparison_with_audio)
|
| 291 |
|
| 292 |
progress(1.0, desc="Done!")
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
| 294 |
return result_path, comparison_with_audio
|
| 295 |
|
| 296 |
# Create Gradio interface
|
|
|
|
| 65 |
from transformers import SiglipImageProcessor, SiglipVisionModel
|
| 66 |
from diffposetalk.diffposetalk import DiffPoseTalk
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# Helper functions from the original script
|
| 69 |
def parse_video(driving_frames, max_frame_num, fps=25):
|
| 70 |
video_length = len(driving_frames)
|
|
|
|
| 174 |
final_output_path = temp_output_file.name
|
| 175 |
|
| 176 |
# Set seed
|
| 177 |
+
# seed = 43
|
| 178 |
+
# generator = torch.Generator(device="cuda").manual_seed(seed)
|
| 179 |
|
| 180 |
progress(0.2, desc="Processing image...")
|
| 181 |
# Load and process image
|
|
|
|
| 238 |
|
| 239 |
progress(0.6, desc="Generating animation (this may take a while)...")
|
| 240 |
# Generate video
|
| 241 |
+
with torch.no_grad():
|
| 242 |
+
sample = pipe(
|
| 243 |
+
image=image,
|
| 244 |
+
image_face=image_face,
|
| 245 |
+
control_video=input_video,
|
| 246 |
+
prompt="",
|
| 247 |
+
negative_prompt="",
|
| 248 |
+
height=480,
|
| 249 |
+
width=720,
|
| 250 |
+
num_frames=49,
|
| 251 |
+
# generator=generator,
|
| 252 |
+
guidance_scale=guidance_scale,
|
| 253 |
+
num_inference_steps=steps,
|
| 254 |
+
)
|
| 255 |
out_samples = sample.frames[0]
|
| 256 |
|
| 257 |
out_samples = out_samples[2:] # Skip first two frames
|
|
|
|
| 284 |
comparison_with_audio = save_video_with_audio(comparison_path, audio_path, comparison_with_audio)
|
| 285 |
|
| 286 |
progress(1.0, desc="Done!")
|
| 287 |
+
|
| 288 |
+
torch.cuda.empty_cache()
|
| 289 |
+
gc.collect()
|
| 290 |
+
|
| 291 |
return result_path, comparison_with_audio
|
| 292 |
|
| 293 |
# Create Gradio interface
|