Spaces:
Running
Running
Generation time
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ import einops
|
|
12 |
import safetensors.torch as sf
|
13 |
import numpy as np
|
14 |
import random
|
|
|
15 |
import math
|
16 |
# 20250506 pftq: Added for video input loading
|
17 |
import decord
|
@@ -422,6 +423,37 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
|
|
422 |
clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
|
423 |
clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
|
424 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
425 |
for section_index in range(total_latent_sections):
|
426 |
if stream.input_queue.top() == 'end':
|
427 |
stream.output_queue.push(('end', None))
|
@@ -475,35 +507,7 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
|
|
475 |
callback=callback,
|
476 |
)
|
477 |
|
478 |
-
total_generated_latent_frames
|
479 |
-
history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
|
480 |
-
|
481 |
-
if not high_vram:
|
482 |
-
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
483 |
-
load_model_as_complete(vae, target_device=gpu)
|
484 |
-
|
485 |
-
real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
|
486 |
-
|
487 |
-
if history_pixels is None:
|
488 |
-
history_pixels = vae_decode(real_history_latents, vae).cpu()
|
489 |
-
else:
|
490 |
-
section_latent_frames = latent_window_size * 2
|
491 |
-
overlapped_frames = latent_window_size * 4 - 3
|
492 |
-
|
493 |
-
current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
|
494 |
-
history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
|
495 |
-
|
496 |
-
if not high_vram:
|
497 |
-
unload_complete_models()
|
498 |
-
|
499 |
-
if enable_preview or section_index == total_latent_sections - 1:
|
500 |
-
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
501 |
-
|
502 |
-
save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
|
503 |
-
|
504 |
-
print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
|
505 |
-
|
506 |
-
stream.output_queue.push(('file', output_filename))
|
507 |
except:
|
508 |
traceback.print_exc()
|
509 |
|
@@ -516,8 +520,7 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
|
|
516 |
return
|
517 |
|
518 |
def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
|
519 |
-
return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (
|
520 |
-
|
521 |
|
522 |
@spaces.GPU(duration=get_duration)
|
523 |
def process(input_image, prompt,
|
@@ -537,6 +540,7 @@ def process(input_image, prompt,
|
|
537 |
use_teacache=False,
|
538 |
mp4_crf=16
|
539 |
):
|
|
|
540 |
global stream
|
541 |
|
542 |
if torch.cuda.device_count() == 0:
|
@@ -575,7 +579,17 @@ def process(input_image, prompt,
|
|
575 |
yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
|
576 |
|
577 |
if flag == 'end':
|
578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
579 |
break
|
580 |
|
581 |
# 20250506 pftq: Modified worker to accept video input and clean frame count
|
@@ -663,6 +677,63 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
663 |
def callback(d):
|
664 |
return
|
665 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
666 |
for idx in range(batch):
|
667 |
if batch > 1:
|
668 |
print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
|
@@ -701,60 +772,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
701 |
else:
|
702 |
transformer.initialize_teacache(enable_teacache=False)
|
703 |
|
704 |
-
|
705 |
-
available_frames = history_latents.shape[2] # Number of latent frames
|
706 |
-
max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
|
707 |
-
adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
|
708 |
-
# Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
|
709 |
-
effective_clean_frames = max(0, num_clean_frames - 1) if num_clean_frames > 1 else 0
|
710 |
-
effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
|
711 |
-
num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
|
712 |
-
num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
|
713 |
-
|
714 |
-
total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
|
715 |
-
total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
|
716 |
-
|
717 |
-
indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
|
718 |
-
clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
|
719 |
-
[1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
|
720 |
-
)
|
721 |
-
clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
|
722 |
-
|
723 |
-
# 20250506 pftq: Split history_latents dynamically based on available frames
|
724 |
-
fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
|
725 |
-
context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
|
726 |
-
|
727 |
-
if total_context_frames > 0:
|
728 |
-
context_frames = history_latents[:, :, -total_context_frames:, :, :]
|
729 |
-
split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
|
730 |
-
split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
|
731 |
-
if split_sizes:
|
732 |
-
splits = context_frames.split(split_sizes, dim=2)
|
733 |
-
split_idx = 0
|
734 |
-
|
735 |
-
if num_4x_frames > 0:
|
736 |
-
clean_latents_4x = splits[split_idx]
|
737 |
-
split_idx = 1
|
738 |
-
if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
|
739 |
-
print("Edge case for <=1 sec videos 4x")
|
740 |
-
clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
|
741 |
-
|
742 |
-
if num_2x_frames > 0 and split_idx < len(splits):
|
743 |
-
clean_latents_2x = splits[split_idx]
|
744 |
-
if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
|
745 |
-
print("Edge case for <=1 sec videos 2x")
|
746 |
-
clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
|
747 |
-
split_idx += 1
|
748 |
-
elif clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
|
749 |
-
clean_latents_2x = clean_latents_4x
|
750 |
-
|
751 |
-
if effective_clean_frames > 0 and split_idx < len(splits):
|
752 |
-
clean_latents_1x = splits[split_idx]
|
753 |
-
|
754 |
-
clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
|
755 |
-
|
756 |
-
# 20250507 pftq: Fix for <=1 sec videos.
|
757 |
-
max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
|
758 |
|
759 |
generated_latents = sample_hunyuan(
|
760 |
transformer=transformer,
|
@@ -801,8 +819,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
801 |
section_latent_frames = latent_window_size * 2
|
802 |
overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
|
803 |
|
804 |
-
|
805 |
-
history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
|
806 |
|
807 |
if not high_vram:
|
808 |
unload_complete_models()
|
@@ -844,11 +861,12 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
844 |
return
|
845 |
|
846 |
def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
847 |
-
return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (
|
848 |
|
849 |
# 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
|
850 |
@spaces.GPU(duration=get_duration_video)
|
851 |
def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
|
|
852 |
global stream, high_vram
|
853 |
|
854 |
if torch.cuda.device_count() == 0:
|
@@ -899,7 +917,18 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
|
|
899 |
yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
|
900 |
|
901 |
if flag == 'end':
|
902 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
903 |
break
|
904 |
|
905 |
def end_process():
|
@@ -985,6 +1014,7 @@ with block:
|
|
985 |
timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
|
986 |
|
987 |
final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
|
|
|
988 |
total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
|
989 |
|
990 |
with gr.Row():
|
@@ -994,24 +1024,24 @@ with block:
|
|
994 |
|
995 |
with gr.Accordion("Advanced settings", open=False):
|
996 |
enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
|
997 |
-
use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
|
998 |
|
999 |
-
n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
|
1000 |
|
1001 |
latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
|
1002 |
-
steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG.
|
1003 |
|
1004 |
with gr.Row():
|
1005 |
no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
|
1006 |
resolution = gr.Dropdown([
|
1007 |
-
640,
|
1008 |
-
672,
|
1009 |
-
704,
|
1010 |
-
768,
|
1011 |
-
832,
|
1012 |
-
864,
|
1013 |
-
960
|
1014 |
-
], value=
|
1015 |
|
1016 |
# 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
|
1017 |
cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
|
@@ -1049,157 +1079,74 @@ with block:
|
|
1049 |
ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
|
1050 |
ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
|
1051 |
|
1052 |
-
|
1053 |
-
|
1054 |
-
return preferences
|
1055 |
-
|
1056 |
-
def load_preferences(saved_prefs):
|
1057 |
-
saved_prefs = init_preferences(saved_prefs)
|
1058 |
-
return saved_prefs["generation-mode"]
|
1059 |
-
|
1060 |
-
def init_preferences(saved_prefs):
|
1061 |
-
if saved_prefs is None:
|
1062 |
-
saved_prefs = default_local_storage
|
1063 |
-
return saved_prefs
|
1064 |
-
|
1065 |
-
def check_parameters(generation_mode, input_image, input_video):
|
1066 |
-
if generation_mode == "image" and input_image is None:
|
1067 |
-
raise gr.Error("Please provide an image to extend.")
|
1068 |
-
if generation_mode == "video" and input_video is None:
|
1069 |
-
raise gr.Error("Please provide a video to extend.")
|
1070 |
-
return gr.update(interactive=True)
|
1071 |
-
|
1072 |
-
prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
|
1073 |
-
timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
|
1074 |
-
start_button.click(fn = check_parameters, inputs = [
|
1075 |
-
generation_mode, input_image, input_video
|
1076 |
-
], outputs = [end_button], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
|
1077 |
-
start_button_video.click(fn = check_parameters, inputs = [
|
1078 |
-
generation_mode, input_image, input_video
|
1079 |
-
], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
|
1080 |
-
end_button.click(fn=end_process)
|
1081 |
-
|
1082 |
-
generation_mode.change(fn = save_preferences, inputs = [
|
1083 |
-
local_storage,
|
1084 |
-
generation_mode,
|
1085 |
-
], outputs = [
|
1086 |
-
local_storage
|
1087 |
-
])
|
1088 |
-
|
1089 |
-
with gr.Row(elem_id="image_examples", visible=False):
|
1090 |
-
gr.Examples(
|
1091 |
examples = [
|
1092 |
[
|
1093 |
"./img_examples/Example1.png", # input_image
|
1094 |
"A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
1095 |
"image", # generation_mode
|
1096 |
-
"Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
|
1097 |
True, # randomize_seed
|
1098 |
42, # seed
|
1099 |
672, # resolution
|
1100 |
1, # total_second_length
|
1101 |
9, # latent_window_size
|
1102 |
-
|
1103 |
1.0, # cfg
|
1104 |
10.0, # gs
|
1105 |
0.0, # rs
|
1106 |
6, # gpu_memory_preservation
|
1107 |
False, # enable_preview
|
1108 |
-
|
1109 |
16 # mp4_crf
|
1110 |
],
|
1111 |
[
|
1112 |
-
"./img_examples/
|
1113 |
-
"
|
1114 |
"image", # generation_mode
|
1115 |
-
"Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
|
1116 |
True, # randomize_seed
|
1117 |
42, # seed
|
1118 |
672, # resolution
|
1119 |
-
|
1120 |
9, # latent_window_size
|
1121 |
-
|
1122 |
1.0, # cfg
|
1123 |
10.0, # gs
|
1124 |
0.0, # rs
|
1125 |
6, # gpu_memory_preservation
|
1126 |
False, # enable_preview
|
1127 |
-
|
1128 |
16 # mp4_crf
|
1129 |
],
|
1130 |
-
],
|
1131 |
-
run_on_click = True,
|
1132 |
-
fn = process,
|
1133 |
-
inputs = ips,
|
1134 |
-
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
|
1135 |
-
cache_examples = torch.cuda.device_count() > 0,
|
1136 |
-
)
|
1137 |
-
|
1138 |
-
with gr.Row(elem_id="video_examples", visible=False):
|
1139 |
-
gr.Examples(
|
1140 |
-
examples = [
|
1141 |
[
|
1142 |
-
"./img_examples/
|
1143 |
-
"
|
1144 |
-
"
|
|
|
1145 |
True, # randomize_seed
|
1146 |
42, # seed
|
1147 |
-
1, # batch
|
1148 |
672, # resolution
|
1149 |
-
|
1150 |
9, # latent_window_size
|
1151 |
-
|
1152 |
1.0, # cfg
|
1153 |
10.0, # gs
|
1154 |
0.0, # rs
|
1155 |
6, # gpu_memory_preservation
|
1156 |
False, # enable_preview
|
1157 |
-
|
1158 |
-
|
1159 |
-
16, # mp4_crf
|
1160 |
-
5, # num_clean_frames
|
1161 |
-
default_vae
|
1162 |
],
|
1163 |
[
|
1164 |
-
"./img_examples/
|
1165 |
-
"
|
1166 |
-
"Missing arm, unrealistic position, blurred, blurry", # n_prompt
|
1167 |
-
True, # randomize_seed
|
1168 |
-
42, # seed
|
1169 |
-
1, # batch
|
1170 |
-
640, # resolution
|
1171 |
-
1, # total_second_length
|
1172 |
-
9, # latent_window_size
|
1173 |
-
35, # steps
|
1174 |
-
1.0, # cfg
|
1175 |
-
10.0, # gs
|
1176 |
-
0.0, # rs
|
1177 |
-
6, # gpu_memory_preservation
|
1178 |
-
False, # enable_preview
|
1179 |
-
False, # use_teacache
|
1180 |
-
False, # no_resize
|
1181 |
-
16, # mp4_crf
|
1182 |
-
5, # num_clean_frames
|
1183 |
-
default_vae
|
1184 |
-
],
|
1185 |
-
],
|
1186 |
-
run_on_click = True,
|
1187 |
-
fn = process_video,
|
1188 |
-
inputs = ips_video,
|
1189 |
-
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
|
1190 |
-
cache_examples = torch.cuda.device_count() > 0,
|
1191 |
-
)
|
1192 |
-
|
1193 |
-
gr.Examples(
|
1194 |
-
examples = [
|
1195 |
-
[
|
1196 |
-
"./img_examples/Example1.png", # input_image
|
1197 |
-
"A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
1198 |
"image", # generation_mode
|
1199 |
-
"Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
|
1200 |
True, # randomize_seed
|
1201 |
42, # seed
|
1202 |
-
|
1203 |
1, # total_second_length
|
1204 |
9, # latent_window_size
|
1205 |
25, # steps
|
@@ -1208,7 +1155,7 @@ with block:
|
|
1208 |
0.0, # rs
|
1209 |
6, # gpu_memory_preservation
|
1210 |
False, # enable_preview
|
1211 |
-
|
1212 |
16 # mp4_crf
|
1213 |
]
|
1214 |
],
|
@@ -1220,15 +1167,16 @@ with block:
|
|
1220 |
)
|
1221 |
|
1222 |
gr.Examples(
|
|
|
1223 |
examples = [
|
1224 |
[
|
1225 |
"./img_examples/Example1.mp4", # input_video
|
1226 |
"View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
1227 |
-
"Missing arm, unrealistic position, blurred, blurry", # n_prompt
|
1228 |
True, # randomize_seed
|
1229 |
42, # seed
|
1230 |
1, # batch
|
1231 |
-
|
1232 |
1, # total_second_length
|
1233 |
9, # latent_window_size
|
1234 |
25, # steps
|
@@ -1237,7 +1185,7 @@ with block:
|
|
1237 |
0.0, # rs
|
1238 |
6, # gpu_memory_preservation
|
1239 |
False, # enable_preview
|
1240 |
-
|
1241 |
False, # no_resize
|
1242 |
16, # mp4_crf
|
1243 |
5, # num_clean_frames
|
@@ -1250,20 +1198,57 @@ with block:
|
|
1250 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
|
1251 |
cache_examples = False,
|
1252 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1253 |
|
1254 |
def handle_generation_mode_change(generation_mode_data):
|
1255 |
if generation_mode_data == "text":
|
1256 |
-
return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
|
1257 |
elif generation_mode_data == "image":
|
1258 |
-
return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
|
1259 |
elif generation_mode_data == "video":
|
1260 |
-
return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
|
1261 |
|
1262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1263 |
generation_mode.change(
|
1264 |
fn=handle_generation_mode_change,
|
1265 |
inputs=[generation_mode],
|
1266 |
-
outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch]
|
1267 |
)
|
1268 |
|
1269 |
# Update display when the page loads
|
@@ -1271,7 +1256,7 @@ with block:
|
|
1271 |
fn=handle_generation_mode_change, inputs = [
|
1272 |
generation_mode
|
1273 |
], outputs = [
|
1274 |
-
text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch
|
1275 |
]
|
1276 |
)
|
1277 |
|
|
|
12 |
import safetensors.torch as sf
|
13 |
import numpy as np
|
14 |
import random
|
15 |
+
import time
|
16 |
import math
|
17 |
# 20250506 pftq: Added for video input loading
|
18 |
import decord
|
|
|
423 |
clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
|
424 |
clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
|
425 |
|
426 |
+
def post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
|
427 |
+
total_generated_latent_frames += int(generated_latents.shape[2])
|
428 |
+
history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
|
429 |
+
|
430 |
+
if not high_vram:
|
431 |
+
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
432 |
+
load_model_as_complete(vae, target_device=gpu)
|
433 |
+
|
434 |
+
if history_pixels is None:
|
435 |
+
real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
|
436 |
+
history_pixels = vae_decode(real_history_latents, vae).cpu()
|
437 |
+
else:
|
438 |
+
section_latent_frames = latent_window_size * 2
|
439 |
+
overlapped_frames = latent_window_size * 4 - 3
|
440 |
+
|
441 |
+
real_history_latents = history_latents[:, :, max(-section_latent_frames, -total_generated_latent_frames):, :, :]
|
442 |
+
history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
|
443 |
+
|
444 |
+
if not high_vram:
|
445 |
+
unload_complete_models()
|
446 |
+
|
447 |
+
if enable_preview or section_index == total_latent_sections - 1:
|
448 |
+
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
449 |
+
|
450 |
+
save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
|
451 |
+
|
452 |
+
print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
|
453 |
+
|
454 |
+
stream.output_queue.push(('file', output_filename))
|
455 |
+
return [total_generated_latent_frames, history_latents, history_pixels]
|
456 |
+
|
457 |
for section_index in range(total_latent_sections):
|
458 |
if stream.input_queue.top() == 'end':
|
459 |
stream.output_queue.push(('end', None))
|
|
|
507 |
callback=callback,
|
508 |
)
|
509 |
|
510 |
+
[total_generated_latent_frames, history_latents, history_pixels] = post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
511 |
except:
|
512 |
traceback.print_exc()
|
513 |
|
|
|
520 |
return
|
521 |
|
522 |
def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
|
523 |
+
return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (1 + ((steps - 25) / 100))
|
|
|
524 |
|
525 |
@spaces.GPU(duration=get_duration)
|
526 |
def process(input_image, prompt,
|
|
|
540 |
use_teacache=False,
|
541 |
mp4_crf=16
|
542 |
):
|
543 |
+
start = time.time()
|
544 |
global stream
|
545 |
|
546 |
if torch.cuda.device_count() == 0:
|
|
|
579 |
yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
|
580 |
|
581 |
if flag == 'end':
|
582 |
+
end = time.time()
|
583 |
+
secondes = int(end - start)
|
584 |
+
minutes = math.floor(secondes / 60)
|
585 |
+
secondes = secondes - (minutes * 60)
|
586 |
+
hours = math.floor(minutes / 60)
|
587 |
+
minutes = minutes - (hours * 60)
|
588 |
+
yield output_filename, gr.update(visible=False), gr.update(), "The video has been generated in " + \
|
589 |
+
((str(hours) + " h, ") if hours != 0 else "") + \
|
590 |
+
((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
|
591 |
+
str(secondes) + " sec. " + \
|
592 |
+
"You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", gr.update(interactive=True), gr.update(interactive=False)
|
593 |
break
|
594 |
|
595 |
# 20250506 pftq: Modified worker to accept video input and clean frame count
|
|
|
677 |
def callback(d):
|
678 |
return
|
679 |
|
680 |
+
def compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent):
|
681 |
+
# 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
|
682 |
+
available_frames = history_latents.shape[2] # Number of latent frames
|
683 |
+
max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
|
684 |
+
adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
|
685 |
+
# Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
|
686 |
+
effective_clean_frames = max(0, num_clean_frames - 1)
|
687 |
+
effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
|
688 |
+
num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
|
689 |
+
num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
|
690 |
+
|
691 |
+
total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
|
692 |
+
total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
|
693 |
+
|
694 |
+
indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
|
695 |
+
clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
|
696 |
+
[1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
|
697 |
+
)
|
698 |
+
clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
|
699 |
+
|
700 |
+
# 20250506 pftq: Split history_latents dynamically based on available frames
|
701 |
+
fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
|
702 |
+
context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
|
703 |
+
|
704 |
+
if total_context_frames > 0:
|
705 |
+
context_frames = history_latents[:, :, -total_context_frames:, :, :]
|
706 |
+
split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
|
707 |
+
split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
|
708 |
+
if split_sizes:
|
709 |
+
splits = context_frames.split(split_sizes, dim=2)
|
710 |
+
split_idx = 0
|
711 |
+
|
712 |
+
if num_4x_frames > 0:
|
713 |
+
clean_latents_4x = splits[split_idx]
|
714 |
+
split_idx = 1
|
715 |
+
if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
|
716 |
+
print("Edge case for <=1 sec videos 4x")
|
717 |
+
clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
|
718 |
+
|
719 |
+
if num_2x_frames > 0 and split_idx < len(splits):
|
720 |
+
clean_latents_2x = splits[split_idx]
|
721 |
+
if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
|
722 |
+
print("Edge case for <=1 sec videos 2x")
|
723 |
+
clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
|
724 |
+
split_idx += 1
|
725 |
+
elif clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
|
726 |
+
clean_latents_2x = clean_latents_4x
|
727 |
+
|
728 |
+
if effective_clean_frames > 0 and split_idx < len(splits):
|
729 |
+
clean_latents_1x = splits[split_idx]
|
730 |
+
|
731 |
+
clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
|
732 |
+
|
733 |
+
# 20250507 pftq: Fix for <=1 sec videos.
|
734 |
+
max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
|
735 |
+
return [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices]
|
736 |
+
|
737 |
for idx in range(batch):
|
738 |
if batch > 1:
|
739 |
print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
|
|
|
772 |
else:
|
773 |
transformer.initialize_teacache(enable_teacache=False)
|
774 |
|
775 |
+
[max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
776 |
|
777 |
generated_latents = sample_hunyuan(
|
778 |
transformer=transformer,
|
|
|
819 |
section_latent_frames = latent_window_size * 2
|
820 |
overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
|
821 |
|
822 |
+
history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu(), overlapped_frames)
|
|
|
823 |
|
824 |
if not high_vram:
|
825 |
unload_complete_models()
|
|
|
861 |
return
|
862 |
|
863 |
def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
864 |
+
return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (1 + ((steps - 25) / 100))
|
865 |
|
866 |
# 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
|
867 |
@spaces.GPU(duration=get_duration_video)
|
868 |
def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
869 |
+
start = time.time()
|
870 |
global stream, high_vram
|
871 |
|
872 |
if torch.cuda.device_count() == 0:
|
|
|
917 |
yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
|
918 |
|
919 |
if flag == 'end':
|
920 |
+
end = time.time()
|
921 |
+
secondes = int(end - start)
|
922 |
+
minutes = math.floor(secondes / 60)
|
923 |
+
secondes = secondes - (minutes * 60)
|
924 |
+
hours = math.floor(minutes / 60)
|
925 |
+
minutes = minutes - (hours * 60)
|
926 |
+
yield output_filename, gr.update(visible=False), desc + \
|
927 |
+
" The video has been generated in " + \
|
928 |
+
((str(hours) + " h, ") if hours != 0 else "") + \
|
929 |
+
((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
|
930 |
+
str(secondes) + " sec. " + \
|
931 |
+
" Video complete. You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", '', gr.update(interactive=True), gr.update(interactive=False)
|
932 |
break
|
933 |
|
934 |
def end_process():
|
|
|
1014 |
timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
|
1015 |
|
1016 |
final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
|
1017 |
+
prompt_hint = gr.HTML("Video extension barely follows the prompt; to force to follow the prompt, you have to set the Distilled CFG Scale to 3.0 and the Context Frames to 2 but the video quality will be poor.")
|
1018 |
total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
|
1019 |
|
1020 |
with gr.Row():
|
|
|
1024 |
|
1025 |
with gr.Accordion("Advanced settings", open=False):
|
1026 |
enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
|
1027 |
+
use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
|
1028 |
|
1029 |
+
n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
|
1030 |
|
1031 |
latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
|
1032 |
+
steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
|
1033 |
|
1034 |
with gr.Row():
|
1035 |
no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
|
1036 |
resolution = gr.Dropdown([
|
1037 |
+
["409,600 px (working)", 640],
|
1038 |
+
["451,584 px (working)", 672],
|
1039 |
+
["495,616 px (VRAM pb on HF)", 704],
|
1040 |
+
["589,824 px (not tested)", 768],
|
1041 |
+
["692,224 px (not tested)", 832],
|
1042 |
+
["746,496 px (not tested)", 864],
|
1043 |
+
["921,600 px (not tested)", 960]
|
1044 |
+
], value=672, label="Resolution (width x height)", info="Do not affect the generation time")
|
1045 |
|
1046 |
# 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
|
1047 |
cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
|
|
|
1079 |
ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
|
1080 |
ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
|
1081 |
|
1082 |
+
gr.Examples(
|
1083 |
+
label = "Examples from image",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1084 |
examples = [
|
1085 |
[
|
1086 |
"./img_examples/Example1.png", # input_image
|
1087 |
"A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
1088 |
"image", # generation_mode
|
1089 |
+
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
1090 |
True, # randomize_seed
|
1091 |
42, # seed
|
1092 |
672, # resolution
|
1093 |
1, # total_second_length
|
1094 |
9, # latent_window_size
|
1095 |
+
25, # steps
|
1096 |
1.0, # cfg
|
1097 |
10.0, # gs
|
1098 |
0.0, # rs
|
1099 |
6, # gpu_memory_preservation
|
1100 |
False, # enable_preview
|
1101 |
+
True, # use_teacache
|
1102 |
16 # mp4_crf
|
1103 |
],
|
1104 |
[
|
1105 |
+
"./img_examples/Example2.webp", # input_image
|
1106 |
+
"A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
|
1107 |
"image", # generation_mode
|
1108 |
+
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
1109 |
True, # randomize_seed
|
1110 |
42, # seed
|
1111 |
672, # resolution
|
1112 |
+
2, # total_second_length
|
1113 |
9, # latent_window_size
|
1114 |
+
25, # steps
|
1115 |
1.0, # cfg
|
1116 |
10.0, # gs
|
1117 |
0.0, # rs
|
1118 |
6, # gpu_memory_preservation
|
1119 |
False, # enable_preview
|
1120 |
+
True, # use_teacache
|
1121 |
16 # mp4_crf
|
1122 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1123 |
[
|
1124 |
+
"./img_examples/Example2.webp", # input_image
|
1125 |
+
"A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
|
1126 |
+
"image", # generation_mode
|
1127 |
+
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
1128 |
True, # randomize_seed
|
1129 |
42, # seed
|
|
|
1130 |
672, # resolution
|
1131 |
+
2, # total_second_length
|
1132 |
9, # latent_window_size
|
1133 |
+
25, # steps
|
1134 |
1.0, # cfg
|
1135 |
10.0, # gs
|
1136 |
0.0, # rs
|
1137 |
6, # gpu_memory_preservation
|
1138 |
False, # enable_preview
|
1139 |
+
True, # use_teacache
|
1140 |
+
16 # mp4_crf
|
|
|
|
|
|
|
1141 |
],
|
1142 |
[
|
1143 |
+
"./img_examples/Example3.jpg", # input_image
|
1144 |
+
"A boy is walking to the right, full view, full-length view, cartoon",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1145 |
"image", # generation_mode
|
1146 |
+
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
1147 |
True, # randomize_seed
|
1148 |
42, # seed
|
1149 |
+
672, # resolution
|
1150 |
1, # total_second_length
|
1151 |
9, # latent_window_size
|
1152 |
25, # steps
|
|
|
1155 |
0.0, # rs
|
1156 |
6, # gpu_memory_preservation
|
1157 |
False, # enable_preview
|
1158 |
+
True, # use_teacache
|
1159 |
16 # mp4_crf
|
1160 |
]
|
1161 |
],
|
|
|
1167 |
)
|
1168 |
|
1169 |
gr.Examples(
|
1170 |
+
label = "Examples from video",
|
1171 |
examples = [
|
1172 |
[
|
1173 |
"./img_examples/Example1.mp4", # input_video
|
1174 |
"View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
1175 |
+
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
1176 |
True, # randomize_seed
|
1177 |
42, # seed
|
1178 |
1, # batch
|
1179 |
+
672, # resolution
|
1180 |
1, # total_second_length
|
1181 |
9, # latent_window_size
|
1182 |
25, # steps
|
|
|
1185 |
0.0, # rs
|
1186 |
6, # gpu_memory_preservation
|
1187 |
False, # enable_preview
|
1188 |
+
True, # use_teacache
|
1189 |
False, # no_resize
|
1190 |
16, # mp4_crf
|
1191 |
5, # num_clean_frames
|
|
|
1198 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
|
1199 |
cache_examples = False,
|
1200 |
)
|
1201 |
+
|
1202 |
+
def save_preferences(preferences, value):
|
1203 |
+
preferences["generation-mode"] = value
|
1204 |
+
return preferences
|
1205 |
+
|
1206 |
+
def load_preferences(saved_prefs):
|
1207 |
+
saved_prefs = init_preferences(saved_prefs)
|
1208 |
+
return saved_prefs["generation-mode"]
|
1209 |
+
|
1210 |
+
def init_preferences(saved_prefs):
|
1211 |
+
if saved_prefs is None:
|
1212 |
+
saved_prefs = default_local_storage
|
1213 |
+
return saved_prefs
|
1214 |
+
|
1215 |
+
def check_parameters(generation_mode, input_image, input_video):
|
1216 |
+
if generation_mode == "image" and input_image is None:
|
1217 |
+
raise gr.Error("Please provide an image to extend.")
|
1218 |
+
if generation_mode == "video" and input_video is None:
|
1219 |
+
raise gr.Error("Please provide a video to extend.")
|
1220 |
+
return gr.update(interactive=True)
|
1221 |
|
1222 |
def handle_generation_mode_change(generation_mode_data):
|
1223 |
if generation_mode_data == "text":
|
1224 |
+
return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
|
1225 |
elif generation_mode_data == "image":
|
1226 |
+
return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
|
1227 |
elif generation_mode_data == "video":
|
1228 |
+
return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
|
1229 |
|
1230 |
|
1231 |
+
prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
|
1232 |
+
timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
|
1233 |
+
start_button.click(fn = check_parameters, inputs = [
|
1234 |
+
generation_mode, input_image, input_video
|
1235 |
+
], outputs = [end_button], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
|
1236 |
+
start_button_video.click(fn = check_parameters, inputs = [
|
1237 |
+
generation_mode, input_image, input_video
|
1238 |
+
], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
|
1239 |
+
end_button.click(fn=end_process)
|
1240 |
+
|
1241 |
+
generation_mode.change(fn = save_preferences, inputs = [
|
1242 |
+
local_storage,
|
1243 |
+
generation_mode,
|
1244 |
+
], outputs = [
|
1245 |
+
local_storage
|
1246 |
+
])
|
1247 |
+
|
1248 |
generation_mode.change(
|
1249 |
fn=handle_generation_mode_change,
|
1250 |
inputs=[generation_mode],
|
1251 |
+
outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint]
|
1252 |
)
|
1253 |
|
1254 |
# Update display when the page loads
|
|
|
1256 |
fn=handle_generation_mode_change, inputs = [
|
1257 |
generation_mode
|
1258 |
], outputs = [
|
1259 |
+
text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint
|
1260 |
]
|
1261 |
)
|
1262 |
|