Fabrice-TIERCELIN commited on
Commit
e00725a
·
verified ·
1 Parent(s): cdb2539

Generation time

Browse files
Files changed (1) hide show
  1. app.py +201 -216
app.py CHANGED
@@ -12,6 +12,7 @@ import einops
12
  import safetensors.torch as sf
13
  import numpy as np
14
  import random
 
15
  import math
16
  # 20250506 pftq: Added for video input loading
17
  import decord
@@ -422,6 +423,37 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
422
  clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
423
  clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  for section_index in range(total_latent_sections):
426
  if stream.input_queue.top() == 'end':
427
  stream.output_queue.push(('end', None))
@@ -475,35 +507,7 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
475
  callback=callback,
476
  )
477
 
478
- total_generated_latent_frames += int(generated_latents.shape[2])
479
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
480
-
481
- if not high_vram:
482
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
483
- load_model_as_complete(vae, target_device=gpu)
484
-
485
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
486
-
487
- if history_pixels is None:
488
- history_pixels = vae_decode(real_history_latents, vae).cpu()
489
- else:
490
- section_latent_frames = latent_window_size * 2
491
- overlapped_frames = latent_window_size * 4 - 3
492
-
493
- current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
494
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
495
-
496
- if not high_vram:
497
- unload_complete_models()
498
-
499
- if enable_preview or section_index == total_latent_sections - 1:
500
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
501
-
502
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
503
-
504
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
505
-
506
- stream.output_queue.push(('file', output_filename))
507
  except:
508
  traceback.print_exc()
509
 
@@ -516,8 +520,7 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
516
  return
517
 
518
  def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
519
- return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (2**((resolution - 640) / 640)) * (1 + ((steps - 25) / 100))
520
-
521
 
522
  @spaces.GPU(duration=get_duration)
523
  def process(input_image, prompt,
@@ -537,6 +540,7 @@ def process(input_image, prompt,
537
  use_teacache=False,
538
  mp4_crf=16
539
  ):
 
540
  global stream
541
 
542
  if torch.cuda.device_count() == 0:
@@ -575,7 +579,17 @@ def process(input_image, prompt,
575
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
576
 
577
  if flag == 'end':
578
- yield output_filename, gr.update(visible=False), gr.update(), 'To make all your generated scenes consistent, you can then apply a face swap on the main character.', gr.update(interactive=True), gr.update(interactive=False)
 
 
 
 
 
 
 
 
 
 
579
  break
580
 
581
  # 20250506 pftq: Modified worker to accept video input and clean frame count
@@ -663,6 +677,63 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
663
  def callback(d):
664
  return
665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  for idx in range(batch):
667
  if batch > 1:
668
  print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
@@ -701,60 +772,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
701
  else:
702
  transformer.initialize_teacache(enable_teacache=False)
703
 
704
- # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
705
- available_frames = history_latents.shape[2] # Number of latent frames
706
- max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
707
- adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
708
- # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
709
- effective_clean_frames = max(0, num_clean_frames - 1) if num_clean_frames > 1 else 0
710
- effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
711
- num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
712
- num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
713
-
714
- total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
715
- total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
716
-
717
- indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
718
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
719
- [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
720
- )
721
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
722
-
723
- # 20250506 pftq: Split history_latents dynamically based on available frames
724
- fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
725
- context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
726
-
727
- if total_context_frames > 0:
728
- context_frames = history_latents[:, :, -total_context_frames:, :, :]
729
- split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
730
- split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
731
- if split_sizes:
732
- splits = context_frames.split(split_sizes, dim=2)
733
- split_idx = 0
734
-
735
- if num_4x_frames > 0:
736
- clean_latents_4x = splits[split_idx]
737
- split_idx = 1
738
- if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
739
- print("Edge case for <=1 sec videos 4x")
740
- clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
741
-
742
- if num_2x_frames > 0 and split_idx < len(splits):
743
- clean_latents_2x = splits[split_idx]
744
- if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
745
- print("Edge case for <=1 sec videos 2x")
746
- clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
747
- split_idx += 1
748
- elif clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
749
- clean_latents_2x = clean_latents_4x
750
-
751
- if effective_clean_frames > 0 and split_idx < len(splits):
752
- clean_latents_1x = splits[split_idx]
753
-
754
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
755
-
756
- # 20250507 pftq: Fix for <=1 sec videos.
757
- max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
758
 
759
  generated_latents = sample_hunyuan(
760
  transformer=transformer,
@@ -801,8 +819,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
801
  section_latent_frames = latent_window_size * 2
802
  overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
803
 
804
- current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
805
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
806
 
807
  if not high_vram:
808
  unload_complete_models()
@@ -844,11 +861,12 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
844
  return
845
 
846
  def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
847
- return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (2**((resolution - 640) / 640)) * (1 + ((steps - 25) / 100))
848
 
849
  # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
850
  @spaces.GPU(duration=get_duration_video)
851
  def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
 
852
  global stream, high_vram
853
 
854
  if torch.cuda.device_count() == 0:
@@ -899,7 +917,18 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
899
  yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
900
 
901
  if flag == 'end':
902
- yield output_filename, gr.update(visible=False), desc+' Video complete. To make all your generated scenes consistent, you can then apply a face swap on the main character.', '', gr.update(interactive=True), gr.update(interactive=False)
 
 
 
 
 
 
 
 
 
 
 
903
  break
904
 
905
  def end_process():
@@ -985,6 +1014,7 @@ with block:
985
  timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
986
 
987
  final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
 
988
  total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
989
 
990
  with gr.Row():
@@ -994,24 +1024,24 @@ with block:
994
 
995
  with gr.Accordion("Advanced settings", open=False):
996
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
997
- use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
998
 
999
- n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1000
 
1001
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
1002
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. Changing this value is not recommended.')
1003
 
1004
  with gr.Row():
1005
  no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
1006
  resolution = gr.Dropdown([
1007
- 640,
1008
- 672,
1009
- 704,
1010
- 768,
1011
- 832,
1012
- 864,
1013
- 960
1014
- ], value=640, label="Resolution (max width or height)")
1015
 
1016
  # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
1017
  cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
@@ -1049,157 +1079,74 @@ with block:
1049
  ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
1050
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1051
 
1052
- def save_preferences(preferences, value):
1053
- preferences["generation-mode"] = value
1054
- return preferences
1055
-
1056
- def load_preferences(saved_prefs):
1057
- saved_prefs = init_preferences(saved_prefs)
1058
- return saved_prefs["generation-mode"]
1059
-
1060
- def init_preferences(saved_prefs):
1061
- if saved_prefs is None:
1062
- saved_prefs = default_local_storage
1063
- return saved_prefs
1064
-
1065
- def check_parameters(generation_mode, input_image, input_video):
1066
- if generation_mode == "image" and input_image is None:
1067
- raise gr.Error("Please provide an image to extend.")
1068
- if generation_mode == "video" and input_video is None:
1069
- raise gr.Error("Please provide a video to extend.")
1070
- return gr.update(interactive=True)
1071
-
1072
- prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1073
- timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1074
- start_button.click(fn = check_parameters, inputs = [
1075
- generation_mode, input_image, input_video
1076
- ], outputs = [end_button], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
1077
- start_button_video.click(fn = check_parameters, inputs = [
1078
- generation_mode, input_image, input_video
1079
- ], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
1080
- end_button.click(fn=end_process)
1081
-
1082
- generation_mode.change(fn = save_preferences, inputs = [
1083
- local_storage,
1084
- generation_mode,
1085
- ], outputs = [
1086
- local_storage
1087
- ])
1088
-
1089
- with gr.Row(elem_id="image_examples", visible=False):
1090
- gr.Examples(
1091
  examples = [
1092
  [
1093
  "./img_examples/Example1.png", # input_image
1094
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1095
  "image", # generation_mode
1096
- "Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
1097
  True, # randomize_seed
1098
  42, # seed
1099
  672, # resolution
1100
  1, # total_second_length
1101
  9, # latent_window_size
1102
- 50, # steps
1103
  1.0, # cfg
1104
  10.0, # gs
1105
  0.0, # rs
1106
  6, # gpu_memory_preservation
1107
  False, # enable_preview
1108
- False, # use_teacache
1109
  16 # mp4_crf
1110
  ],
1111
  [
1112
- "./img_examples/Example1.png", # input_image
1113
- "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1114
  "image", # generation_mode
1115
- "Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
1116
  True, # randomize_seed
1117
  42, # seed
1118
  672, # resolution
1119
- 1, # total_second_length
1120
  9, # latent_window_size
1121
- 35, # steps
1122
  1.0, # cfg
1123
  10.0, # gs
1124
  0.0, # rs
1125
  6, # gpu_memory_preservation
1126
  False, # enable_preview
1127
- False, # use_teacache
1128
  16 # mp4_crf
1129
  ],
1130
- ],
1131
- run_on_click = True,
1132
- fn = process,
1133
- inputs = ips,
1134
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
1135
- cache_examples = torch.cuda.device_count() > 0,
1136
- )
1137
-
1138
- with gr.Row(elem_id="video_examples", visible=False):
1139
- gr.Examples(
1140
- examples = [
1141
  [
1142
- "./img_examples/Example1.mp4", # input_video
1143
- "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1144
- "Missing arm, unrealistic position, blurred, blurry", # n_prompt
 
1145
  True, # randomize_seed
1146
  42, # seed
1147
- 1, # batch
1148
  672, # resolution
1149
- 1, # total_second_length
1150
  9, # latent_window_size
1151
- 50, # steps
1152
  1.0, # cfg
1153
  10.0, # gs
1154
  0.0, # rs
1155
  6, # gpu_memory_preservation
1156
  False, # enable_preview
1157
- False, # use_teacache
1158
- False, # no_resize
1159
- 16, # mp4_crf
1160
- 5, # num_clean_frames
1161
- default_vae
1162
  ],
1163
  [
1164
- "./img_examples/Example1.mp4", # input_video
1165
- "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1166
- "Missing arm, unrealistic position, blurred, blurry", # n_prompt
1167
- True, # randomize_seed
1168
- 42, # seed
1169
- 1, # batch
1170
- 640, # resolution
1171
- 1, # total_second_length
1172
- 9, # latent_window_size
1173
- 35, # steps
1174
- 1.0, # cfg
1175
- 10.0, # gs
1176
- 0.0, # rs
1177
- 6, # gpu_memory_preservation
1178
- False, # enable_preview
1179
- False, # use_teacache
1180
- False, # no_resize
1181
- 16, # mp4_crf
1182
- 5, # num_clean_frames
1183
- default_vae
1184
- ],
1185
- ],
1186
- run_on_click = True,
1187
- fn = process_video,
1188
- inputs = ips_video,
1189
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
1190
- cache_examples = torch.cuda.device_count() > 0,
1191
- )
1192
-
1193
- gr.Examples(
1194
- examples = [
1195
- [
1196
- "./img_examples/Example1.png", # input_image
1197
- "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1198
  "image", # generation_mode
1199
- "Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
1200
  True, # randomize_seed
1201
  42, # seed
1202
- 640, # resolution
1203
  1, # total_second_length
1204
  9, # latent_window_size
1205
  25, # steps
@@ -1208,7 +1155,7 @@ with block:
1208
  0.0, # rs
1209
  6, # gpu_memory_preservation
1210
  False, # enable_preview
1211
- False, # use_teacache
1212
  16 # mp4_crf
1213
  ]
1214
  ],
@@ -1220,15 +1167,16 @@ with block:
1220
  )
1221
 
1222
  gr.Examples(
 
1223
  examples = [
1224
  [
1225
  "./img_examples/Example1.mp4", # input_video
1226
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1227
- "Missing arm, unrealistic position, blurred, blurry", # n_prompt
1228
  True, # randomize_seed
1229
  42, # seed
1230
  1, # batch
1231
- 640, # resolution
1232
  1, # total_second_length
1233
  9, # latent_window_size
1234
  25, # steps
@@ -1237,7 +1185,7 @@ with block:
1237
  0.0, # rs
1238
  6, # gpu_memory_preservation
1239
  False, # enable_preview
1240
- False, # use_teacache
1241
  False, # no_resize
1242
  16, # mp4_crf
1243
  5, # num_clean_frames
@@ -1250,20 +1198,57 @@ with block:
1250
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
1251
  cache_examples = False,
1252
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1253
 
1254
  def handle_generation_mode_change(generation_mode_data):
1255
  if generation_mode_data == "text":
1256
- return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1257
  elif generation_mode_data == "image":
1258
- return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1259
  elif generation_mode_data == "video":
1260
- return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
1261
 
1262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1263
  generation_mode.change(
1264
  fn=handle_generation_mode_change,
1265
  inputs=[generation_mode],
1266
- outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch]
1267
  )
1268
 
1269
  # Update display when the page loads
@@ -1271,7 +1256,7 @@ with block:
1271
  fn=handle_generation_mode_change, inputs = [
1272
  generation_mode
1273
  ], outputs = [
1274
- text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch
1275
  ]
1276
  )
1277
 
 
12
  import safetensors.torch as sf
13
  import numpy as np
14
  import random
15
+ import time
16
  import math
17
  # 20250506 pftq: Added for video input loading
18
  import decord
 
423
  clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
424
  clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
425
 
426
+ def post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
427
+ total_generated_latent_frames += int(generated_latents.shape[2])
428
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
429
+
430
+ if not high_vram:
431
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
432
+ load_model_as_complete(vae, target_device=gpu)
433
+
434
+ if history_pixels is None:
435
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
436
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
437
+ else:
438
+ section_latent_frames = latent_window_size * 2
439
+ overlapped_frames = latent_window_size * 4 - 3
440
+
441
+ real_history_latents = history_latents[:, :, max(-section_latent_frames, -total_generated_latent_frames):, :, :]
442
+ history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
443
+
444
+ if not high_vram:
445
+ unload_complete_models()
446
+
447
+ if enable_preview or section_index == total_latent_sections - 1:
448
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
449
+
450
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
451
+
452
+ print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
453
+
454
+ stream.output_queue.push(('file', output_filename))
455
+ return [total_generated_latent_frames, history_latents, history_pixels]
456
+
457
  for section_index in range(total_latent_sections):
458
  if stream.input_queue.top() == 'end':
459
  stream.output_queue.push(('end', None))
 
507
  callback=callback,
508
  )
509
 
510
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  except:
512
  traceback.print_exc()
513
 
 
520
  return
521
 
522
  def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
523
+ return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (1 + ((steps - 25) / 100))
 
524
 
525
  @spaces.GPU(duration=get_duration)
526
  def process(input_image, prompt,
 
540
  use_teacache=False,
541
  mp4_crf=16
542
  ):
543
+ start = time.time()
544
  global stream
545
 
546
  if torch.cuda.device_count() == 0:
 
579
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
580
 
581
  if flag == 'end':
582
+ end = time.time()
583
+ secondes = int(end - start)
584
+ minutes = math.floor(secondes / 60)
585
+ secondes = secondes - (minutes * 60)
586
+ hours = math.floor(minutes / 60)
587
+ minutes = minutes - (hours * 60)
588
+ yield output_filename, gr.update(visible=False), gr.update(), "The video has been generated in " + \
589
+ ((str(hours) + " h, ") if hours != 0 else "") + \
590
+ ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
591
+ str(secondes) + " sec. " + \
592
+ "You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", gr.update(interactive=True), gr.update(interactive=False)
593
  break
594
 
595
  # 20250506 pftq: Modified worker to accept video input and clean frame count
 
677
  def callback(d):
678
  return
679
 
680
+ def compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent):
681
+ # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
682
+ available_frames = history_latents.shape[2] # Number of latent frames
683
+ max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
684
+ adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
685
+ # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
686
+ effective_clean_frames = max(0, num_clean_frames - 1)
687
+ effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
688
+ num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
689
+ num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
690
+
691
+ total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
692
+ total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
693
+
694
+ indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
695
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
696
+ [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
697
+ )
698
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
699
+
700
+ # 20250506 pftq: Split history_latents dynamically based on available frames
701
+ fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
702
+ context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
703
+
704
+ if total_context_frames > 0:
705
+ context_frames = history_latents[:, :, -total_context_frames:, :, :]
706
+ split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
707
+ split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
708
+ if split_sizes:
709
+ splits = context_frames.split(split_sizes, dim=2)
710
+ split_idx = 0
711
+
712
+ if num_4x_frames > 0:
713
+ clean_latents_4x = splits[split_idx]
714
+ split_idx = 1
715
+ if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
716
+ print("Edge case for <=1 sec videos 4x")
717
+ clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
718
+
719
+ if num_2x_frames > 0 and split_idx < len(splits):
720
+ clean_latents_2x = splits[split_idx]
721
+ if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
722
+ print("Edge case for <=1 sec videos 2x")
723
+ clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
724
+ split_idx += 1
725
+ elif clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
726
+ clean_latents_2x = clean_latents_4x
727
+
728
+ if effective_clean_frames > 0 and split_idx < len(splits):
729
+ clean_latents_1x = splits[split_idx]
730
+
731
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
732
+
733
+ # 20250507 pftq: Fix for <=1 sec videos.
734
+ max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
735
+ return [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices]
736
+
737
  for idx in range(batch):
738
  if batch > 1:
739
  print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
 
772
  else:
773
  transformer.initialize_teacache(enable_teacache=False)
774
 
775
+ [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
 
777
  generated_latents = sample_hunyuan(
778
  transformer=transformer,
 
819
  section_latent_frames = latent_window_size * 2
820
  overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
821
 
822
+ history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu(), overlapped_frames)
 
823
 
824
  if not high_vram:
825
  unload_complete_models()
 
861
  return
862
 
863
  def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
864
+ return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (1 + ((steps - 25) / 100))
865
 
866
  # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
867
  @spaces.GPU(duration=get_duration_video)
868
  def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
869
+ start = time.time()
870
  global stream, high_vram
871
 
872
  if torch.cuda.device_count() == 0:
 
917
  yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
918
 
919
  if flag == 'end':
920
+ end = time.time()
921
+ secondes = int(end - start)
922
+ minutes = math.floor(secondes / 60)
923
+ secondes = secondes - (minutes * 60)
924
+ hours = math.floor(minutes / 60)
925
+ minutes = minutes - (hours * 60)
926
+ yield output_filename, gr.update(visible=False), desc + \
927
+ " The video has been generated in " + \
928
+ ((str(hours) + " h, ") if hours != 0 else "") + \
929
+ ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
930
+ str(secondes) + " sec. " + \
931
+ " Video complete. You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", '', gr.update(interactive=True), gr.update(interactive=False)
932
  break
933
 
934
  def end_process():
 
1014
  timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
1015
 
1016
  final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
1017
+ prompt_hint = gr.HTML("Video extension barely follows the prompt; to force to follow the prompt, you have to set the Distilled CFG Scale to 3.0 and the Context Frames to 2 but the video quality will be poor.")
1018
  total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
1019
 
1020
  with gr.Row():
 
1024
 
1025
  with gr.Accordion("Advanced settings", open=False):
1026
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1027
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1028
 
1029
+ n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1030
 
1031
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
1032
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
1033
 
1034
  with gr.Row():
1035
  no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
1036
  resolution = gr.Dropdown([
1037
+ ["409,600 px (working)", 640],
1038
+ ["451,584 px (working)", 672],
1039
+ ["495,616 px (VRAM pb on HF)", 704],
1040
+ ["589,824 px (not tested)", 768],
1041
+ ["692,224 px (not tested)", 832],
1042
+ ["746,496 px (not tested)", 864],
1043
+ ["921,600 px (not tested)", 960]
1044
+ ], value=672, label="Resolution (width x height)", info="Do not affect the generation time")
1045
 
1046
  # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
1047
  cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
 
1079
  ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
1080
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1081
 
1082
+ gr.Examples(
1083
+ label = "Examples from image",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1084
  examples = [
1085
  [
1086
  "./img_examples/Example1.png", # input_image
1087
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1088
  "image", # generation_mode
1089
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1090
  True, # randomize_seed
1091
  42, # seed
1092
  672, # resolution
1093
  1, # total_second_length
1094
  9, # latent_window_size
1095
+ 25, # steps
1096
  1.0, # cfg
1097
  10.0, # gs
1098
  0.0, # rs
1099
  6, # gpu_memory_preservation
1100
  False, # enable_preview
1101
+ True, # use_teacache
1102
  16 # mp4_crf
1103
  ],
1104
  [
1105
+ "./img_examples/Example2.webp", # input_image
1106
+ "A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1107
  "image", # generation_mode
1108
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1109
  True, # randomize_seed
1110
  42, # seed
1111
  672, # resolution
1112
+ 2, # total_second_length
1113
  9, # latent_window_size
1114
+ 25, # steps
1115
  1.0, # cfg
1116
  10.0, # gs
1117
  0.0, # rs
1118
  6, # gpu_memory_preservation
1119
  False, # enable_preview
1120
+ True, # use_teacache
1121
  16 # mp4_crf
1122
  ],
 
 
 
 
 
 
 
 
 
 
 
1123
  [
1124
+ "./img_examples/Example2.webp", # input_image
1125
+ "A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1126
+ "image", # generation_mode
1127
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1128
  True, # randomize_seed
1129
  42, # seed
 
1130
  672, # resolution
1131
+ 2, # total_second_length
1132
  9, # latent_window_size
1133
+ 25, # steps
1134
  1.0, # cfg
1135
  10.0, # gs
1136
  0.0, # rs
1137
  6, # gpu_memory_preservation
1138
  False, # enable_preview
1139
+ True, # use_teacache
1140
+ 16 # mp4_crf
 
 
 
1141
  ],
1142
  [
1143
+ "./img_examples/Example3.jpg", # input_image
1144
+ "A boy is walking to the right, full view, full-length view, cartoon",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1145
  "image", # generation_mode
1146
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1147
  True, # randomize_seed
1148
  42, # seed
1149
+ 672, # resolution
1150
  1, # total_second_length
1151
  9, # latent_window_size
1152
  25, # steps
 
1155
  0.0, # rs
1156
  6, # gpu_memory_preservation
1157
  False, # enable_preview
1158
+ True, # use_teacache
1159
  16 # mp4_crf
1160
  ]
1161
  ],
 
1167
  )
1168
 
1169
  gr.Examples(
1170
+ label = "Examples from video",
1171
  examples = [
1172
  [
1173
  "./img_examples/Example1.mp4", # input_video
1174
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1175
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1176
  True, # randomize_seed
1177
  42, # seed
1178
  1, # batch
1179
+ 672, # resolution
1180
  1, # total_second_length
1181
  9, # latent_window_size
1182
  25, # steps
 
1185
  0.0, # rs
1186
  6, # gpu_memory_preservation
1187
  False, # enable_preview
1188
+ True, # use_teacache
1189
  False, # no_resize
1190
  16, # mp4_crf
1191
  5, # num_clean_frames
 
1198
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
1199
  cache_examples = False,
1200
  )
1201
+
1202
+ def save_preferences(preferences, value):
1203
+ preferences["generation-mode"] = value
1204
+ return preferences
1205
+
1206
+ def load_preferences(saved_prefs):
1207
+ saved_prefs = init_preferences(saved_prefs)
1208
+ return saved_prefs["generation-mode"]
1209
+
1210
+ def init_preferences(saved_prefs):
1211
+ if saved_prefs is None:
1212
+ saved_prefs = default_local_storage
1213
+ return saved_prefs
1214
+
1215
+ def check_parameters(generation_mode, input_image, input_video):
1216
+ if generation_mode == "image" and input_image is None:
1217
+ raise gr.Error("Please provide an image to extend.")
1218
+ if generation_mode == "video" and input_video is None:
1219
+ raise gr.Error("Please provide a video to extend.")
1220
+ return gr.update(interactive=True)
1221
 
1222
  def handle_generation_mode_change(generation_mode_data):
1223
  if generation_mode_data == "text":
1224
+ return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1225
  elif generation_mode_data == "image":
1226
+ return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1227
  elif generation_mode_data == "video":
1228
+ return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
1229
 
1230
 
1231
+ prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1232
+ timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1233
+ start_button.click(fn = check_parameters, inputs = [
1234
+ generation_mode, input_image, input_video
1235
+ ], outputs = [end_button], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
1236
+ start_button_video.click(fn = check_parameters, inputs = [
1237
+ generation_mode, input_image, input_video
1238
+ ], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
1239
+ end_button.click(fn=end_process)
1240
+
1241
+ generation_mode.change(fn = save_preferences, inputs = [
1242
+ local_storage,
1243
+ generation_mode,
1244
+ ], outputs = [
1245
+ local_storage
1246
+ ])
1247
+
1248
  generation_mode.change(
1249
  fn=handle_generation_mode_change,
1250
  inputs=[generation_mode],
1251
+ outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint]
1252
  )
1253
 
1254
  # Update display when the page loads
 
1256
  fn=handle_generation_mode_change, inputs = [
1257
  generation_mode
1258
  ], outputs = [
1259
+ text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint
1260
  ]
1261
  )
1262