Fabrice-TIERCELIN commited on
Commit
ae205be
·
verified ·
1 Parent(s): 1e823d2

Start & end frame mode

Browse files
Files changed (1) hide show
  1. app.py +418 -35
app.py CHANGED
@@ -7,7 +7,14 @@ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.di
7
  try:
8
  import spaces
9
  except:
10
- print("Not on HuggingFace")
 
 
 
 
 
 
 
11
  import gradio as gr
12
  import torch
13
  import traceback
@@ -198,9 +205,6 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
198
  frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
199
  #print(f"Tensor shape: {frames_pt.shape}")
200
 
201
- # 20250507 pftq: Save pixel frames for use in worker
202
- input_video_pixels = frames_pt.cpu()
203
-
204
  # 20250506 pftq: Move to device
205
  #print(f"Moving tensor to device: {device}")
206
  frames_pt = frames_pt.to(device)
@@ -252,7 +256,7 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
252
  torch.cuda.empty_cache()
253
  #print("VAE moved back to CPU, CUDA cache cleared")
254
 
255
- return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
256
 
257
  except Exception as e:
258
  print(f"Error in video_encode: {str(e)}")
@@ -306,7 +310,7 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
306
  return False
307
 
308
  @torch.no_grad()
309
- def worker(input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
310
  def encode_prompt(prompt, n_prompt):
311
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
312
 
@@ -412,7 +416,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
412
 
413
  rnd = torch.Generator("cpu").manual_seed(seed)
414
 
415
- history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
416
  start_latent = start_latent.to(history_latents)
417
  history_pixels = None
418
 
@@ -575,6 +579,285 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
575
  stream.output_queue.push(('end', None))
576
  return
577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  # 20250506 pftq: Modified worker to accept video input and clean frame count
579
  @torch.no_grad()
580
  def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
@@ -602,8 +885,8 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
602
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
603
 
604
  # 20250506 pftq: Encode video
605
- start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)[:6]
606
- start_latent = start_latent.to(dtype=torch.float32).cpu()
607
  video_latents = video_latents.cpu()
608
 
609
  total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
@@ -855,18 +1138,17 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
855
  stream.output_queue.push(('end', None))
856
  return
857
 
858
- def get_duration(input_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
859
  return allocation_time
860
 
861
- # Remove this decorator if you run on local
862
  @spaces.GPU(duration=get_duration)
863
- def process_on_gpu(input_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
864
  ):
865
  start = time.time()
866
  global stream
867
  stream = AsyncStream()
868
 
869
- async_run(worker, input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
870
 
871
  output_filename = None
872
 
@@ -896,6 +1178,7 @@ def process_on_gpu(input_image, image_position, prompts, generation_mode, n_prom
896
  break
897
 
898
  def process(input_image,
 
899
  image_position=0,
900
  prompt="",
901
  generation_mode="image",
@@ -907,18 +1190,18 @@ def process(input_image,
907
  resolution=640,
908
  total_second_length=5,
909
  latent_window_size=9,
910
- steps=25,
911
  cfg=1.0,
912
  gs=10.0,
913
  rs=0.0,
914
  gpu_memory_preservation=6,
915
- enable_preview=True,
916
  use_teacache=False,
917
  mp4_crf=16,
918
  fps_number=30
919
  ):
920
  if auto_allocation:
921
- allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25)), 600)
922
 
923
  if torch.cuda.device_count() == 0:
924
  gr.Warning('Set this space to GPU config to make it work.')
@@ -939,6 +1222,7 @@ def process(input_image,
939
  yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
940
 
941
  yield from process_on_gpu(input_image,
 
942
  image_position,
943
  prompts,
944
  generation_mode,
@@ -962,7 +1246,6 @@ def process(input_image,
962
  def get_duration_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
963
  return allocation_time
964
 
965
- # Remove this decorator if you run on local
966
  @spaces.GPU(duration=get_duration_video)
967
  def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
968
  start = time.time()
@@ -1003,7 +1286,7 @@ def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution
1003
  def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1004
  global high_vram
1005
  if auto_allocation:
1006
- allocation_time = min(total_second_length * 60 * (2.5 if use_teacache else 3.5) * (1 + ((steps - 25) / 25)), 600)
1007
 
1008
  if torch.cuda.device_count() == 0:
1009
  gr.Warning('Set this space to GPU config to make it work.')
@@ -1103,9 +1386,10 @@ with block:
1103
  local_storage = gr.BrowserState(default_local_storage)
1104
  with gr.Row():
1105
  with gr.Column():
1106
- generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1107
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1108
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
 
1109
  image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
1110
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1111
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
@@ -1131,7 +1415,7 @@ with block:
1131
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1132
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1133
 
1134
- n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1135
 
1136
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
1137
 
@@ -1186,8 +1470,7 @@ with block:
1186
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1187
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1188
 
1189
- # 20250506 pftq: Updated inputs to include num_clean_frames
1190
- ips = [input_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
1191
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1192
 
1193
  gr.Examples(
@@ -1195,10 +1478,11 @@ with block:
1195
  examples = [
1196
  [
1197
  None, # input_image
 
1198
  0, # image_position
1199
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1200
  "text", # generation_mode
1201
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1202
  True, # randomize_seed
1203
  42, # seed
1204
  True, # auto_allocation
@@ -1229,10 +1513,11 @@ with block:
1229
  examples = [
1230
  [
1231
  "./img_examples/Example1.png", # input_image
 
1232
  0, # image_position
1233
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1234
  "image", # generation_mode
1235
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1236
  True, # randomize_seed
1237
  42, # seed
1238
  True, # auto_allocation
@@ -1252,10 +1537,11 @@ with block:
1252
  ],
1253
  [
1254
  "./img_examples/Example2.webp", # input_image
 
1255
  0, # image_position
1256
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1257
  "image", # generation_mode
1258
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1259
  True, # randomize_seed
1260
  42, # seed
1261
  True, # auto_allocation
@@ -1275,10 +1561,11 @@ with block:
1275
  ],
1276
  [
1277
  "./img_examples/Example2.webp", # input_image
 
1278
  0, # image_position
1279
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1280
  "image", # generation_mode
1281
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1282
  True, # randomize_seed
1283
  42, # seed
1284
  True, # auto_allocation
@@ -1298,10 +1585,11 @@ with block:
1298
  ],
1299
  [
1300
  "./img_examples/Example3.jpg", # input_image
 
1301
  0, # image_position
1302
- "A boy is walking to the right, full view, full-length view, cartoon",
1303
  "image", # generation_mode
1304
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1305
  True, # randomize_seed
1306
  42, # seed
1307
  True, # auto_allocation
@@ -1321,10 +1609,11 @@ with block:
1321
  ],
1322
  [
1323
  "./img_examples/Example4.webp", # input_image
 
1324
  100, # image_position
1325
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1326
  "image", # generation_mode
1327
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1328
  True, # randomize_seed
1329
  42, # seed
1330
  True, # auto_allocation
@@ -1350,13 +1639,47 @@ with block:
1350
  cache_examples = False,
1351
  )
1352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1353
  gr.Examples(
1354
  label = "🎥 Examples from video",
1355
  examples = [
1356
  [
1357
  "./img_examples/Example1.mp4", # input_video
1358
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1359
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1360
  True, # randomize_seed
1361
  42, # seed
1362
  True, # auto_allocation
@@ -1401,17 +1724,77 @@ with block:
1401
  def check_parameters(generation_mode, input_image, input_video):
1402
  if generation_mode == "image" and input_image is None:
1403
  raise gr.Error("Please provide an image to extend.")
 
 
1404
  if generation_mode == "video" and input_video is None:
1405
  raise gr.Error("Please provide a video to extend.")
1406
  return [gr.update(interactive=True), gr.update(visible = True)]
1407
 
1408
  def handle_generation_mode_change(generation_mode_data):
1409
  if generation_mode_data == "text":
1410
- return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1411
  elif generation_mode_data == "image":
1412
- return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1413
  elif generation_mode_data == "video":
1414
- return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1415
 
1416
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1417
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
@@ -1433,7 +1816,7 @@ with block:
1433
  generation_mode.change(
1434
  fn=handle_generation_mode_change,
1435
  inputs=[generation_mode],
1436
- outputs=[text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
1437
  )
1438
 
1439
  # Update display when the page loads
@@ -1441,7 +1824,7 @@ with block:
1441
  fn=handle_generation_mode_change, inputs = [
1442
  generation_mode
1443
  ], outputs = [
1444
- text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
1445
  ]
1446
  )
1447
 
 
7
  try:
8
  import spaces
9
  except:
10
+ class spaces():
11
+ def GPU(*args, **kwargs):
12
+ def decorator(function):
13
+ def new_function(*dummy_args, **dummy_kwargs):
14
+ return function(*dummy_args, **dummy_kwargs)
15
+ return new_function
16
+ return decorator
17
+
18
  import gradio as gr
19
  import torch
20
  import traceback
 
205
  frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
206
  #print(f"Tensor shape: {frames_pt.shape}")
207
 
 
 
 
208
  # 20250506 pftq: Move to device
209
  #print(f"Moving tensor to device: {device}")
210
  frames_pt = frames_pt.to(device)
 
256
  torch.cuda.empty_cache()
257
  #print("VAE moved back to CPU, CUDA cache cleared")
258
 
259
+ return start_latent, input_image_np, history_latents, fps, target_height, target_width
260
 
261
  except Exception as e:
262
  print(f"Error in video_encode: {str(e)}")
 
310
  return False
311
 
312
  @torch.no_grad()
313
+ def worker(input_image, end_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
314
  def encode_prompt(prompt, n_prompt):
315
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
316
 
 
416
 
417
  rnd = torch.Generator("cpu").manual_seed(seed)
418
 
419
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
420
  start_latent = start_latent.to(history_latents)
421
  history_pixels = None
422
 
 
579
  stream.output_queue.push(('end', None))
580
  return
581
 
582
+ @torch.no_grad()
583
+ def worker_start_end(input_image, end_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
584
+ def encode_prompt(prompt, n_prompt):
585
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
586
+
587
+ if cfg == 1:
588
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
589
+ else:
590
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
591
+
592
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
593
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
594
+
595
+ llama_vec = llama_vec.to(transformer.dtype)
596
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
597
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
598
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
599
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
600
+
601
+ total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
602
+ total_latent_sections = int(max(round(total_latent_sections), 1))
603
+
604
+ job_id = generate_timestamp()
605
+
606
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
607
+
608
+ try:
609
+ # Clean GPU
610
+ if not high_vram:
611
+ unload_complete_models(
612
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
613
+ )
614
+
615
+ # Text encoding
616
+
617
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
618
+
619
+ if not high_vram:
620
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
621
+ load_model_as_complete(text_encoder_2, target_device=gpu)
622
+
623
+
624
+ prompt_parameters = []
625
+
626
+ for prompt_part in prompts[:total_latent_sections]:
627
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
628
+
629
+ # Clean GPU
630
+ if not high_vram:
631
+ unload_complete_models(
632
+ text_encoder, text_encoder_2
633
+ )
634
+
635
+ # Processing input image (start frame)
636
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing start frame ...'))))
637
+
638
+ H, W, C = input_image.shape
639
+ height, width = find_nearest_bucket(H, W, resolution=resolution)
640
+ has_end_image = end_image is not None
641
+
642
+ def get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram):
643
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
644
+
645
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}_start.png'))
646
+
647
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
648
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
649
+
650
+ # Processing end image (if provided)
651
+ if has_end_image:
652
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing end frame ...'))))
653
+
654
+ end_image_np = resize_and_center_crop(end_image, target_width=width, target_height=height)
655
+
656
+ Image.fromarray(end_image_np).save(os.path.join(outputs_folder, f'{job_id}_end.png'))
657
+
658
+ end_image_pt = torch.from_numpy(end_image_np).float() / 127.5 - 1
659
+ end_image_pt = end_image_pt.permute(2, 0, 1)[None, :, None]
660
+
661
+ # VAE encoding
662
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
663
+
664
+ if not high_vram:
665
+ load_model_as_complete(vae, target_device=gpu)
666
+
667
+ start_latent = vae_encode(input_image_pt, vae)
668
+
669
+ if has_end_image:
670
+ end_latent = vae_encode(end_image_pt, vae)
671
+
672
+ # CLIP Vision
673
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
674
+
675
+ if not high_vram:
676
+ load_model_as_complete(image_encoder, target_device=gpu)
677
+
678
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
679
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
680
+
681
+ if has_end_image:
682
+ end_image_encoder_output = hf_clip_vision_encode(end_image_np, feature_extractor, image_encoder)
683
+ end_image_encoder_last_hidden_state = end_image_encoder_output.last_hidden_state
684
+ # Combine both image embeddings or use a weighted approach
685
+ image_encoder_last_hidden_state = (image_encoder_last_hidden_state + end_image_encoder_last_hidden_state) / 2
686
+
687
+ # Clean GPU
688
+ if not high_vram:
689
+ unload_complete_models(
690
+ image_encoder
691
+ )
692
+
693
+ return [start_latent, end_latent, image_encoder_last_hidden_state]
694
+
695
+ [start_latent, end_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram)
696
+
697
+ # Dtype
698
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
699
+
700
+ # Sampling
701
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
702
+
703
+ rnd = torch.Generator("cpu").manual_seed(seed)
704
+ num_frames = latent_window_size * 4 - 3
705
+
706
+ history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32, device=cpu)
707
+ start_latent = start_latent.to(history_latents)
708
+ if has_end_image:
709
+ end_latent = end_latent.to(history_latents)
710
+
711
+ history_pixels = None
712
+ total_generated_latent_frames = 0
713
+
714
+ if total_latent_sections > 4:
715
+ # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
716
+ # items looks better than expanding it when total_latent_sections > 4
717
+ # One can try to remove below trick and just
718
+ # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
719
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
720
+ else:
721
+ # Convert an iterator to a list
722
+ latent_paddings = list(range(total_latent_sections - 1, -1, -1))
723
+
724
+ if enable_preview:
725
+ def callback(d):
726
+ preview = d['denoised']
727
+ preview = vae_decode_fake(preview)
728
+
729
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
730
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
731
+
732
+ if stream.input_queue.top() == 'end':
733
+ stream.output_queue.push(('end', None))
734
+ raise KeyboardInterrupt('User ends the task.')
735
+
736
+ current_step = d['i'] + 1
737
+ percentage = int(100.0 * current_step / steps)
738
+ hint = f'Sampling {current_step}/{steps}'
739
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
740
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
741
+ return
742
+ else:
743
+ def callback(d):
744
+ return
745
+
746
+ def post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section):
747
+ if is_last_section:
748
+ generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
749
+
750
+ total_generated_latent_frames += int(generated_latents.shape[2])
751
+ history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
752
+
753
+ if not high_vram:
754
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
755
+ load_model_as_complete(vae, target_device=gpu)
756
+
757
+ if history_pixels is None:
758
+ history_pixels = vae_decode(history_latents[:, :, :total_generated_latent_frames, :, :], vae).cpu()
759
+ else:
760
+ section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
761
+ overlapped_frames = latent_window_size * 4 - 3
762
+
763
+ current_pixels = vae_decode(history_latents[:, :, :min(total_generated_latent_frames, section_latent_frames)], vae).cpu()
764
+ history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
765
+
766
+ if not high_vram:
767
+ unload_complete_models(vae)
768
+
769
+ if enable_preview or is_last_section:
770
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
771
+
772
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
773
+
774
+ print(f'Decoded. Pixel shape {history_pixels.shape}')
775
+
776
+ stream.output_queue.push(('file', output_filename))
777
+
778
+ return [total_generated_latent_frames, history_latents, history_pixels]
779
+
780
+ for latent_padding in latent_paddings:
781
+ is_last_section = latent_padding == 0
782
+ is_first_section = latent_padding == latent_paddings[0]
783
+ latent_padding_size = latent_padding * latent_window_size
784
+
785
+ if stream.input_queue.top() == 'end':
786
+ stream.output_queue.push(('end', None))
787
+ return
788
+
789
+ print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
790
+
791
+ if len(prompt_parameters) > 0:
792
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
793
+
794
+ indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + 2 + 16).unsqueeze(0)
795
+ clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
796
+ clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
797
+
798
+ clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
799
+
800
+ # Use end image latent for the first section if provided
801
+ if has_end_image and is_first_section:
802
+ clean_latents_post = end_latent
803
+
804
+ clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
805
+
806
+ if not high_vram:
807
+ unload_complete_models()
808
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
809
+
810
+ if use_teacache:
811
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
812
+ else:
813
+ transformer.initialize_teacache(enable_teacache=False)
814
+
815
+ generated_latents = sample_hunyuan(
816
+ transformer=transformer,
817
+ sampler='unipc',
818
+ width=width,
819
+ height=height,
820
+ frames=num_frames,
821
+ real_guidance_scale=cfg,
822
+ distilled_guidance_scale=gs,
823
+ guidance_rescale=rs,
824
+ # shift=3.0,
825
+ num_inference_steps=steps,
826
+ generator=rnd,
827
+ prompt_embeds=llama_vec,
828
+ prompt_embeds_mask=llama_attention_mask,
829
+ prompt_poolers=clip_l_pooler,
830
+ negative_prompt_embeds=llama_vec_n,
831
+ negative_prompt_embeds_mask=llama_attention_mask_n,
832
+ negative_prompt_poolers=clip_l_pooler_n,
833
+ device=gpu,
834
+ dtype=torch.bfloat16,
835
+ image_embeddings=image_encoder_last_hidden_state,
836
+ latent_indices=latent_indices,
837
+ clean_latents=clean_latents,
838
+ clean_latent_indices=clean_latent_indices,
839
+ clean_latents_2x=clean_latents_2x,
840
+ clean_latent_2x_indices=clean_latent_2x_indices,
841
+ clean_latents_4x=clean_latents_4x,
842
+ clean_latent_4x_indices=clean_latent_4x_indices,
843
+ callback=callback,
844
+ )
845
+
846
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
847
+
848
+ if is_last_section:
849
+ break
850
+ except:
851
+ traceback.print_exc()
852
+
853
+ if not high_vram:
854
+ unload_complete_models(
855
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
856
+ )
857
+
858
+ stream.output_queue.push(('end', None))
859
+ return
860
+
861
  # 20250506 pftq: Modified worker to accept video input and clean frame count
862
  @torch.no_grad()
863
  def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
 
885
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
886
 
887
  # 20250506 pftq: Encode video
888
+ start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
889
+ start_latent = start_latent.to(dtype=torch.float32, device=cpu)
890
  video_latents = video_latents.cpu()
891
 
892
  total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
 
1138
  stream.output_queue.push(('end', None))
1139
  return
1140
 
1141
+ def get_duration(input_image, end_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
1142
  return allocation_time
1143
 
 
1144
  @spaces.GPU(duration=get_duration)
1145
+ def process_on_gpu(input_image, end_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
1146
  ):
1147
  start = time.time()
1148
  global stream
1149
  stream = AsyncStream()
1150
 
1151
+ async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
1152
 
1153
  output_filename = None
1154
 
 
1178
  break
1179
 
1180
  def process(input_image,
1181
+ end_image,
1182
  image_position=0,
1183
  prompt="",
1184
  generation_mode="image",
 
1190
  resolution=640,
1191
  total_second_length=5,
1192
  latent_window_size=9,
1193
+ steps=30,
1194
  cfg=1.0,
1195
  gs=10.0,
1196
  rs=0.0,
1197
  gpu_memory_preservation=6,
1198
+ enable_preview=False,
1199
  use_teacache=False,
1200
  mp4_crf=16,
1201
  fps_number=30
1202
  ):
1203
  if auto_allocation:
1204
+ allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25))**2, 600)
1205
 
1206
  if torch.cuda.device_count() == 0:
1207
  gr.Warning('Set this space to GPU config to make it work.')
 
1222
  yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1223
 
1224
  yield from process_on_gpu(input_image,
1225
+ end_image,
1226
  image_position,
1227
  prompts,
1228
  generation_mode,
 
1246
  def get_duration_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1247
  return allocation_time
1248
 
 
1249
  @spaces.GPU(duration=get_duration_video)
1250
  def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1251
  start = time.time()
 
1286
  def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1287
  global high_vram
1288
  if auto_allocation:
1289
+ allocation_time = min(total_second_length * 60 * (2.5 if use_teacache else 3.5) * (1 + ((steps - 25) / 25))**2, 600)
1290
 
1291
  if torch.cuda.device_count() == 0:
1292
  gr.Warning('Set this space to GPU config to make it work.')
 
1386
  local_storage = gr.BrowserState(default_local_storage)
1387
  with gr.Row():
1388
  with gr.Column():
1389
+ generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Start & end frames", "start_end"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1390
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1391
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1392
+ end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320)
1393
  image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
1394
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1395
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
 
1415
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1416
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1417
 
1418
+ n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1419
 
1420
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
1421
 
 
1470
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1471
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1472
 
1473
+ ips = [input_image, end_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
 
1474
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1475
 
1476
  gr.Examples(
 
1478
  examples = [
1479
  [
1480
  None, # input_image
1481
+ None, # end_image
1482
  0, # image_position
1483
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1484
  "text", # generation_mode
1485
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1486
  True, # randomize_seed
1487
  42, # seed
1488
  True, # auto_allocation
 
1513
  examples = [
1514
  [
1515
  "./img_examples/Example1.png", # input_image
1516
+ None, # end_image
1517
  0, # image_position
1518
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1519
  "image", # generation_mode
1520
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1521
  True, # randomize_seed
1522
  42, # seed
1523
  True, # auto_allocation
 
1537
  ],
1538
  [
1539
  "./img_examples/Example2.webp", # input_image
1540
+ None, # end_image
1541
  0, # image_position
1542
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1543
  "image", # generation_mode
1544
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1545
  True, # randomize_seed
1546
  42, # seed
1547
  True, # auto_allocation
 
1561
  ],
1562
  [
1563
  "./img_examples/Example2.webp", # input_image
1564
+ None, # end_image
1565
  0, # image_position
1566
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1567
  "image", # generation_mode
1568
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1569
  True, # randomize_seed
1570
  42, # seed
1571
  True, # auto_allocation
 
1585
  ],
1586
  [
1587
  "./img_examples/Example3.jpg", # input_image
1588
+ None, # end_image
1589
  0, # image_position
1590
+ "एउटा केटा दायाँतिर हिँडिरहेको छ, पूर्ण दृश्य, पूर्ण-लम्बाइको दृश्य, कार्टुन",
1591
  "image", # generation_mode
1592
+ "हात छुटेको, लामो हात, अवास्तविक स्थिति, असम्भव विकृति, देखिने हड्डी, मांसपेशी संकुचन, कमजोर फ्रेम, धमिलो, धमिलो, अत्यधिक चिल्लो", # n_prompt
1593
  True, # randomize_seed
1594
  42, # seed
1595
  True, # auto_allocation
 
1609
  ],
1610
  [
1611
  "./img_examples/Example4.webp", # input_image
1612
+ None, # end_image
1613
  100, # image_position
1614
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1615
  "image", # generation_mode
1616
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1617
  True, # randomize_seed
1618
  42, # seed
1619
  True, # auto_allocation
 
1639
  cache_examples = False,
1640
  )
1641
 
1642
+ gr.Examples(
1643
+ label = "🖼️ Examples from start and end frames",
1644
+ examples = [
1645
+ [
1646
+ "./img_examples/Example5.png", # input_image
1647
+ "./img_examples/Example6.png", # end_image
1648
+ 0, # image_position
1649
+ "A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k", # generation_mode
1650
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1651
+ True, # randomize_seed
1652
+ 42, # seed
1653
+ True, # auto_allocation
1654
+ 180, # allocation_time
1655
+ 672, # resolution
1656
+ 1, # total_second_length
1657
+ 9, # latent_window_size
1658
+ 30, # steps
1659
+ 1.0, # cfg
1660
+ 10.0, # gs
1661
+ 0.0, # rs
1662
+ 6, # gpu_memory_preservation
1663
+ False, # enable_preview
1664
+ True, # use_teacache
1665
+ 16, # mp4_crf
1666
+ 30 # fps_number
1667
+ ],
1668
+ ],
1669
+ run_on_click = True,
1670
+ fn = process,
1671
+ inputs = ips,
1672
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1673
+ cache_examples = False,
1674
+ )
1675
+
1676
  gr.Examples(
1677
  label = "🎥 Examples from video",
1678
  examples = [
1679
  [
1680
  "./img_examples/Example1.mp4", # input_video
1681
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1682
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1683
  True, # randomize_seed
1684
  42, # seed
1685
  True, # auto_allocation
 
1724
  def check_parameters(generation_mode, input_image, input_video):
1725
  if generation_mode == "image" and input_image is None:
1726
  raise gr.Error("Please provide an image to extend.")
1727
+ if generation_mode == "start_end" and input_image is None:
1728
+ raise gr.Error("Please provide an image to extend.")
1729
  if generation_mode == "video" and input_video is None:
1730
  raise gr.Error("Please provide a video to extend.")
1731
  return [gr.update(interactive=True), gr.update(visible = True)]
1732
 
1733
  def handle_generation_mode_change(generation_mode_data):
1734
  if generation_mode_data == "text":
1735
+ return [
1736
+ gr.update(visible = True), # text_to_video_hint
1737
+ gr.update(visible = False), # image_position
1738
+ gr.update(visible = False), # input_image
1739
+ gr.update(visible = False), # end_image
1740
+ gr.update(visible = False), # input_video
1741
+ gr.update(visible = True), # start_button
1742
+ gr.update(visible = False), # start_button_video
1743
+ gr.update(visible = False), # no_resize
1744
+ gr.update(visible = False), # batch
1745
+ gr.update(visible = False), # num_clean_frames
1746
+ gr.update(visible = False), # vae_batch
1747
+ gr.update(visible = False), # prompt_hint
1748
+ gr.update(visible = True) # fps_number
1749
+ ]
1750
  elif generation_mode_data == "image":
1751
+ return [
1752
+ gr.update(visible = False), # text_to_video_hint
1753
+ gr.update(visible = True), # image_position
1754
+ gr.update(visible = True), # input_image
1755
+ gr.update(visible = False), # end_image
1756
+ gr.update(visible = False), # input_video
1757
+ gr.update(visible = True), # start_button
1758
+ gr.update(visible = False), # start_button_video
1759
+ gr.update(visible = False), # no_resize
1760
+ gr.update(visible = False), # batch
1761
+ gr.update(visible = False), # num_clean_frames
1762
+ gr.update(visible = False), # vae_batch
1763
+ gr.update(visible = False), # prompt_hint
1764
+ gr.update(visible = True) # fps_number
1765
+ ]
1766
+ elif generation_mode_data == "start_end":
1767
+ return [
1768
+ gr.update(visible = False), # text_to_video_hint
1769
+ gr.update(visible = False), # image_position
1770
+ gr.update(visible = True), # input_image
1771
+ gr.update(visible = True), # end_image
1772
+ gr.update(visible = False), # input_video
1773
+ gr.update(visible = True), # start_button
1774
+ gr.update(visible = False), # start_button_video
1775
+ gr.update(visible = False), # no_resize
1776
+ gr.update(visible = False), # batch
1777
+ gr.update(visible = False), # num_clean_frames
1778
+ gr.update(visible = False), # vae_batch
1779
+ gr.update(visible = False), # prompt_hint
1780
+ gr.update(visible = True) # fps_number
1781
+ ]
1782
  elif generation_mode_data == "video":
1783
+ return [
1784
+ gr.update(visible = False), # text_to_video_hint
1785
+ gr.update(visible = False), # image_position
1786
+ gr.update(visible = False), # input_image
1787
+ gr.update(visible = False), # end_image
1788
+ gr.update(visible = True), # input_video
1789
+ gr.update(visible = False), # start_button
1790
+ gr.update(visible = True), # start_button_video
1791
+ gr.update(visible = True), # no_resize
1792
+ gr.update(visible = True), # batch
1793
+ gr.update(visible = True), # num_clean_frames
1794
+ gr.update(visible = True), # vae_batch
1795
+ gr.update(visible = True), # prompt_hint
1796
+ gr.update(visible = False) # fps_number
1797
+ ]
1798
 
1799
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1800
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
 
1816
  generation_mode.change(
1817
  fn=handle_generation_mode_change,
1818
  inputs=[generation_mode],
1819
+ outputs=[text_to_video_hint, image_position, input_image, end_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
1820
  )
1821
 
1822
  # Update display when the page loads
 
1824
  fn=handle_generation_mode_change, inputs = [
1825
  generation_mode
1826
  ], outputs = [
1827
+ text_to_video_hint, image_position, input_image, end_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
1828
  ]
1829
  )
1830