Fabrice-TIERCELIN commited on
Commit
50254df
·
verified ·
1 Parent(s): c28f966

Last frame

Browse files
Files changed (1) hide show
  1. app.py +330 -94
app.py CHANGED
@@ -355,31 +355,36 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
355
 
356
  H, W, C = input_image.shape
357
  height, width = find_nearest_bucket(H, W, resolution=resolution)
358
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
359
-
360
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
361
-
362
- input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
363
- input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
364
-
365
- # VAE encoding
366
-
367
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
368
-
369
- if not high_vram:
370
- load_model_as_complete(vae, target_device=gpu)
371
-
372
- start_latent = vae_encode(input_image_pt, vae)
373
-
374
- # CLIP Vision
375
-
376
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
377
-
378
- if not high_vram:
379
- load_model_as_complete(image_encoder, target_device=gpu)
 
 
380
 
381
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
382
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
 
 
 
383
 
384
  # Dtype
385
 
@@ -438,7 +443,7 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
438
  section_latent_frames = latent_window_size * 2
439
  overlapped_frames = latent_window_size * 4 - 3
440
 
441
- real_history_latents = history_latents[:, :, max(-section_latent_frames, -total_generated_latent_frames):, :, :]
442
  history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
443
 
444
  if not high_vram:
@@ -519,78 +524,226 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
519
  stream.output_queue.push(('end', None))
520
  return
521
 
522
- def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
523
- return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (1 + ((steps - 25) / 100))
 
 
524
 
525
- @spaces.GPU(duration=get_duration)
526
- def process(input_image, prompt,
527
- generation_mode="image",
528
- n_prompt="",
529
- randomize_seed=True,
530
- seed=31337,
531
- resolution=640,
532
- total_second_length=5,
533
- latent_window_size=9,
534
- steps=25,
535
- cfg=1.0,
536
- gs=10.0,
537
- rs=0.0,
538
- gpu_memory_preservation=6,
539
- enable_preview=True,
540
- use_teacache=False,
541
- mp4_crf=16
542
- ):
543
- start = time.time()
544
- global stream
545
 
546
- if torch.cuda.device_count() == 0:
547
- gr.Warning('Set this space to GPU config to make it work.')
548
- yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
549
- return
550
 
551
- if randomize_seed:
552
- seed = random.randint(0, np.iinfo(np.int32).max)
 
 
 
553
 
554
- prompts = prompt.split(";")
 
555
 
556
- # assert input_image is not None, 'No input image!'
557
- if generation_mode == "text":
558
- default_height, default_width = 640, 640
559
- input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
560
- print("No input image provided. Using a blank white image.")
561
 
562
- yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
563
 
564
- stream = AsyncStream()
 
 
 
 
 
565
 
566
- async_run(worker, input_image, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf)
567
 
568
- output_filename = None
569
 
570
- while True:
571
- flag, data = stream.output_queue.next()
 
572
 
573
- if flag == 'file':
574
- output_filename = data
575
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
576
 
577
- if flag == 'progress':
578
- preview, desc, html = data
579
- yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
580
 
581
- if flag == 'end':
582
- end = time.time()
583
- secondes = int(end - start)
584
- minutes = math.floor(secondes / 60)
585
- secondes = secondes - (minutes * 60)
586
- hours = math.floor(minutes / 60)
587
- minutes = minutes - (hours * 60)
588
- yield output_filename, gr.update(visible=False), gr.update(), "The video has been generated in " + \
589
- ((str(hours) + " h, ") if hours != 0 else "") + \
590
- ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
591
- str(secondes) + " sec. " + \
592
- "You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", gr.update(interactive=True), gr.update(interactive=False)
593
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
594
 
595
  # 20250506 pftq: Modified worker to accept video input and clean frame count
596
  @spaces.GPU()
@@ -860,12 +1013,90 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
860
  stream.output_queue.push(('end', None))
861
  return
862
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
  def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
864
  return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (1 + ((steps - 25) / 100))
865
 
866
  # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
867
  @spaces.GPU(duration=get_duration_video)
868
- def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
 
869
  start = time.time()
870
  global stream, high_vram
871
 
@@ -913,6 +1144,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
913
 
914
  if flag == 'progress':
915
  preview, desc, html = data
 
916
  #yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
917
  yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
918
 
@@ -1002,6 +1234,7 @@ with block:
1002
  generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1003
  text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1004
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
 
1005
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1006
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
1007
  prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
@@ -1076,7 +1309,7 @@ with block:
1076
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1077
 
1078
  # 20250506 pftq: Updated inputs to include num_clean_frames
1079
- ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
1080
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1081
 
1082
  gr.Examples(
@@ -1084,6 +1317,7 @@ with block:
1084
  examples = [
1085
  [
1086
  "./img_examples/Example1.png", # input_image
 
1087
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1088
  "image", # generation_mode
1089
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
@@ -1103,7 +1337,8 @@ with block:
1103
  ],
1104
  [
1105
  "./img_examples/Example2.webp", # input_image
1106
- "A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
 
1107
  "image", # generation_mode
1108
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1109
  True, # randomize_seed
@@ -1122,7 +1357,8 @@ with block:
1122
  ],
1123
  [
1124
  "./img_examples/Example2.webp", # input_image
1125
- "A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
 
1126
  "image", # generation_mode
1127
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1128
  True, # randomize_seed
@@ -1141,6 +1377,7 @@ with block:
1141
  ],
1142
  [
1143
  "./img_examples/Example3.jpg", # input_image
 
1144
  "A boy is walking to the right, full view, full-length view, cartoon",
1145
  "image", # generation_mode
1146
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
@@ -1221,13 +1458,12 @@ with block:
1221
 
1222
  def handle_generation_mode_change(generation_mode_data):
1223
  if generation_mode_data == "text":
1224
- return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1225
  elif generation_mode_data == "image":
1226
- return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1227
  elif generation_mode_data == "video":
1228
- return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
1229
 
1230
-
1231
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1232
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1233
  start_button.click(fn = check_parameters, inputs = [
@@ -1248,7 +1484,7 @@ with block:
1248
  generation_mode.change(
1249
  fn=handle_generation_mode_change,
1250
  inputs=[generation_mode],
1251
- outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint]
1252
  )
1253
 
1254
  # Update display when the page loads
@@ -1256,7 +1492,7 @@ with block:
1256
  fn=handle_generation_mode_change, inputs = [
1257
  generation_mode
1258
  ], outputs = [
1259
- text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint
1260
  ]
1261
  )
1262
 
 
355
 
356
  H, W, C = input_image.shape
357
  height, width = find_nearest_bucket(H, W, resolution=resolution)
358
+
359
+ def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
360
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
361
+
362
+ #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
363
+
364
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
365
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
366
+
367
+ # VAE encoding
368
+
369
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
370
+
371
+ if not high_vram:
372
+ load_model_as_complete(vae, target_device=gpu)
373
+
374
+ start_latent = vae_encode(input_image_pt, vae)
375
+
376
+ # CLIP Vision
377
+
378
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
379
+
380
+ if not high_vram:
381
+ load_model_as_complete(image_encoder, target_device=gpu)
382
 
383
+ image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
384
+
385
+ return [start_latent, image_encoder_last_hidden_state]
386
+
387
+ [start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
388
 
389
  # Dtype
390
 
 
443
  section_latent_frames = latent_window_size * 2
444
  overlapped_frames = latent_window_size * 4 - 3
445
 
446
+ real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
447
  history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
448
 
449
  if not high_vram:
 
524
  stream.output_queue.push(('end', None))
525
  return
526
 
527
+ @torch.no_grad()
528
+ def worker_last_frame(input_image, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
529
+ def encode_prompt(prompt, n_prompt):
530
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
531
 
532
+ if cfg == 1:
533
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
534
+ else:
535
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
 
537
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
538
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
 
 
539
 
540
+ llama_vec = llama_vec.to(transformer.dtype)
541
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
542
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
543
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
544
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
545
 
546
+ total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
547
+ total_latent_sections = int(max(round(total_latent_sections), 1))
548
 
549
+ job_id = generate_timestamp()
 
 
 
 
550
 
551
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
552
 
553
+ try:
554
+ # Clean GPU
555
+ if not high_vram:
556
+ unload_complete_models(
557
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
558
+ )
559
 
560
+ # Text encoding
561
 
562
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
563
 
564
+ if not high_vram:
565
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
566
+ load_model_as_complete(text_encoder_2, target_device=gpu)
567
 
568
+ prompt_parameters = []
 
 
569
 
570
+ for prompt_part in prompts:
571
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
 
572
 
573
+ # Processing input image
574
+
575
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
576
+
577
+ H, W, C = input_image.shape
578
+ height, width = find_nearest_bucket(H, W, resolution=resolution)
579
+
580
+ def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
581
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
582
+
583
+ #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
584
+
585
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
586
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
587
+
588
+ # VAE encoding
589
+
590
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
591
+
592
+ if not high_vram:
593
+ load_model_as_complete(vae, target_device=gpu)
594
+
595
+ start_latent = vae_encode(input_image_pt, vae)
596
+
597
+ # CLIP Vision
598
+
599
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
600
+
601
+ if not high_vram:
602
+ load_model_as_complete(image_encoder, target_device=gpu)
603
+
604
+ image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
605
+
606
+ return [start_latent, image_encoder_last_hidden_state]
607
+
608
+ [start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
609
+
610
+ # Dtype
611
+
612
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
613
+
614
+ # Sampling
615
+
616
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
617
+
618
+ rnd = torch.Generator("cpu").manual_seed(seed)
619
+
620
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
621
+ history_pixels = None
622
+
623
+ history_latents = torch.cat([start_latent.to(history_latents), history_latents], dim=2)
624
+ total_generated_latent_frames = 1
625
+
626
+ if enable_preview:
627
+ def callback(d):
628
+ preview = d['denoised']
629
+ preview = vae_decode_fake(preview)
630
+
631
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
632
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
633
+
634
+ if stream.input_queue.top() == 'end':
635
+ stream.output_queue.push(('end', None))
636
+ raise KeyboardInterrupt('User ends the task.')
637
+
638
+ current_step = d['i'] + 1
639
+ percentage = int(100.0 * current_step / steps)
640
+ hint = f'Sampling {current_step}/{steps}'
641
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
642
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
643
+ return
644
+ else:
645
+ def callback(d):
646
+ return
647
+
648
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
649
+ latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
650
+ clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
651
+
652
+ def post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
653
+ total_generated_latent_frames += int(generated_latents.shape[2])
654
+ history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
655
+
656
+ if not high_vram:
657
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
658
+ load_model_as_complete(vae, target_device=gpu)
659
+
660
+ if history_pixels is None:
661
+ real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
662
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
663
+ else:
664
+ section_latent_frames = latent_window_size * 2
665
+ overlapped_frames = latent_window_size * 4 - 3
666
+
667
+ real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
668
+ history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
669
+
670
+ if not high_vram:
671
+ unload_complete_models()
672
+
673
+ if enable_preview or section_index == 0:
674
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
675
+
676
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
677
+
678
+ print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
679
+
680
+ stream.output_queue.push(('file', output_filename))
681
+ return [total_generated_latent_frames, history_latents, history_pixels]
682
+
683
+ for section_index in range(total_latent_sections - 1, -1, -1):
684
+ if stream.input_queue.top() == 'end':
685
+ stream.output_queue.push(('end', None))
686
+ return
687
+
688
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
689
+
690
+ if len(prompt_parameters) > 0:
691
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
692
+
693
+ if not high_vram:
694
+ unload_complete_models()
695
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
696
+
697
+ if use_teacache:
698
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
699
+ else:
700
+ transformer.initialize_teacache(enable_teacache=False)
701
+
702
+ clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :sum([1, 2, 16]), :, :].split([1, 2, 16], dim=2)
703
+ clean_latents = torch.cat([clean_latents_1x, start_latent.to(history_latents)], dim=2)
704
+
705
+ generated_latents = sample_hunyuan(
706
+ transformer=transformer,
707
+ sampler='unipc',
708
+ width=width,
709
+ height=height,
710
+ frames=latent_window_size * 4 - 3,
711
+ real_guidance_scale=cfg,
712
+ distilled_guidance_scale=gs,
713
+ guidance_rescale=rs,
714
+ # shift=3.0,
715
+ num_inference_steps=steps,
716
+ generator=rnd,
717
+ prompt_embeds=llama_vec,
718
+ prompt_embeds_mask=llama_attention_mask,
719
+ prompt_poolers=clip_l_pooler,
720
+ negative_prompt_embeds=llama_vec_n,
721
+ negative_prompt_embeds_mask=llama_attention_mask_n,
722
+ negative_prompt_poolers=clip_l_pooler_n,
723
+ device=gpu,
724
+ dtype=torch.bfloat16,
725
+ image_embeddings=image_encoder_last_hidden_state,
726
+ latent_indices=latent_indices,
727
+ clean_latents=clean_latents,
728
+ clean_latent_indices=clean_latent_indices,
729
+ clean_latents_2x=clean_latents_2x,
730
+ clean_latent_2x_indices=clean_latent_2x_indices,
731
+ clean_latents_4x=clean_latents_4x,
732
+ clean_latent_4x_indices=clean_latent_4x_indices,
733
+ callback=callback,
734
+ )
735
+
736
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
737
+ except:
738
+ traceback.print_exc()
739
+
740
+ if not high_vram:
741
+ unload_complete_models(
742
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
743
+ )
744
+
745
+ stream.output_queue.push(('end', None))
746
+ return
747
 
748
  # 20250506 pftq: Modified worker to accept video input and clean frame count
749
  @spaces.GPU()
 
1013
  stream.output_queue.push(('end', None))
1014
  return
1015
 
1016
+ def get_duration(input_image, image_position, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
1017
+ return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (1 + ((steps - 25) / 100))
1018
+
1019
+ @spaces.GPU(duration=get_duration)
1020
+ def process(input_image,
1021
+ image_position=0,
1022
+ prompt="",
1023
+ generation_mode="image",
1024
+ n_prompt="",
1025
+ randomize_seed=True,
1026
+ seed=31337,
1027
+ resolution=640,
1028
+ total_second_length=5,
1029
+ latent_window_size=9,
1030
+ steps=25,
1031
+ cfg=1.0,
1032
+ gs=10.0,
1033
+ rs=0.0,
1034
+ gpu_memory_preservation=6,
1035
+ enable_preview=True,
1036
+ use_teacache=False,
1037
+ mp4_crf=16,
1038
+ progress = gr.Progress()
1039
+ ):
1040
+ start = time.time()
1041
+ global stream
1042
+
1043
+ if torch.cuda.device_count() == 0:
1044
+ gr.Warning('Set this space to GPU config to make it work.')
1045
+ yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
1046
+ return
1047
+
1048
+ if randomize_seed:
1049
+ seed = random.randint(0, np.iinfo(np.int32).max)
1050
+
1051
+ prompts = prompt.split(";")
1052
+
1053
+ # assert input_image is not None, 'No input image!'
1054
+ if generation_mode == "text":
1055
+ default_height, default_width = 640, 640
1056
+ input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
1057
+ print("No input image provided. Using a blank white image.")
1058
+
1059
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
1060
+
1061
+ stream = AsyncStream()
1062
+
1063
+ async_run(worker_last_frame if image_position == 100 else worker, input_image, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf)
1064
+
1065
+ output_filename = None
1066
+
1067
+ while True:
1068
+ flag, data = stream.output_queue.next()
1069
+
1070
+ if flag == 'file':
1071
+ output_filename = data
1072
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
1073
+
1074
+ if flag == 'progress':
1075
+ preview, desc, html = data
1076
+ progress(None, desc = desc)
1077
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
1078
+
1079
+ if flag == 'end':
1080
+ end = time.time()
1081
+ secondes = int(end - start)
1082
+ minutes = math.floor(secondes / 60)
1083
+ secondes = secondes - (minutes * 60)
1084
+ hours = math.floor(minutes / 60)
1085
+ minutes = minutes - (hours * 60)
1086
+ yield output_filename, gr.update(visible=False), gr.update(), "The video has been generated in " + \
1087
+ ((str(hours) + " h, ") if hours != 0 else "") + \
1088
+ ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
1089
+ str(secondes) + " sec. " + \
1090
+ "You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", gr.update(interactive=True), gr.update(interactive=False)
1091
+ break
1092
+
1093
  def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1094
  return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (1 + ((steps - 25) / 100))
1095
 
1096
  # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
1097
  @spaces.GPU(duration=get_duration_video)
1098
+ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch,
1099
+ progress = gr.Progress()):
1100
  start = time.time()
1101
  global stream, high_vram
1102
 
 
1144
 
1145
  if flag == 'progress':
1146
  preview, desc, html = data
1147
+ progress(None, desc = desc)
1148
  #yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
1149
  yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
1150
 
 
1234
  generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1235
  text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1236
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1237
+ image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=100, info='0=Video start; 100=Video end')
1238
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1239
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
1240
  prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
 
1309
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1310
 
1311
  # 20250506 pftq: Updated inputs to include num_clean_frames
1312
+ ips = [input_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
1313
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1314
 
1315
  gr.Examples(
 
1317
  examples = [
1318
  [
1319
  "./img_examples/Example1.png", # input_image
1320
+ 0, # image_position
1321
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1322
  "image", # generation_mode
1323
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
 
1337
  ],
1338
  [
1339
  "./img_examples/Example2.webp", # input_image
1340
+ 0, # image_position
1341
+ "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1342
  "image", # generation_mode
1343
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1344
  True, # randomize_seed
 
1357
  ],
1358
  [
1359
  "./img_examples/Example2.webp", # input_image
1360
+ 0, # image_position
1361
+ "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1362
  "image", # generation_mode
1363
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1364
  True, # randomize_seed
 
1377
  ],
1378
  [
1379
  "./img_examples/Example3.jpg", # input_image
1380
+ 0, # image_position
1381
  "A boy is walking to the right, full view, full-length view, cartoon",
1382
  "image", # generation_mode
1383
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
 
1458
 
1459
  def handle_generation_mode_change(generation_mode_data):
1460
  if generation_mode_data == "text":
1461
+ return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1462
  elif generation_mode_data == "image":
1463
+ return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1464
  elif generation_mode_data == "video":
1465
+ return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
1466
 
 
1467
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1468
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1469
  start_button.click(fn = check_parameters, inputs = [
 
1484
  generation_mode.change(
1485
  fn=handle_generation_mode_change,
1486
  inputs=[generation_mode],
1487
+ outputs=[text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint]
1488
  )
1489
 
1490
  # Update display when the page loads
 
1492
  fn=handle_generation_mode_change, inputs = [
1493
  generation_mode
1494
  ], outputs = [
1495
+ text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint
1496
  ]
1497
  )
1498