Fabrice-TIERCELIN commited on
Commit
27a6551
·
verified ·
1 Parent(s): d24caab

New features

Browse files
Files changed (1) hide show
  1. app.py +68 -83
app.py CHANGED
@@ -42,6 +42,9 @@ from transformers import SiglipImageProcessor, SiglipVisionModel
42
  from diffusers_helper.clip_vision import hf_clip_vision_encode
43
  from diffusers_helper.bucket_tools import find_nearest_bucket
44
  from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
 
 
 
45
 
46
  high_vram = False
47
  free_mem_gb = 0
@@ -110,7 +113,7 @@ def check_parameters(generation_mode, input_image, input_video):
110
  raise gr.Error("Please provide an image to extend.")
111
  if generation_mode == "video" and input_video is None:
112
  raise gr.Error("Please provide a video to extend.")
113
- return []
114
 
115
  @spaces.GPU()
116
  @torch.no_grad()
@@ -414,6 +417,10 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
414
  stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
415
  return
416
 
 
 
 
 
417
  for section_index in range(total_latent_sections):
418
  if stream.input_queue.top() == 'end':
419
  stream.output_queue.push(('end', None))
@@ -433,10 +440,6 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
433
  else:
434
  transformer.initialize_teacache(enable_teacache=False)
435
 
436
- indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
437
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
438
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
439
-
440
  clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
441
  clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
442
 
@@ -567,13 +570,28 @@ def process(input_image, prompt,
567
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
568
 
569
  if flag == 'end':
570
- yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
571
- break
572
 
573
  # 20250506 pftq: Modified worker to accept video input and clean frame count
574
  @spaces.GPU()
575
  @torch.no_grad()
576
- def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
579
 
@@ -591,15 +609,10 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
591
  fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
592
  load_model_as_complete(text_encoder_2, target_device=gpu)
593
 
594
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
595
-
596
- if cfg == 1:
597
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
598
- else:
599
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
600
 
601
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
602
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
603
 
604
  # 20250506 pftq: Processing input video instead of image
605
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
@@ -622,10 +635,6 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
622
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
623
 
624
  # Dtype
625
- llama_vec = llama_vec.to(transformer.dtype)
626
- llama_vec_n = llama_vec_n.to(transformer.dtype)
627
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
628
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
629
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
630
 
631
  total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
@@ -679,6 +688,9 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
679
 
680
  print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
681
 
 
 
 
682
  if not high_vram:
683
  unload_complete_models()
684
  move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
@@ -723,12 +735,12 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
723
  clean_latents_4x = splits[split_idx]
724
  split_idx = 1
725
  if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
726
- clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
727
 
728
  if num_2x_frames > 0 and split_idx < len(splits):
729
  clean_latents_2x = splits[split_idx]
730
  if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
731
- clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
732
  split_idx += 1
733
  elif clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
734
  clean_latents_2x = clean_latents_4x
@@ -798,7 +810,7 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
798
  save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
799
  print(f"Latest video saved: {output_filename}")
800
  # 20250508 pftq: Save prompt to mp4 metadata comments
801
- set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompt} | Negative Prompt: {n_prompt}");
802
  print(f"Prompt saved to mp4 metadata comments: {output_filename}")
803
 
804
  # 20250506 pftq: Clean up previous partial files
@@ -842,6 +854,8 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
842
  if randomize_seed:
843
  seed = random.randint(0, np.iinfo(np.int32).max)
844
 
 
 
845
  # 20250506 pftq: Updated assertion for video input
846
  assert input_video is not None, 'No input video!'
847
 
@@ -863,7 +877,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
863
  stream = AsyncStream()
864
 
865
  # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
866
- async_run(worker_video, input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
867
 
868
  output_filename = None
869
 
@@ -880,8 +894,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
880
  yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
881
 
882
  if flag == 'end':
883
- yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
884
- break
885
 
886
  def end_process():
887
  stream.input_queue.push('end')
@@ -906,7 +919,10 @@ def refresh_prompt():
906
  sorted_dict_values = sorted(dict_values.items(), key=lambda x: x[0])
907
  array = []
908
  for sorted_dict_value in sorted_dict_values:
909
- array.append(timeless_prompt_value[0] + ". " + sorted_dict_value[1])
 
 
 
910
  print(str(array))
911
  return ";".join(array)
912
 
@@ -914,7 +930,6 @@ title_html = """
914
  <h1><center>FramePack</center></h1>
915
  <big><center>Generate videos from text/image/video freely, without account, without watermark and download it</center></big>
916
  <br/>
917
- <br/>
918
 
919
  <p>This space is ready to work on ZeroGPU and GPU and has been tested successfully on ZeroGPU. Please leave a <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/FramePack/discussions/new">message in discussion</a> if you encounter issues.</p>
920
  """
@@ -933,13 +948,12 @@ with block:
933
  gr.HTML(title_html)
934
  with gr.Row():
935
  with gr.Column():
936
- generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video-to-Video", "video"]], label="Generation mode", value = "image")
937
  text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.", visible=False)
938
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
939
  input_video = gr.Video(sources='upload', label="Input Video", height=320, visible=False)
940
- timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, focus motion, consistent arm, consistent position, fixed camera")
941
- prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Not for video extension')
942
- prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
943
 
944
  @gr.render(inputs=prompt_number)
945
  def show_split(prompt_number):
@@ -949,7 +963,6 @@ with block:
949
  timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
950
 
951
  final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
952
- timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
953
  total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
954
 
955
  with gr.Row():
@@ -960,7 +973,7 @@ with block:
960
  with gr.Accordion("Advanced settings", open=False):
961
  with gr.Row():
962
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
963
- no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing) (only for video extension)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
964
 
965
  n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
966
  randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
@@ -968,18 +981,18 @@ with block:
968
 
969
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
970
  steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. Changing this value is not recommended.')
971
- batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed (only for video extension).')
972
 
973
- resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0, info='Only for video extension')
974
 
975
  # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
976
  cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
977
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames; 3=blurred motions& & unsharped, 10=focus motion; changing this value is not recommended')
978
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01) # Should not change
979
 
980
 
981
  # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
982
- num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 if memory issues (only for video extension).")
983
 
984
  default_vae = 32
985
  if high_vram:
@@ -987,7 +1000,7 @@ with block:
987
  elif free_mem_gb>=20:
988
  default_vae = 64
989
 
990
- vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion (only for video extension).")
991
 
992
 
993
  gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
@@ -1004,12 +1017,14 @@ with block:
1004
  ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
1005
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1006
 
 
 
1007
  start_button.click(fn = check_parameters, inputs = [
1008
  generation_mode, input_image, input_video
1009
- ], outputs = [], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
1010
  start_button_video.click(fn = check_parameters, inputs = [
1011
  generation_mode, input_image, input_video
1012
- ], outputs = [], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
1013
  end_button.click(fn=end_process)
1014
 
1015
  gr.Examples(
@@ -1045,43 +1060,9 @@ with block:
1045
  10.0, # gs
1046
  0.0, # rs
1047
  6, # gpu_memory_preservation
1048
- False, # use_teacache
1049
  16 # mp4_crf
1050
- ],
1051
- [
1052
- "./img_examples/Example1.png", # input_image
1053
- "We are sinking, photorealistic, realistic, intricate details, 8k, insanely detailed",
1054
- "image", # generation_mode
1055
- "Missing arm, unrealistic position, blurred, blurry", # n_prompt
1056
- True, # randomize_seed
1057
- 42, # seed
1058
- 1, # total_second_length
1059
- 9, # latent_window_size
1060
- 25, # steps
1061
- 1.0, # cfg
1062
- 10.0, # gs
1063
- 0.0, # rs
1064
- 6, # gpu_memory_preservation
1065
- False, # use_teacache
1066
- 16 # mp4_crf
1067
- ],
1068
- [
1069
- "./img_examples/Example1.png", # input_image
1070
- "A boat is passing, photorealistic, realistic, intricate details, 8k, insanely detailed",
1071
- "image", # generation_mode
1072
- "Missing arm, unrealistic position, blurred, blurry", # n_prompt
1073
- True, # randomize_seed
1074
- 42, # seed
1075
- 1, # total_second_length
1076
- 9, # latent_window_size
1077
- 25, # steps
1078
- 1.0, # cfg
1079
- 10.0, # gs
1080
- 0.0, # rs
1081
- 6, # gpu_memory_preservation
1082
- False, # use_teacache
1083
- 16 # mp4_crf
1084
- ],
1085
  ],
1086
  run_on_click = True,
1087
  fn = process,
@@ -1121,19 +1102,23 @@ with block:
1121
  cache_examples = torch.cuda.device_count() > 0,
1122
  )
1123
 
 
 
 
 
1124
 
1125
  def handle_generation_mode_change(generation_mode_data):
1126
  if generation_mode_data == "text":
1127
- return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False)]
1128
  elif generation_mode_data == "image":
1129
- return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False)]
1130
  elif generation_mode_data == "video":
1131
- return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True)]
1132
 
1133
  generation_mode.change(
1134
  fn=handle_generation_mode_change,
1135
  inputs=[generation_mode],
1136
- outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video]
1137
  )
1138
 
1139
- block.launch(mcp_server=False, ssr_mode=False)
 
42
  from diffusers_helper.clip_vision import hf_clip_vision_encode
43
  from diffusers_helper.bucket_tools import find_nearest_bucket
44
  from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
45
+ import pillow_heif
46
+
47
+ pillow_heif.register_heif_opener()
48
 
49
  high_vram = False
50
  free_mem_gb = 0
 
113
  raise gr.Error("Please provide an image to extend.")
114
  if generation_mode == "video" and input_video is None:
115
  raise gr.Error("Please provide a video to extend.")
116
+ return [gr.update(interactive=True)]
117
 
118
  @spaces.GPU()
119
  @torch.no_grad()
 
417
  stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
418
  return
419
 
420
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
421
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
422
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
423
+
424
  for section_index in range(total_latent_sections):
425
  if stream.input_queue.top() == 'end':
426
  stream.output_queue.push(('end', None))
 
440
  else:
441
  transformer.initialize_teacache(enable_teacache=False)
442
 
 
 
 
 
443
  clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
444
  clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
445
 
 
570
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
571
 
572
  if flag == 'end':
573
+ return output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
 
574
 
575
  # 20250506 pftq: Modified worker to accept video input and clean frame count
576
  @spaces.GPU()
577
  @torch.no_grad()
578
+ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
579
+ def encode_prompt(prompt, n_prompt):
580
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
581
+
582
+ if cfg == 1:
583
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
584
+ else:
585
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
586
+
587
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
588
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
589
+
590
+ llama_vec = llama_vec.to(transformer.dtype)
591
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
592
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
593
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
594
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
595
 
596
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
597
 
 
609
  fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
610
  load_model_as_complete(text_encoder_2, target_device=gpu)
611
 
612
+ prompt_parameters = []
 
 
 
 
 
613
 
614
+ for prompt_part in prompts:
615
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
616
 
617
  # 20250506 pftq: Processing input video instead of image
618
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
 
635
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
636
 
637
  # Dtype
 
 
 
 
638
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
639
 
640
  total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
 
688
 
689
  print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
690
 
691
+ if len(prompt_parameters) > 0:
692
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
693
+
694
  if not high_vram:
695
  unload_complete_models()
696
  move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
 
735
  clean_latents_4x = splits[split_idx]
736
  split_idx = 1
737
  if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
738
+ clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x], dim=2)
739
 
740
  if num_2x_frames > 0 and split_idx < len(splits):
741
  clean_latents_2x = splits[split_idx]
742
  if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
743
+ clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x], dim=2)
744
  split_idx += 1
745
  elif clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
746
  clean_latents_2x = clean_latents_4x
 
810
  save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
811
  print(f"Latest video saved: {output_filename}")
812
  # 20250508 pftq: Save prompt to mp4 metadata comments
813
+ set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompts} | Negative Prompt: {n_prompt}");
814
  print(f"Prompt saved to mp4 metadata comments: {output_filename}")
815
 
816
  # 20250506 pftq: Clean up previous partial files
 
854
  if randomize_seed:
855
  seed = random.randint(0, np.iinfo(np.int32).max)
856
 
857
+ prompts = prompt.split(";")
858
+
859
  # 20250506 pftq: Updated assertion for video input
860
  assert input_video is not None, 'No input video!'
861
 
 
877
  stream = AsyncStream()
878
 
879
  # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
880
+ async_run(worker_video, input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
881
 
882
  output_filename = None
883
 
 
894
  yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
895
 
896
  if flag == 'end':
897
+ return output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
 
898
 
899
  def end_process():
900
  stream.input_queue.push('end')
 
919
  sorted_dict_values = sorted(dict_values.items(), key=lambda x: x[0])
920
  array = []
921
  for sorted_dict_value in sorted_dict_values:
922
+ if timeless_prompt_value[0] is not None and len(timeless_prompt_value[0]) and sorted_dict_value[1] is not None and len(sorted_dict_value[1]):
923
+ array.append(timeless_prompt_value[0] + ". " + sorted_dict_value[1])
924
+ else:
925
+ array.append(timeless_prompt_value[0] + sorted_dict_value[1])
926
  print(str(array))
927
  return ";".join(array)
928
 
 
930
  <h1><center>FramePack</center></h1>
931
  <big><center>Generate videos from text/image/video freely, without account, without watermark and download it</center></big>
932
  <br/>
 
933
 
934
  <p>This space is ready to work on ZeroGPU and GPU and has been tested successfully on ZeroGPU. Please leave a <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/FramePack/discussions/new">message in discussion</a> if you encounter issues.</p>
935
  """
 
948
  gr.HTML(title_html)
949
  with gr.Row():
950
  with gr.Column():
951
+ generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], label="Generation mode", value = "image")
952
  text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.", visible=False)
953
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
954
  input_video = gr.Video(sources='upload', label="Input Video", height=320, visible=False)
955
+ timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
956
+ prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
 
957
 
958
  @gr.render(inputs=prompt_number)
959
  def show_split(prompt_number):
 
963
  timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
964
 
965
  final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
 
966
  total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
967
 
968
  with gr.Row():
 
973
  with gr.Accordion("Advanced settings", open=False):
974
  with gr.Row():
975
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
976
+ no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).', visible=False)
977
 
978
  n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
979
  randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
 
981
 
982
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
983
  steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. Changing this value is not recommended.')
984
+ batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.', visible=False)
985
 
986
+ resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0, visible=False)
987
 
988
  # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
989
  cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
990
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames; 3=follow the prompt but blurred motions & unsharped, 10=focus motion; changing this value is not recommended')
991
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, info='Should not change')
992
 
993
 
994
  # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
995
+ num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 to avoid memory issues or to give more weight to the prompt.", visible=False)
996
 
997
  default_vae = 32
998
  if high_vram:
 
1000
  elif free_mem_gb>=20:
1001
  default_vae = 64
1002
 
1003
+ vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion.", visible=False)
1004
 
1005
 
1006
  gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
 
1017
  ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
1018
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1019
 
1020
+ prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1021
+ timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1022
  start_button.click(fn = check_parameters, inputs = [
1023
  generation_mode, input_image, input_video
1024
+ ], outputs = [end_button], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
1025
  start_button_video.click(fn = check_parameters, inputs = [
1026
  generation_mode, input_image, input_video
1027
+ ], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
1028
  end_button.click(fn=end_process)
1029
 
1030
  gr.Examples(
 
1060
  10.0, # gs
1061
  0.0, # rs
1062
  6, # gpu_memory_preservation
1063
+ True, # use_teacache
1064
  16 # mp4_crf
1065
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1066
  ],
1067
  run_on_click = True,
1068
  fn = process,
 
1102
  cache_examples = torch.cuda.device_count() > 0,
1103
  )
1104
 
1105
+ gr.Markdown('''
1106
+ # Guide
1107
+ To make all your generated scenes consistent, you can then apply a face swap on the main character.
1108
+ ''')
1109
 
1110
  def handle_generation_mode_change(generation_mode_data):
1111
  if generation_mode_data == "text":
1112
+ return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1113
  elif generation_mode_data == "image":
1114
+ return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1115
  elif generation_mode_data == "video":
1116
+ return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
1117
 
1118
  generation_mode.change(
1119
  fn=handle_generation_mode_change,
1120
  inputs=[generation_mode],
1121
+ outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, resolution, num_clean_frames, vae_batch]
1122
  )
1123
 
1124
+ block.launch(mcp_server=True, ssr_mode=False)