FramePack

Running

App Files Files Community

Fabrice-TIERCELIN commited on 26 days ago

Commit

e00725a

verified ·

1 Parent(s): cdb2539

Generation time

Browse files

Files changed (1) hide show

app.py +201 -216

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import einops
 import safetensors.torch as sf
 import numpy as np
 import random
 import math
 # 20250506 pftq: Added for video input loading
 import decord
@@ -422,6 +423,37 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
         clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
         clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
         for section_index in range(total_latent_sections):
             if stream.input_queue.top() == 'end':
                 stream.output_queue.push(('end', None))
@@ -475,35 +507,7 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
                 callback=callback,
             )
-            total_generated_latent_frames += int(generated_latents.shape[2])
-            history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
-            if not high_vram:
-                offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
-                load_model_as_complete(vae, target_device=gpu)
-            real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
-            if history_pixels is None:
-                history_pixels = vae_decode(real_history_latents, vae).cpu()
-            else:
-                section_latent_frames = latent_window_size * 2
-                overlapped_frames = latent_window_size * 4 - 3
-                current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
-                history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
-            if not high_vram:
-                unload_complete_models()
-            if enable_preview or section_index == total_latent_sections - 1:
-                output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
-                save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
-                print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
-                stream.output_queue.push(('file', output_filename))
     except:
         traceback.print_exc()
@@ -516,8 +520,7 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
     return
 def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
-    return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (2**((resolution - 640) / 640)) * (1 + ((steps - 25) / 100))
 @spaces.GPU(duration=get_duration)
 def process(input_image, prompt,
@@ -537,6 +540,7 @@ def process(input_image, prompt,
             use_teacache=False,
             mp4_crf=16
            ):
     global stream
     if torch.cuda.device_count() == 0:
@@ -575,7 +579,17 @@ def process(input_image, prompt,
             yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
         if flag == 'end':
-            yield output_filename, gr.update(visible=False), gr.update(), 'To make all your generated scenes consistent, you can then apply a face swap on the main character.', gr.update(interactive=True), gr.update(interactive=False)
             break
 # 20250506 pftq: Modified worker to accept video input and clean frame count
@@ -663,6 +677,63 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
             def callback(d):
                 return
         for idx in range(batch):
             if batch > 1:
                 print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
@@ -701,60 +772,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
                 else:
                     transformer.initialize_teacache(enable_teacache=False)
-                # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
-                available_frames = history_latents.shape[2]  # Number of latent frames
-                max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4)  # Cap at available pixel frames
-                adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4)  # Convert back to latent frames
-                # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
-                effective_clean_frames = max(0, num_clean_frames - 1) if num_clean_frames > 1 else 0
-                effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
-                num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
-                num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
-                total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
-                total_context_frames = min(total_context_frames, available_frames)  # 20250507 pftq: Edge case for <=1 sec videos
-                indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
-                clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
-                    [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
-                )
-                clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
-                # 20250506 pftq: Split history_latents dynamically based on available frames
-                fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
-                context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
-                if total_context_frames > 0:
-                    context_frames = history_latents[:, :, -total_context_frames:, :, :]
-                    split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
-                    split_sizes = [s for s in split_sizes if s > 0]  # Remove zero sizes
-                    if split_sizes:
-                        splits = context_frames.split(split_sizes, dim=2)
-                        split_idx = 0
-                        if num_4x_frames > 0:
-                            clean_latents_4x = splits[split_idx]
-                            split_idx = 1
-                        if clean_latents_4x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                            print("Edge case for <=1 sec videos 4x")
-                            clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
-                        if num_2x_frames > 0 and split_idx < len(splits):
-                            clean_latents_2x = splits[split_idx]
-                            if clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                                print("Edge case for <=1 sec videos 2x")
-                                clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
-                            split_idx += 1
-                        elif clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                            clean_latents_2x = clean_latents_4x
-                        if effective_clean_frames > 0 and split_idx < len(splits):
-                            clean_latents_1x = splits[split_idx]
-                clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
-                # 20250507 pftq: Fix for <=1 sec videos.
-                max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
                 generated_latents = sample_hunyuan(
                     transformer=transformer,
@@ -801,8 +819,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
                   section_latent_frames = latent_window_size * 2
                   overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
-                  current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
-                  history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
                 if not high_vram:
                     unload_complete_models()
@@ -844,11 +861,12 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
     return
 def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
-    return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (2**((resolution - 640) / 640)) * (1 + ((steps - 25) / 100))
 # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
 @spaces.GPU(duration=get_duration_video)
 def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     global stream, high_vram
     if torch.cuda.device_count() == 0:
@@ -899,7 +917,18 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
             yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
         if flag == 'end':
-            yield output_filename, gr.update(visible=False), desc+' Video complete. To make all your generated scenes consistent, you can then apply a face swap on the main character.', '', gr.update(interactive=True), gr.update(interactive=False)
             break
 def end_process():
@@ -985,6 +1014,7 @@ with block:
                     timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
             final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
             total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
             with gr.Row():
@@ -994,24 +1024,24 @@ with block:
             with gr.Accordion("Advanced settings", open=False):
                 enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
-                use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
-                n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
                 latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
-                steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. Changing this value is not recommended.')
                 with gr.Row():
                     no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
                     resolution = gr.Dropdown([
-                        640,
-                        672,
-                        704,
-                        768,
-                        832,
-                        864,
-                        960
-                    ], value=640, label="Resolution (max width or height)")
                 # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
                 cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
@@ -1049,157 +1079,74 @@ with block:
     ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
     ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
-    def save_preferences(preferences, value):
-        preferences["generation-mode"] = value
-        return preferences
-    def load_preferences(saved_prefs):
-        saved_prefs = init_preferences(saved_prefs)
-        return saved_prefs["generation-mode"]
-    def init_preferences(saved_prefs):
-        if saved_prefs is None:
-            saved_prefs = default_local_storage
-        return saved_prefs
-    def check_parameters(generation_mode, input_image, input_video):
-        if generation_mode == "image" and input_image is None:
-            raise gr.Error("Please provide an image to extend.")
-        if generation_mode == "video" and input_video is None:
-            raise gr.Error("Please provide a video to extend.")
-        return gr.update(interactive=True)
-    prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
-    timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
-    start_button.click(fn = check_parameters, inputs = [
-        generation_mode, input_image, input_video
-    ], outputs = [end_button], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
-    start_button_video.click(fn = check_parameters, inputs = [
-        generation_mode, input_image, input_video
-    ], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
-    end_button.click(fn=end_process)
-    generation_mode.change(fn = save_preferences, inputs = [
-        local_storage,
-        generation_mode,
-    ], outputs = [
-       local_storage
-    ])
-    with gr.Row(elem_id="image_examples", visible=False):
-        gr.Examples(
         examples = [
                 [
                     "./img_examples/Example1.png", # input_image
                     "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
-                    50, # steps
                     1.0, # cfg
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
-                    False, # use_teacache
                     16 # mp4_crf
                 ],
                 [
-                    "./img_examples/Example1.png", # input_image
-                    "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     672, # resolution
-                    1, # total_second_length
                     9, # latent_window_size
-                    35, # steps
                     1.0, # cfg
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
-                    False, # use_teacache
                     16 # mp4_crf
                 ],
-            ],
-        run_on_click = True,
-        fn = process,
-	    inputs = ips,
-        outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
-        cache_examples = torch.cuda.device_count() > 0,
-    )
-    with gr.Row(elem_id="video_examples", visible=False):
-        gr.Examples(
-        examples = [
                 [
-                    "./img_examples/Example1.mp4", # input_video
-                    "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
-                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
-                    1, # batch
                     672, # resolution
-                    1, # total_second_length
                     9, # latent_window_size
-                    50, # steps
                     1.0, # cfg
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
-                    False, # use_teacache
-                    False, # no_resize
-                    16, # mp4_crf
-                    5, # num_clean_frames
-                    default_vae
                 ],
                 [
-                    "./img_examples/Example1.mp4", # input_video
-                    "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
-                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
-                    True, # randomize_seed
-                    42, # seed
-                    1, # batch
-                    640, # resolution
-                    1, # total_second_length
-                    9, # latent_window_size
-                    35, # steps
-                    1.0, # cfg
-                    10.0, # gs
-                    0.0, # rs
-                    6, # gpu_memory_preservation
-                    False, # enable_preview
-                    False, # use_teacache
-                    False, # no_resize
-                    16, # mp4_crf
-                    5, # num_clean_frames
-                    default_vae
-                ],
-            ],
-        run_on_click = True,
-        fn = process_video,
-	    inputs = ips_video,
-	    outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
-        cache_examples = torch.cuda.device_count() > 0,
-    )
-    gr.Examples(
-        examples = [
-                [
-                    "./img_examples/Example1.png", # input_image
-                    "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
-                    640, # resolution
                     1, # total_second_length
                     9, # latent_window_size
                     25, # steps
@@ -1208,7 +1155,7 @@ with block:
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
-                    False, # use_teacache
                     16 # mp4_crf
                 ]
             ],
@@ -1220,15 +1167,16 @@ with block:
     )
     gr.Examples(
         examples = [
                 [
                     "./img_examples/Example1.mp4", # input_video
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
-                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     1, # batch
-                    640, # resolution
                     1, # total_second_length
                     9, # latent_window_size
                     25, # steps
@@ -1237,7 +1185,7 @@ with block:
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
-                    False, # use_teacache
                     False, # no_resize
                     16, # mp4_crf
                     5, # num_clean_frames
@@ -1250,20 +1198,57 @@ with block:
 	    outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
         cache_examples = False,
     )
     def handle_generation_mode_change(generation_mode_data):
         if generation_mode_data == "text":
-            return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "image":
-            return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "video":
-            return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],
-        outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch]
     )
     # Update display when the page loads
@@ -1271,7 +1256,7 @@ with block:
         fn=handle_generation_mode_change, inputs = [
         generation_mode
     ], outputs = [
-       text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch
     ]
     )

 import safetensors.torch as sf
 import numpy as np
 import random
+import time
 import math
 # 20250506 pftq: Added for video input loading
 import decord
         clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
         clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+        def post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
+            total_generated_latent_frames += int(generated_latents.shape[2])
+            history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
+            if not high_vram:
+                offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
+                load_model_as_complete(vae, target_device=gpu)
+            if history_pixels is None:
+                real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
+                history_pixels = vae_decode(real_history_latents, vae).cpu()
+            else:
+                section_latent_frames = latent_window_size * 2
+                overlapped_frames = latent_window_size * 4 - 3
+                real_history_latents = history_latents[:, :, max(-section_latent_frames, -total_generated_latent_frames):, :, :]
+                history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
+            if not high_vram:
+                unload_complete_models()
+            if enable_preview or section_index == total_latent_sections - 1:
+                output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
+                save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
+                print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
+                stream.output_queue.push(('file', output_filename))
+            return [total_generated_latent_frames, history_latents, history_pixels]
         for section_index in range(total_latent_sections):
             if stream.input_queue.top() == 'end':
                 stream.output_queue.push(('end', None))
                 callback=callback,
             )
+            [total_generated_latent_frames, history_latents, history_pixels] = post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
     except:
         traceback.print_exc()
     return
 def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
+    return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (1 + ((steps - 25) / 100))
 @spaces.GPU(duration=get_duration)
 def process(input_image, prompt,
             use_teacache=False,
             mp4_crf=16
            ):
+    start = time.time()
     global stream
     if torch.cuda.device_count() == 0:
             yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
         if flag == 'end':
+            end = time.time()
+            secondes = int(end - start)
+            minutes = math.floor(secondes / 60)
+            secondes = secondes - (minutes * 60)
+            hours = math.floor(minutes / 60)
+            minutes = minutes - (hours * 60)
+            yield output_filename, gr.update(visible=False), gr.update(), "The video has been generated in " + \
+            ((str(hours) + " h, ") if hours != 0 else "") + \
+            ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
+            str(secondes) + " sec. " + \
+            "You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", gr.update(interactive=True), gr.update(interactive=False)
             break
 # 20250506 pftq: Modified worker to accept video input and clean frame count
             def callback(d):
                 return
+        def compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent):
+            # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
+            available_frames = history_latents.shape[2]  # Number of latent frames
+            max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4)  # Cap at available pixel frames
+            adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4)  # Convert back to latent frames
+            # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
+            effective_clean_frames = max(0, num_clean_frames - 1)
+            effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
+            num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
+            num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
+            total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
+            total_context_frames = min(total_context_frames, available_frames)  # 20250507 pftq: Edge case for <=1 sec videos
+            indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
+            clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
+                [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
+            )
+            clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+            # 20250506 pftq: Split history_latents dynamically based on available frames
+            fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
+            context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
+            if total_context_frames > 0:
+                context_frames = history_latents[:, :, -total_context_frames:, :, :]
+                split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
+                split_sizes = [s for s in split_sizes if s > 0]  # Remove zero sizes
+                if split_sizes:
+                    splits = context_frames.split(split_sizes, dim=2)
+                    split_idx = 0
+                    if num_4x_frames > 0:
+                        clean_latents_4x = splits[split_idx]
+                        split_idx = 1
+                    if clean_latents_4x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
+                        print("Edge case for <=1 sec videos 4x")
+                        clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
+                    if num_2x_frames > 0 and split_idx < len(splits):
+                        clean_latents_2x = splits[split_idx]
+                        if clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
+                            print("Edge case for <=1 sec videos 2x")
+                            clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
+                        split_idx += 1
+                    elif clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
+                        clean_latents_2x = clean_latents_4x
+                    if effective_clean_frames > 0 and split_idx < len(splits):
+                        clean_latents_1x = splits[split_idx]
+            clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
+            # 20250507 pftq: Fix for <=1 sec videos.
+            max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
+            return [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices]
         for idx in range(batch):
             if batch > 1:
                 print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
                 else:
                     transformer.initialize_teacache(enable_teacache=False)
+                [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
                 generated_latents = sample_hunyuan(
                     transformer=transformer,
                   section_latent_frames = latent_window_size * 2
                   overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
+                  history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu(), overlapped_frames)
                 if not high_vram:
                     unload_complete_models()
     return
 def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
+    return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (1 + ((steps - 25) / 100))
 # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
 @spaces.GPU(duration=get_duration_video)
 def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
+    start = time.time()
     global stream, high_vram
     if torch.cuda.device_count() == 0:
             yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
         if flag == 'end':
+            end = time.time()
+            secondes = int(end - start)
+            minutes = math.floor(secondes / 60)
+            secondes = secondes - (minutes * 60)
+            hours = math.floor(minutes / 60)
+            minutes = minutes - (hours * 60)
+            yield output_filename, gr.update(visible=False), desc + \
+            " The video has been generated in " + \
+            ((str(hours) + " h, ") if hours != 0 else "") + \
+            ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
+            str(secondes) + " sec. " + \
+            " Video complete. You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", '', gr.update(interactive=True), gr.update(interactive=False)
             break
 def end_process():
                     timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
             final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
+            prompt_hint = gr.HTML("Video extension barely follows the prompt; to force to follow the prompt, you have to set the Distilled CFG Scale to 3.0 and the Context Frames to 2 but the video quality will be poor.")
             total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
             with gr.Row():
             with gr.Accordion("Advanced settings", open=False):
                 enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
+                use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
+                n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
                 latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
+                steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
                 with gr.Row():
                     no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
                     resolution = gr.Dropdown([
+                        ["409,600 px (working)", 640],
+                        ["451,584 px (working)", 672],
+                        ["495,616 px (VRAM pb on HF)", 704],
+                        ["589,824 px (not tested)", 768],
+                        ["692,224 px (not tested)", 832],
+                        ["746,496 px (not tested)", 864],
+                        ["921,600 px (not tested)", 960]
+                    ], value=672, label="Resolution (width x height)", info="Do not affect the generation time")
                 # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
                 cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
     ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
     ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
+    gr.Examples(
+        label = "Examples from image",
         examples = [
                 [
                     "./img_examples/Example1.png", # input_image
                     "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
+                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
+                    25, # steps
                     1.0, # cfg
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
+                    True, # use_teacache
                     16 # mp4_crf
                 ],
                 [
+                    "./img_examples/Example2.webp", # input_image
+                    "A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
                     "image", # generation_mode
+                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     672, # resolution
+                    2, # total_second_length
                     9, # latent_window_size
+                    25, # steps
                     1.0, # cfg
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
+                    True, # use_teacache
                     16 # mp4_crf
                 ],
                 [
+                    "./img_examples/Example2.webp", # input_image
+                    "A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
+                    "image", # generation_mode
+                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     672, # resolution
+                    2, # total_second_length
                     9, # latent_window_size
+                    25, # steps
                     1.0, # cfg
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
+                    True, # use_teacache
+                    16 # mp4_crf
                 ],
                 [
+                    "./img_examples/Example3.jpg", # input_image
+                    "A boy is walking to the right, full view, full-length view, cartoon",
                     "image", # generation_mode
+                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
+                    672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
                     25, # steps
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
+                    True, # use_teacache
                     16 # mp4_crf
                 ]
             ],
     )
     gr.Examples(
+        label = "Examples from video",
         examples = [
                 [
                     "./img_examples/Example1.mp4", # input_video
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
+                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     1, # batch
+                    672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
                     25, # steps
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # enable_preview
+                    True, # use_teacache
                     False, # no_resize
                     16, # mp4_crf
                     5, # num_clean_frames
 	    outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
         cache_examples = False,
     )
+    def save_preferences(preferences, value):
+        preferences["generation-mode"] = value
+        return preferences
+    def load_preferences(saved_prefs):
+        saved_prefs = init_preferences(saved_prefs)
+        return saved_prefs["generation-mode"]
+    def init_preferences(saved_prefs):
+        if saved_prefs is None:
+            saved_prefs = default_local_storage
+        return saved_prefs
+    def check_parameters(generation_mode, input_image, input_video):
+        if generation_mode == "image" and input_image is None:
+            raise gr.Error("Please provide an image to extend.")
+        if generation_mode == "video" and input_video is None:
+            raise gr.Error("Please provide a video to extend.")
+        return gr.update(interactive=True)
     def handle_generation_mode_change(generation_mode_data):
         if generation_mode_data == "text":
+            return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "image":
+            return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "video":
+            return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
+    prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
+    timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
+    start_button.click(fn = check_parameters, inputs = [
+        generation_mode, input_image, input_video
+    ], outputs = [end_button], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
+    start_button_video.click(fn = check_parameters, inputs = [
+        generation_mode, input_image, input_video
+    ], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
+    end_button.click(fn=end_process)
+    generation_mode.change(fn = save_preferences, inputs = [
+        local_storage,
+        generation_mode,
+    ], outputs = [
+       local_storage
+    ])
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],
+        outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint]
     )
     # Update display when the page loads
         fn=handle_generation_mode_change, inputs = [
         generation_mode
     ], outputs = [
+       text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint
     ]
     )