FramePack

Running

App Files Files Community

Fabrice-TIERCELIN commited on Jun 11

Commit

ce1c404

verified ·

1 Parent(s): 27a6551

Display resolution and save preferences

Browse files

Files changed (1) hide show

app.py +290 -127

app.py CHANGED Viewed

@@ -108,12 +108,9 @@ stream = AsyncStream()
 outputs_folder = './outputs/'
 os.makedirs(outputs_folder, exist_ok=True)
-def check_parameters(generation_mode, input_image, input_video):
-    if generation_mode == "image" and input_image is None:
-        raise gr.Error("Please provide an image to extend.")
-    if generation_mode == "video" and input_video is None:
-        raise gr.Error("Please provide a video to extend.")
-    return [gr.update(interactive=True)]
 @spaces.GPU()
 @torch.no_grad()
@@ -306,7 +303,7 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
         return False
 @torch.no_grad()
-def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
     def encode_prompt(prompt, n_prompt):
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
@@ -356,7 +353,7 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
         H, W, C = input_image.shape
-        height, width = find_nearest_bucket(H, W, resolution=640)
         input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
         Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
@@ -399,23 +396,27 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
         history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
         total_generated_latent_frames = 1
-        def callback(d):
-            preview = d['denoised']
-            preview = vae_decode_fake(preview)
-            preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
-            preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
-            if stream.input_queue.top() == 'end':
-                stream.output_queue.push(('end', None))
-                raise KeyboardInterrupt('User ends the task.')
-            current_step = d['i'] + 1
-            percentage = int(100.0 * current_step / steps)
-            hint = f'Sampling {current_step}/{steps}'
-            desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
-            stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
-            return
         indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
         clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
@@ -495,13 +496,14 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
             if not high_vram:
                 unload_complete_models()
-            output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
-            save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
-            print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
-            stream.output_queue.push(('file', output_filename))
     except:
         traceback.print_exc()
@@ -513,8 +515,8 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
     stream.output_queue.push(('end', None))
     return
-def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
-    return total_second_length * 60 * (0.7 if use_teacache else 1.3)
 @spaces.GPU(duration=get_duration)
@@ -523,6 +525,7 @@ def process(input_image, prompt,
             n_prompt="",
             randomize_seed=True,
             seed=31337,
             total_second_length=5,
             latent_window_size=9,
             steps=25,
@@ -530,14 +533,16 @@ def process(input_image, prompt,
             gs=10.0,
             rs=0.0,
             gpu_memory_preservation=6,
             use_teacache=False,
             mp4_crf=16
            ):
-    global stream
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
-        return None, None, None, None, None, None
     if randomize_seed:
         seed = random.randint(0, np.iinfo(np.int32).max)
@@ -554,7 +559,7 @@ def process(input_image, prompt,
     stream = AsyncStream()
-    async_run(worker, input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
     output_filename = None
@@ -570,12 +575,13 @@ def process(input_image, prompt,
             yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
         if flag == 'end':
-            return output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
 # 20250506 pftq: Modified worker to accept video input and clean frame count
 @spaces.GPU()
 @torch.no_grad()
-def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     def encode_prompt(prompt, n_prompt):
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
@@ -618,13 +624,8 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
         # 20250506 pftq: Encode video
-        #H, W = 640, 640  # Default resolution, will be adjusted
-        #height, width = find_nearest_bucket(H, W, resolution=640)
-        #start_latent, input_image_np, history_latents, fps = video_encode(input_video, vae, height, width, vae_batch_size=16, device=gpu)
         start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels  = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
-        #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
         # CLIP Vision
         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
@@ -640,23 +641,27 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
         total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
         total_latent_sections = int(max(round(total_latent_sections), 1))
-        def callback(d):
-            preview = d['denoised']
-            preview = vae_decode_fake(preview)
-            preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
-            preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
-            if stream.input_queue.top() == 'end':
-                stream.output_queue.push(('end', None))
-                raise KeyboardInterrupt('User ends the task.')
-            current_step = d['i'] + 1
-            percentage = int(100.0 * current_step / steps)
-            hint = f'Sampling {current_step}/{steps}'
-            desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
-            stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
-            return
         for idx in range(batch):
             if batch > 1:
@@ -677,10 +682,6 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
             history_pixels = None
             previous_video = None
-            # 20250507 pftq: hot fix for initial video being corrupted by vae encoding, issue with ghosting because of slight differences
-            #history_pixels = input_video_pixels
-            #save_bcthw_as_mp4(vae_decode(video_latents, vae).cpu(), os.path.join(outputs_folder, f'{job_id}_input_video.mp4'), fps=fps, crf=mp4_crf) # 20250507 pftq: test fast movement corrupted by vae encoding if vae batch size too low
             for section_index in range(total_latent_sections):
                 if stream.input_queue.top() == 'end':
                     stream.output_queue.push(('end', None))
@@ -735,12 +736,14 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
                             clean_latents_4x = splits[split_idx]
                             split_idx = 1
                         if clean_latents_4x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                            clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x], dim=2)
                         if num_2x_frames > 0 and split_idx < len(splits):
                             clean_latents_2x = splits[split_idx]
                             if clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                                clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x], dim=2)
                             split_idx += 1
                         elif clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
                             clean_latents_2x = clean_latents_4x
@@ -804,27 +807,28 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
                 if not high_vram:
                     unload_complete_models()
-                output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
-                # 20250506 pftq: Use input video FPS for output
-                save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
-                print(f"Latest video saved: {output_filename}")
-                # 20250508 pftq: Save prompt to mp4 metadata comments
-                set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompts} | Negative Prompt: {n_prompt}");
-                print(f"Prompt saved to mp4 metadata comments: {output_filename}")
-                # 20250506 pftq: Clean up previous partial files
-                if previous_video is not None and os.path.exists(previous_video):
-                    try:
-                        os.remove(previous_video)
-                        print(f"Previous partial video deleted: {previous_video}")
-                    except Exception as e:
-                        print(f"Error deleting previous partial video {previous_video}: {e}")
-                previous_video = output_filename
-                print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
-                stream.output_queue.push(('file', output_filename))
             seed = (seed + 1) % np.iinfo(np.int32).max
@@ -839,17 +843,18 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
     stream.output_queue.push(('end', None))
     return
-def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
-    return total_second_length * 60 * (0.7 if use_teacache else 2)
 # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
 @spaces.GPU(duration=get_duration_video)
-def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     global stream, high_vram
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
-        return None, None, None, None, None, None
     if randomize_seed:
         seed = random.randint(0, np.iinfo(np.int32).max)
@@ -877,7 +882,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
     stream = AsyncStream()
     # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
-    async_run(worker_video, input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
     output_filename = None
@@ -894,7 +899,8 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
             yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
         if flag == 'end':
-            return output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
 def end_process():
     stream.input_queue.push('end')
@@ -934,8 +940,23 @@ title_html = """
     <p>This space is ready to work on ZeroGPU and GPU and has been tested successfully on ZeroGPU. Please leave a <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/FramePack/discussions/new">message in discussion</a> if you encounter issues.</p>
     """
 css = make_progress_bar_css()
-block = gr.Blocks(css=css).queue()
 with block:
     if torch.cuda.device_count() == 0:
         with gr.Row():
@@ -946,12 +967,13 @@ with block:
     </big></big></big></p>
     """)
     gr.HTML(title_html)
     with gr.Row():
         with gr.Column():
-            generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], label="Generation mode", value = "image")
-            text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.", visible=False)
             input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
-            input_video = gr.Video(sources='upload', label="Input Video", height=320, visible=False)
             timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
             prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
@@ -967,23 +989,29 @@ with block:
             with gr.Row():
                 start_button = gr.Button(value="🎥 Generate", variant="primary")
-                start_button_video = gr.Button(value="🎥 Generate", variant="primary", visible=False)
-                end_button = gr.Button(value="End Generation", variant="stop", interactive=False, visible=False)
             with gr.Accordion("Advanced settings", open=False):
-                with gr.Row():
-                    use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
-                    no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).', visible=False)
-                n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
-                randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
-                seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
                 latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
                 steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. Changing this value is not recommended.')
-                batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.', visible=False)
-                resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0, visible=False)
                 # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
                 cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
@@ -992,7 +1020,7 @@ with block:
                 # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
-                num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 to avoid memory issues or to give more weight to the prompt.", visible=False)
                 default_vae = 32
                 if high_vram:
@@ -1000,12 +1028,16 @@ with block:
                 elif free_mem_gb>=20:
                     default_vae = 64
-                vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion.", visible=False)
                 gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
                 mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
         with gr.Column():
             preview_image = gr.Image(label="Next Latents", height=200, visible=False)
@@ -1014,8 +1046,28 @@ with block:
             progress_bar = gr.HTML('', elem_classes='no-generating-animation')
     # 20250506 pftq: Updated inputs to include num_clean_frames
-    ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
-    ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
     prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
     timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
@@ -1027,32 +1079,127 @@ with block:
     ], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
     end_button.click(fn=end_process)
-    gr.Examples(
         examples = [
                 [
                     "./img_examples/Example1.png", # input_image
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     1, # total_second_length
                     9, # latent_window_size
-                    25, # steps
                     1.0, # cfg
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # use_teacache
                     16 # mp4_crf
                 ],
                 [
                     "./img_examples/Example1.png", # input_image
                     "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     1, # total_second_length
                     9, # latent_window_size
                     25, # steps
@@ -1060,7 +1207,8 @@ with block:
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
-                    True, # use_teacache
                     16 # mp4_crf
                 ]
             ],
@@ -1068,7 +1216,7 @@ with block:
         fn = process,
 	    inputs = ips,
         outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
-        cache_examples = torch.cuda.device_count() > 0,
     )
     gr.Examples(
@@ -1080,7 +1228,7 @@ with block:
                     True, # randomize_seed
                     42, # seed
                     1, # batch
-                    640, # resolution
                     1, # total_second_length
                     9, # latent_window_size
                     25, # steps
@@ -1088,37 +1236,52 @@ with block:
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
                     False, # use_teacache
                     False, # no_resize
                     16, # mp4_crf
                     5, # num_clean_frames
                     default_vae
-                ],
             ],
         run_on_click = True,
         fn = process_video,
 	    inputs = ips_video,
 	    outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
-        cache_examples = torch.cuda.device_count() > 0,
     )
-    gr.Markdown('''
-    # Guide
-    To make all your generated scenes consistent, you can then apply a face swap on the main character.
-    ''')
     def handle_generation_mode_change(generation_mode_data):
         if generation_mode_data == "text":
-            return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "image":
-            return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "video":
-            return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],
-        outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, resolution, num_clean_frames, vae_batch]
     )
 block.launch(mcp_server=True, ssr_mode=False)

 outputs_folder = './outputs/'
 os.makedirs(outputs_folder, exist_ok=True)
+default_local_storage = {
+        "generation-mode": "image",
+    }
 @spaces.GPU()
 @torch.no_grad()
         return False
 @torch.no_grad()
+def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
     def encode_prompt(prompt, n_prompt):
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
         H, W, C = input_image.shape
+        height, width = find_nearest_bucket(H, W, resolution=resolution)
         input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
         Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
         history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
         total_generated_latent_frames = 1
+        if enable_preview:
+            def callback(d):
+                preview = d['denoised']
+                preview = vae_decode_fake(preview)
+                preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
+                preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
+                if stream.input_queue.top() == 'end':
+                    stream.output_queue.push(('end', None))
+                    raise KeyboardInterrupt('User ends the task.')
+                current_step = d['i'] + 1
+                percentage = int(100.0 * current_step / steps)
+                hint = f'Sampling {current_step}/{steps}'
+                desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
+                stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
+                return
+        else:
+            def callback(d):
+                return
         indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
         clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
             if not high_vram:
                 unload_complete_models()
+            if enable_preview or section_index == total_latent_sections - 1:
+                output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
+                save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
+                print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
+                stream.output_queue.push(('file', output_filename))
     except:
         traceback.print_exc()
     stream.output_queue.push(('end', None))
     return
+def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
+    return total_second_length * 60 * (0.7 if use_teacache else 1.3) * (2**((resolution - 640) / 640)) * (1 + ((steps - 25) / 100))
 @spaces.GPU(duration=get_duration)
             n_prompt="",
             randomize_seed=True,
             seed=31337,
+            resolution=640,
             total_second_length=5,
             latent_window_size=9,
             steps=25,
             gs=10.0,
             rs=0.0,
             gpu_memory_preservation=6,
+            enable_preview=True,
             use_teacache=False,
             mp4_crf=16
            ):
+    global stream, input_image_debug_value, prompt_debug_value, total_second_length_debug_value
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
+        yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
+        return
     if randomize_seed:
         seed = random.randint(0, np.iinfo(np.int32).max)
     stream = AsyncStream()
+    async_run(worker, input_image, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf)
     output_filename = None
             yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
         if flag == 'end':
+            yield output_filename, gr.update(visible=False), gr.update(), 'To make all your generated scenes consistent, you can then apply a face swap on the main character.', gr.update(interactive=True), gr.update(interactive=False)
+            break
 # 20250506 pftq: Modified worker to accept video input and clean frame count
 @spaces.GPU()
 @torch.no_grad()
+def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     def encode_prompt(prompt, n_prompt):
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
         # 20250506 pftq: Encode video
         start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels  = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
         # CLIP Vision
         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
         total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
         total_latent_sections = int(max(round(total_latent_sections), 1))
+        if enable_preview:
+            def callback(d):
+                preview = d['denoised']
+                preview = vae_decode_fake(preview)
+                preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
+                preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
+                if stream.input_queue.top() == 'end':
+                    stream.output_queue.push(('end', None))
+                    raise KeyboardInterrupt('User ends the task.')
+                current_step = d['i'] + 1
+                percentage = int(100.0 * current_step / steps)
+                hint = f'Sampling {current_step}/{steps}'
+                desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Resolution: {height}px * {width}px, Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
+                stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
+                return
+        else:
+            def callback(d):
+                return
         for idx in range(batch):
             if batch > 1:
             history_pixels = None
             previous_video = None
             for section_index in range(total_latent_sections):
                 if stream.input_queue.top() == 'end':
                     stream.output_queue.push(('end', None))
                             clean_latents_4x = splits[split_idx]
                             split_idx = 1
                         if clean_latents_4x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
+                            print("Edge case for <=1 sec videos 4x")
+                            clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
                         if num_2x_frames > 0 and split_idx < len(splits):
                             clean_latents_2x = splits[split_idx]
                             if clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
+                                print("Edge case for <=1 sec videos 2x")
+                                clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
                             split_idx += 1
                         elif clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
                             clean_latents_2x = clean_latents_4x
                 if not high_vram:
                     unload_complete_models()
+                if enable_preview or section_index == total_latent_sections - 1:
+                    output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
+                    # 20250506 pftq: Use input video FPS for output
+                    save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
+                    print(f"Latest video saved: {output_filename}")
+                    # 20250508 pftq: Save prompt to mp4 metadata comments
+                    set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompts} | Negative Prompt: {n_prompt}");
+                    print(f"Prompt saved to mp4 metadata comments: {output_filename}")
+                    # 20250506 pftq: Clean up previous partial files
+                    if previous_video is not None and os.path.exists(previous_video):
+                        try:
+                            os.remove(previous_video)
+                            print(f"Previous partial video deleted: {previous_video}")
+                        except Exception as e:
+                            print(f"Error deleting previous partial video {previous_video}: {e}")
+                    previous_video = output_filename
+                    print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
+                    stream.output_queue.push(('file', output_filename))
             seed = (seed + 1) % np.iinfo(np.int32).max
     stream.output_queue.push(('end', None))
     return
+def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
+    return total_second_length * 60 * (0.7 if use_teacache else 2) * (2**((resolution - 640) / 640)) * (1 + ((steps - 25) / 100))
 # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
 @spaces.GPU(duration=get_duration_video)
+def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     global stream, high_vram
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
+        yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
+        return
     if randomize_seed:
         seed = random.randint(0, np.iinfo(np.int32).max)
     stream = AsyncStream()
     # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
+    async_run(worker_video, input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
     output_filename = None
             yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
         if flag == 'end':
+            yield output_filename, gr.update(visible=False), desc+' Video complete. To make all your generated scenes consistent, you can then apply a face swap on the main character.', '', gr.update(interactive=True), gr.update(interactive=False)
+            break
 def end_process():
     stream.input_queue.push('end')
     <p>This space is ready to work on ZeroGPU and GPU and has been tested successfully on ZeroGPU. Please leave a <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/FramePack/discussions/new">message in discussion</a> if you encounter issues.</p>
     """
+js = """
+function createGradioAnimation() {
+    window.addEventListener("beforeunload", function (e) {
+        if (document.getElementById('end-button') && !document.getElementById('end-button').disabled) {
+            var confirmationMessage = 'A process is still running. '
+                                    + 'If you leave before saving, your changes will be lost.';
+            (e || window.event).returnValue = confirmationMessage;
+        }
+        return confirmationMessage;
+    });
+    return 'Animation created';
+}
+"""
 css = make_progress_bar_css()
+block = gr.Blocks(css=css, js=js).queue()
 with block:
     if torch.cuda.device_count() == 0:
         with gr.Row():
     </big></big></big></p>
     """)
     gr.HTML(title_html)
+    local_storage = gr.BrowserState(default_local_storage)
     with gr.Row():
         with gr.Column():
+            generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
+            text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
             input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
+            input_video = gr.Video(sources='upload', label="Input Video", height=320)
             timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
             prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
             with gr.Row():
                 start_button = gr.Button(value="🎥 Generate", variant="primary")
+                start_button_video = gr.Button(value="🎥 Generate", variant="primary")
+                end_button = gr.Button(elem_id="end-button", value="End Generation", variant="stop", interactive=False)
             with gr.Accordion("Advanced settings", open=False):
+                enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
+                use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
+                n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
                 latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
                 steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. Changing this value is not recommended.')
+                with gr.Row():
+                    no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
+                    resolution = gr.Dropdown([
+                        640,
+                        672,
+                        704,
+                        768,
+                        832,
+                        864,
+                        960
+                    ], value=640, label="Resolution (max width or height)")
                 # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
                 cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
                 # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
+                num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 to avoid memory issues or to give more weight to the prompt.")
                 default_vae = 32
                 if high_vram:
                 elif free_mem_gb>=20:
                     default_vae = 64
+                vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion.")
                 gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
                 mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
+                batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.')
+                with gr.Row():
+                    randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
+                    seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
         with gr.Column():
             preview_image = gr.Image(label="Next Latents", height=200, visible=False)
             progress_bar = gr.HTML('', elem_classes='no-generating-animation')
     # 20250506 pftq: Updated inputs to include num_clean_frames
+    ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
+    ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
+    def save_preferences(preferences, value):
+        preferences["generation-mode"] = value
+        return preferences
+    def load_preferences(saved_prefs):
+        saved_prefs = init_preferences(saved_prefs)
+        return saved_prefs["generation-mode"]
+    def init_preferences(saved_prefs):
+        if saved_prefs is None:
+            saved_prefs = default_local_storage
+        return saved_prefs
+    def check_parameters(generation_mode, input_image, input_video):
+        if generation_mode == "image" and input_image is None:
+            raise gr.Error("Please provide an image to extend.")
+        if generation_mode == "video" and input_video is None:
+            raise gr.Error("Please provide a video to extend.")
+        return gr.update(interactive=True)
     prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
     timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
     ], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
     end_button.click(fn=end_process)
+    generation_mode.change(fn = save_preferences, inputs = [
+        local_storage,
+        generation_mode,
+    ], outputs = [
+       local_storage
+    ])
+    with gr.Row(elem_id="image_examples", visible=False):
+        gr.Examples(
         examples = [
+                [
+                    "./img_examples/Example1.png", # input_image
+                    "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
+                    "image", # generation_mode
+                    "Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
+                    True, # randomize_seed
+                    42, # seed
+                    672, # resolution
+                    1, # total_second_length
+                    9, # latent_window_size
+                    50, # steps
+                    1.0, # cfg
+                    10.0, # gs
+                    0.0, # rs
+                    6, # gpu_memory_preservation
+                    False, # enable_preview
+                    False, # use_teacache
+                    16 # mp4_crf
+                ],
                 [
                     "./img_examples/Example1.png", # input_image
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
+                    "Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
+                    672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
+                    35, # steps
                     1.0, # cfg
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
+                    False, # enable_preview
                     False, # use_teacache
                     16 # mp4_crf
                 ],
+            ],
+        run_on_click = True,
+        fn = process,
+	    inputs = ips,
+        outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
+        cache_examples = torch.cuda.device_count() > 0,
+    )
+    with gr.Row(elem_id="video_examples", visible=False):
+        gr.Examples(
+        examples = [
+                [
+                    "./img_examples/Example1.mp4", # input_video
+                    "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
+                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
+                    True, # randomize_seed
+                    42, # seed
+                    1, # batch
+                    672, # resolution
+                    1, # total_second_length
+                    9, # latent_window_size
+                    50, # steps
+                    1.0, # cfg
+                    10.0, # gs
+                    0.0, # rs
+                    6, # gpu_memory_preservation
+                    False, # enable_preview
+                    False, # use_teacache
+                    False, # no_resize
+                    16, # mp4_crf
+                    5, # num_clean_frames
+                    default_vae
+                ],
+                [
+                    "./img_examples/Example1.mp4", # input_video
+                    "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
+                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
+                    True, # randomize_seed
+                    42, # seed
+                    1, # batch
+                    672, # resolution
+                    1, # total_second_length
+                    9, # latent_window_size
+                    35, # steps
+                    1.0, # cfg
+                    10.0, # gs
+                    0.0, # rs
+                    6, # gpu_memory_preservation
+                    False, # enable_preview
+                    False, # use_teacache
+                    False, # no_resize
+                    16, # mp4_crf
+                    5, # num_clean_frames
+                    default_vae
+                ],
+            ],
+        run_on_click = True,
+        fn = process_video,
+	    inputs = ips_video,
+	    outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
+        cache_examples = torch.cuda.device_count() > 0,
+    )
+    gr.Examples(
+        examples = [
                 [
                     "./img_examples/Example1.png", # input_image
                     "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
+                    "Missing arm, unrealistic position, impossible contortion, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
+                    672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
                     25, # steps
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
+                    False, # enable_preview
+                    False, # use_teacache
                     16 # mp4_crf
                 ]
             ],
         fn = process,
 	    inputs = ips,
         outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
+        cache_examples = False,
     )
     gr.Examples(
                     True, # randomize_seed
                     42, # seed
                     1, # batch
+                    672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
                     25, # steps
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
+                    False, # enable_preview
                     False, # use_teacache
                     False, # no_resize
                     16, # mp4_crf
                     5, # num_clean_frames
                     default_vae
+                ]
             ],
         run_on_click = True,
         fn = process_video,
 	    inputs = ips_video,
 	    outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
+        cache_examples = False,
     )
     def handle_generation_mode_change(generation_mode_data):
         if generation_mode_data == "text":
+            return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "image":
+            return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "video":
+            return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],
+        outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch]
+    )
+    # Update display when the page loads
+    block.load(
+        fn=handle_generation_mode_change, inputs = [
+        generation_mode
+    ], outputs = [
+       text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch
+    ]
+    )
+    # Load saved preferences when the page loads
+    block.load(
+        fn=load_preferences, inputs = [
+        local_storage
+    ], outputs = [
+       generation_mode
+    ]
     )
 block.launch(mcp_server=True, ssr_mode=False)