Stable-Video-Diffusion-Img2Vid

Paused

App Files Files Community

xi0v

Fabrice-TIERCELIN commited on Jul 16, 2024

Commit

1759c53

verified ·

1 Parent(s): 8a3ce8b

Number inference steps (#45)

Browse files

- Number inference steps (90c77ceadccbffbb521dad66b02d8801f21d5a68)

Co-authored-by: Fabrice TIERCELIN <[email protected]>

Files changed (1) hide show

app.py +22 -15

app.py CHANGED Viewed

@@ -43,7 +43,8 @@ def animate(
     version: str = "auto",
     width: int = 1024,
     height: int = 576,
-    motion_control: bool = False
 ):
     start = time.time()
@@ -56,7 +57,7 @@ def animate(
         image_data = image_data.convert("RGB")
     if motion_control:
-        image_data = [image_data] * 25
     if randomize_seed:
         seed = random.randint(0, max_64_bit_int)
@@ -76,7 +77,8 @@ def animate(
         decoding_t,
         version,
         width,
-        height
     )
     os.makedirs(output_folder, exist_ok=True)
@@ -133,16 +135,17 @@ def animate_on_gpu(
     decoding_t: int = 3,
     version: str = "svdxt",
     width: int = 1024,
-    height: int = 576
 ):
     generator = torch.manual_seed(seed)
     if version == "dragnuwa":
-        return dragnuwaPipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25).frames[0]
     elif version == "svdxt":
-        return fps25Pipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25).frames[0]
     else:
-        return fps14Pipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25).frames[0]
 def resize_image(image, output_size=(1024, 576)):
@@ -193,7 +196,8 @@ def reset():
         "auto",
         1024,
         576,
-        False
     ]
 with gr.Blocks() as demo:
@@ -215,12 +219,13 @@ with gr.Blocks() as demo:
           with gr.Accordion("Advanced options", open=False):
               width = gr.Slider(label="Width", info="Width of the video", value=1024, minimum=256, maximum=1024, step=8)
               height = gr.Slider(label="Height", info="Height of the video", value=576, minimum=256, maximum=576, step=8)
-              motion_control = gr.Checkbox(label="Motion control (fixed camera)", info="Fix the camera", value=False)
               video_format = gr.Radio([["*.mp4", "mp4"], ["*.avi", "avi"], ["*.wmv", "wmv"], ["*.mkv", "mkv"], ["*.mov", "mov"], ["*.gif", "gif"]], label="Video format for result", info="File extention", value="mp4", interactive=True)
               frame_format = gr.Radio([["*.webp", "webp"], ["*.png", "png"], ["*.jpeg", "jpeg"], ["*.gif (unanimated)", "gif"], ["*.bmp", "bmp"]], label="Image format for frames", info="File extention", value="webp", interactive=True)
               fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=25, minimum=5, maximum=30)
               motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
               noise_aug_strength = gr.Slider(label="Noise strength", info="The noise to add", value=0.1, minimum=0, maximum=1, step=0.1)
               decoding_t = gr.Slider(label="Decoding", info="Number of frames decoded at a time; this eats more VRAM; reduce if necessary", value=3, minimum=1, maximum=5, step=1)
               version = gr.Radio([["Auto", "auto"], ["🏃🏻‍♀️ SVD (trained on 14 f/s)", "svd"], ["🏃🏻‍♀️💨 SVD-XT (trained on 25 f/s)", "svdxt"], ["DragNUWA (unstable)", "dragnuwa"]], label="Model", info="Trained model", value="auto", interactive=True)
               seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
@@ -249,7 +254,8 @@ with gr.Blocks() as demo:
       version,
       width,
       height,
-      motion_control
   ], outputs=[
       video_output,
       gif_output,
@@ -273,16 +279,17 @@ with gr.Blocks() as demo:
       version,
       width,
       height,
-      motion_control
   ], queue = False, show_progress = False)
   gr.Examples(
     examples=[
-        ["Examples/Fire.webp", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False],
-        ["Examples/Water.png", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False],
-        ["Examples/Town.jpeg", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False]
     ],
-    inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id, noise_aug_strength, decoding_t, video_format, frame_format, version, width, height, motion_control],
     outputs=[video_output, gif_output, download_button, gallery, seed, information_msg, reset_btn],
     fn=animate,
     run_on_click=True,

     version: str = "auto",
     width: int = 1024,
     height: int = 576,
+    motion_control: bool = False,
+    num_inference_steps: int = 25
 ):
     start = time.time()
         image_data = image_data.convert("RGB")
     if motion_control:
+        image_data = [image_data] * 2
     if randomize_seed:
         seed = random.randint(0, max_64_bit_int)
         decoding_t,
         version,
         width,
+        height,
+        num_inference_steps
     )
     os.makedirs(output_folder, exist_ok=True)
     decoding_t: int = 3,
     version: str = "svdxt",
     width: int = 1024,
+    height: int = 576,
+    num_inference_steps: int = 25
 ):
     generator = torch.manual_seed(seed)
     if version == "dragnuwa":
+        return dragnuwaPipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25, num_inference_steps=num_inference_steps).frames[0]
     elif version == "svdxt":
+        return fps25Pipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25, num_inference_steps=num_inference_steps).frames[0]
     else:
+        return fps14Pipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25, num_inference_steps=num_inference_steps).frames[0]
 def resize_image(image, output_size=(1024, 576)):
         "auto",
         1024,
         576,
+        False,
+        25
     ]
 with gr.Blocks() as demo:
           with gr.Accordion("Advanced options", open=False):
               width = gr.Slider(label="Width", info="Width of the video", value=1024, minimum=256, maximum=1024, step=8)
               height = gr.Slider(label="Height", info="Height of the video", value=576, minimum=256, maximum=576, step=8)
+              motion_control = gr.Checkbox(label="Motion control (experimental)", info="Fix the camera", value=False)
               video_format = gr.Radio([["*.mp4", "mp4"], ["*.avi", "avi"], ["*.wmv", "wmv"], ["*.mkv", "mkv"], ["*.mov", "mov"], ["*.gif", "gif"]], label="Video format for result", info="File extention", value="mp4", interactive=True)
               frame_format = gr.Radio([["*.webp", "webp"], ["*.png", "png"], ["*.jpeg", "jpeg"], ["*.gif (unanimated)", "gif"], ["*.bmp", "bmp"]], label="Image format for frames", info="File extention", value="webp", interactive=True)
               fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=25, minimum=5, maximum=30)
               motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
               noise_aug_strength = gr.Slider(label="Noise strength", info="The noise to add", value=0.1, minimum=0, maximum=1, step=0.1)
+              num_inference_steps = gr.Slider(label="Number inference steps", info="More denoising steps usually lead to a higher quality video at the expense of slower inference", value=25, minimum=1, maximum=100, step=1)
               decoding_t = gr.Slider(label="Decoding", info="Number of frames decoded at a time; this eats more VRAM; reduce if necessary", value=3, minimum=1, maximum=5, step=1)
               version = gr.Radio([["Auto", "auto"], ["🏃🏻‍♀️ SVD (trained on 14 f/s)", "svd"], ["🏃🏻‍♀️💨 SVD-XT (trained on 25 f/s)", "svdxt"], ["DragNUWA (unstable)", "dragnuwa"]], label="Model", info="Trained model", value="auto", interactive=True)
               seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
       version,
       width,
       height,
+      motion_control,
+      num_inference_steps
   ], outputs=[
       video_output,
       gif_output,
       version,
       width,
       height,
+      motion_control,
+      num_inference_steps
   ], queue = False, show_progress = False)
   gr.Examples(
     examples=[
+        ["Examples/Fire.webp", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False, 25],
+        ["Examples/Water.png", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False, 25],
+        ["Examples/Town.jpeg", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False, 25]
     ],
+    inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id, noise_aug_strength, decoding_t, video_format, frame_format, version, width, height, motion_control, num_inference_steps],
     outputs=[video_output, gif_output, download_button, gallery, seed, information_msg, reset_btn],
     fn=animate,
     run_on_click=True,