Spaces:

kadirnar
/

Paints-UNDO

Runtime error

App Files Files Community

kadirnar commited on Jul 10, 2024

Commit

6f2d84f

verified ·

1 Parent(s): b4f278b

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -37

app.py CHANGED Viewed

@@ -12,9 +12,8 @@ import gradio as gr
 import numpy as np
 import torch
 import wd14tagger
-import memory_management
 import uuid
-import spaces
 from PIL import Image
 from diffusers_helper.code_cond import unet_add_coded_conds
 from diffusers_helper.cat_cond import unet_add_concat_conds
@@ -24,8 +23,11 @@ from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPTextModel, CLIPTokenizer
 from diffusers_vdm.pipeline import LatentVideoDiffusionPipeline
 from diffusers_vdm.utils import resize_and_center_crop, save_bcthw_as_mp4
 torch.set_grad_enabled(False)
-@spaces.GPU()
 class ModifiedUNet(UNet2DConditionModel):
     @classmethod
     def from_config(cls, *args, **kwargs):
@@ -37,9 +39,9 @@ class ModifiedUNet(UNet2DConditionModel):
 model_name = 'lllyasviel/paints_undo_single_frame'
 tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
-text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16)
-vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(torch.bfloat16)  # bfloat16 vae
-unet = ModifiedUNet.from_pretrained(model_name, subfolder="unet").to(torch.float16)
 unet.set_attn_processor(AttnProcessor2_0())
 vae.set_attn_processor(AttnProcessor2_0())
@@ -47,12 +49,7 @@ vae.set_attn_processor(AttnProcessor2_0())
 video_pipe = LatentVideoDiffusionPipeline.from_pretrained(
     'lllyasviel/paints_undo_multi_frame',
     fp16=True
-)
-memory_management.unload_all_models([
-    video_pipe.unet, video_pipe.vae, video_pipe.text_encoder, video_pipe.image_projection, video_pipe.image_encoder,
-    unet, vae, text_encoder
-])
 k_sampler = KDiffusionSampler(
     unet=unet,
@@ -73,18 +70,17 @@ def find_best_bucket(h, w, options):
             best_bucket = (bucket_h, bucket_w)
     return best_bucket
-@spaces.GPU()
 def encode_cropped_prompt_77tokens(txt: str):
-    memory_management.load_models_to_gpu(text_encoder)
     cond_ids = tokenizer(txt,
                          padding="max_length",
                          max_length=tokenizer.model_max_length,
                          truncation=True,
-                         return_tensors="pt").input_ids.to(device=text_encoder.device)
     text_cond = text_encoder(cond_ids, attention_mask=None).last_hidden_state
     return text_cond
-@spaces.GPU()
 def pytorch2numpy(imgs):
     results = []
     for x in imgs:
@@ -94,7 +90,7 @@ def pytorch2numpy(imgs):
         results.append(y)
     return results
-@spaces.GPU()
 def numpy2pytorch(imgs):
     h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
     h = h.movedim(-1, 1)
@@ -106,28 +102,28 @@ def resize_without_crop(image, target_width, target_height):
     resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
     return np.array(resized_image)
 @spaces.GPU()
 def interrogator_process(x):
-    return wd14tagger.default_interrogator(x)
 @spaces.GPU()
 def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
             progress=gr.Progress()):
-    rng = torch.Generator(device=memory_management.gpu).manual_seed(int(seed))
-    memory_management.load_models_to_gpu(vae)
     fg = resize_and_center_crop(input_fg, image_width, image_height)
-    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
     concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
-    memory_management.load_models_to_gpu(text_encoder)
     conds = encode_cropped_prompt_77tokens(prompt)
     unconds = encode_cropped_prompt_77tokens(n_prompt)
-    memory_management.load_models_to_gpu(unet)
-    fs = torch.tensor(input_undo_steps).to(device=unet.device, dtype=torch.long)
     initial_latents = torch.zeros_like(concat_conds)
-    concat_conds = concat_conds.to(device=unet.device, dtype=unet.dtype)
     latents = k_sampler(
         initial_latent=initial_latents,
         strength=1.0,
@@ -142,14 +138,13 @@ def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed,
         progress_tqdm=functools.partial(progress.tqdm, desc='Generating Key Frames')
     ).to(vae.dtype) / vae.config.scaling_factor
-    memory_management.load_models_to_gpu(vae)
     pixels = vae.decode(latents).sample
     pixels = pytorch2numpy(pixels)
     pixels = [fg] + pixels + [np.zeros_like(fg) + 255]
     return pixels
-@spaces.GPU()
 def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=7.5, fs=3, progress_tqdm=None):
     random.seed(seed)
     np.random.seed(seed)
@@ -168,25 +163,21 @@ def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=
     input_frames = numpy2pytorch([image_1, image_2])
     input_frames = input_frames.unsqueeze(0).movedim(1, 2)
-    memory_management.load_models_to_gpu(video_pipe.text_encoder)
     positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
     negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
-    memory_management.load_models_to_gpu([video_pipe.image_projection, video_pipe.image_encoder])
-    input_frames = input_frames.to(device=video_pipe.image_encoder.device, dtype=video_pipe.image_encoder.dtype)
     positive_image_cond = video_pipe.encode_clip_vision(input_frames)
     positive_image_cond = video_pipe.image_projection(positive_image_cond)
     negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
     negative_image_cond = video_pipe.image_projection(negative_image_cond)
-    memory_management.load_models_to_gpu([video_pipe.vae])
-    input_frames = input_frames.to(device=video_pipe.vae.device, dtype=video_pipe.vae.dtype)
     input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
     first_frame = input_frame_latents[:, :, 0]
     last_frame = input_frame_latents[:, :, 1]
     concat_cond = torch.stack([first_frame] + [torch.zeros_like(first_frame)] * (frames - 2) + [last_frame], dim=2)
-    memory_management.load_models_to_gpu([video_pipe.unet])
     latents = video_pipe(
         batch_size=1,
         steps=int(steps),
@@ -200,11 +191,11 @@ def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=
         progress_tqdm=progress_tqdm
     )
-    memory_management.load_models_to_gpu([video_pipe.vae])
     video = video_pipe.decode_latents(latents, vae_hidden_states)
     return video, image_1, image_2
-@spaces.GPU
 def process_video(keyframes, prompt, steps, cfg, fps, seed, progress=gr.Progress()):
     result_frames = []
     cropped_images = []
@@ -282,7 +273,7 @@ with block:
     prompt_gen_button.click(
         fn=interrogator_process,
         inputs=[input_fg],
-        outputs=[prompt]
     ).then(lambda: [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=False)],
            outputs=[prompt_gen_button, key_gen_button, i2v_end_btn])
@@ -311,4 +302,4 @@ with block:
         examples_per_page=1024
     )
-block.queue().launch(server_name='0.0.0.0')

 import numpy as np
 import torch
 import wd14tagger
 import uuid
 from PIL import Image
 from diffusers_helper.code_cond import unet_add_coded_conds
 from diffusers_helper.cat_cond import unet_add_concat_conds
 from transformers import CLIPTextModel, CLIPTokenizer
 from diffusers_vdm.pipeline import LatentVideoDiffusionPipeline
 from diffusers_vdm.utils import resize_and_center_crop, save_bcthw_as_mp4
+import spaces
+# Disable gradients globally
 torch.set_grad_enabled(False)
 class ModifiedUNet(UNet2DConditionModel):
     @classmethod
     def from_config(cls, *args, **kwargs):
 model_name = 'lllyasviel/paints_undo_single_frame'
 tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16).to("cuda")
+vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(torch.bfloat16).to("cuda")  # bfloat16 vae
+unet = ModifiedUNet.from_pretrained(model_name, subfolder="unet").to(torch.float16).to("cuda")
 unet.set_attn_processor(AttnProcessor2_0())
 vae.set_attn_processor(AttnProcessor2_0())
 video_pipe = LatentVideoDiffusionPipeline.from_pretrained(
     'lllyasviel/paints_undo_multi_frame',
     fp16=True
+).to("cuda")
 k_sampler = KDiffusionSampler(
     unet=unet,
             best_bucket = (bucket_h, bucket_w)
     return best_bucket
 def encode_cropped_prompt_77tokens(txt: str):
     cond_ids = tokenizer(txt,
                          padding="max_length",
                          max_length=tokenizer.model_max_length,
                          truncation=True,
+                         return_tensors="pt").input_ids.to(device="cuda")
     text_cond = text_encoder(cond_ids, attention_mask=None).last_hidden_state
     return text_cond
 def pytorch2numpy(imgs):
     results = []
     for x in imgs:
         results.append(y)
     return results
 def numpy2pytorch(imgs):
     h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
     h = h.movedim(-1, 1)
     resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
     return np.array(resized_image)
 @spaces.GPU()
 def interrogator_process(x):
+    image_description = wd14tagger.default_interrogator(x)
+    return image_description, image_description
 @spaces.GPU()
 def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
             progress=gr.Progress()):
+    rng = torch.Generator(device="cuda").manual_seed(int(seed))
     fg = resize_and_center_crop(input_fg, image_width, image_height)
+    concat_conds = numpy2pytorch([fg]).clone().detach().to(device="cuda", dtype=vae.dtype)
     concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
     conds = encode_cropped_prompt_77tokens(prompt)
     unconds = encode_cropped_prompt_77tokens(n_prompt)
+    fs = torch.tensor(input_undo_steps).to(device="cuda", dtype=torch.long)
     initial_latents = torch.zeros_like(concat_conds)
+    concat_conds = concat_conds.to(device="cuda", dtype=unet.dtype)
     latents = k_sampler(
         initial_latent=initial_latents,
         strength=1.0,
         progress_tqdm=functools.partial(progress.tqdm, desc='Generating Key Frames')
     ).to(vae.dtype) / vae.config.scaling_factor
     pixels = vae.decode(latents).sample
     pixels = pytorch2numpy(pixels)
     pixels = [fg] + pixels + [np.zeros_like(fg) + 255]
     return pixels
 def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=7.5, fs=3, progress_tqdm=None):
     random.seed(seed)
     np.random.seed(seed)
     input_frames = numpy2pytorch([image_1, image_2])
     input_frames = input_frames.unsqueeze(0).movedim(1, 2)
     positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
     negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
+    input_frames = input_frames.to(device="cuda", dtype=video_pipe.image_encoder.dtype)
     positive_image_cond = video_pipe.encode_clip_vision(input_frames)
     positive_image_cond = video_pipe.image_projection(positive_image_cond)
     negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
     negative_image_cond = video_pipe.image_projection(negative_image_cond)
+    input_frames = input_frames.to(device="cuda", dtype=video_pipe.vae.dtype)
     input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
     first_frame = input_frame_latents[:, :, 0]
     last_frame = input_frame_latents[:, :, 1]
     concat_cond = torch.stack([first_frame] + [torch.zeros_like(first_frame)] * (frames - 2) + [last_frame], dim=2)
     latents = video_pipe(
         batch_size=1,
         steps=int(steps),
         progress_tqdm=progress_tqdm
     )
     video = video_pipe.decode_latents(latents, vae_hidden_states)
     return video, image_1, image_2
+@spaces.GPU(duration=360)
 def process_video(keyframes, prompt, steps, cfg, fps, seed, progress=gr.Progress()):
     result_frames = []
     cropped_images = []
     prompt_gen_button.click(
         fn=interrogator_process,
         inputs=[input_fg],
+        outputs=[prompt, i2v_input_text]
     ).then(lambda: [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=False)],
            outputs=[prompt_gen_button, key_gen_button, i2v_end_btn])
         examples_per_page=1024
     )
+block.queue().launch(server_name='0.0.0.0')