SeedVR2-3B

Paused

App Files Files

xet

Community

Iceclear commited on Aug 4

Commit

657048f

verified ·

1 Parent(s): 09fd3d7

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -38

app.py CHANGED Viewed

@@ -36,6 +36,7 @@ else:
 from torchvision.transforms import Compose, Lambda, Normalize
 from torchvision.io.video import read_video
 import argparse
 from common.distributed import (
     get_device,
@@ -62,6 +63,7 @@ from urllib.parse import urlparse
 from torch.hub import download_url_to_file, get_dir
 import shlex
 import uuid
 os.environ["MASTER_ADDR"] = "127.0.0.1"
@@ -76,7 +78,7 @@ subprocess.run(
 )
 def load_file_from_url(url, model_dir=None, progress=True, file_name=None):
-    """Load file form http url, will download models if necessary.
     Reference: https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py
@@ -225,29 +227,6 @@ def generation_step(runner, text_embeds_dict, cond_latents):
 @spaces.GPU(duration=100)
 def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size=1, cfg_scale=1.0, cfg_rescale=0.0, sample_steps=1, res_h=1280, res_w=720, sp_size=1):
     runner = configure_runner(1)
-    output_dir = 'output/' + str(uuid.uuid4()) + '.mp4'
-    def _build_pos_and_neg_prompt():
-        # read positive prompt
-        positive_text = "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, \
-        hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, \
-        skin pore detailing, hyper sharpness, perfect without deformations."
-        # read negative prompt
-        negative_text = "painting, oil painting, illustration, drawing, art, sketch, oil painting, cartoon, \
-        CG Style, 3D render, unreal engine, blurring, dirty, messy, worst quality, low quality, frames, watermark, \
-        signature, jpeg artifacts, deformed, lowres, over-smooth"
-        return positive_text, negative_text
-    def _build_test_prompts(video_path):
-        positive_text, negative_text = _build_pos_and_neg_prompt()
-        original_videos = []
-        prompts = {}
-        video_list = os.listdir(video_path)
-        for f in video_list:
-            # if f.endswith(".mp4"):
-            original_videos.append(f)
-            prompts[f] = positive_text
-        print(f"Total prompts to be generated: {len(original_videos)}")
-        return original_videos, prompts, negative_text
     def _extract_text_embeds():
         # Text encoder forward.
@@ -294,7 +273,6 @@ def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size
     # set random seed
     set_seed(seed, same_across_ranks=True)
     os.makedirs('output/', exist_ok=True)
-    tgt_path = 'output/'
     # get test prompts
     original_videos = [video_path.split('/')[-1]]
@@ -331,13 +309,24 @@ def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size
         # read condition latents
         cond_latents = []
         for video in videos:
-            video = (
-                read_video(
-                   os.path.join(video_path), output_format="TCHW"
-                )[0]
-                / 255.0
-            )
-            print(f"Read video size: {video.size()}")
             cond_latents.append(video_transform(video.to(torch.device("cuda"))))
         ori_lengths = [video.size(1) for video in cond_latents]
@@ -386,14 +375,20 @@ def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size
             sample = sample.clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round()
             sample = sample.to(torch.uint8).numpy()
-            mediapy.write_video(
-                output_dir, sample, fps=fps_out
-            )
         # print(f"Generated video size: {sample.shape}")
         gc.collect()
         torch.cuda.empty_cache()
-        return output_dir, output_dir
 with gr.Blocks(title="SeedVR2: One-Step Video Restoration via Diffusion Adversarial Post-Training") as demo:
@@ -411,16 +406,17 @@ with gr.Blocks(title="SeedVR2: One-Step Video Restoration via Diffusion Adversar
     # Interface
     with gr.Row():
-        input_video = gr.Video(label="Upload a video")
         seed = gr.Number(label="Seeds", value=666)
         fps = gr.Number(label="fps", value=24)
     with gr.Row():
         output_video = gr.Video(label="Output")
         download_link = gr.File(label="Download the output")
     run_button = gr.Button("Run")
-    run_button.click(fn=generation_loop, inputs=[input_video, seed, fps], outputs=[output_video, download_link])
     # Examples
     gr.Examples(

 from torchvision.transforms import Compose, Lambda, Normalize
 from torchvision.io.video import read_video
 import argparse
+from PIL import Image
 from common.distributed import (
     get_device,
 from torch.hub import download_url_to_file, get_dir
 import shlex
 import uuid
+import mimetypes
 os.environ["MASTER_ADDR"] = "127.0.0.1"
 )
 def load_file_from_url(url, model_dir=None, progress=True, file_name=None):
+    """Load file from http url, will download models if necessary.
     Reference: https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py
 @spaces.GPU(duration=100)
 def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size=1, cfg_scale=1.0, cfg_rescale=0.0, sample_steps=1, res_h=1280, res_w=720, sp_size=1):
     runner = configure_runner(1)
     def _extract_text_embeds():
         # Text encoder forward.
     # set random seed
     set_seed(seed, same_across_ranks=True)
     os.makedirs('output/', exist_ok=True)
     # get test prompts
     original_videos = [video_path.split('/')[-1]]
         # read condition latents
         cond_latents = []
         for video in videos:
+            media_type, _ = mimetypes.guess_type(video_path)
+            is_image = media_type and media_type.startswith("image")
+            is_video = media_type and media_type.startswith("video")
+            if is_video:
+                video = (
+                    read_video(
+                       os.path.join(video_path), output_format="TCHW"
+                    )[0]
+                    / 255.0
+                )
+                print(f"Read video size: {video.size()}")
+                output_dir = 'output/' + str(uuid.uuid4()) + '.mp4'
+            else:
+                img = Image.open(input_file.name).convert("RGB")
+                img_tensor = T.ToTensor()(img).unsqueeze(0)  # (1, C, H, W)
+                video = img_tensor.permute(0, 1, 2, 3)  # (T=1, C, H, W)
+                print(f"Read Image size: {video.size()}")
+                output_dir = 'output/' + str(uuid.uuid4()) + '.png'
             cond_latents.append(video_transform(video.to(torch.device("cuda"))))
         ori_lengths = [video.size(1) for video in cond_latents]
             sample = sample.clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round()
             sample = sample.to(torch.uint8).numpy()
+            if is_image:
+                mediapy.write(output_dir, sample[0])
+            else:
+                mediapy.write_video(
+                    output_dir, sample, fps=fps_out
+                )
         # print(f"Generated video size: {sample.shape}")
         gc.collect()
         torch.cuda.empty_cache()
+        if is_image:
+            return output_dir, None
+        else:
+            return None, output_dir
 with gr.Blocks(title="SeedVR2: One-Step Video Restoration via Diffusion Adversarial Post-Training") as demo:
     # Interface
     with gr.Row():
+        input_video = gr.File(label="Upload image or video", type="file")
         seed = gr.Number(label="Seeds", value=666)
         fps = gr.Number(label="fps", value=24)
     with gr.Row():
         output_video = gr.Video(label="Output")
+        output_image = gr.Image(label="Output_Image")
         download_link = gr.File(label="Download the output")
     run_button = gr.Button("Run")
+    run_button.click(fn=generation_loop, inputs=[input_video, seed, fps], outputs=[output_image, output_video, download_link])
     # Examples
     gr.Examples(