Spaces:

pandaphd
/

generative_photography

Running on Zero

App Files Files Community

pandaphd commited on 2 days ago

Commit

cc3773d

1 Parent(s): c0c90ad

nice demo

Browse files

Files changed (15) hide show

README.md +0 -14
app.py +17 -0
app_bokehK.py +77 -0
app_color_temperature.py +77 -0
app_focal_length.py +77 -0
app_shutter_speed.py +77 -0
configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml +5 -8
configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml +4 -7
configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml +5 -7
configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml +4 -7
inference_bokehK.py +13 -22
inference_color_temperature.py +21 -96
inference_focal_length.py +22 -95
inference_shutter_speed.py +26 -100
requirements.txt +5 -5

README.md DELETED Viewed

@@ -1,14 +0,0 @@
----
-title: Generative Photography
-emoji: 📈
-colorFrom: blue
-colorTo: blue
-sdk: gradio
-sdk_version: 5.20.0
-app_file: app.py
-pinned: false
-license: cc-by-nc-nd-4.0
-short_description: Demo for Generative Photography
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,11 +1,28 @@
 import gradio as gr
 import json
 import torch
 from inference_bokehK import load_models as load_bokeh_models, run_inference as run_bokeh_inference, OmegaConf
 from inference_focal_length import load_models as load_focal_models, run_inference as run_focal_inference
 from inference_shutter_speed import load_models as load_shutter_models, run_inference as run_shutter_inference
 from inference_color_temperature import load_models as load_color_models, run_inference as run_color_inference
 torch.manual_seed(42)
 bokeh_cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml")

+import os
 import gradio as gr
 import json
 import torch
+from huggingface_hub import snapshot_download
 from inference_bokehK import load_models as load_bokeh_models, run_inference as run_bokeh_inference, OmegaConf
 from inference_focal_length import load_models as load_focal_models, run_inference as run_focal_inference
 from inference_shutter_speed import load_models as load_shutter_models, run_inference as run_shutter_inference
 from inference_color_temperature import load_models as load_color_models, run_inference as run_color_inference
+model_path = "ckpts"
+os.makedirs(model_path, exist_ok=True)
+print("Downloading models from Hugging Face...")
+snapshot_download(repo_id="pandaphd/generative_photography", local_dir=model_path)
 torch.manual_seed(42)
 bokeh_cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml")

app_bokehK.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gradio as gr
+import tempfile
+import json
+from inference_bokehK import load_models, run_inference, OmegaConf
+import torch
+# Initialize models once at startup
+cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml")
+pipeline, device = load_models(cfg)
+def generate_video(base_scene, bokehK_list):
+    try:
+        # Validate input
+        if len(json.loads(bokehK_list)) != 5:
+            raise ValueError("Exactly 5 Bokeh K values required")
+        # Run inference
+        video_path = run_inference(
+            pipeline=pipeline,
+            tokenizer=pipeline.tokenizer,
+            text_encoder=pipeline.text_encoder,
+            base_scene=base_scene,
+            bokehK_list=bokehK_list,
+            device=device
+        )
+        return video_path
+    except Exception as e:
+        raise gr.Error(f"Generation failed: {str(e)}")
+# Example inputs
+examples = [
+    [
+        "A young boy wearing an orange jacket is standing on a crosswalk, waiting to cross the street.",
+        "[2.5, 6.3, 10.1, 17.2, 24.0]"
+    ],
+    [
+        "A display of frozen desserts, including cupcakes and donuts, is arranged in a row on a counter.",
+        "[20.0, 18.5, 15.0, 10.5, 5.0]"
+    ]
+]
+with gr.Blocks(title="Bokeh Effect Generator") as demo:
+    gr.Markdown("#Dynamic Bokeh Effect Generation")
+    with gr.Row():
+        with gr.Column():
+            scene_input = gr.Textbox(
+                label="Scene Description",
+                placeholder="Describe the scene you want to generate..."
+            )
+            bokeh_input = gr.Textbox(
+                label="Bokeh Blur Values",
+                placeholder="Enter 5 comma-separated values from 1-30 (e.g., [2.44, 8.3, 10.1, 17.2, 24.0])"
+            )
+            submit_btn = gr.Button("Generate Video", variant="primary")
+        with gr.Column():
+            video_output = gr.Video(label="Generated Video")
+            error_output = gr.Textbox(label="Error Messages", visible=False)
+    gr.Examples(
+        examples=examples,
+        inputs=[scene_input, bokeh_input],
+        outputs=[video_output],
+        fn=generate_video,
+        cache_examples=True
+    )
+    submit_btn.click(
+        fn=generate_video,
+        inputs=[scene_input, bokeh_input],
+        outputs=[video_output],
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

app_color_temperature.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gradio as gr
+import tempfile
+import json
+from inference_color_temperature import load_models, run_inference, OmegaConf
+import torch
+# Initialize models once at startup
+cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml")
+pipeline, device = load_models(cfg)
+def generate_video(base_scene, color_temperature_list):
+    try:
+        # Validate input
+        if len(json.loads(color_temperature_list)) != 5:
+            raise ValueError("Exactly 5 color_temperature values required")
+        # Run inference
+        video_path = run_inference(
+            pipeline=pipeline,
+            tokenizer=pipeline.tokenizer,
+            text_encoder=pipeline.text_encoder,
+            base_scene=base_scene,
+            color_temperature_list=color_temperature_list,
+            device=device
+        )
+        return video_path
+    except Exception as e:
+        raise gr.Error(f"Generation failed: {str(e)}")
+# Example inputs
+examples = [
+    [
+        "A beautiful blue sky with a mountain range in the background.",
+        "[5455.0, 5155.0, 5555.0, 6555.0, 7555.0]"
+    ],
+    [
+        "A red couch is situated in front of a window, which is filled with a variety of potted plants.",
+        "[3500.0, 5500.0, 6500.0, 7500.0, 8500.0]"
+    ]
+]
+with gr.Blocks(title="Color Temperature Effect Generator") as demo:
+    gr.Markdown("# Dynamic Color Temperature Effect Generation")
+    with gr.Row():
+        with gr.Column():
+            scene_input = gr.Textbox(
+                label="Scene Description",
+                placeholder="Describe the scene you want to generate..."
+            )
+            color_temperature_input = gr.Textbox(
+                label="Color Temperature Values",
+                placeholder="Enter 5 comma-separated values from 2000-10000 (e.g., [3001.3, 4000.2, 4400.34, 5488.23, 8888.82])"
+            )
+            submit_btn = gr.Button("Generate Video", variant="primary")
+        with gr.Column():
+            video_output = gr.Video(label="Generated Video")
+            error_output = gr.Textbox(label="Error Messages", visible=False)
+    gr.Examples(
+        examples=examples,
+        inputs=[scene_input, color_temperature_input],
+        outputs=[video_output],
+        fn=generate_video,
+        cache_examples=True
+    )
+    submit_btn.click(
+        fn=generate_video,
+        inputs=[scene_input, color_temperature_input],
+        outputs=[video_output],
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

app_focal_length.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gradio as gr
+import tempfile
+import json
+from inference_focal_length import load_models, run_inference, OmegaConf
+import torch
+# Initialize models once at startup
+cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml")
+pipeline, device = load_models(cfg)
+def generate_video(base_scene, focal_length_list):
+    try:
+        # Validate input
+        if len(json.loads(focal_length_list)) != 5:
+            raise ValueError("Exactly 5 focal_length values required")
+        # Run inference
+        video_path = run_inference(
+            pipeline=pipeline,
+            tokenizer=pipeline.tokenizer,
+            text_encoder=pipeline.text_encoder,
+            base_scene=base_scene,
+            focal_length_list=focal_length_list,
+            device=device
+        )
+        return video_path
+    except Exception as e:
+        raise gr.Error(f"Generation failed: {str(e)}")
+# Example inputs
+examples = [
+    [
+        "A small office cubicle with a desk, computer, and chair.",
+        "[25.1, 36.1, 47.1, 58.1, 69.1]"
+    ],
+    [
+        "A large, white couch is placed in a living room, with a mirror above it. The couch is covered with various items, including a blue box, a pink towel, and a pair of shoes.",
+        "[55.0, 46.0, 37.0, 28.0, 25.0]"
+    ]
+]
+with gr.Blocks(title="Focal Length Effect Generator") as demo:
+    gr.Markdown("#Dynamic Focal Length Effect Generation")
+    with gr.Row():
+        with gr.Column():
+            scene_input = gr.Textbox(
+                label="Scene Description",
+                placeholder="Describe the scene you want to generate..."
+            )
+            focal_length_input = gr.Textbox(
+                label="Focal Length Values",
+                placeholder="Enter 5 comma-separated values from 24-70 (e.g., [25.1, 30.2, 33.3, 40.8, 54.0])"
+            )
+            submit_btn = gr.Button("Generate Video", variant="primary")
+        with gr.Column():
+            video_output = gr.Video(label="Generated Video")
+            error_output = gr.Textbox(label="Error Messages", visible=False)
+    gr.Examples(
+        examples=examples,
+        inputs=[scene_input, focal_length_input],
+        outputs=[video_output],
+        fn=generate_video,
+        cache_examples=True
+    )
+    submit_btn.click(
+        fn=generate_video,
+        inputs=[scene_input, focal_length_input],
+        outputs=[video_output],
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

app_shutter_speed.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gradio as gr
+import tempfile
+import json
+from inference_shutter_speed import load_models, run_inference, OmegaConf
+import torch
+# Initialize models once at startup
+cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml")
+pipeline, device = load_models(cfg)
+def generate_video(base_scene, shutter_speed_list):
+    try:
+        # Validate input
+        if len(json.loads(shutter_speed_list)) != 5:
+            raise ValueError("Exactly 5 shutter_speed values required")
+        # Run inference
+        video_path = run_inference(
+            pipeline=pipeline,
+            tokenizer=pipeline.tokenizer,
+            text_encoder=pipeline.text_encoder,
+            base_scene=base_scene,
+            shutter_speed_list=shutter_speed_list,
+            device=device
+        )
+        return video_path
+    except Exception as e:
+        raise gr.Error(f"Generation failed: {str(e)}")
+# Example inputs
+examples = [
+    [
+        "A brown and orange leather handbag with a paw print on it sits next to a book.",
+        "[0.11, 0.22, 0.33, 0.44, 0.55]"
+    ],
+    [
+        "A variety of potted plants are displayed on a windowsill, with some of them placed in yellow and white bowls. ",
+        "[0.29, 0.49, 0.69, 0.79, 0.89]"
+    ]
+]
+with gr.Blocks(title="Shutter Speed Effect Generator") as demo:
+    gr.Markdown("#Dynamic Shutter Speed Effect Generation")
+    with gr.Row():
+        with gr.Column():
+            scene_input = gr.Textbox(
+                label="Scene Description",
+                placeholder="Describe the scene you want to generate..."
+            )
+            shutter_speed_input = gr.Textbox(
+                label="Shutter Speed Values",
+                placeholder="Enter 5 comma-separated values from 0.1-1.0 (e.g., [0.15, 0.32, 0.53, 0.62, 0.82])"
+            )
+            submit_btn = gr.Button("Generate Video", variant="primary")
+        with gr.Column():
+            video_output = gr.Video(label="Generated Video")
+            error_output = gr.Textbox(label="Error Messages", visible=False)
+    gr.Examples(
+        examples=examples,
+        inputs=[scene_input, shutter_speed_input],
+        outputs=[video_output],
+        fn=generate_video,
+        cache_examples=True
+    )
+    submit_btn.click(
+        fn=generate_video,
+        inputs=[scene_input, shutter_speed_input],
+        outputs=[video_output],
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml CHANGED Viewed

@@ -1,13 +1,11 @@
-output_dir: "inference_output/genphoto_bokehK"
-pretrained_model_repo: "pandaphd/generative_photography"
-pretrained_model_path: "stable-diffusion-v1-5"
 unet_subfolder: "unet_merged"
-camera_adaptor_ckpt: "weights/checkpoint-bokehK.ckpt"
-lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
-motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
 lora_rank: 2
 lora_scale: 1.0
@@ -43,7 +41,6 @@ camera_encoder_kwargs:
   attention_block_types: ["Temporal_Self", ]
   temporal_position_encoding: true
   temporal_position_encoding_max_len: 16
 attention_processor_kwargs:
   add_spatial: false
   spatial_attn_names: 'attn1'
@@ -53,7 +50,6 @@ attention_processor_kwargs:
   query_condition: true
   key_value_condition: true
   scale: 1.0
 noise_scheduler_kwargs:
   num_train_timesteps: 1000
   beta_start:          0.00085
@@ -62,5 +58,6 @@ noise_scheduler_kwargs:
   steps_offset:        1
   clip_sample:         false
 num_workers: 8
 global_seed: 42

+pretrained_model_path: "./ckpts/stable-diffusion-v1-5/"
 unet_subfolder: "unet_merged"
+camera_adaptor_ckpt: "./ckpts/weights/checkpoint-bokehK.ckpt"
+lora_ckpt: "./ckpts/weights/RealEstate10K_LoRA.ckpt"
+motion_module_ckpt: "./ckpts/weights/v3_sd15_mm.ckpt"
 lora_rank: 2
 lora_scale: 1.0
   attention_block_types: ["Temporal_Self", ]
   temporal_position_encoding: true
   temporal_position_encoding_max_len: 16
 attention_processor_kwargs:
   add_spatial: false
   spatial_attn_names: 'attn1'
   query_condition: true
   key_value_condition: true
   scale: 1.0
 noise_scheduler_kwargs:
   num_train_timesteps: 1000
   beta_start:          0.00085
   steps_offset:        1
   clip_sample:         false
 num_workers: 8
 global_seed: 42

configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml CHANGED Viewed

@@ -1,16 +1,13 @@
 output_dir: "inference_output/genphoto_color_temperature"
-pretrained_model_repo: "pandaphd/generative_photography"
-pretrained_model_path: "stable-diffusion-v1-5"
 unet_subfolder: "unet_merged"
-camera_adaptor_ckpt: "weights/checkpoint-color_temperature.ckpt"
-lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
-motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
 lora_rank: 2
 lora_scale: 1.0
 motion_lora_rank: 0
 motion_lora_scale: 1.0

 output_dir: "inference_output/genphoto_color_temperature"
+pretrained_model_path: "./ckpts/stable-diffusion-v1-5/"
 unet_subfolder: "unet_merged"
+camera_adaptor_ckpt: "./ckpts/weights/checkpoint-color_temperature.ckpt"
 lora_rank: 2
 lora_scale: 1.0
+lora_ckpt: "./ckpts/weights/RealEstate10K_LoRA.ckpt"
+motion_module_ckpt: "./ckpts/weights/v3_sd15_mm.ckpt"
 motion_lora_rank: 0
 motion_lora_scale: 1.0

configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml CHANGED Viewed

@@ -1,16 +1,14 @@
 output_dir: "inference_output/genphoto_focal_length"
-pretrained_model_repo: "pandaphd/generative_photography"
-pretrained_model_path: "stable-diffusion-v1-5"
 unet_subfolder: "unet_merged"
-camera_adaptor_ckpt: "weights/checkpoint-focal_length.ckpt"
-lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
-motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
 lora_rank: 2
 lora_scale: 1.0
 motion_lora_rank: 0
 motion_lora_scale: 1.0

 output_dir: "inference_output/genphoto_focal_length"
+pretrained_model_path: "./ckpts/stable-diffusion-v1-5/"
 unet_subfolder: "unet_merged"
+camera_adaptor_ckpt: "./ckpts/weights/checkpoint-focal_length.ckpt"
 lora_rank: 2
 lora_scale: 1.0
+lora_ckpt: "./ckpts/weights/RealEstate10K_LoRA.ckpt"
+motion_module_ckpt: "./ckpts/weights/v3_sd15_mm.ckpt"
 motion_lora_rank: 0
 motion_lora_scale: 1.0

configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml CHANGED Viewed

@@ -1,16 +1,13 @@
 output_dir: "inference_output/genphoto_shutter_speed"
-pretrained_model_repo: "pandaphd/generative_photography"
-pretrained_model_path: "stable-diffusion-v1-5"
 unet_subfolder: "unet_merged"
-camera_adaptor_ckpt: "weights/checkpoint-shutter_speed.ckpt"
-lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
-motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
 lora_rank: 2
 lora_scale: 1.0
 motion_lora_rank: 0
 motion_lora_scale: 1.0

 output_dir: "inference_output/genphoto_shutter_speed"
+pretrained_model_path: "./ckpts/stable-diffusion-v1-5/"
 unet_subfolder: "unet_merged"
+camera_adaptor_ckpt: "./ckpts/weights/checkpoint-shutter_speed.ckpt"
 lora_rank: 2
 lora_scale: 1.0
+lora_ckpt: "./ckpts/weights/RealEstate10K_LoRA.ckpt"
+motion_module_ckpt: "./ckpts/weights/v3_sd15_mm.ckpt"
 motion_lora_rank: 0
 motion_lora_scale: 1.0

inference_bokehK.py CHANGED Viewed

@@ -22,11 +22,6 @@ from genphoto.utils.util import save_videos_grid
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-from huggingface_hub import hf_hub_download
 def create_bokehK_embedding(bokehK_values, target_height, target_width):
     f = bokehK_values.shape[0]
     bokehK_embedding = torch.zeros((f, 3, target_height, target_width), dtype=bokehK_values.dtype)
@@ -94,24 +89,18 @@ class Camera_Embedding(Dataset):
         camera_embedding = torch.cat((bokehK_embedding, ccl_embedding), dim=1)
         return camera_embedding
 def load_models(cfg):
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    pretrained_model_path = hf_hub_download("pandaphd/generative_photography", "stable-diffusion-v1-5/")
-    lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
-    motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
-    camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-bokehK.ckpt")
     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
-    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
     vae.requires_grad_(False)
-    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
-    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
     text_encoder.requires_grad_(False)
     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
-        pretrained_model_path,
         subfolder=cfg.unet_subfolder,
         unet_additional_kwargs=cfg.unet_additional_kwargs
     ).to(device)
@@ -132,26 +121,26 @@ def load_models(cfg):
     )
     if cfg.lora_ckpt is not None:
-        lora_checkpoints = torch.load(lora_ckpt_path, map_location=unet.device)
         if 'lora_state_dict' in lora_checkpoints.keys():
             lora_checkpoints = lora_checkpoints['lora_state_dict']
         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
         assert len(lora_u) == 0
     if cfg.motion_module_ckpt is not None:
-        mm_checkpoints = torch.load(motion_module_ckpt_path, map_location=unet.device)
         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
         assert len(mm_u) == 0
     if cfg.camera_adaptor_ckpt is not None:
-        camera_adaptor_checkpoint = torch.load(camera_adaptor_ckpt_path, map_location=device)
         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
         assert len(attention_processor_u) == 0
     pipeline = GenPhotoPipeline(
         vae=vae,
         text_encoder=text_encoder,
@@ -160,10 +149,12 @@ def load_models(cfg):
         scheduler=noise_scheduler,
         camera_encoder=camera_encoder
     ).to(device)
     pipeline.enable_vae_slicing()
     return pipeline, device
 def run_inference(pipeline, tokenizer, text_encoder, base_scene, bokehK_list, device, video_length=5, height=256, width=384):

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def create_bokehK_embedding(bokehK_values, target_height, target_width):
     f = bokehK_values.shape[0]
     bokehK_embedding = torch.zeros((f, 3, target_height, target_width), dtype=bokehK_values.dtype)
         camera_embedding = torch.cat((bokehK_embedding, ccl_embedding), dim=1)
         return camera_embedding
 def load_models(cfg):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+    vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
     vae.requires_grad_(False)
+    tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
     text_encoder.requires_grad_(False)
     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+        cfg.pretrained_model_path,
         subfolder=cfg.unet_subfolder,
         unet_additional_kwargs=cfg.unet_additional_kwargs
     ).to(device)
     )
     if cfg.lora_ckpt is not None:
+        lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
         if 'lora_state_dict' in lora_checkpoints.keys():
             lora_checkpoints = lora_checkpoints['lora_state_dict']
         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
         assert len(lora_u) == 0
     if cfg.motion_module_ckpt is not None:
+        mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
         assert len(mm_u) == 0
     if cfg.camera_adaptor_ckpt is not None:
+        camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
         assert len(attention_processor_u) == 0
     pipeline = GenPhotoPipeline(
         vae=vae,
         text_encoder=text_encoder,
         scheduler=noise_scheduler,
         camera_encoder=camera_encoder
     ).to(device)
     pipeline.enable_vae_slicing()
     return pipeline, device
 def run_inference(pipeline, tokenizer, text_encoder, base_scene, bokehK_list, device, video_length=5, height=256, width=384):

inference_color_temperature.py CHANGED Viewed

@@ -22,7 +22,6 @@ from genphoto.utils.util import save_videos_grid
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-from huggingface_hub import hf_hub_download
 def kelvin_to_rgb(kelvin):
@@ -132,104 +131,19 @@ class Camera_Embedding(Dataset):
         camera_embedding = torch.cat((color_temperature_embedding, ccl_embedding), dim=1)
         return camera_embedding
-#
-# def load_models(cfg):
-#
-#     device = "cuda" if torch.cuda.is_available() else "cpu"
-#
-#     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
-#     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
-#     vae.requires_grad_(False)
-#     tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
-#     text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
-#     text_encoder.requires_grad_(False)
-#     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
-#         cfg.pretrained_model_path,
-#         subfolder=cfg.unet_subfolder,
-#         unet_additional_kwargs=cfg.unet_additional_kwargs
-#     ).to(device)
-#     unet.requires_grad_(False)
-#
-#     camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
-#     camera_encoder.requires_grad_(False)
-#     camera_adaptor = CameraAdaptor(unet, camera_encoder)
-#     camera_adaptor.requires_grad_(False)
-#     camera_adaptor.to(device)
-#
-#     logger.info("Setting the attention processors")
-#     unet.set_all_attn_processor(
-#         add_spatial_lora=cfg.lora_ckpt is not None,
-#         add_motion_lora=cfg.motion_lora_rank > 0,
-#         lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
-#         motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
-#         **cfg.attention_processor_kwargs
-#     )
-#
-#     if cfg.lora_ckpt is not None:
-#         print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
-#         lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
-#         if 'lora_state_dict' in lora_checkpoints.keys():
-#             lora_checkpoints = lora_checkpoints['lora_state_dict']
-#         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
-#         assert len(lora_u) == 0
-#         print(f'Loading done')
-#
-#     if cfg.motion_module_ckpt is not None:
-#         print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
-#         mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
-#         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
-#         assert len(mm_u) == 0
-#         print("Loading done")
-#
-#
-#     if cfg.camera_adaptor_ckpt is not None:
-#         logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
-#         camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
-#         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
-#         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
-#         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
-#
-#         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
-#         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
-#         assert len(attention_processor_u) == 0
-#
-#         logger.info("Camera Adaptor loading done")
-#     else:
-#         logger.info("No Camera Adaptor checkpoint used")
-#
-#     pipeline = GenPhotoPipeline(
-#         vae=vae,
-#         text_encoder=text_encoder,
-#         tokenizer=tokenizer,
-#         unet=unet,
-#         scheduler=noise_scheduler,
-#         camera_encoder=camera_encoder
-#     ).to(device)
-#
-#     pipeline.enable_vae_slicing()
-#
-#     return pipeline, device
 def load_models(cfg):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    pretrained_model_path = hf_hub_download("pandaphd/generative_photography", "stable-diffusion-v1-5/")
-    lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
-    motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
-    camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-color_temperature.ckpt")
     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
-    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
     vae.requires_grad_(False)
-    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
-    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
     text_encoder.requires_grad_(False)
     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
-        pretrained_model_path,
         subfolder=cfg.unet_subfolder,
         unet_additional_kwargs=cfg.unet_additional_kwargs
     ).to(device)
@@ -241,6 +155,7 @@ def load_models(cfg):
     camera_adaptor.requires_grad_(False)
     camera_adaptor.to(device)
     unet.set_all_attn_processor(
         add_spatial_lora=cfg.lora_ckpt is not None,
         add_motion_lora=cfg.motion_lora_rank > 0,
@@ -250,25 +165,36 @@ def load_models(cfg):
     )
     if cfg.lora_ckpt is not None:
-        lora_checkpoints = torch.load(lora_ckpt_path, map_location=unet.device)
         if 'lora_state_dict' in lora_checkpoints.keys():
             lora_checkpoints = lora_checkpoints['lora_state_dict']
         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
         assert len(lora_u) == 0
     if cfg.motion_module_ckpt is not None:
-        mm_checkpoints = torch.load(motion_module_ckpt_path, map_location=unet.device)
         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
         assert len(mm_u) == 0
     if cfg.camera_adaptor_ckpt is not None:
-        camera_adaptor_checkpoint = torch.load(camera_adaptor_ckpt_path, map_location=device)
         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
         assert len(attention_processor_u) == 0
     pipeline = GenPhotoPipeline(
         vae=vae,
@@ -280,9 +206,8 @@ def load_models(cfg):
     ).to(device)
     pipeline.enable_vae_slicing()
-    return pipeline, device
 def run_inference(pipeline, tokenizer, text_encoder, base_scene, color_temperature_list, device, video_length=5, height=256, width=384):

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def kelvin_to_rgb(kelvin):
         camera_embedding = torch.cat((color_temperature_embedding, ccl_embedding), dim=1)
         return camera_embedding
 def load_models(cfg):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+    vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
     vae.requires_grad_(False)
+    tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
     text_encoder.requires_grad_(False)
     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+        cfg.pretrained_model_path,
         subfolder=cfg.unet_subfolder,
         unet_additional_kwargs=cfg.unet_additional_kwargs
     ).to(device)
     camera_adaptor.requires_grad_(False)
     camera_adaptor.to(device)
+    logger.info("Setting the attention processors")
     unet.set_all_attn_processor(
         add_spatial_lora=cfg.lora_ckpt is not None,
         add_motion_lora=cfg.motion_lora_rank > 0,
     )
     if cfg.lora_ckpt is not None:
+        print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
+        lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
         if 'lora_state_dict' in lora_checkpoints.keys():
             lora_checkpoints = lora_checkpoints['lora_state_dict']
         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
         assert len(lora_u) == 0
+        print(f'Loading done')
     if cfg.motion_module_ckpt is not None:
+        print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
+        mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
         assert len(mm_u) == 0
+        print("Loading done")
     if cfg.camera_adaptor_ckpt is not None:
+        logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
+        camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
         assert len(attention_processor_u) == 0
+        logger.info("Camera Adaptor loading done")
+    else:
+        logger.info("No Camera Adaptor checkpoint used")
     pipeline = GenPhotoPipeline(
         vae=vae,
     ).to(device)
     pipeline.enable_vae_slicing()
+    return pipeline, device
 def run_inference(pipeline, tokenizer, text_encoder, base_scene, color_temperature_list, device, video_length=5, height=256, width=384):

inference_focal_length.py CHANGED Viewed

@@ -24,9 +24,6 @@ logger = logging.getLogger(__name__)
-from huggingface_hub import hf_hub_download
 def create_focal_length_embedding(focal_length_values, target_height, target_width, base_focal_length=24.0, sensor_height=24.0, sensor_width=36.0):
     device = 'cpu'
@@ -137,101 +134,19 @@ class Camera_Embedding(Dataset):
         camera_embedding = torch.cat((focal_length_embedding, ccl_embedding), dim=1)
         return camera_embedding
-#
-# def load_models(cfg):
-#
-#     device = "cuda" if torch.cuda.is_available() else "cpu"
-#
-#     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
-#     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
-#     vae.requires_grad_(False)
-#     tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
-#     text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
-#     text_encoder.requires_grad_(False)
-#     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
-#         cfg.pretrained_model_path,
-#         subfolder=cfg.unet_subfolder,
-#         unet_additional_kwargs=cfg.unet_additional_kwargs
-#     ).to(device)
-#     unet.requires_grad_(False)
-#
-#     camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
-#     camera_encoder.requires_grad_(False)
-#     camera_adaptor = CameraAdaptor(unet, camera_encoder)
-#     camera_adaptor.requires_grad_(False)
-#     camera_adaptor.to(device)
-#
-#     logger.info("Setting the attention processors")
-#     unet.set_all_attn_processor(
-#         add_spatial_lora=cfg.lora_ckpt is not None,
-#         add_motion_lora=cfg.motion_lora_rank > 0,
-#         lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
-#         motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
-#         **cfg.attention_processor_kwargs
-#     )
-#
-#     if cfg.lora_ckpt is not None:
-#         print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
-#         lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
-#         if 'lora_state_dict' in lora_checkpoints.keys():
-#             lora_checkpoints = lora_checkpoints['lora_state_dict']
-#         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
-#         assert len(lora_u) == 0
-#         print(f'Loading done')
-#
-#     if cfg.motion_module_ckpt is not None:
-#         print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
-#         mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
-#         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
-#         assert len(mm_u) == 0
-#         print("Loading done")
-#
-#     if cfg.camera_adaptor_ckpt is not None:
-#         logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
-#         camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
-#         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
-#         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
-#         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
-#
-#         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
-#         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
-#         assert len(attention_processor_u) == 0
-#
-#         logger.info("Camera Adaptor loading done")
-#     else:
-#         logger.info("No Camera Adaptor checkpoint used")
-#
-#     pipeline = GenPhotoPipeline(
-#         vae=vae,
-#         text_encoder=text_encoder,
-#         tokenizer=tokenizer,
-#         unet=unet,
-#         scheduler=noise_scheduler,
-#         camera_encoder=camera_encoder
-#     ).to(device)
-#     pipeline.enable_vae_slicing()
-#
-#     return pipeline, device
 def load_models(cfg):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    pretrained_model_path = hf_hub_download("pandaphd/generative_photography", "stable-diffusion-v1-5/")
-    lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
-    motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
-    camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-focal_length.ckpt")
     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
-    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
     vae.requires_grad_(False)
-    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
-    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
     text_encoder.requires_grad_(False)
     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
-        pretrained_model_path,
         subfolder=cfg.unet_subfolder,
         unet_additional_kwargs=cfg.unet_additional_kwargs
     ).to(device)
@@ -243,6 +158,7 @@ def load_models(cfg):
     camera_adaptor.requires_grad_(False)
     camera_adaptor.to(device)
     unet.set_all_attn_processor(
         add_spatial_lora=cfg.lora_ckpt is not None,
         add_motion_lora=cfg.motion_lora_rank > 0,
@@ -252,25 +168,35 @@ def load_models(cfg):
     )
     if cfg.lora_ckpt is not None:
-        lora_checkpoints = torch.load(lora_ckpt_path, map_location=unet.device)
         if 'lora_state_dict' in lora_checkpoints.keys():
             lora_checkpoints = lora_checkpoints['lora_state_dict']
         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
         assert len(lora_u) == 0
     if cfg.motion_module_ckpt is not None:
-        mm_checkpoints = torch.load(motion_module_ckpt_path, map_location=unet.device)
         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
         assert len(mm_u) == 0
     if cfg.camera_adaptor_ckpt is not None:
-        camera_adaptor_checkpoint = torch.load(camera_adaptor_ckpt_path, map_location=device)
         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
         assert len(attention_processor_u) == 0
     pipeline = GenPhotoPipeline(
         vae=vae,
@@ -280,10 +206,11 @@ def load_models(cfg):
         scheduler=noise_scheduler,
         camera_encoder=camera_encoder
     ).to(device)
     pipeline.enable_vae_slicing()
     return pipeline, device
 def run_inference(pipeline, tokenizer, text_encoder, base_scene, focal_length_list, device, video_length=5, height=256, width=384):
     focal_length_values = json.loads(focal_length_list)

 def create_focal_length_embedding(focal_length_values, target_height, target_width, base_focal_length=24.0, sensor_height=24.0, sensor_width=36.0):
     device = 'cpu'
         camera_embedding = torch.cat((focal_length_embedding, ccl_embedding), dim=1)
         return camera_embedding
 def load_models(cfg):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+    vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
     vae.requires_grad_(False)
+    tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
     text_encoder.requires_grad_(False)
     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+        cfg.pretrained_model_path,
         subfolder=cfg.unet_subfolder,
         unet_additional_kwargs=cfg.unet_additional_kwargs
     ).to(device)
     camera_adaptor.requires_grad_(False)
     camera_adaptor.to(device)
+    logger.info("Setting the attention processors")
     unet.set_all_attn_processor(
         add_spatial_lora=cfg.lora_ckpt is not None,
         add_motion_lora=cfg.motion_lora_rank > 0,
     )
     if cfg.lora_ckpt is not None:
+        print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
+        lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
         if 'lora_state_dict' in lora_checkpoints.keys():
             lora_checkpoints = lora_checkpoints['lora_state_dict']
         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
         assert len(lora_u) == 0
+        print(f'Loading done')
     if cfg.motion_module_ckpt is not None:
+        print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
+        mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
         assert len(mm_u) == 0
+        print("Loading done")
     if cfg.camera_adaptor_ckpt is not None:
+        logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
+        camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
         assert len(attention_processor_u) == 0
+        logger.info("Camera Adaptor loading done")
+    else:
+        logger.info("No Camera Adaptor checkpoint used")
     pipeline = GenPhotoPipeline(
         vae=vae,
         scheduler=noise_scheduler,
         camera_encoder=camera_encoder
     ).to(device)
     pipeline.enable_vae_slicing()
     return pipeline, device
 def run_inference(pipeline, tokenizer, text_encoder, base_scene, focal_length_list, device, video_length=5, height=256, width=384):
     focal_length_values = json.loads(focal_length_list)

inference_shutter_speed.py CHANGED Viewed

@@ -22,11 +22,6 @@ from genphoto.utils.util import save_videos_grid
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-from huggingface_hub import hf_hub_download
 def create_shutter_speed_embedding(shutter_speed_values, target_height, target_width, base_exposure=0.5):
     """
     Create a shutter_speed (Exposure Value or shutter speed) embedding tensor using a constant fwc value.
@@ -119,115 +114,32 @@ class Camera_Embedding(Dataset):
         return camera_embedding
-# def load_models(cfg):
-#
-#     device = "cuda" if torch.cuda.is_available() else "cpu"
-#
-#     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
-#     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
-#     vae.requires_grad_(False)
-#     tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
-#     text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
-#     text_encoder.requires_grad_(False)
-#
-#     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
-#         cfg.pretrained_model_path,
-#         subfolder=cfg.unet_subfolder,
-#         unet_additional_kwargs=cfg.unet_additional_kwargs
-#     ).to(device)
-#     unet.requires_grad_(False)
-#
-#
-#     camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
-#     camera_encoder.requires_grad_(False)
-#     camera_adaptor = CameraAdaptor(unet, camera_encoder)
-#     camera_adaptor.requires_grad_(False)
-#     camera_adaptor.to(device)
-#
-#     logger.info("Setting the attention processors")
-#     unet.set_all_attn_processor(
-#         add_spatial_lora=cfg.lora_ckpt is not None,
-#         add_motion_lora=cfg.motion_lora_rank > 0,
-#         lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
-#         motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
-#         **cfg.attention_processor_kwargs
-#     )
-#
-#     if cfg.lora_ckpt is not None:
-#         print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
-#         lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
-#         if 'lora_state_dict' in lora_checkpoints.keys():
-#             lora_checkpoints = lora_checkpoints['lora_state_dict']
-#         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
-#         assert len(lora_u) == 0
-#         print(f'Loading done')
-#
-#     if cfg.motion_module_ckpt is not None:
-#         print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
-#         mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
-#         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
-#         assert len(mm_u) == 0
-#         print("Loading done")
-#
-#
-#     if cfg.camera_adaptor_ckpt is not None:
-#         logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
-#         camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
-#
-#         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
-#         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
-#
-#         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
-#
-#         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
-#         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
-#         assert len(attention_processor_u) == 0
-#
-#         logger.info("Camera Adaptor loading done")
-#     else:
-#         logger.info("No Camera Adaptor checkpoint used")
-#
-#     pipeline = GenPhotoPipeline(
-#         vae=vae,
-#         text_encoder=text_encoder,
-#         tokenizer=tokenizer,
-#         unet=unet,
-#         scheduler=noise_scheduler,
-#         camera_encoder=camera_encoder
-#     ).to(device)
-#     pipeline.enable_vae_slicing()
-#
-#     return pipeline, device
 def load_models(cfg):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    pretrained_model_path = hf_hub_download("pandaphd/generative_photography", "stable-diffusion-v1-5/")
-    lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
-    motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
-    camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-shutter_speed.ckpt")
     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
-    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
     vae.requires_grad_(False)
-    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
-    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
     text_encoder.requires_grad_(False)
     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
-        pretrained_model_path,
         subfolder=cfg.unet_subfolder,
         unet_additional_kwargs=cfg.unet_additional_kwargs
     ).to(device)
     unet.requires_grad_(False)
     camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
     camera_encoder.requires_grad_(False)
     camera_adaptor = CameraAdaptor(unet, camera_encoder)
     camera_adaptor.requires_grad_(False)
     camera_adaptor.to(device)
     unet.set_all_attn_processor(
         add_spatial_lora=cfg.lora_ckpt is not None,
         add_motion_lora=cfg.motion_lora_rank > 0,
@@ -237,25 +149,40 @@ def load_models(cfg):
     )
     if cfg.lora_ckpt is not None:
-        lora_checkpoints = torch.load(lora_ckpt_path, map_location=unet.device)
         if 'lora_state_dict' in lora_checkpoints.keys():
             lora_checkpoints = lora_checkpoints['lora_state_dict']
         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
         assert len(lora_u) == 0
     if cfg.motion_module_ckpt is not None:
-        mm_checkpoints = torch.load(motion_module_ckpt_path, map_location=unet.device)
         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
         assert len(mm_u) == 0
     if cfg.camera_adaptor_ckpt is not None:
-        camera_adaptor_checkpoint = torch.load(camera_adaptor_ckpt_path, map_location=device)
         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
         assert len(attention_processor_u) == 0
     pipeline = GenPhotoPipeline(
         vae=vae,
@@ -265,10 +192,9 @@ def load_models(cfg):
         scheduler=noise_scheduler,
         camera_encoder=camera_encoder
     ).to(device)
     pipeline.enable_vae_slicing()
-    return pipeline, device
 def run_inference(pipeline, tokenizer, text_encoder, base_scene, shutter_speed_list, device, video_length=5, height=256, width=384):

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def create_shutter_speed_embedding(shutter_speed_values, target_height, target_width, base_exposure=0.5):
     """
     Create a shutter_speed (Exposure Value or shutter speed) embedding tensor using a constant fwc value.
         return camera_embedding
 def load_models(cfg):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+    vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
     vae.requires_grad_(False)
+    tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
     text_encoder.requires_grad_(False)
     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+        cfg.pretrained_model_path,
         subfolder=cfg.unet_subfolder,
         unet_additional_kwargs=cfg.unet_additional_kwargs
     ).to(device)
     unet.requires_grad_(False)
     camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
     camera_encoder.requires_grad_(False)
     camera_adaptor = CameraAdaptor(unet, camera_encoder)
     camera_adaptor.requires_grad_(False)
     camera_adaptor.to(device)
+    logger.info("Setting the attention processors")
     unet.set_all_attn_processor(
         add_spatial_lora=cfg.lora_ckpt is not None,
         add_motion_lora=cfg.motion_lora_rank > 0,
     )
     if cfg.lora_ckpt is not None:
+        print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
+        lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
         if 'lora_state_dict' in lora_checkpoints.keys():
             lora_checkpoints = lora_checkpoints['lora_state_dict']
         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
         assert len(lora_u) == 0
+        print(f'Loading done')
     if cfg.motion_module_ckpt is not None:
+        print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
+        mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
         assert len(mm_u) == 0
+        print("Loading done")
+    # 🔥 加载 Camera Adaptor Checkpoint
     if cfg.camera_adaptor_ckpt is not None:
+        logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
+        camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
+        # 加载 Camera Encoder
         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
         assert len(attention_processor_u) == 0
+        logger.info("Camera Adaptor loading done")
+    else:
+        logger.info("No Camera Adaptor checkpoint used")
     pipeline = GenPhotoPipeline(
         vae=vae,
         scheduler=noise_scheduler,
         camera_encoder=camera_encoder
     ).to(device)
     pipeline.enable_vae_slicing()
+    return pipeline, device
 def run_inference(pipeline, tokenizer, text_encoder, base_scene, shutter_speed_list, device, video_length=5, height=256, width=384):

requirements.txt CHANGED Viewed

@@ -2,18 +2,18 @@
 torch==2.1.1
 torchvision==0.16.1
 torchaudio==2.1.1
-diffusers
 imageio==2.36.0
 imageio-ffmpeg
-transformers
-accelerate
 opencv-python
 gdown
 einops
 decord
 omegaconf
 safetensors
-gradio
 wandb
 triton
-huggingface_hub

 torch==2.1.1
 torchvision==0.16.1
 torchaudio==2.1.1
+diffusers==0.24.0
 imageio==2.36.0
 imageio-ffmpeg
+transformers==4.45.2
+accelerate==1.0.1
 opencv-python
 gdown
 einops
 decord
 omegaconf
 safetensors
+gradio==5.1.0
 wandb
 triton
+huggingface_hub==0.25.2