marigold-lcm-2

Paused

App Files Files Community

toshas commited on Mar 22, 2024

Commit

c732904

1 Parent(s): e8416d0

initial commit

Browse files

Files changed (39) hide show

.gitignore +3 -0
README.md +23 -7
app.py +828 -0
extrude.py +332 -0
files/basrelief/coin.jpg +3 -0
files/basrelief/einstein.jpg +3 -0
files/basrelief/food.jpeg +3 -0
files/image/arc.jpeg +3 -0
files/image/bee.jpg +3 -0
files/image/berries.jpeg +3 -0
files/image/butterfly.jpeg +3 -0
files/image/cat.jpg +3 -0
files/image/concert.jpeg +3 -0
files/image/dog.jpeg +3 -0
files/image/doughnuts.jpeg +3 -0
files/image/einstein.jpg +3 -0
files/image/food.jpeg +3 -0
files/image/glasses.jpeg +3 -0
files/image/house.jpg +3 -0
files/image/lake.jpeg +3 -0
files/image/marigold.jpeg +3 -0
files/image/portrait_1.jpeg +3 -0
files/image/portrait_2.jpeg +3 -0
files/image/pumpkins.jpg +3 -0
files/image/puzzle.jpeg +3 -0
files/image/road.jpg +3 -0
files/image/scientists.jpg +3 -0
files/image/surfboards.jpeg +3 -0
files/image/surfer.jpeg +3 -0
files/image/swings.jpg +3 -0
files/image/switzerland.jpeg +3 -0
files/image/teamwork.jpeg +3 -0
files/image/wave.jpeg +3 -0
files/video/cab.mp4 +3 -0
files/video/elephant.mp4 +3 -0
files/video/obama.mp4 +3 -0
marigold_depth_estimation_lcm.py +702 -0
marigold_logo_square.jpg +3 -0
requirements.txt +15 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.idea
+.DS_Store
+__pycache__

README.md CHANGED Viewed

@@ -1,13 +1,29 @@
 ---
-title: Marigold Lcm
-emoji: 🚀
-colorFrom: indigo
-colorTo: green
 sdk: gradio
 sdk_version: 4.22.0
 app_file: app.py
-pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Marigold-LCM Depth Estimation
+emoji: 🏵️
+colorFrom: blue
+colorTo: red
 sdk: gradio
 sdk_version: 4.22.0
 app_file: app.py
+pinned: true
+license: cc-by-sa-4.0
+models:
+- prs-eth/marigold-v1-0
+- prs-eth/marigold-lcm-v1-0
 ---
+This is a demo of Marigold-LCM, the state-of-the-art depth estimator for images in the wild.
+It combines the power of the original Marigold 10-step estimator and the Latent Consistency Models, delivering high-quality results in as little as one step.
+Find out more in our paper titled ["Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation"](https://arxiv.org/abs/2312.02145)
+```
+@misc{ke2023repurposing,
+      title={Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation},
+      author={Bingxin Ke and Anton Obukhov and Shengyu Huang and Nando Metzger and Rodrigo Caye Daudt and Konrad Schindler},
+      year={2023},
+      eprint={2312.02145},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,828 @@

+import functools
+import os
+import shutil
+import zipfile
+from io import BytesIO
+import gradio as gr
+import imageio as imageio
+import numpy as np
+import torch as torch
+from PIL import Image
+from diffusers import UNet2DConditionModel, LCMScheduler
+from gradio_imageslider import ImageSlider
+from huggingface_hub import login
+from tqdm import tqdm
+from extrude import extrude_depth_3d
+from marigold_depth_estimation_lcm import MarigoldDepthConsistencyPipeline
+default_seed = 2024
+default_image_denoise_steps = 4
+default_image_ensemble_size = 1
+default_image_processing_res = 768
+default_image_reproducuble = True
+default_video_depth_latent_init_strength = 0.1
+default_video_denoise_steps = 1
+default_video_ensemble_size = 1
+default_video_processing_res = 768
+default_video_out_fps = 15
+default_video_out_max_frames = 100
+default_bas_plane_near = 0.0
+default_bas_plane_far = 1.0
+default_bas_embossing = 20
+default_bas_denoise_steps = 4
+default_bas_ensemble_size = 1
+default_bas_processing_res = 768
+default_bas_size_longest_px = 512
+default_bas_size_longest_cm = 10
+default_bas_filter_size = 3
+default_bas_frame_thickness = 5
+default_bas_frame_near = 1
+default_bas_frame_far = 1
+def process_image(
+    pipe,
+    path_input,
+    denoise_steps=default_image_denoise_steps,
+    ensemble_size=default_image_ensemble_size,
+    processing_res=default_image_processing_res,
+    reproducible=default_image_reproducuble,
+):
+    input_image = Image.open(path_input)
+    pipe_out = pipe(
+        input_image,
+        denoising_steps=denoise_steps,
+        ensemble_size=ensemble_size,
+        processing_res=processing_res,
+        batch_size=1 if processing_res == 0 else 0,
+        seed=default_seed if reproducible else None,
+        show_progress_bar=False,
+    )
+    depth_pred = pipe_out.depth_np
+    depth_colored = pipe_out.depth_colored
+    depth_16bit = (depth_pred * 65535.0).astype(np.uint16)
+    path_output_dir = os.path.splitext(path_input)[0] + "_output"
+    os.makedirs(path_output_dir, exist_ok=True)
+    name_base = os.path.splitext(os.path.basename(path_input))[0]
+    path_out_fp32 = os.path.join(path_output_dir, f"{name_base}_depth_fp32.npy")
+    path_out_16bit = os.path.join(path_output_dir, f"{name_base}_depth_16bit.png")
+    path_out_vis = os.path.join(path_output_dir, f"{name_base}_depth_colored.png")
+    np.save(path_out_fp32, depth_pred)
+    Image.fromarray(depth_16bit).save(path_out_16bit, mode="I;16")
+    depth_colored.save(path_out_vis)
+    return (
+        [path_out_16bit, path_out_vis],
+        [path_out_16bit, path_out_fp32, path_out_vis],
+    )
+def process_video(
+    pipe,
+    path_input,
+    depth_latent_init_strength=default_video_depth_latent_init_strength,
+    denoise_steps=default_video_denoise_steps,
+    ensemble_size=default_video_ensemble_size,
+    processing_res=default_video_processing_res,
+    out_fps=default_video_out_fps,
+    out_max_frames=default_video_out_max_frames,
+    progress=gr.Progress(),
+):
+    path_output_dir = os.path.splitext(path_input)[0] + "_output"
+    os.makedirs(path_output_dir, exist_ok=True)
+    name_base = os.path.splitext(os.path.basename(path_input))[0]
+    path_out_vis = os.path.join(path_output_dir, f"{name_base}_depth_colored.mp4")
+    path_out_16bit = os.path.join(path_output_dir, f"{name_base}_depth_16bit.zip")
+    reader = imageio.get_reader(path_input)
+    meta_data = reader.get_meta_data()
+    fps = meta_data["fps"]
+    size = meta_data["size"]
+    duration_sec = meta_data["duration"]
+    if fps <= out_fps:
+        frame_interval, out_fps = 1, fps
+    else:
+        frame_interval = round(fps / out_fps)
+        out_fps = fps / frame_interval
+    out_duration_sec = out_max_frames / out_fps
+    if duration_sec > out_duration_sec:
+        gr.Warning(
+            f"Only the first ~{int(out_duration_sec)} seconds will be processed; "
+            f"use alternative setups for full processing"
+        )
+    writer = imageio.get_writer(path_out_vis, fps=out_fps)
+    zipf = zipfile.ZipFile(path_out_16bit, "w", zipfile.ZIP_DEFLATED)
+    prev_depth_latent = None
+    pbar = tqdm(desc="Processing Video", total=out_max_frames)
+    out_frame_id = 0
+    for frame_id, frame in enumerate(reader):
+        if not (frame_id % frame_interval == 0):
+            continue
+        out_frame_id += 1
+        pbar.update(1)
+        if out_frame_id > out_max_frames:
+            break
+        frame_pil = Image.fromarray(frame)
+        pipe_out = pipe(
+            frame_pil,
+            denoising_steps=denoise_steps,
+            ensemble_size=ensemble_size,
+            processing_res=processing_res,
+            match_input_res=False,
+            batch_size=0,
+            depth_latent_init=prev_depth_latent,
+            depth_latent_init_strength=depth_latent_init_strength,
+            seed=default_seed,
+            show_progress_bar=False,
+        )
+        prev_depth_latent = pipe_out.depth_latent
+        processed_frame = pipe_out.depth_colored
+        processed_frame = imageio.core.util.Array(np.array(processed_frame))
+        writer.append_data(processed_frame)
+        processed_frame = (65535 * np.clip(pipe_out.depth_np, 0.0, 1.0)).astype(
+            np.uint16
+        )
+        processed_frame = Image.fromarray(processed_frame, mode="I;16")
+        archive_path = os.path.join(
+            f"{name_base}_depth_16bit", f"{out_frame_id:05d}.png"
+        )
+        img_byte_arr = BytesIO()
+        processed_frame.save(img_byte_arr, format="png")
+        img_byte_arr.seek(0)
+        zipf.writestr(archive_path, img_byte_arr.read())
+    reader.close()
+    writer.close()
+    zipf.close()
+    return (
+        path_out_vis,
+        [path_out_vis, path_out_16bit],
+    )
+def process_bas(
+    pipe,
+    path_input,
+    plane_near=default_bas_plane_near,
+    plane_far=default_bas_plane_far,
+    embossing=default_bas_embossing,
+    denoise_steps=default_bas_denoise_steps,
+    ensemble_size=default_bas_ensemble_size,
+    processing_res=default_bas_processing_res,
+    size_longest_px=default_bas_size_longest_px,
+    size_longest_cm=default_bas_size_longest_cm,
+    filter_size=default_bas_filter_size,
+    frame_thickness=default_bas_frame_thickness,
+    frame_near=default_bas_frame_near,
+    frame_far=default_bas_frame_far,
+):
+    if plane_near >= plane_far:
+        raise gr.Error("NEAR plane must have a value smaller than the FAR plane")
+    path_output_dir = os.path.splitext(path_input)[0] + "_output"
+    os.makedirs(path_output_dir, exist_ok=True)
+    name_base, name_ext = os.path.splitext(os.path.basename(path_input))
+    input_image = Image.open(path_input)
+    pipe_out = pipe(
+        input_image,
+        denoising_steps=denoise_steps,
+        ensemble_size=ensemble_size,
+        processing_res=processing_res,
+        seed=default_seed,
+        show_progress_bar=False,
+    )
+    depth_pred = pipe_out.depth_np * 65535
+    def _process_3d(
+        size_longest_px,
+        filter_size,
+        vertex_colors,
+        scene_lights,
+        output_model_scale=None,
+        prepare_for_3d_printing=False,
+    ):
+        image_rgb_w, image_rgb_h = input_image.width, input_image.height
+        image_rgb_d = max(image_rgb_w, image_rgb_h)
+        image_new_w = size_longest_px * image_rgb_w // image_rgb_d
+        image_new_h = size_longest_px * image_rgb_h // image_rgb_d
+        image_rgb_new = os.path.join(
+            path_output_dir, f"{name_base}_rgb_{size_longest_px}{name_ext}"
+        )
+        image_depth_new = os.path.join(
+            path_output_dir, f"{name_base}_depth_{size_longest_px}.png"
+        )
+        input_image.resize((image_new_w, image_new_h), Image.LANCZOS).save(
+            image_rgb_new
+        )
+        Image.fromarray(depth_pred).convert(mode="F").resize(
+            (image_new_w, image_new_h), Image.BILINEAR
+        ).convert("I").save(image_depth_new)
+        path_glb, path_stl = extrude_depth_3d(
+            image_rgb_new,
+            image_depth_new,
+            output_model_scale=size_longest_cm * 10
+            if output_model_scale is None
+            else output_model_scale,
+            filter_size=filter_size,
+            coef_near=plane_near,
+            coef_far=plane_far,
+            emboss=embossing / 100,
+            f_thic=frame_thickness / 100,
+            f_near=frame_near / 100,
+            f_back=frame_far / 100,
+            vertex_colors=vertex_colors,
+            scene_lights=scene_lights,
+            prepare_for_3d_printing=prepare_for_3d_printing,
+        )
+        return path_glb, path_stl
+    path_viewer_glb, _ = _process_3d(
+        256, filter_size, vertex_colors=False, scene_lights=True, output_model_scale=1
+    )
+    path_files_glb, path_files_stl = _process_3d(
+        size_longest_px, filter_size, vertex_colors=True, scene_lights=False, prepare_for_3d_printing=True
+    )
+    return path_viewer_glb, [path_files_glb, path_files_stl]
+def run_demo_server(pipe):
+    process_pipe_image = functools.partial(process_image, pipe)
+    process_pipe_video = functools.partial(process_video, pipe)
+    process_pipe_bas = functools.partial(process_bas, pipe)
+    os.environ["GRADIO_ALLOW_FLAGGING"] = "never"
+    gradio_theme = gr.themes.Default()
+    # gradio_theme.set(
+    #     section_header_text_size="20px",
+    #     section_header_text_weight="bold",
+    # )
+    with gr.Blocks(
+        theme=gradio_theme,
+        title="Marigold-LCM Depth Estimation",
+        css="""
+            #download {
+                height: 118px;
+            }
+            .slider .inner {
+                width: 5px;
+                background: #FFF;
+            }
+            .viewport {
+                aspect-ratio: 4/3;
+            }
+            .tabs button.selected {
+                font-size: 20px !important;
+                color: crimson !important;
+            }
+        """,
+        head="""
+            <script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
+            <script>
+                window.dataLayer = window.dataLayer || [];
+                function gtag() {dataLayer.push(arguments);}
+                gtag('js', new Date());
+                gtag('config', 'G-1FWSVCGZTG');
+            </script>
+        """,
+    ) as demo:
+        gr.Markdown(
+            """
+            <h1 align="center">Marigold-LCM Depth Estimation</h1>
+            <p align="center">
+            <a title="Website" href="https://marigoldmonodepth.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://www.obukhov.ai/img/badges/badge-website.svg">
+            </a>
+            <a title="arXiv" href="https://arxiv.org/abs/2312.02145" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://www.obukhov.ai/img/badges/badge-pdf.svg">
+            </a>
+            <a title="Github" href="https://github.com/prs-eth/marigold" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://img.shields.io/github/stars/prs-eth/marigold?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
+            </a>
+            <a title="Social" href="https://twitter.com/antonobukhov1" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
+            </a>
+            </p>
+            <p align="justify">
+                Marigold-LCM is the fast version of Marigold, the state-of-the-art depth estimator for images in the wild.
+                It combines the power of the original Marigold 10-step estimator and the Latent Consistency Models, delivering high-quality results in as little as <b>one step</b>.
+                We provide three functions in this demo: Image, Video, and Bas-relief 3D processing — <b>see the tabs below</b>.
+                Upload your content into the <b>left</b> side, or click any of the <b>examples</b> below.
+                Wait a second (for images and 3D) or a minute (for videos), and interact with the result in the <b>right</b> side.
+                To avoid queuing, fork the demo into your profile.
+            </p>
+        """
+        )
+        with gr.Tabs(elem_classes=["tabs"]):
+            with gr.Tab("Image"):
+                with gr.Row():
+                    with gr.Column():
+                        image_input = gr.Image(
+                            label="Input Image",
+                            type="filepath",
+                        )
+                        with gr.Row():
+                            image_submit_btn = gr.Button(
+                                value="Compute Depth", variant="primary"
+                            )
+                            image_reset_btn = gr.Button(value="Reset")
+                        with gr.Accordion("Advanced options", open=False):
+                            image_denoise_steps = gr.Slider(
+                                label="Number of denoising steps",
+                                minimum=1,
+                                maximum=4,
+                                step=1,
+                                value=default_image_denoise_steps,
+                            )
+                            image_ensemble_size = gr.Slider(
+                                label="Ensemble size",
+                                minimum=1,
+                                maximum=10,
+                                step=1,
+                                value=default_image_ensemble_size,
+                            )
+                            image_processing_res = gr.Radio(
+                                [
+                                    ("Native", 0),
+                                    ("Recommended", 768),
+                                ],
+                                label="Processing resolution",
+                                value=default_image_processing_res,
+                            )
+                    with gr.Column():
+                        image_output_slider = ImageSlider(
+                            label="Predicted depth (red-near, blue-far)",
+                            type="filepath",
+                            show_download_button=True,
+                            show_share_button=True,
+                            interactive=False,
+                            elem_classes="slider",
+                            position=0.25,
+                        )
+                        image_output_files = gr.Files(
+                            label="Depth outputs",
+                            elem_id="download",
+                            interactive=False,
+                        )
+                gr.Examples(
+                    fn=process_pipe_image,
+                    examples=[
+                        os.path.join("files", "image", name)
+                        for name in [
+                            "arc.jpeg",
+                            "berries.jpeg",
+                            "butterfly.jpeg",
+                            "cat.jpg",
+                            "concert.jpeg",
+                            "dog.jpeg",
+                            "doughnuts.jpeg",
+                            "einstein.jpg",
+                            "food.jpeg",
+                            "glasses.jpeg",
+                            "house.jpg",
+                            "lake.jpeg",
+                            "marigold.jpeg",
+                            "portrait_1.jpeg",
+                            "portrait_2.jpeg",
+                            "pumpkins.jpg",
+                            "puzzle.jpeg",
+                            "road.jpg",
+                            "scientists.jpg",
+                            "surfboards.jpeg",
+                            "surfer.jpeg",
+                            "swings.jpg",
+                            "switzerland.jpeg",
+                            "teamwork.jpeg",
+                            "wave.jpeg",
+                        ]
+                    ],
+                    inputs=[image_input],
+                    outputs=[image_output_slider, image_output_files],
+                    cache_examples=True,
+                )
+            with gr.Tab("Video"):
+                with gr.Row():
+                    with gr.Column():
+                        video_input = gr.Video(
+                            label="Input Video",
+                            sources=["upload"],
+                        )
+                        with gr.Row():
+                            video_submit_btn = gr.Button(
+                                value="Compute Depth", variant="primary"
+                            )
+                            video_reset_btn = gr.Button(value="Reset")
+                    with gr.Column():
+                        video_output_video = gr.Video(
+                            label="Output video depth (red-near, blue-far)",
+                            interactive=False,
+                        )
+                        video_output_files = gr.Files(
+                            label="Depth outputs",
+                            elem_id="download",
+                            interactive=False,
+                        )
+                gr.Examples(
+                    fn=process_pipe_video,
+                    examples=[
+                        os.path.join("files", "video", name)
+                        for name in [
+                            "cab.mp4",
+                            "elephant.mp4",
+                            "obama.mp4",
+                        ]
+                    ],
+                    inputs=[video_input],
+                    outputs=[video_output_video, video_output_files],
+                    cache_examples=True,
+                )
+            with gr.Tab("Bas-relief (3D)"):
+                gr.Markdown(
+                    """
+                    <p align="justify">
+                        This part of the demo uses Marigold-LCM to create a bas-relief model.
+                        The models are watertight, with correct normals, and exported in the STL format, which makes them <b>3D-printable</b>.
+                        Start by uploading the image and click "Create" with the default parameters.
+                        To improve the result, click "Clear", adjust the geometry sliders below, and click "Create" again.
+                    </p>
+                    """,
+                )
+                with gr.Row():
+                    with gr.Column():
+                        bas_input = gr.Image(
+                            label="Input Image",
+                            type="filepath",
+                        )
+                        with gr.Row():
+                            bas_submit_btn = gr.Button(value="Create 3D", variant="primary")
+                            bas_clear_btn = gr.Button(value="Clear")
+                            bas_reset_btn = gr.Button(value="Reset")
+                        with gr.Accordion("3D printing demo: Main options", open=True):
+                            bas_plane_near = gr.Slider(
+                                label="Relative position of the near plane (between 0 and 1)",
+                                minimum=0.0,
+                                maximum=1.0,
+                                step=0.001,
+                                value=default_bas_plane_near,
+                            )
+                            bas_plane_far = gr.Slider(
+                                label="Relative position of the far plane (between near and 1)",
+                                minimum=0.0,
+                                maximum=1.0,
+                                step=0.001,
+                                value=default_bas_plane_far,
+                            )
+                            bas_embossing = gr.Slider(
+                                label="Embossing level",
+                                minimum=0,
+                                maximum=100,
+                                step=1,
+                                value=default_bas_embossing,
+                            )
+                        with gr.Accordion("3D printing demo: Advanced options", open=False):
+                            bas_denoise_steps = gr.Slider(
+                                label="Number of denoising steps",
+                                minimum=1,
+                                maximum=4,
+                                step=1,
+                                value=default_bas_denoise_steps,
+                            )
+                            bas_ensemble_size = gr.Slider(
+                                label="Ensemble size",
+                                minimum=1,
+                                maximum=10,
+                                step=1,
+                                value=default_bas_ensemble_size,
+                            )
+                            bas_processing_res = gr.Radio(
+                                [
+                                    ("Native", 0),
+                                    ("Recommended", 768),
+                                ],
+                                label="Processing resolution",
+                                value=default_bas_processing_res,
+                            )
+                            bas_size_longest_px = gr.Slider(
+                                label="Size (px) of the longest side",
+                                minimum=256,
+                                maximum=1024,
+                                step=256,
+                                value=default_bas_size_longest_px,
+                            )
+                            bas_size_longest_cm = gr.Slider(
+                                label="Size (cm) of the longest side",
+                                minimum=1,
+                                maximum=100,
+                                step=1,
+                                value=default_bas_size_longest_cm,
+                            )
+                            bas_filter_size = gr.Slider(
+                                label="Size (px) of the smoothing filter",
+                                minimum=1,
+                                maximum=5,
+                                step=2,
+                                value=default_bas_filter_size,
+                            )
+                            bas_frame_thickness = gr.Slider(
+                                label="Frame thickness",
+                                minimum=0,
+                                maximum=100,
+                                step=1,
+                                value=default_bas_frame_thickness,
+                            )
+                            bas_frame_near = gr.Slider(
+                                label="Frame's near plane offset",
+                                minimum=-100,
+                                maximum=100,
+                                step=1,
+                                value=default_bas_frame_near,
+                            )
+                            bas_frame_far = gr.Slider(
+                                label="Frame's far plane offset",
+                                minimum=1,
+                                maximum=10,
+                                step=1,
+                                value=default_bas_frame_far,
+                            )
+                    with gr.Column():
+                        bas_output_viewer = gr.Model3D(
+                            camera_position=(75.0, 90.0, 1.25),
+                            elem_classes="viewport",
+                            label="3D preview (low-res, relief highlight)",
+                            interactive=False,
+                        )
+                        bas_output_files = gr.Files(
+                            label="3D model outputs (high-res)",
+                            elem_id="download",
+                            interactive=False,
+                        )
+                gr.Examples(
+                    fn=process_pipe_bas,
+                    examples=[
+                        [
+                            "files/basrelief/coin.jpg",  # input
+                            0.0,  # plane_near
+                            0.66,  # plane_far
+                            15,  # embossing
+                            4,  # denoise_steps
+                            4,  # ensemble_size
+                            768,  # processing_res
+                            512,  # size_longest_px
+                            10,  # size_longest_cm
+                            3,  # filter_size
+                            5,  # frame_thickness
+                            0,  # frame_near
+                            1,  # frame_far
+                        ],
+                        [
+                            "files/basrelief/einstein.jpg",  # input
+                            0.0,  # plane_near
+                            0.5,  # plane_far
+                            50,  # embossing
+                            2,  # denoise_steps
+                            1,  # ensemble_size
+                            768,  # processing_res
+                            512,  # size_longest_px
+                            10,  # size_longest_cm
+                            3,  # filter_size
+                            5,  # frame_thickness
+                            -15,  # frame_near
+                            1,  # frame_far
+                        ],
+                        [
+                            "files/basrelief/food.jpeg",  # input
+                            0.0,  # plane_near
+                            1.0,  # plane_far
+                            20,  # embossing
+                            2,  # denoise_steps
+                            4,  # ensemble_size
+                            768,  # processing_res
+                            512,  # size_longest_px
+                            10,  # size_longest_cm
+                            3,  # filter_size
+                            5,  # frame_thickness
+                            -5,  # frame_near
+                            1,  # frame_far
+                        ],
+                    ],
+                    inputs=[
+                        bas_input,
+                        bas_plane_near,
+                        bas_plane_far,
+                        bas_embossing,
+                        bas_denoise_steps,
+                        bas_ensemble_size,
+                        bas_processing_res,
+                        bas_size_longest_px,
+                        bas_size_longest_cm,
+                        bas_filter_size,
+                        bas_frame_thickness,
+                        bas_frame_near,
+                        bas_frame_far,
+                    ],
+                    outputs=[bas_output_viewer, bas_output_files],
+                    cache_examples=True,
+                )
+        image_submit_btn.click(
+            fn=process_pipe_image,
+            inputs=[
+                image_input,
+                image_denoise_steps,
+                image_ensemble_size,
+                image_processing_res,
+            ],
+            outputs=[image_output_slider, image_output_files],
+            concurrency_limit=1,
+        )
+        image_reset_btn.click(
+            fn=lambda: (
+                None,
+                None,
+                None,
+                default_image_ensemble_size,
+                default_image_denoise_steps,
+                default_image_processing_res,
+            ),
+            inputs=[],
+            outputs=[
+                image_input,
+                image_output_slider,
+                image_output_files,
+                image_ensemble_size,
+                image_denoise_steps,
+                image_processing_res,
+            ],
+            concurrency_limit=1,
+        )
+        video_submit_btn.click(
+            fn=process_pipe_video,
+            inputs=[video_input],
+            outputs=[video_output_video, video_output_files],
+            concurrency_limit=1,
+        )
+        video_reset_btn.click(
+            fn=lambda: (None, None, None),
+            inputs=[],
+            outputs=[video_input, video_output_video, video_output_files],
+            concurrency_limit=1,
+        )
+        def wrapper_process_pipe_bas(*args, **kwargs):
+            out = list(process_pipe_bas(*args, **kwargs))
+            out = [gr.Button(interactive=False), gr.Image(interactive=False)] + out
+            return out
+        bas_submit_btn.click(
+            fn=wrapper_process_pipe_bas,
+            inputs=[
+                bas_input,
+                bas_plane_near,
+                bas_plane_far,
+                bas_embossing,
+                bas_denoise_steps,
+                bas_ensemble_size,
+                bas_processing_res,
+                bas_size_longest_px,
+                bas_size_longest_cm,
+                bas_filter_size,
+                bas_frame_thickness,
+                bas_frame_near,
+                bas_frame_far,
+            ],
+            outputs=[bas_submit_btn, bas_input, bas_output_viewer, bas_output_files],
+            concurrency_limit=1,
+        )
+        bas_clear_btn.click(
+            fn=lambda: (gr.Button(interactive=True), None, None),
+            inputs=[],
+            outputs=[
+                bas_submit_btn,
+                bas_output_viewer,
+                bas_output_files,
+            ],
+            concurrency_limit=1,
+        )
+        bas_reset_btn.click(
+            fn=lambda: (
+                gr.Button(interactive=True),
+                None,
+                None,
+                None,
+                default_bas_plane_near,
+                default_bas_plane_far,
+                default_bas_embossing,
+                default_bas_denoise_steps,
+                default_bas_ensemble_size,
+                default_bas_processing_res,
+                default_bas_size_longest_px,
+                default_bas_size_longest_cm,
+                default_bas_filter_size,
+                default_bas_frame_thickness,
+                default_bas_frame_near,
+                default_bas_frame_far,
+            ),
+            inputs=[],
+            outputs=[
+                bas_submit_btn,
+                bas_input,
+                bas_output_viewer,
+                bas_output_files,
+                bas_plane_near,
+                bas_plane_far,
+                bas_embossing,
+                bas_denoise_steps,
+                bas_ensemble_size,
+                bas_processing_res,
+                bas_size_longest_px,
+                bas_size_longest_cm,
+                bas_filter_size,
+                bas_frame_thickness,
+                bas_frame_near,
+                bas_frame_far,
+            ],
+            concurrency_limit=1,
+        )
+        demo.queue(
+            api_open=False,
+        ).launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+        )
+def prefetch_hf_cache(pipe):
+    process_image(pipe, "files/image/bee.jpg", 1, 1, 64)
+    shutil.rmtree("files/image/bee_output")
+def main():
+    CHECKPOINT = "prs-eth/marigold-v1-0"
+    CHECKPOINT_UNET_LCM = "prs-eth/marigold-lcm-v1-0"
+    login(token=os.environ["HF_TOKEN_COLAB_RO"])
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    pipe = MarigoldDepthConsistencyPipeline.from_pretrained(
+        CHECKPOINT,
+        unet=UNet2DConditionModel.from_pretrained(
+            CHECKPOINT_UNET_LCM, subfolder="unet", use_auth_token=True
+        ),
+    )
+    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+    try:
+        import xformers
+        pipe.enable_xformers_memory_efficient_attention()
+    except:
+        pass  # run without xformers
+    pipe = pipe.to(device)
+    prefetch_hf_cache(pipe)
+    run_demo_server(pipe)
+if __name__ == "__main__":
+    main()

extrude.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import math
+import os
+import numpy as np
+import pygltflib
+import trimesh
+from PIL import Image, ImageFilter
+def quaternion_multiply(q1, q2):
+    x1, y1, z1, w1 = q1
+    x2, y2, z2, w2 = q2
+    return [
+        w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2,
+        w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2,
+        w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2,
+        w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2,
+    ]
+def glb_add_lights(path_input, path_output):
+    """
+    Adds directional lights in the horizontal plane to the glb file.
+    :param path_input: path to input glb
+    :param path_output: path to output glb
+    :return: None
+    """
+    glb = pygltflib.GLTF2().load(path_input)
+    N = 3  # default max num lights in Babylon.js is 4
+    angle_step = 2 * math.pi / N
+    elevation_angle = math.radians(75)
+    light_colors = [
+        [1.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0],
+        [0.0, 0.0, 1.0],
+    ]
+    lights_extension = {
+        "lights": [
+            {"type": "directional", "color": light_colors[i], "intensity": 2.0}
+            for i in range(N)
+        ]
+    }
+    if "KHR_lights_punctual" not in glb.extensionsUsed:
+        glb.extensionsUsed.append("KHR_lights_punctual")
+    glb.extensions["KHR_lights_punctual"] = lights_extension
+    light_nodes = []
+    for i in range(N):
+        angle = i * angle_step
+        pos_rot = [0.0, 0.0, math.sin(angle / 2), math.cos(angle / 2)]
+        elev_rot = [
+            math.sin(elevation_angle / 2),
+            0.0,
+            0.0,
+            math.cos(elevation_angle / 2),
+        ]
+        rotation = quaternion_multiply(pos_rot, elev_rot)
+        node = {
+            "rotation": rotation,
+            "extensions": {"KHR_lights_punctual": {"light": i}},
+        }
+        light_nodes.append(node)
+    light_node_indices = list(range(len(glb.nodes), len(glb.nodes) + N))
+    glb.nodes.extend(light_nodes)
+    root_node_index = glb.scenes[glb.scene].nodes[0]
+    root_node = glb.nodes[root_node_index]
+    if hasattr(root_node, "children"):
+        root_node.children.extend(light_node_indices)
+    else:
+        root_node.children = light_node_indices
+    glb.save(path_output)
+def extrude_depth_3d(
+    path_rgb,
+    path_depth,
+    output_model_scale=100,
+    filter_size=3,
+    coef_near=0.0,
+    coef_far=1.0,
+    emboss=0.3,
+    f_thic=0.05,
+    f_near=-0.15,
+    f_back=0.01,
+    vertex_colors=True,
+    scene_lights=True,
+    prepare_for_3d_printing=False,
+):
+    f_far_inner = -emboss
+    f_far_outer = f_far_inner - f_back
+    f_near = max(f_near, f_far_inner)
+    depth_image = Image.open(path_depth)
+    assert depth_image.mode == "I", depth_image.mode
+    depth_image = depth_image.filter(ImageFilter.MedianFilter(size=filter_size))
+    w, h = depth_image.size
+    d_max = max(w, h)
+    depth_image = np.array(depth_image).astype(np.double)
+    z_min, z_max = np.min(depth_image), np.max(depth_image)
+    depth_image = (depth_image.astype(np.double) - z_min) / (z_max - z_min)
+    depth_image[depth_image < coef_near] = coef_near
+    depth_image[depth_image > coef_far] = coef_far
+    depth_image = emboss * (depth_image - coef_near) / (coef_far - coef_near)
+    rgb_image = np.array(
+        Image.open(path_rgb).convert("RGB").resize((w, h), Image.Resampling.LANCZOS)
+    )
+    w_norm = w / float(d_max - 1)
+    h_norm = h / float(d_max - 1)
+    w_half = w_norm / 2
+    h_half = h_norm / 2
+    x, y = np.meshgrid(np.arange(w), np.arange(h))
+    x = x / float(d_max - 1) - w_half  # [-w_half, w_half]
+    y = -y / float(d_max - 1) + h_half  # [-h_half, h_half]
+    z = -depth_image  # -depth_emboss (far) - 0 (near)
+    vertices_2d = np.stack((x, y, z), axis=-1)
+    vertices = vertices_2d.reshape(-1, 3)
+    colors = rgb_image[:, :, :3].reshape(-1, 3) / 255.0
+    faces = []
+    for y in range(h - 1):
+        for x in range(w - 1):
+            idx = y * w + x
+            faces.append([idx, idx + w, idx + 1])
+            faces.append([idx + 1, idx + w, idx + 1 + w])
+    # OUTER frame
+    nv = len(vertices)
+    vertices = np.append(
+        vertices,
+        [
+            [-w_half - f_thic, -h_half - f_thic, f_near],  # 00
+            [-w_half - f_thic, -h_half - f_thic, f_far_outer],  # 01
+            [w_half + f_thic, -h_half - f_thic, f_near],  # 02
+            [w_half + f_thic, -h_half - f_thic, f_far_outer],  # 03
+            [w_half + f_thic, h_half + f_thic, f_near],  # 04
+            [w_half + f_thic, h_half + f_thic, f_far_outer],  # 05
+            [-w_half - f_thic, h_half + f_thic, f_near],  # 06
+            [-w_half - f_thic, h_half + f_thic, f_far_outer],  # 07
+        ],
+        axis=0,
+    )
+    faces.extend(
+        [
+            [nv + 0, nv + 1, nv + 2],
+            [nv + 2, nv + 1, nv + 3],
+            [nv + 2, nv + 3, nv + 4],
+            [nv + 4, nv + 3, nv + 5],
+            [nv + 4, nv + 5, nv + 6],
+            [nv + 6, nv + 5, nv + 7],
+            [nv + 6, nv + 7, nv + 0],
+            [nv + 0, nv + 7, nv + 1],
+        ]
+    )
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * 8, axis=0)
+    # INNER frame
+    nv = len(vertices)
+    vertices_left_data = vertices_2d[:, 0]  # H x 3
+    vertices_left_frame = vertices_2d[:, 0].copy()  # H x 3
+    vertices_left_frame[:, 2] = f_near
+    vertices = np.append(vertices, vertices_left_data, axis=0)
+    vertices = np.append(vertices, vertices_left_frame, axis=0)
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 * h), axis=0)
+    for i in range(h - 1):
+        nvi_d = nv + i
+        nvi_f = nvi_d + h
+        faces.append([nvi_d, nvi_f, nvi_d + 1])
+        faces.append([nvi_d + 1, nvi_f, nvi_f + 1])
+    nv = len(vertices)
+    vertices_right_data = vertices_2d[:, -1]  # H x 3
+    vertices_right_frame = vertices_2d[:, -1].copy()  # H x 3
+    vertices_right_frame[:, 2] = f_near
+    vertices = np.append(vertices, vertices_right_data, axis=0)
+    vertices = np.append(vertices, vertices_right_frame, axis=0)
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 * h), axis=0)
+    for i in range(h - 1):
+        nvi_d = nv + i
+        nvi_f = nvi_d + h
+        faces.append([nvi_d, nvi_d + 1, nvi_f])
+        faces.append([nvi_d + 1, nvi_f + 1, nvi_f])
+    nv = len(vertices)
+    vertices_top_data = vertices_2d[0, :]  # H x 3
+    vertices_top_frame = vertices_2d[0, :].copy()  # H x 3
+    vertices_top_frame[:, 2] = f_near
+    vertices = np.append(vertices, vertices_top_data, axis=0)
+    vertices = np.append(vertices, vertices_top_frame, axis=0)
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 * w), axis=0)
+    for i in range(w - 1):
+        nvi_d = nv + i
+        nvi_f = nvi_d + w
+        faces.append([nvi_d, nvi_d + 1, nvi_f])
+        faces.append([nvi_d + 1, nvi_f + 1, nvi_f])
+    nv = len(vertices)
+    vertices_bottom_data = vertices_2d[-1, :]  # H x 3
+    vertices_bottom_frame = vertices_2d[-1, :].copy()  # H x 3
+    vertices_bottom_frame[:, 2] = f_near
+    vertices = np.append(vertices, vertices_bottom_data, axis=0)
+    vertices = np.append(vertices, vertices_bottom_frame, axis=0)
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 * w), axis=0)
+    for i in range(w - 1):
+        nvi_d = nv + i
+        nvi_f = nvi_d + w
+        faces.append([nvi_d, nvi_f, nvi_d + 1])
+        faces.append([nvi_d + 1, nvi_f, nvi_f + 1])
+    # FRONT frame
+    nv = len(vertices)
+    vertices = np.append(
+        vertices,
+        [
+            [-w_half - f_thic, -h_half - f_thic, f_near],
+            [-w_half - f_thic, h_half + f_thic, f_near],
+        ],
+        axis=0,
+    )
+    vertices = np.append(vertices, vertices_left_frame, axis=0)
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 + h), axis=0)
+    for i in range(h - 1):
+        faces.append([nv, nv + 2 + i + 1, nv + 2 + i])
+    faces.append([nv, nv + 2, nv + 1])
+    nv = len(vertices)
+    vertices = np.append(
+        vertices,
+        [
+            [w_half + f_thic, h_half + f_thic, f_near],
+            [w_half + f_thic, -h_half - f_thic, f_near],
+        ],
+        axis=0,
+    )
+    vertices = np.append(vertices, vertices_right_frame, axis=0)
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 + h), axis=0)
+    for i in range(h - 1):
+        faces.append([nv, nv + 2 + i, nv + 2 + i + 1])
+    faces.append([nv, nv + h + 1, nv + 1])
+    nv = len(vertices)
+    vertices = np.append(
+        vertices,
+        [
+            [w_half + f_thic, h_half + f_thic, f_near],
+            [-w_half - f_thic, h_half + f_thic, f_near],
+        ],
+        axis=0,
+    )
+    vertices = np.append(vertices, vertices_top_frame, axis=0)
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 + w), axis=0)
+    for i in range(w - 1):
+        faces.append([nv, nv + 2 + i, nv + 2 + i + 1])
+    faces.append([nv, nv + 1, nv + 2])
+    nv = len(vertices)
+    vertices = np.append(
+        vertices,
+        [
+            [-w_half - f_thic, -h_half - f_thic, f_near],
+            [w_half + f_thic, -h_half - f_thic, f_near],
+        ],
+        axis=0,
+    )
+    vertices = np.append(vertices, vertices_bottom_frame, axis=0)
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 + w), axis=0)
+    for i in range(w - 1):
+        faces.append([nv, nv + 2 + i + 1, nv + 2 + i])
+    faces.append([nv, nv + 1, nv + w + 1])
+    # BACK frame
+    nv = len(vertices)
+    vertices = np.append(
+        vertices,
+        [
+            [-w_half - f_thic, -h_half - f_thic, f_far_outer],  # 00
+            [w_half + f_thic, -h_half - f_thic, f_far_outer],  # 01
+            [w_half + f_thic, h_half + f_thic, f_far_outer],  # 02
+            [-w_half - f_thic, h_half + f_thic, f_far_outer],  # 03
+        ],
+        axis=0,
+    )
+    faces.extend(
+        [
+            [nv + 0, nv + 2, nv + 1],
+            [nv + 2, nv + 0, nv + 3],
+        ]
+    )
+    colors = np.append(colors, [[0.5, 0.5, 0.5]] * 4, axis=0)
+    trimesh_kwargs = {}
+    if vertex_colors:
+        trimesh_kwargs["vertex_colors"] = colors
+    mesh = trimesh.Trimesh(vertices=vertices, faces=faces, **trimesh_kwargs)
+    mesh.merge_vertices()
+    current_max_dimension = max(mesh.extents)
+    scaling_factor = output_model_scale / current_max_dimension
+    mesh.apply_scale(scaling_factor)
+    if prepare_for_3d_printing:
+        rotation_mat = trimesh.transformations.rotation_matrix(np.radians(90), [-1, 0, 0])
+        mesh.apply_transform(rotation_mat)
+    path_out_base = os.path.splitext(path_depth)[0].replace("_16bit", "")
+    path_out_glb = path_out_base + ".glb"
+    path_out_stl = path_out_base + ".stl"
+    mesh.export(path_out_glb, file_type="glb")
+    if scene_lights:
+        glb_add_lights(path_out_glb, path_out_glb)
+    mesh.export(path_out_stl, file_type="stl")
+    return path_out_glb, path_out_stl

files/basrelief/coin.jpg ADDED Viewed

Git LFS Details

SHA256: d5295c5cb301ef73099e3dd91f80916e7b013f6b04d75759df57081b16a18adc
Pointer size: 131 Bytes
Size of remote file: 632 kB

files/basrelief/einstein.jpg ADDED Viewed

Git LFS Details

SHA256: d4a4543c0fffb2ca5ea3c17e23e88fcfcf66eae8b487173fbc5c25d0d614bdb6
Pointer size: 131 Bytes
Size of remote file: 367 kB

files/basrelief/food.jpeg ADDED Viewed

Git LFS Details

SHA256: a26151050a574b0dc0014e9c4806da3d6f6bc1297ee1035a16b9ace007a179af
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

files/image/arc.jpeg ADDED Viewed

Git LFS Details

SHA256: f888e3770134e2073459026f58c568f7cf30524dd26a9182413c84b709e1b63e
Pointer size: 132 Bytes
Size of remote file: 1.01 MB

files/image/bee.jpg ADDED Viewed

Git LFS Details

SHA256: 7643ccdbc9550e2bf6ebdd5c768db5bc829ef719b0d1a91b4f6f9184b52f4751
Pointer size: 131 Bytes
Size of remote file: 146 kB

files/image/berries.jpeg ADDED Viewed

Git LFS Details

SHA256: dac1411ea48cf83b7a59c6424032f95b2ff496b3a98cdccf168bbed1c8f0aed4
Pointer size: 131 Bytes
Size of remote file: 940 kB

files/image/butterfly.jpeg ADDED Viewed

Git LFS Details

SHA256: e0364b8eec31d2c113c15c2b6c892754130765e8e2c960adc87d51ca5c0ea8f9
Pointer size: 131 Bytes
Size of remote file: 878 kB

files/image/cat.jpg ADDED Viewed

Git LFS Details

SHA256: 794796a86e56a4b372287661dc934daa2d15e988d01afe88afc50b32644c007a
Pointer size: 131 Bytes
Size of remote file: 236 kB

files/image/concert.jpeg ADDED Viewed

Git LFS Details

SHA256: fc746e234cb8a3e483999ee4c4f4d22b4e6c48cb2655eaa47c0936f3a37b61dc
Pointer size: 131 Bytes
Size of remote file: 420 kB

files/image/dog.jpeg ADDED Viewed

Git LFS Details

SHA256: c932a965dfe63c8c6dbc1bb48f7ea245a6a6dd2fb40fd243545e908b3aa7aa62
Pointer size: 131 Bytes
Size of remote file: 672 kB

files/image/doughnuts.jpeg ADDED Viewed

Git LFS Details

SHA256: 2ede4170b4a17f0c076c1a336eb4d3c03d64688997a986e3a8101972016b799a
Pointer size: 131 Bytes
Size of remote file: 607 kB

files/image/einstein.jpg ADDED Viewed

Git LFS Details

SHA256: d4a4543c0fffb2ca5ea3c17e23e88fcfcf66eae8b487173fbc5c25d0d614bdb6
Pointer size: 131 Bytes
Size of remote file: 367 kB

files/image/food.jpeg ADDED Viewed

Git LFS Details

SHA256: a26151050a574b0dc0014e9c4806da3d6f6bc1297ee1035a16b9ace007a179af
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

files/image/glasses.jpeg ADDED Viewed

Git LFS Details

SHA256: de8c0c20adb7c187357c21e467d3f178888574962027cdd366c390b63913ffec
Pointer size: 131 Bytes
Size of remote file: 677 kB

files/image/house.jpg ADDED Viewed

Git LFS Details

SHA256: 4087027e84a6323099fc839fd0b6816fd614814e92d12df21051cff3ed472819
Pointer size: 133 Bytes
Size of remote file: 14.9 MB

files/image/lake.jpeg ADDED Viewed

Git LFS Details

SHA256: 181dc0f684f0f3b94bc4bec829becd3dec817f69032731edf55ee8370c6898f0
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

files/image/marigold.jpeg ADDED Viewed

Git LFS Details

SHA256: 575c1a7bc1199d86b5ec305b4efc12286842dee4a189e8699dcf8a6d0276807c
Pointer size: 131 Bytes
Size of remote file: 416 kB

files/image/portrait_1.jpeg ADDED Viewed

Git LFS Details

SHA256: 76e3ad74311975f0db43cdebd4202d1464e19b6950cc3e7c5aa0a160f95493c3
Pointer size: 131 Bytes
Size of remote file: 506 kB

files/image/portrait_2.jpeg ADDED Viewed

Git LFS Details

SHA256: 805ad1127b0d9d09068df70e3ab7aa7450ff802fa5464db8430787dfee1ec6a0
Pointer size: 131 Bytes
Size of remote file: 525 kB

files/image/pumpkins.jpg ADDED Viewed

Git LFS Details

SHA256: 92f03bc05dc882231bce735f2afb8c27eb9d0616166abe3794b39ff24314fd0a
Pointer size: 133 Bytes
Size of remote file: 11.3 MB

files/image/puzzle.jpeg ADDED Viewed

Git LFS Details

SHA256: 60b66432124a0936c6143301a9f9b793af4184bc9340c567d11fdd5a22cc98cc
Pointer size: 131 Bytes
Size of remote file: 374 kB

files/image/road.jpg ADDED Viewed

Git LFS Details

SHA256: 58bb01aea37f6e1206260eddb6d003589d779e8b3fb3ef0a0f1e2e38a8fa3925
Pointer size: 133 Bytes
Size of remote file: 13.1 MB

files/image/scientists.jpg ADDED Viewed

Git LFS Details

SHA256: 7b164dfbc4ab6e491ce81972b8c0e076fdc4af622289d0aa3cb43ee3c2be4030
Pointer size: 131 Bytes
Size of remote file: 444 kB

files/image/surfboards.jpeg ADDED Viewed

Git LFS Details

SHA256: 326f9ffd3b85b29b971205eb87c2d0c9b5e4409b496be1eb961b46d5f7c5d6c6
Pointer size: 132 Bytes
Size of remote file: 1.16 MB

files/image/surfer.jpeg ADDED Viewed

Git LFS Details

SHA256: 52827abf2c3951b752d4e58c88fff7ab907672c58fda70b813df3922650c7495
Pointer size: 132 Bytes
Size of remote file: 1.01 MB

files/image/swings.jpg ADDED Viewed

Git LFS Details

SHA256: cae2ac669c948313eae8aca53017f10b64b42f87c53b9c34639962b218fdf1f1
Pointer size: 131 Bytes
Size of remote file: 353 kB

files/image/switzerland.jpeg ADDED Viewed

Git LFS Details

SHA256: 81e35ba90f7736167ea3e8a0a58f932ecded07b00b012a5bd7df5dabbe0eb3ce
Pointer size: 131 Bytes
Size of remote file: 847 kB

files/image/teamwork.jpeg ADDED Viewed

Git LFS Details

SHA256: 3cd48af8f3db4d89760cd6f40f2716570e697ae74a9bd88ed1ba36c0e68326b3
Pointer size: 131 Bytes
Size of remote file: 700 kB

files/image/wave.jpeg ADDED Viewed

Git LFS Details

SHA256: 7f14e77f7990d75104d6e3447077eb176d6437c58f5fb0fffcdb6015193b2d03
Pointer size: 132 Bytes
Size of remote file: 1.07 MB

files/video/cab.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7857328de30257e2985e0218e18e35f0dbc6ca9dd9f89b28687881f13ca0a4a
+size 3268179

files/video/elephant.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d198ec2e3e5a308c5eeb18c9f3a882f6c5812d329d9e8497e1bf79ff466dd84
+size 3078416

files/video/obama.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4aa0ac19460e0966139247cc180f98398fb11a35e3ca5c90cb70f0c4704904de
+size 955458

marigold_depth_estimation_lcm.py ADDED Viewed

	@@ -0,0 +1,702 @@

+# Copyright 2024 Anton Obukhov, Bingxin Ke, ETH Zurich and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import math
+from typing import Dict, Union, Tuple
+import matplotlib
+import numpy as np
+import torch
+from PIL import Image
+from scipy.optimize import minimize
+from torch.utils.data import DataLoader, TensorDataset
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils import BaseOutput, check_min_version
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.27.0.dev0")
+class MarigoldDepthConsistencyOutput(BaseOutput):
+    """
+    Output class for Marigold monocular depth prediction pipeline.
+    Args:
+        depth_np (`np.ndarray`):
+            Predicted depth map, with depth values in the range of [0, 1].
+        depth_colored (`None` or `PIL.Image.Image`):
+            Colorized depth map, with the shape of [3, H, W] and values in [0, 1].
+        depth_latent (`torch.Tensor`):
+            Depth map's latent, with the shape of [4, h, w].
+        uncertainty (`None` or `np.ndarray`):
+            Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling.
+    """
+    depth_np: np.ndarray
+    depth_colored: Union[None, Image.Image]
+    depth_latent: torch.Tensor
+    uncertainty: Union[None, np.ndarray]
+class MarigoldDepthConsistencyPipeline(DiffusionPipeline):
+    """
+    Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the depth latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and depth maps
+            to and from latent representations.
+        scheduler (`DDIMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+    """
+    rgb_latent_scale_factor = 0.18215
+    depth_latent_scale_factor = 0.18215
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: DDIMScheduler,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+    ):
+        super().__init__()
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.empty_text_embed = None
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_image: Image,
+        denoising_steps: int = 1,
+        ensemble_size: int = 1,
+        processing_res: int = 768,
+        match_input_res: bool = True,
+        batch_size: int = 0,
+        depth_latent_init: torch.Tensor = None,
+        depth_latent_init_strength: float = 0.1,
+        seed: int = None,
+        color_map: str = "Spectral",
+        show_progress_bar: bool = True,
+        ensemble_kwargs: Dict = None,
+    ) -> MarigoldDepthConsistencyOutput:
+        """
+        Function invoked when calling the pipeline.
+        Args:
+            input_image (`Image`):
+                Input RGB (or gray-scale) image.
+            processing_res (`int`, *optional*, defaults to `768`):
+                Maximum resolution of processing.
+                If set to 0: will not resize at all.
+            match_input_res (`bool`, *optional*, defaults to `True`):
+                Resize depth prediction to match input resolution.
+                Only valid if `limit_input_res` is not None.
+            denoising_steps (`int`, *optional*, defaults to `1`):
+                Number of diffusion denoising steps (consistency) during inference.
+            ensemble_size (`int`, *optional*, defaults to `1`):
+                Number of predictions to be ensembled.
+            batch_size (`int`, *optional*, defaults to `0`):
+                Inference batch size, no bigger than `num_ensemble`.
+                If set to 0, the script will automatically decide the proper batch size.
+            depth_latent_init (`torch.Tensor`, *optional*, defaults to `None`):
+                Initial depth map latent for better temporal consistency.
+            depth_latent_init_strength (`float`, *optional*, defaults to `0.1`)
+                Degree of initial depth latent influence, must be between 0 and 1.
+            seed (`int`, *optional*, defaults to `None`)
+                Reproducibility seed.
+            show_progress_bar (`bool`, *optional*, defaults to `True`):
+                Display a progress bar of diffusion denoising.
+            color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation):
+                Colormap used to colorize the depth map.
+            ensemble_kwargs (`dict`, *optional*, defaults to `None`):
+                Arguments for detailed ensembling settings.
+        Returns:
+            `MarigoldDepthConsistencyOutput`: Output class for Marigold monocular depth prediction pipeline, including:
+            - **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
+            - **depth_colored** (`None` or `PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and
+                    values in [0, 1]. None if `color_map` is `None`
+            - **depth_latent** (`torch.Tensor`) Predicted depth map latent
+            - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
+                    coming from ensembling. None if `ensemble_size = 1`
+        """
+        device = self.device
+        input_size = input_image.size
+        if not match_input_res:
+            assert (
+                processing_res is not None
+            ), "Value error: `resize_output_back` is only valid with "
+        assert processing_res >= 0, "Value error: `processing_res` must be non-negative"
+        assert (
+            1 <= denoising_steps <= 10
+        ), "Value error: This model degrades with large number of steps"
+        assert ensemble_size >= 1
+        # ----------------- Image Preprocess -----------------
+        # Resize image
+        if processing_res > 0:
+            input_image = self.resize_max_res(
+                input_image, max_edge_resolution=processing_res
+            )
+        # Convert the image to RGB, to 1.remove the alpha channel 2.convert B&W to 3-channel
+        input_image = input_image.convert("RGB")
+        image = np.asarray(input_image)
+        # Normalize rgb values
+        rgb = np.transpose(image, (2, 0, 1))  # [H, W, rgb] -> [rgb, H, W]
+        rgb_norm = rgb / 255.0 * 2.0 - 1.0  # [0, 255] -> [-1, 1]
+        rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype)
+        rgb_norm = rgb_norm.to(device)
+        assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
+        # ----------------- Predicting depth -----------------
+        # Batch repeated input image
+        duplicated_rgb = torch.stack([rgb_norm] * ensemble_size)
+        batch_dataset = TensorDataset(duplicated_rgb)
+        if batch_size > 0:
+            _bs = batch_size
+        else:
+            _bs = self._find_batch_size(
+                ensemble_size=ensemble_size,
+                input_res=max(duplicated_rgb.shape[-2:]),
+                dtype=self.dtype,
+            )
+        batch_loader = DataLoader(batch_dataset, batch_size=_bs, shuffle=False)
+        # Predict depth maps (batched)
+        depth_pred_ls = []
+        if show_progress_bar:
+            iterable = tqdm(
+                batch_loader, desc=" " * 2 + "Inference batches", leave=False
+            )
+        else:
+            iterable = batch_loader
+        depth_latent = None
+        for batch in iterable:
+            (batched_img,) = batch
+            depth_pred_raw, depth_latent = self.single_infer(
+                rgb_in=batched_img,
+                num_inference_steps=denoising_steps,
+                depth_latent_init=depth_latent_init,
+                depth_latent_init_strength=depth_latent_init_strength,
+                seed=seed,
+                show_pbar=show_progress_bar,
+            )
+            depth_pred_ls.append(depth_pred_raw.detach())
+        depth_preds = torch.concat(depth_pred_ls, dim=0).squeeze()
+        torch.cuda.empty_cache()  # clear vram cache for ensembling
+        # ----------------- Test-time ensembling -----------------
+        if ensemble_size > 1:
+            depth_pred, pred_uncert = self.ensemble_depths(
+                depth_preds, **(ensemble_kwargs or {})
+            )
+        else:
+            depth_pred = depth_preds
+            pred_uncert = None
+        # ----------------- Post processing -----------------
+        # Scale prediction to [0, 1]
+        min_d = torch.min(depth_pred)
+        max_d = torch.max(depth_pred)
+        depth_pred = (depth_pred - min_d) / (max_d - min_d)
+        if ensemble_size > 1:
+            depth_latent = self._encode_depth(2 * depth_pred - 1)
+        # Convert to numpy
+        depth_pred = depth_pred.cpu().numpy().astype(np.float32)
+        # Resize back to original resolution
+        if match_input_res:
+            pred_img = Image.fromarray(depth_pred)
+            pred_img = pred_img.resize(input_size)
+            depth_pred = np.asarray(pred_img)
+        # Clip output range
+        depth_pred = depth_pred.clip(0, 1)
+        # Colorize
+        if color_map is not None:
+            depth_colored = self.colorize_depth_maps(
+                depth_pred, 0, 1, cmap=color_map
+            ).squeeze()  # [3, H, W], value in (0, 1)
+            depth_colored = (depth_colored * 255).astype(np.uint8)
+            depth_colored_hwc = self.chw2hwc(depth_colored)
+            depth_colored_img = Image.fromarray(depth_colored_hwc)
+        else:
+            depth_colored_img = None
+        return MarigoldDepthConsistencyOutput(
+            depth_np=depth_pred,
+            depth_colored=depth_colored_img,
+            depth_latent=depth_latent,
+            uncertainty=pred_uncert,
+        )
+    def _encode_empty_text(self):
+        """
+        Encode text embedding for empty prompt.
+        """
+        prompt = ""
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="do_not_pad",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
+        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
+    @torch.no_grad()
+    def single_infer(
+        self,
+        rgb_in: torch.Tensor,
+        num_inference_steps: int,
+        depth_latent_init: torch.Tensor,
+        depth_latent_init_strength: float,
+        seed: int,
+        show_pbar: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Perform an individual depth prediction without ensembling.
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image.
+            num_inference_steps (`int`):
+                Number of diffusion denoisign steps (DDIM) during inference.
+            depth_latent_init (`torch.Tensor`, `optional`):
+                Initial depth latent
+            depth_latent_init_strength (`float`, `optional`):
+                Degree of initial depth latent influence, must be between 0 and 1
+            seed (`int`, *optional*, defaults to `None`)
+                Reproducibility seed.
+            show_pbar (`bool`):
+                Display a progress bar of diffusion denoising.
+        Returns:
+            `torch.Tensor`: Predicted depth map.
+        """
+        device = rgb_in.device
+        # Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps  # [T]
+        # Encode image
+        rgb_latent = self._encode_rgb(rgb_in)
+        # Initial depth map (noise)
+        if seed is None:
+            rng = None
+        else:
+            rng = torch.Generator(device=device)
+            rng.manual_seed(seed)
+        depth_latent = torch.randn(
+            rgb_latent.shape, device=device, dtype=self.dtype, generator=rng
+        )  # [B, 4, h, w]
+        if depth_latent_init is not None:
+            assert 0.0 <= depth_latent_init_strength <= 1.0
+            assert (
+                depth_latent_init.dim() == 4
+                and depth_latent.dim() == 4
+                and depth_latent_init.shape[0] == 1
+            )
+            if depth_latent.shape[0] != 1:
+                depth_latent_init = depth_latent_init.repeat(
+                    depth_latent.shape[0], 1, 1, 1
+                )
+            depth_latent *= 1.0 - depth_latent_init_strength
+            depth_latent = depth_latent + depth_latent_init * depth_latent_init_strength
+        # Batched empty text embedding
+        if self.empty_text_embed is None:
+            self._encode_empty_text()
+        batch_empty_text_embed = self.empty_text_embed.repeat(
+            (rgb_latent.shape[0], 1, 1)
+        )  # [B, 2, 1024]
+        # Denoising loop
+        if show_pbar:
+            iterable = tqdm(
+                enumerate(timesteps),
+                total=len(timesteps),
+                leave=False,
+                desc=" " * 4 + "Diffusion denoising",
+            )
+        else:
+            iterable = enumerate(timesteps)
+        for i, t in iterable:
+            unet_input = torch.cat(
+                [rgb_latent, depth_latent], dim=1
+            )  # this order is important
+            # predict the noise residual
+            noise_pred = self.unet(
+                unet_input, t, encoder_hidden_states=batch_empty_text_embed
+            ).sample  # [B, 4, h, w]
+            # compute the previous noisy sample x_t -> x_t-1
+            depth_latent = self.scheduler.step(noise_pred, t, depth_latent).prev_sample
+        depth = self._decode_depth(depth_latent)
+        # clip prediction
+        depth = torch.clip(depth, -1.0, 1.0)
+        # shift to [0, 1]
+        depth = (depth + 1.0) / 2.0
+        return depth, depth_latent
+    def _encode_depth(self, depth_in: torch.Tensor) -> torch.Tensor:
+        """
+        Encode depth image into latent.
+        Args:
+            depth_in (`torch.Tensor`):
+                Input Depth image to be encoded.
+        Returns:
+            `torch.Tensor`: Depth latent.
+        """
+        # encode
+        dims = depth_in.squeeze().shape
+        h = self.vae.encoder(depth_in.reshape(1, 1, *dims).repeat(1, 3, 1, 1))
+        moments = self.vae.quant_conv(h)
+        mean, _ = torch.chunk(moments, 2, dim=1)
+        depth_latent = mean * self.depth_latent_scale_factor
+        return depth_latent
+    def _encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
+        """
+        Encode RGB image into latent.
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image to be encoded.
+        Returns:
+            `torch.Tensor`: Image latent.
+        """
+        # encode
+        h = self.vae.encoder(rgb_in)
+        moments = self.vae.quant_conv(h)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        # scale latent
+        rgb_latent = mean * self.rgb_latent_scale_factor
+        return rgb_latent
+    def _decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
+        """
+        Decode depth latent into depth map.
+        Args:
+            depth_latent (`torch.Tensor`):
+                Depth latent to be decoded.
+        Returns:
+            `torch.Tensor`: Decoded depth map.
+        """
+        # scale latent
+        depth_latent = depth_latent / self.depth_latent_scale_factor
+        # decode
+        z = self.vae.post_quant_conv(depth_latent)
+        stacked = self.vae.decoder(z)
+        # mean of output channels
+        depth_mean = stacked.mean(dim=1, keepdim=True)
+        return depth_mean
+    @staticmethod
+    def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
+        """
+        Resize image to limit maximum edge length while keeping aspect ratio.
+        Args:
+            img (`Image.Image`):
+                Image to be resized.
+            max_edge_resolution (`int`):
+                Maximum edge length (pixel).
+        Returns:
+            `Image.Image`: Resized image.
+        """
+        original_width, original_height = img.size
+        downscale_factor = min(
+            max_edge_resolution / original_width, max_edge_resolution / original_height
+        )
+        new_width = int(original_width * downscale_factor)
+        new_height = int(original_height * downscale_factor)
+        resized_img = img.resize((new_width, new_height))
+        return resized_img
+    @staticmethod
+    def colorize_depth_maps(
+        depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None
+    ):
+        """
+        Colorize depth maps.
+        """
+        assert len(depth_map.shape) >= 2, "Invalid dimension"
+        if isinstance(depth_map, torch.Tensor):
+            depth = depth_map.detach().squeeze().numpy()
+        elif isinstance(depth_map, np.ndarray):
+            depth = depth_map.copy().squeeze()
+        # reshape to [ (B,) H, W ]
+        if depth.ndim < 3:
+            depth = depth[np.newaxis, :, :]
+        # colorize
+        cm = matplotlib.colormaps[cmap]
+        depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
+        img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3]  # value from 0 to 1
+        img_colored_np = np.rollaxis(img_colored_np, 3, 1)
+        if valid_mask is not None:
+            if isinstance(depth_map, torch.Tensor):
+                valid_mask = valid_mask.detach().numpy()
+            valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
+            if valid_mask.ndim < 3:
+                valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
+            else:
+                valid_mask = valid_mask[:, np.newaxis, :, :]
+            valid_mask = np.repeat(valid_mask, 3, axis=1)
+            img_colored_np[~valid_mask] = 0
+        if isinstance(depth_map, torch.Tensor):
+            img_colored = torch.from_numpy(img_colored_np).float()
+        elif isinstance(depth_map, np.ndarray):
+            img_colored = img_colored_np
+        return img_colored
+    @staticmethod
+    def chw2hwc(chw):
+        assert 3 == len(chw.shape)
+        if isinstance(chw, torch.Tensor):
+            hwc = torch.permute(chw, (1, 2, 0))
+        elif isinstance(chw, np.ndarray):
+            hwc = np.moveaxis(chw, 0, -1)
+        return hwc
+    @staticmethod
+    def _find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> int:
+        """
+        Automatically search for suitable operating batch size.
+        Args:
+            ensemble_size (`int`):
+                Number of predictions to be ensembled.
+            input_res (`int`):
+                Operating resolution of the input image.
+        Returns:
+            `int`: Operating batch size.
+        """
+        # Search table for suggested max. inference batch size
+        bs_search_table = [
+            # tested on A100-PCIE-80GB
+            {"res": 768, "total_vram": 79, "bs": 35, "dtype": torch.float32},
+            {"res": 1024, "total_vram": 79, "bs": 20, "dtype": torch.float32},
+            # tested on A100-PCIE-40GB
+            {"res": 768, "total_vram": 39, "bs": 15, "dtype": torch.float32},
+            {"res": 1024, "total_vram": 39, "bs": 8, "dtype": torch.float32},
+            {"res": 768, "total_vram": 39, "bs": 30, "dtype": torch.float16},
+            {"res": 1024, "total_vram": 39, "bs": 15, "dtype": torch.float16},
+            # tested on RTX3090, RTX4090
+            {"res": 512, "total_vram": 23, "bs": 20, "dtype": torch.float32},
+            {"res": 768, "total_vram": 23, "bs": 7, "dtype": torch.float32},
+            {"res": 1024, "total_vram": 23, "bs": 3, "dtype": torch.float32},
+            {"res": 512, "total_vram": 23, "bs": 40, "dtype": torch.float16},
+            {"res": 768, "total_vram": 23, "bs": 18, "dtype": torch.float16},
+            {"res": 1024, "total_vram": 23, "bs": 10, "dtype": torch.float16},
+            # tested on GTX1080Ti
+            {"res": 512, "total_vram": 10, "bs": 5, "dtype": torch.float32},
+            {"res": 768, "total_vram": 10, "bs": 2, "dtype": torch.float32},
+            {"res": 512, "total_vram": 10, "bs": 10, "dtype": torch.float16},
+            {"res": 768, "total_vram": 10, "bs": 5, "dtype": torch.float16},
+            {"res": 1024, "total_vram": 10, "bs": 3, "dtype": torch.float16},
+        ]
+        if not torch.cuda.is_available():
+            return 1
+        total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
+        filtered_bs_search_table = [s for s in bs_search_table if s["dtype"] == dtype]
+        for settings in sorted(
+            filtered_bs_search_table,
+            key=lambda k: (k["res"], -k["total_vram"]),
+        ):
+            if input_res <= settings["res"] and total_vram >= settings["total_vram"]:
+                bs = settings["bs"]
+                if bs > ensemble_size:
+                    bs = ensemble_size
+                elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
+                    bs = math.ceil(ensemble_size / 2)
+                return bs
+        return 1
+    @staticmethod
+    def ensemble_depths(
+        input_images: torch.Tensor,
+        regularizer_strength: float = 0.02,
+        max_iter: int = 2,
+        tol: float = 1e-3,
+        reduction: str = "median",
+        max_res: int = None,
+    ):
+        """
+        To ensemble multiple affine-invariant depth images (up to scale and shift),
+            by aligning estimating the scale and shift
+        """
+        def inter_distances(tensors: torch.Tensor):
+            """
+            To calculate the distance between each two depth maps.
+            """
+            distances = []
+            for i, j in torch.combinations(torch.arange(tensors.shape[0])):
+                arr1 = tensors[i : i + 1]
+                arr2 = tensors[j : j + 1]
+                distances.append(arr1 - arr2)
+            dist = torch.concatenate(distances, dim=0)
+            return dist
+        device = input_images.device
+        dtype = input_images.dtype
+        np_dtype = np.float32
+        original_input = input_images.clone()
+        n_img = input_images.shape[0]
+        ori_shape = input_images.shape
+        if max_res is not None:
+            scale_factor = torch.min(max_res / torch.tensor(ori_shape[-2:]))
+            if scale_factor < 1:
+                downscaler = torch.nn.Upsample(
+                    scale_factor=scale_factor, mode="nearest"
+                )
+                input_images = downscaler(torch.from_numpy(input_images)).numpy()
+        # init guess
+        _min = np.min(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
+        _max = np.max(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
+        s_init = 1.0 / (_max - _min).reshape((-1, 1, 1))
+        t_init = (-1 * s_init.flatten() * _min.flatten()).reshape((-1, 1, 1))
+        x = np.concatenate([s_init, t_init]).reshape(-1).astype(np_dtype)
+        input_images = input_images.to(device)
+        # objective function
+        def closure(x):
+            l = len(x)
+            s = x[: int(l / 2)]
+            t = x[int(l / 2) :]
+            s = torch.from_numpy(s).to(dtype=dtype).to(device)
+            t = torch.from_numpy(t).to(dtype=dtype).to(device)
+            transformed_arrays = input_images * s.view((-1, 1, 1)) + t.view((-1, 1, 1))
+            dists = inter_distances(transformed_arrays)
+            sqrt_dist = torch.sqrt(torch.mean(dists**2))
+            if "mean" == reduction:
+                pred = torch.mean(transformed_arrays, dim=0)
+            elif "median" == reduction:
+                pred = torch.median(transformed_arrays, dim=0).values
+            else:
+                raise ValueError
+            near_err = torch.sqrt((0 - torch.min(pred)) ** 2)
+            far_err = torch.sqrt((1 - torch.max(pred)) ** 2)
+            err = sqrt_dist + (near_err + far_err) * regularizer_strength
+            err = err.detach().cpu().numpy().astype(np_dtype)
+            return err
+        res = minimize(
+            closure,
+            x,
+            method="BFGS",
+            tol=tol,
+            options={"maxiter": max_iter, "disp": False},
+        )
+        x = res.x
+        l = len(x)
+        s = x[: int(l / 2)]
+        t = x[int(l / 2) :]
+        # Prediction
+        s = torch.from_numpy(s).to(dtype=dtype).to(device)
+        t = torch.from_numpy(t).to(dtype=dtype).to(device)
+        transformed_arrays = original_input * s.view(-1, 1, 1) + t.view(-1, 1, 1)
+        if "mean" == reduction:
+            aligned_images = torch.mean(transformed_arrays, dim=0)
+            std = torch.std(transformed_arrays, dim=0)
+            uncertainty = std
+        elif "median" == reduction:
+            aligned_images = torch.median(transformed_arrays, dim=0).values
+            # MAD (median absolute deviation) as uncertainty indicator
+            abs_dev = torch.abs(transformed_arrays - aligned_images)
+            mad = torch.median(abs_dev, dim=0).values
+            uncertainty = mad
+        else:
+            raise ValueError(f"Unknown reduction method: {reduction}")
+        # Scale and shift to [0, 1]
+        _min = torch.min(aligned_images)
+        _max = torch.max(aligned_images)
+        aligned_images = (aligned_images - _min) / (_max - _min)
+        uncertainty /= _max - _min
+        return aligned_images, uncertainty

marigold_logo_square.jpg ADDED Viewed

Git LFS Details

SHA256: bd5f1e527678fc913aee17ab69831551cfdb2934f673e9e97a7f011103b63c9e
Pointer size: 130 Bytes
Size of remote file: 76 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+gradio==4.22.0
+gradio-imageslider==0.0.16
+pygltflib==1.16.1
+trimesh==4.0.5
+imageio
+imageio-ffmpeg
+Pillow
+accelerate>=0.22.0
+diffusers==0.27.2
+matplotlib==3.8.2
+scipy==1.11.4
+torch==2.0.1
+transformers>=4.32.1
+xformers>=0.0.21