Spaces:

liguang0115
/

vmem

Running on L4

App Files Files Community

liguang0115 commited on 12 days ago

Commit

390338e

1 Parent(s): d5a5fa0

Refactor inference configuration and pipeline logic; removed unused parameters and improved frame selection process. Updated inference settings in inference.yaml and streamlined surfel model initialization in pipeline.py.

Browse files

Files changed (2) hide show

configs/inference/inference.yaml +2 -22
modeling/pipeline.py +88 -243

configs/inference/inference.yaml CHANGED Viewed

@@ -4,20 +4,15 @@ model:
     width: 576
     original_height: 288
     original_width: 512
-    cache_dir: "/homes/55/runjia/storage/svd_weights"
-    # pretrained_model_path: "stabilityai/stable-diffusion-2-1"
-    # pretrained_video_model_path: "stabilityai/stable-video-diffusion-img2vid"
     context_num_frames: 4
     target_num_frames: 4
     num_frames: 8
     vae_spatial_scale: 8
     latent_channels: 4
-    # num_ray_blocks: 2
     vae_scale_factor: 8
-    inference_mode: false
-    temporal_only: false
     use_non_maximum_suppression: true
     translation_distance_weight: 0.1
@@ -26,14 +21,7 @@ model:
     cfg_min: 1.2
     cfg: 2.0
     guider_types: 1
     samples_dir: "./visualization"
-    save_flag: false
-    use_wandb: false
-    # model_path: "/homes/55/runjia/storage/simview_weights/2025-04-30_12-08-55/checkpoint_230000.pth"
     model_path: "liguang0115/vmem"
@@ -45,7 +33,7 @@ surfel:
     merge_position_threshold: 0.2
     merge_normal_threshold: 0.6
     lr: 0.01
-    niter: 1000
     model_path: "liguang0115/cut3r"
     width: 512
     height: 288
@@ -54,14 +42,6 @@ inference:
     visualize: true
     visualize_pointcloud: false
     visualize_surfel: false
-    save_surfels: false
-    image_dir: "/homes/55/runjia/storage/realestate10k/video_data/test"
-    meta_info_dir: "/homes/55/runjia/storage/realestate10k/RealEstate10K/test"

     width: 576
     original_height: 288
     original_width: 512
     context_num_frames: 4
     target_num_frames: 4
     num_frames: 8
     vae_spatial_scale: 8
     latent_channels: 4
     vae_scale_factor: 8
     use_non_maximum_suppression: true
     translation_distance_weight: 0.1
     cfg_min: 1.2
     cfg: 2.0
     guider_types: 1
     samples_dir: "./visualization"
     model_path: "liguang0115/vmem"
     merge_position_threshold: 0.2
     merge_normal_threshold: 0.6
     lr: 0.01
+    niter: 400
     model_path: "liguang0115/cut3r"
     width: 512
     height: 288
     visualize: true
     visualize_pointcloud: false
     visualize_surfel: false

modeling/pipeline.py CHANGED Viewed

@@ -4,27 +4,19 @@ from copy import deepcopy
 import math
-# import matplotlib.pyplot as plt
-# from mpl_toolkits.mplot3d.art3d import Poly3DCollection
 import PIL
-from PIL import Image, ImageOps
 import numpy as np
 from einops import repeat
-# from scipy.spatial import cKDTree
 import torch
 import torch.nn.functional as F
-from torch.amp import autocast
 import torchvision.transforms as tvf
-# from diffusers import AutoencoderKL, DiffusionPipeline
-# from diffusers.schedulers import DDIMScheduler
 from diffusers.utils import export_to_gif
 import sys
-# Add CUT3R to Python path for imports
 sys.path.append("./extern/CUT3R")
 from extern.CUT3R.surfel_inference import run_inference_from_pil
 from extern.CUT3R.add_ckpt_path import add_path_to_dust3r
@@ -91,32 +83,23 @@ class VMemPipeline:
         self.device = device
-        self.use_surfel = self.config.surfel.use_surfel
-        if self.use_surfel:
-            # Initialize CUT3R-based reconstructor
-            # Load and prepare the model
-            # download the model from huggingface
-            surfel_model_path = hf_hub_download(repo_id=self.config.surfel.model_path, filename="cut3r_512_dpt_4_64.pth")
-            print(f"Loading model from {surfel_model_path}...")
-            add_path_to_dust3r(surfel_model_path)
-            self.surfel_model = ARCroco3DStereo.from_pretrained(surfel_model_path).to(device)
-            self.surfel_model.eval()
-            # Import CUT3R scene alignment module
-            from extern.CUT3R.cloud_opt.dust3r_opt import global_aligner, GlobalAlignerMode
-            self.GlobalAlignerMode = GlobalAlignerMode
-            self.global_aligner = global_aligner
-        else:
-            self.surfel_model = None
-        self.temporal_only = self.config.model.temporal_only
         self.use_non_maximum_suppression = self.config.model.use_non_maximum_suppression
         self.context_num_frames = self.config.model.context_num_frames
@@ -537,33 +520,58 @@ class VMemPipeline:
             embeddings = [torch.from_numpy(self.encoder_embeddings[i]).to(self.device, self.dtype) for i in indices]
             intrinsics = [self.Ks[i] for i in indices]
             return c2ws, latents, embeddings, intrinsics, indices
-        if self.temporal_only:
-            # Select frames based on timesteps (temporal mode)
-            context_time_indices = [len(self.c2ws) - 1 - i for i in range(self.config.model.context_num_frames) if len(self.c2ws) - 1 - i >= 0]
-            context_data = prepare_context_data(context_time_indices)
-        elif not self.use_surfel:
-            # Select frames based on camera pose distance with NMS
-            average_c2w = average_camera_pose(target_c2ws)
-            distances = torch.stack([self.geodesic_distance(torch.from_numpy(average_c2w).to(self.device, self.dtype), torch.from_numpy(np.array(c2w)).to(self.device, self.dtype), weight_translation=self.config.model.translation_distance_weight)
-                         for c2w in self.c2ws])
-            # Sort frames by distance (closest to target first)
-            sorted_indices = torch.argsort(distances)
-            max_frames = min(self.config.model.context_num_frames, len(distances), len(self.latents))
-            # Apply non-maximum suppression to select diverse frames
-            is_first_step = len(self.pil_frames) <= 1
             is_second_step = len(self.pil_frames) == 5
-            min_required_frames = 1 if is_first_step else max_frames
             # Adaptively determine initial threshold based on camera pose distribution
             if use_non_maximum_suppression is None:
                 use_non_maximum_suppression = self.use_non_maximum_suppression
             if use_non_maximum_suppression:
                 if is_second_step:
                     # Calculate pairwise distances between existing frames
                     pairwise_distances = []
@@ -581,32 +589,26 @@ class VMemPipeline:
                         pairwise_distances.sort()
                         percentile_idx = int(len(pairwise_distances) * 0.5)  # 25th percentile
                         self.initial_threshold = pairwise_distances[percentile_idx]
-                        # Ensure threshold is within reasonable bounds
-                        # initial_threshold = max(0.00, min(0.001, initial_threshold))
                     else:
-                        self.initial_threshold = 0.001
-                elif is_first_step:
-                    # Default threshold for first frame
-                    self.initial_threshold = 1e8
             else:
                 self.initial_threshold = 1e8
             selected_indices = []
             # Try with increasingly relaxed thresholds until we get enough frames
-            current_threshold = self.initial_threshold
-            while len(selected_indices) < min_required_frames and current_threshold <= 1.0:
-                # Reset selection with new threshold
-                selected_indices = []
-                # Always start with the closest pose
-                selected_indices.append(sorted_indices[0])
                 # Try to add each subsequent pose in order of distance
-                for idx in sorted_indices[1:]:
                     if len(selected_indices) >= max_frames:
                         break
@@ -627,148 +629,22 @@ class VMemPipeline:
                         selected_indices.append(idx)
                 # If we still don't have enough frames, relax the threshold and try again
-                if len(selected_indices) < min_required_frames:
-                    current_threshold *= 1.2
                 else:
                     break
             # If we still don't have enough frames, just take the top frames by distance
-            if len(selected_indices) < min_required_frames:
                 available_indices = []
-                for idx in sorted_indices:
                     if idx not in selected_indices:
                         available_indices.append(idx)
-                selected_indices.extend(available_indices[:min_required_frames-len(selected_indices)])
             # Convert to tensor and maintain original order (don't reverse)
-            context_time_indices = torch.tensor(selected_indices, device=distances.device)
-            context_data = prepare_context_data(context_time_indices)
-        else:
-            if len(self.pil_frames) == 1:
-                context_time_indices = [0]
-            else:
-                # get the average camera pose
-                average_c2w = average_camera_pose(target_c2ws[-self.config.model.context_num_frames//4:])
-                transformed_average_c2w = self.get_transformed_c2ws(average_c2w)
-                target_K = np.mean(self.surfel_Ks, axis=0)
-                # Select frames using surfel-based relevance
-                retrieved_info = self.render_surfels_to_image(
-                    self.surfels,
-                    transformed_average_c2w,
-                    [target_K*0.65] * 2,
-                    principal_points=(int(self.config.surfel.width/2), int(self.config.surfel.height/2)),
-                    image_width=int(self.config.surfel.width),
-                    image_height=int(self.config.surfel.height)
-                )
-                _, frame_count = self.process_retrieved_spatial_information(retrieved_info)
-                if self.config.inference.visualize:
-                    visualize_depth(retrieved_info["depth"],
-                                    visualization_dir=self.visualize_dir,
-                                    file_name=f"retrieved_depth_surfels.png",
-                                    size=(self.width, self.height))
-                # Build candidate frames based on relevance count
-                candidates = []
-                for frame, count in frame_count:
-                    candidates.extend([frame] * count)
-                    indices_to_frame = {
-                        i: frame for i, frame in enumerate(candidates)
-                    }
-                # Sort candidates by distance to target view
-                distances = [self.geodesic_distance(torch.from_numpy(average_c2w).to(self.device, self.dtype),
-                                                    torch.from_numpy(self.c2ws[frame]).to(self.device, self.dtype),
-                                                    weight_translation=self.config.model.translation_distance_weight).item()
-                            for frame in candidates]
-                sorted_indices = torch.argsort(torch.tensor(distances))
-                sorted_frames = [indices_to_frame[int(i.item())] for i in sorted_indices]
-                max_frames = min(self.config.model.context_num_frames, len(candidates), len(self.latents))
-                is_second_step = len(self.pil_frames) == 5
-                # Adaptively determine initial threshold based on camera pose distribution
-                if use_non_maximum_suppression is None:
-                    use_non_maximum_suppression = self.use_non_maximum_suppression
-                if use_non_maximum_suppression:
-                    if is_second_step:
-                        # Calculate pairwise distances between existing frames
-                        pairwise_distances = []
-                        for i in range(len(self.c2ws)):
-                            for j in range(i+1, len(self.c2ws)):
-                                sim = self.geodesic_distance(
-                                    torch.from_numpy(np.array(self.c2ws[i])).to(self.device, self.dtype),
-                                    torch.from_numpy(np.array(self.c2ws[j])).to(self.device, self.dtype),
-                                    weight_translation=self.config.model.translation_distance_weight
-                                )
-                                pairwise_distances.append(sim.item())
-                        if pairwise_distances:
-                            # Sort distances and take percentile as threshold
-                            pairwise_distances.sort()
-                            percentile_idx = int(len(pairwise_distances) * 0.5)  # 25th percentile
-                            self.initial_threshold = pairwise_distances[percentile_idx]
-                        else:
-                            self.initial_threshold = 1
-                else:
-                    self.initial_threshold = 1e8
-                selected_indices = []
-                current_threshold = self.initial_threshold
-                # Always start with the closest pose
-                selected_indices.append(sorted_frames[0])
-                if not use_non_maximum_suppression:
-                    selected_indices.append(len(self.c2ws) - 1)
-                # Try with increasingly relaxed thresholds until we get enough frames
-                while len(selected_indices) < max_frames and current_threshold >= 1e-5 and use_non_maximum_suppression:
-                    # Try to add each subsequent pose in order of distance
-                    for idx in sorted_frames[1:]:
-                        if len(selected_indices) >= max_frames:
-                            break
-                        # Check if this candidate is sufficiently different from all selected frames
-                        is_too_similar = False
-                        for selected_idx in selected_indices:
-                            similarity = self.geodesic_distance(
-                                torch.from_numpy(np.array(self.c2ws[idx])).to(self.device, self.dtype),
-                                torch.from_numpy(np.array(self.c2ws[selected_idx])).to(self.device, self.dtype),
-                                weight_translation=self.config.model.translation_distance_weight
-                            )
-                            if similarity < current_threshold:
-                                is_too_similar = True
-                                break
-                        # Add to selected frames if not too similar to any existing selection
-                        if not is_too_similar:
-                            selected_indices.append(idx)
-                    # If we still don't have enough frames, relax the threshold and try again
-                    if len(selected_indices) < max_frames:
-                        current_threshold /= 1.2
-                    else:
-                        break
-                # If we still don't have enough frames, just take the top frames by distance
-                if len(selected_indices) < max_frames:
-                    available_indices = []
-                    for idx in sorted_frames:
-                        if idx not in selected_indices:
-                            available_indices.append(idx)
-                    selected_indices.extend(available_indices[:max_frames-len(selected_indices)])
-                # Convert to tensor and maintain original order (don't reverse)
-                context_time_indices = torch.from_numpy(np.array(selected_indices))
-            context_data = prepare_context_data(context_time_indices)
         (context_c2ws, context_latents, context_encoder_embeddings, context_Ks, context_time_indices) = context_data
         print(f"context_time_indices: {context_time_indices}")
@@ -992,11 +868,7 @@ class VMemPipeline:
         # Flip Y and Z components of camera poses to match dataset convention
         c2ws_transformed = self.get_transformed_c2ws()
-        # Run inference to construct the scene
-        if self.global_step == 10:
-            visualize = True
-        else:
-            visualize = False
         scene = run_inference_from_pil(
             input_images,
             self.surfel_model,
@@ -1004,8 +876,7 @@ class VMemPipeline:
             depths=torch.from_numpy(np.array(self.surfel_depths)) if len(self.surfel_depths) > 0 else None,
             lr = lr,
             niter = niter,
-            # visualize=self.config.inference.visualize_pointcloud,
-            visualize=visualize,
             device=device,
         )
@@ -1043,12 +914,10 @@ class VMemPipeline:
         )
         confs = confs.squeeze(1)
-        # self.surfels = []
-        # self.surfel_to_timestep = {}
         start_idx = 0 if len(self.surfels) == 0 else len(pointcloud) - self.config.model.target_num_frames
         end_idx = len(pointcloud)
-        # for frame_idx in range(len(pointcloud)):
-        # Create surfels for the current frame
         for frame_idx in range(start_idx, end_idx):
             surfels = self.pointmap_to_surfels(
                 pointmap=pointcloud[frame_idx],
@@ -1077,30 +946,6 @@ class VMemPipeline:
             for surfel_index in range(num_surfels):
                 self.surfel_to_timestep[surfel_start_index + surfel_index] = [frame_idx]
-            # Save surfels if configured
-            if self.config.inference.save_surfels and len(self.surfels) > 0:
-                positions = np.array([s.position for s in surfels], dtype=np.float32)
-                normals   = np.array([s.normal   for s in surfels], dtype=np.float32)
-                radii     = np.array([s.radius   for s in surfels], dtype=np.float32)
-                colors    = np.array([s.color    for s in surfels], dtype=np.float32)
-                np.savez(f"{self.config.visualization_dir}/surfels_added.npz",
-                        positions=positions,
-                        normals=normals,
-                        radii=radii,
-                        colors=colors)
-                positions = np.array([s.position for s in self.surfels], dtype=np.float32)
-                normals   = np.array([s.normal   for s in self.surfels], dtype=np.float32)
-                radii     = np.array([s.radius   for s in self.surfels], dtype=np.float32)
-                colors    = np.array([s.color    for s in self.surfels], dtype=np.float32)
-                np.savez(f"{self.config.visualization_dir}/surfels_original.npz",
-                        positions=positions,
-                        normals=normals,
-                        radii=radii,
-                        colors=colors)
             self.surfels.extend(surfels)
         if self.config.inference.visualize_surfel:
@@ -1323,12 +1168,12 @@ class VMemPipeline:
                     self.pil_frames[-1].save(f"{self.config.visualization_dir}/final_{len(self.pil_frames):07d}.png")
             # Update scene reconstruction if needed
-            if self.use_surfel and not self.temporal_only:
-                self.construct_and_store_scene(self.pil_frames,
-                                            time_indices=context_time_indices,
-                                            niter=self.config.surfel.niter,
-                                            lr=self.config.surfel.lr,
-                                            device=self.device)
             self.global_step += 1
             if self.config.inference.visualize:
@@ -1386,9 +1231,9 @@ class VMemPipeline:
         # Handle surfels if using reconstructor
         self.global_step -= frames_to_remove
-        if self.use_surfel:
-            for _ in range(frames_to_remove):
-                self.surfel_depths.pop()
             # Find surfels that belong only to the removed timesteps

 import math
 import PIL
 import numpy as np
 from einops import repeat
 import torch
 import torch.nn.functional as F
 import torchvision.transforms as tvf
 from diffusers.utils import export_to_gif
 import sys
 sys.path.append("./extern/CUT3R")
 from extern.CUT3R.surfel_inference import run_inference_from_pil
 from extern.CUT3R.add_ckpt_path import add_path_to_dust3r
         self.device = device
+        surfel_model_path = hf_hub_download(repo_id=self.config.surfel.model_path, filename="cut3r_512_dpt_4_64.pth")
+        print(f"Loading model from {surfel_model_path}...")
+        add_path_to_dust3r(surfel_model_path)
+        self.surfel_model = ARCroco3DStereo.from_pretrained(surfel_model_path).to(device)
+        self.surfel_model.eval()
+        # Import CUT3R scene alignment module
+        from extern.CUT3R.cloud_opt.dust3r_opt import global_aligner, GlobalAlignerMode
+        self.GlobalAlignerMode = GlobalAlignerMode
+        self.global_aligner = global_aligner
         self.use_non_maximum_suppression = self.config.model.use_non_maximum_suppression
         self.context_num_frames = self.config.model.context_num_frames
             embeddings = [torch.from_numpy(self.encoder_embeddings[i]).to(self.device, self.dtype) for i in indices]
             intrinsics = [self.Ks[i] for i in indices]
             return c2ws, latents, embeddings, intrinsics, indices
+        if len(self.pil_frames) == 1:
+            context_time_indices = [0]
+        else:
+            # get the average camera pose
+            average_c2w = average_camera_pose(target_c2ws[-self.config.model.context_num_frames//4:])
+            transformed_average_c2w = self.get_transformed_c2ws(average_c2w)
+            target_K = np.mean(self.surfel_Ks, axis=0)
+            # Select frames using surfel-based relevance
+            retrieved_info = self.render_surfels_to_image(
+                self.surfels,
+                transformed_average_c2w,
+                [target_K*0.65] * 2,
+                principal_points=(int(self.config.surfel.width/2), int(self.config.surfel.height/2)),
+                image_width=int(self.config.surfel.width),
+                image_height=int(self.config.surfel.height)
+            )
+            _, frame_count = self.process_retrieved_spatial_information(retrieved_info)
+            if self.config.inference.visualize:
+                visualize_depth(retrieved_info["depth"],
+                                visualization_dir=self.visualize_dir,
+                                file_name=f"retrieved_depth_surfels.png",
+                                size=(self.width, self.height))
+            # Build candidate frames based on relevance count
+            candidates = []
+            for frame, count in frame_count:
+                candidates.extend([frame] * count)
+                indices_to_frame = {
+                    i: frame for i, frame in enumerate(candidates)
+                }
+            # Sort candidates by distance to target view
+            distances = [self.geodesic_distance(torch.from_numpy(average_c2w).to(self.device, self.dtype),
+                                                torch.from_numpy(self.c2ws[frame]).to(self.device, self.dtype),
+                                                weight_translation=self.config.model.translation_distance_weight).item()
+                        for frame in candidates]
+            sorted_indices = torch.argsort(torch.tensor(distances))
+            sorted_frames = [indices_to_frame[int(i.item())] for i in sorted_indices]
+            max_frames = min(self.config.model.context_num_frames, len(candidates), len(self.latents))
             is_second_step = len(self.pil_frames) == 5
             # Adaptively determine initial threshold based on camera pose distribution
             if use_non_maximum_suppression is None:
                 use_non_maximum_suppression = self.use_non_maximum_suppression
             if use_non_maximum_suppression:
                 if is_second_step:
                     # Calculate pairwise distances between existing frames
                     pairwise_distances = []
                         pairwise_distances.sort()
                         percentile_idx = int(len(pairwise_distances) * 0.5)  # 25th percentile
                         self.initial_threshold = pairwise_distances[percentile_idx]
                     else:
+                        self.initial_threshold = 1
             else:
                 self.initial_threshold = 1e8
             selected_indices = []
+            current_threshold = self.initial_threshold
+            # Always start with the closest pose
+            selected_indices.append(sorted_frames[0])
+            if not use_non_maximum_suppression:
+                selected_indices.append(len(self.c2ws) - 1)
             # Try with increasingly relaxed thresholds until we get enough frames
+            while len(selected_indices) < max_frames and current_threshold >= 1e-5 and use_non_maximum_suppression:
                 # Try to add each subsequent pose in order of distance
+                for idx in sorted_frames[1:]:
                     if len(selected_indices) >= max_frames:
                         break
                         selected_indices.append(idx)
                 # If we still don't have enough frames, relax the threshold and try again
+                if len(selected_indices) < max_frames:
+                    current_threshold /= 1.2
                 else:
                     break
             # If we still don't have enough frames, just take the top frames by distance
+            if len(selected_indices) < max_frames:
                 available_indices = []
+                for idx in sorted_frames:
                     if idx not in selected_indices:
                         available_indices.append(idx)
+                selected_indices.extend(available_indices[:max_frames-len(selected_indices)])
             # Convert to tensor and maintain original order (don't reverse)
+            context_time_indices = torch.from_numpy(np.array(selected_indices))
+        context_data = prepare_context_data(context_time_indices)
         (context_c2ws, context_latents, context_encoder_embeddings, context_Ks, context_time_indices) = context_data
         print(f"context_time_indices: {context_time_indices}")
         # Flip Y and Z components of camera poses to match dataset convention
         c2ws_transformed = self.get_transformed_c2ws()
         scene = run_inference_from_pil(
             input_images,
             self.surfel_model,
             depths=torch.from_numpy(np.array(self.surfel_depths)) if len(self.surfel_depths) > 0 else None,
             lr = lr,
             niter = niter,
+            visualize=self.config.inference.visualize_surfel,
             device=device,
         )
         )
         confs = confs.squeeze(1)
         start_idx = 0 if len(self.surfels) == 0 else len(pointcloud) - self.config.model.target_num_frames
         end_idx = len(pointcloud)
         for frame_idx in range(start_idx, end_idx):
             surfels = self.pointmap_to_surfels(
                 pointmap=pointcloud[frame_idx],
             for surfel_index in range(num_surfels):
                 self.surfel_to_timestep[surfel_start_index + surfel_index] = [frame_idx]
             self.surfels.extend(surfels)
         if self.config.inference.visualize_surfel:
                     self.pil_frames[-1].save(f"{self.config.visualization_dir}/final_{len(self.pil_frames):07d}.png")
             # Update scene reconstruction if needed
+            self.construct_and_store_scene(self.pil_frames,
+                                        time_indices=context_time_indices,
+                                        niter=self.config.surfel.niter,
+                                        lr=self.config.surfel.lr,
+                                        device=self.device)
             self.global_step += 1
             if self.config.inference.visualize:
         # Handle surfels if using reconstructor
         self.global_step -= frames_to_remove
+        for _ in range(frames_to_remove):
+            self.surfel_depths.pop()
             # Find surfels that belong only to the removed timesteps