Spaces:

liguang0115
/

vmem

Running on L4

App Files Files Community

vmem / navigation.py

liguang0115

Add initial project structure with core files, configurations, and sample images

2df809d 14 days ago

raw

history blame

17.8 kB

	import numpy as np
	import torch
	from PIL import Image
	import argparse
	import os
	import json
	from typing import List, Optional, Tuple
	import scipy.spatial.transform as spt
	from omegaconf import OmegaConf


	from modeling.pipeline import VMemPipeline
	from utils import load_img_and_K, transform_img_and_K, get_default_intrinsics





	class Navigator:
	"""
	Navigator class for moving through a 3D scene with a virtual camera.
	Provides methods to move forward and turn left/right, generating new camera poses
	and rendering frames using VMemPipeline.
	"""
	def __init__(self, pipeline: VMemPipeline, step_size: float = 0.1, num_interpolation_frames: int = 4):
	"""
	Initialize the Navigator.

	Args:
	pipeline: The VMemPipeline used for rendering frames
	step_size: The distance to move forward with each step
	num_interpolation_frames: Number of frames to generate for each movement
	"""
	self.pipeline = pipeline
	self.step_size = step_size
	self.current_pose = None
	self.current_K = None
	self.frames = []
	self.num_interpolation_frames = num_interpolation_frames
	self.pose_history = [] # Store history of camera poses

	def initialize(self, image, initial_pose, initial_K):
	"""
	Initialize the navigator with an image and camera parameters.
	Uses the pipeline's initialize method to set up the state.

	Args:
	image: Initial image tensor
	initial_pose: Initial camera pose (4x4 camera-to-world matrix)
	initial_K: Initial camera intrinsics matrix

	Returns:
	The initial frame as PIL Image
	"""
	self.current_pose = initial_pose
	self.current_K = initial_K

	# Use the pipeline's initialize method
	initial_frame = self.pipeline.initialize(image, initial_pose, initial_K)
	self.frames = [initial_frame]

	# Save the initial pose
	self.pose_history.append({
	"file_path": f"images/frame_001.png",
	"transform_matrix": initial_pose.tolist() if isinstance(initial_pose, np.ndarray) else initial_pose
	})

	return initial_frame

	def initialize_pipeline(self, image_tensor, initial_pose, initial_K):
	"""
	Initialize the pipeline with the first image and camera pose.
	Deprecated: Use initialize() instead.

	Args:
	image_tensor: Initial image tensor [1, C, H, W]
	initial_pose: Initial camera pose (4x4 camera-to-world matrix)
	initial_K: Initial camera intrinsics matrix

	Returns:
	The generated frame as PIL Image
	"""
	return self.initialize(image_tensor, initial_pose, initial_K)

	def initialize_with_image(self, image, initial_pose, initial_K):
	"""
	Initialize the navigator with an image and camera parameters.
	Deprecated: Use initialize() instead.

	Args:
	image: Initial image tensor
	initial_pose: Initial camera pose (4x4 camera-to-world matrix)
	initial_K: Initial camera intrinsics matrix

	Returns:
	The generated frame as PIL Image
	"""
	return self.initialize(image, initial_pose, initial_K)

	def _interpolate_poses(self, start_pose, end_pose, num_frames):
	"""
	Interpolate between two camera poses.

	Args:
	start_pose: Starting camera pose (4x4 matrix)
	end_pose: Ending camera pose (4x4 matrix)
	num_frames: Number of interpolation frames to generate (including end pose)

	Returns:
	List of interpolated camera poses
	"""
	# Extract rotation matrices
	start_R = start_pose[:3, :3]
	end_R = end_pose[:3, :3]

	# Extract translation vectors
	start_t = start_pose[:3, 3]
	end_t = end_pose[:3, 3]

	# Convert rotation matrices to quaternions for smooth interpolation
	start_quat = spt.Rotation.from_matrix(start_R).as_quat()
	end_quat = spt.Rotation.from_matrix(end_R).as_quat()

	# Generate interpolated poses
	interpolated_poses = []
	for i in range(num_frames):
	# Interpolation factor (0 to 1)
	t = (i + 1) / num_frames

	# Interpolate translation
	interp_t = (1 - t) * start_t + t * end_t

	# Interpolate rotation (SLERP)
	interp_quat = spt.Slerp(
	np.array([0, 1]),
	spt.Rotation.from_quat([start_quat, end_quat])
	)(t).as_matrix()

	# Create interpolated pose matrix
	interp_pose = np.eye(4)
	interp_pose[:3, :3] = interp_quat
	interp_pose[:3, 3] = interp_t

	interpolated_poses.append(interp_pose)

	return interpolated_poses

	def move_backward(self, num_steps: int = 1) -> List[Image.Image]:
	"""
	Move the camera backward along its viewing direction with smooth interpolation.

	Args:
	num_steps: Number of steps to move forward

	Returns:
	List of generated frames as PIL Images
	"""
	if self.current_pose is None:
	print("Navigator not initialized. Call initialize first.")
	return None

	# Get the current forward direction from the camera pose
	forward_dir = self.current_pose[:3, 2]

	# Create the target pose
	target_pose = self.current_pose.copy()
	target_pose[:3, 3] += forward_dir * self.step_size * num_steps

	# Interpolate between current pose and target pose
	interpolated_poses = self._interpolate_poses(
	self.current_pose,
	target_pose,
	self.num_interpolation_frames
	)

	# Create list of intrinsics (same for all frames)
	interpolated_Ks = [self.current_K] * len(interpolated_poses)

	# Generate frames for interpolated poses
	new_frames = self.pipeline.generate_trajectory_frames(interpolated_poses,
	interpolated_Ks,
	use_non_maximum_suppression=False)

	# Update the current pose to the final pose
	self.current_pose = interpolated_poses[-1]
	self.frames.extend(new_frames)

	# Save the final pose
	self.pose_history.append({
	"file_path": f"images/frame_{len(self.pose_history) + 1:03d}.png",
	"transform_matrix": self.current_pose.tolist() if isinstance(self.current_pose, np.ndarray) else self.current_pose
	})

	return new_frames


	def move_forward(self, num_steps: int = 1) -> List[Image.Image]:
	"""
	Move the camera forward along its viewing direction with smooth interpolation.

	Args:
	num_steps: Number of steps to move forward

	Returns:
	List of generated frames as PIL Images
	"""
	if self.current_pose is None:
	print("Navigator not initialized. Call initialize first.")
	return None

	# Get the current forward direction from the camera pose
	forward_dir = self.current_pose[:3, 2]

	# Create the target pose
	target_pose = self.current_pose.copy()
	target_pose[:3, 3] -= forward_dir * self.step_size * num_steps

	# Interpolate between current pose and target pose
	interpolated_poses = self._interpolate_poses(
	self.current_pose,
	target_pose,
	self.num_interpolation_frames
	)

	# Create list of intrinsics (same for all frames)
	interpolated_Ks = [self.current_K] * len(interpolated_poses)

	# Generate frames for interpolated poses
	new_frames = self.pipeline.generate_trajectory_frames(interpolated_poses,
	interpolated_Ks,
	use_non_maximum_suppression=False)

	# Update the current pose to the final pose
	self.current_pose = interpolated_poses[-1]
	self.frames.extend(new_frames)

	# Save the final pose
	self.pose_history.append({
	"file_path": f"images/frame_{len(self.pose_history) + 1:03d}.png",
	"transform_matrix": self.current_pose.tolist() if isinstance(self.current_pose, np.ndarray) else self.current_pose
	})

	return new_frames

	def turn_left(self, degrees: float = 3) -> List[Image.Image]:
	"""
	Rotate the camera left around the up vector with smooth interpolation.

	Args:
	degrees: Rotation angle in degrees

	Returns:
	List of generated frames as PIL Images
	"""
	return self._turn(degrees)

	def turn_right(self, degrees: float = 3) -> List[Image.Image]:
	"""
	Rotate the camera right around the up vector with smooth interpolation.

	Args:
	degrees: Rotation angle in degrees

	Returns:
	List of generated frames as PIL Images
	"""
	return self._turn(-degrees)

	def _turn(self, degrees: float) -> List[Image.Image]:
	"""
	Helper method to turn the camera by the specified angle with smooth interpolation.
	Positive angles turn left, negative angles turn right.

	Args:
	degrees: Rotation angle in degrees

	Returns:
	List of generated frames as PIL Images
	"""
	if self.current_pose is None:
	print("Navigator not initialized. Call initialize first.")
	return None

	# Convert degrees to radians
	angle_rad = np.radians(degrees)

	# Create rotation matrix around the up axis (assuming Y is up)
	rotation = np.array([
	[np.cos(angle_rad), 0, np.sin(angle_rad), 0],
	[0, 1, 0, 0],
	[-np.sin(angle_rad), 0, np.cos(angle_rad), 0],
	[0, 0, 0, 1]
	])

	# Apply rotation to the current pose
	position = self.current_pose[:3, 3].copy()
	rotation_matrix = self.current_pose[:3, :3].copy()

	# Create the target pose
	target_pose = np.eye(4)
	target_pose[:3, :3] = rotation[:3, :3] @ rotation_matrix
	target_pose[:3, 3] = position

	# Interpolate between current pose and target pose
	interpolated_poses = self._interpolate_poses(
	self.current_pose,
	target_pose,
	self.num_interpolation_frames
	)

	# Create list of intrinsics (same for all frames)

	interpolated_Ks = [self.current_K] * len(interpolated_poses)

	# Generate frames for interpolated poses
	new_frames = self.pipeline.generate_trajectory_frames(interpolated_poses, interpolated_Ks)

	# Update the current pose to the final pose
	self.current_pose = interpolated_poses[-1]
	self.frames.extend(new_frames)

	# Save the final pose
	self.pose_history.append({
	"file_path": f"images/frame_{len(self.pose_history) + 1:03d}.png",
	"transform_matrix": self.current_pose.tolist() if isinstance(self.current_pose, np.ndarray) else self.current_pose
	})

	return new_frames



	def navigate(self, commands: List[str]) -> List[List[Image.Image]]:
	"""
	Execute a series of navigation commands and return the generated frames.

	Args:
	commands: List of commands ('w', 'a', 'd', 'q')

	Returns:
	List of lists of generated frames, one list per command
	"""
	if self.current_pose is None:
	print("Navigator not initialized. Call initialize first.")
	return []

	all_generated_frames = []

	for idx, cmd in enumerate(commands):

	if cmd == 'w':
	frames = self.move_forward()
	if frames:
	all_generated_frames.extend(frames)
	elif cmd == 's':
	# self.pipeline.temporal_only = True
	frames = self.move_backward()
	if frames:
	all_generated_frames.extend(frames)
	elif cmd == 'a':
	frames = self.turn_left(4)
	if frames:
	all_generated_frames.extend(frames)
	elif cmd == 'd':
	frames = self.turn_right(4)
	if frames:
	all_generated_frames.extend(frames)



	return all_generated_frames

	def undo(self) -> bool:
	"""
	Undo the last navigation step by removing the most recent frames and poses.
	Uses the pipeline's undo_latest_move method to handle the frame removal.

	Returns:
	bool: True if undo was successful, False otherwise
	"""
	# Check if we have enough poses to undo
	if len(self.pose_history) <= 1:
	print("Cannot undo: at initial position")
	return False

	# Use pipeline's undo function to remove the last batch of frames
	success = self.pipeline.undo_latest_move()

	if success:
	# Remove the last pose from history
	self.pose_history.pop()

	# Set current pose to the previous pose
	prev_pose_data = self.pose_history[-1]
	self.current_pose = np.array(prev_pose_data["transform_matrix"])

	# Remove frames from the frames list
	frames_to_remove = min(self.pipeline.config.model.target_num_frames, len(self.frames) - 1)
	for _ in range(frames_to_remove):
	if len(self.frames) > 1: # Keep at least the initial frame
	self.frames.pop()

	print(f"Successfully undid last movement. Now at position {len(self.pose_history)}")
	return True

	return False

	def save_camera_poses(self, output_path):
	"""
	Save the camera pose history to a JSON file in the format
	required for NeRF training.

	Args:
	output_path: Path to save the JSON file
	"""
	# Create the output directory if it doesn't exist
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	# Format the data as required
	transforms_data = {
	"frames": self.pose_history
	}

	# Save to JSON file
	with open(output_path, 'w') as f:
	json.dump(transforms_data, f, indent=4)

	print(f"Camera poses saved to {output_path}")




	def main():
	parser = argparse.ArgumentParser(description="Interactive navigation in VMem")
	parser.add_argument("--config", type=str, default="configs/inference/inference.yaml", help="Path to config file")
	parser.add_argument("--step_size", type=float, default=0.1, help="Forward step size")
	parser.add_argument("--interpolation_frames", type=int, default=4, help="Number of frames for each movement")
	parser.add_argument("--commands", type=str, default="a,a,a,a,a,d,d,d,d,d,d,w,w,w,w,a,a,a,a,d,d,d,d,s,s,s,s", help="Comma-separated commands to execute (w,a,s,d,c,q) where c is circulate")
	# parser.add_argument("--commands", type=str, default="d,d,d,d,w,w,w,d,d,d,d,d,a,a,a,a,a,s,s", help="Comma-separated commands to execute (w,a,s,d,c,q) where c is circulate")
	parser.add_argument("--output_dir", type=str, default="./visualization/navigation_frames", help="Directory to save output frames")
	parser.add_argument("--save_poses", type=str, default="./visualization/transforms.json", help="Path to save camera poses in NeRF format")
	args = parser.parse_args()

	# Load configuration
	config = OmegaConf.load(args.config)

	# Initialize the pipeline
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	pipeline = VMemPipeline(config, device=device)

	# Create the navigator
	navigator = Navigator(pipeline, step_size=args.step_size, num_interpolation_frames=args.interpolation_frames)

	# Load episode data
	frame_path = "test_samples/oxford.jpeg"
	image, _ = load_img_and_K(frame_path, None, K=None, device=device)
	image, _ = transform_img_and_K(image, (config.model.height, config.model.width), mode="crop", K=None)
	ori_K = np.array(get_default_intrinsics()[0])
	initial_pose = np.eye(4)

	# Initialize the navigator with the first frame using pipeline's initialize method
	initial_frame = navigator.initialize(image, initial_pose, ori_K)

	# Create output directory if needed
	if args.output_dir:
	os.makedirs(args.output_dir, exist_ok=True)
	initial_frame.save(os.path.join(args.output_dir, "initial.png"))

	# If commands are provided, execute them in sequence
	commands = args.commands.split(',')
	all_frames_lists = navigator.navigate(commands)


	# Save camera poses
	if args.save_poses:
	navigator.save_camera_poses(args.save_poses)

	if __name__ == "__main__":
	main()