Upload 3 files

Browse files

Files changed (3) hide show

src/inference/cycleGANtest.py +226 -0
src/inference/merged-discord-app.py +1194 -0
src/training/trainDepth2AnythingGAN.ipynb +0 -0

src/inference/cycleGANtest.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+from PIL import Image
+import os
+import numpy as np
+# Generator architecture (simplified ResNet)
+class ResidualBlock(nn.Module):
+    def __init__(self, channels):
+        super(ResidualBlock, self).__init__()
+        self.conv_block = nn.Sequential(  # Changed from 'block' to 'conv_block'
+            nn.ReflectionPad2d(1),
+            nn.Conv2d(channels, channels, 3),
+            nn.InstanceNorm2d(channels),
+            nn.ReLU(inplace=True),
+            nn.ReflectionPad2d(1),
+            nn.Conv2d(channels, channels, 3),
+            nn.InstanceNorm2d(channels)
+        )
+    def forward(self, x):
+        return x + self.conv_block(x)  # Changed from 'block' to 'conv_block'
+class Generator(nn.Module):
+    def __init__(self, input_channels=3, output_channels=3, n_residual_blocks=9):
+        super(Generator, self).__init__()
+        # Initial convolution
+        model = [
+            nn.ReflectionPad2d(3),
+            nn.Conv2d(input_channels, 64, 7),
+            nn.InstanceNorm2d(64),
+            nn.ReLU(inplace=True)
+        ]
+        # Downsampling
+        in_features = 64
+        out_features = in_features * 2
+        for _ in range(2):
+            model += [
+                nn.Conv2d(in_features, out_features, 3, stride=2, padding=1),
+                nn.InstanceNorm2d(out_features),
+                nn.ReLU(inplace=True)
+            ]
+            in_features = out_features
+            out_features = in_features * 2
+        # Residual blocks
+        for _ in range(n_residual_blocks):
+            model += [ResidualBlock(in_features)]
+        # Upsampling
+        out_features = in_features // 2
+        for _ in range(2):
+            model += [
+                nn.ConvTranspose2d(in_features, out_features, 3, stride=2, padding=1, output_padding=1),
+                nn.InstanceNorm2d(out_features),
+                nn.ReLU(inplace=True)
+            ]
+            in_features = out_features
+            out_features = in_features // 2
+        # Output layer
+        model += [
+            nn.ReflectionPad2d(3),
+            nn.Conv2d(64, output_channels, 7),
+            nn.Tanh()
+        ]
+        self.model = nn.Sequential(*model)
+    def forward(self, x):
+        return self.model(x)
+# Image preprocessing
+def preprocess_image(image_path):
+    image = Image.open(image_path).convert('RGB')
+    transform = transforms.Compose([
+        transforms.Resize(256),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    ])
+    return transform(image).unsqueeze(0)
+# Image postprocessing
+def postprocess_image(tensor):
+    tensor = tensor.squeeze(0).cpu()
+    tensor = (tensor + 1) / 2
+    tensor = tensor.clamp(0, 1)
+    tensor = tensor.permute(1, 2, 0).numpy()
+    return (tensor * 255).astype(np.uint8)
+# Model loading
+def load_model(model_path):
+    model = Generator()
+    if os.path.exists(model_path):
+        print(f"Loading model from {model_path}")
+        state_dict = torch.load(model_path, map_location='cpu')
+        try:
+            model.load_state_dict(state_dict)
+        except Exception as e:
+            print(f"Warning: {e}")
+            # Try loading with strict=False
+            model.load_state_dict(state_dict, strict=False)
+            print("Loaded model with strict=False")
+    else:
+        print(f"Error: Model file not found at {model_path}")
+        return None
+    model.eval()
+    return model
+# Inference function
+# Update the transform_image function to handle numpy arrays from Gradio
+def transform_image(input_image, direction):
+    if input_image is None:
+        print("No input image provided")
+        return None
+    try:
+        # Ensure input image is RGB
+        if len(input_image.shape) == 2:  # Grayscale
+            input_image = np.stack([input_image] * 3, axis=-1)
+        elif input_image.shape[-1] == 4:  # RGBA
+            input_image = input_image[..., :3]
+        if direction == "Depth to Image":
+            model_path = "./checkpoints/depth2image/latest_net_G_A.pth"
+        else:
+            model_path = "./checkpoints/depth2image/latest_net_G_B.pth"
+        # Load model
+        model = load_model(model_path)
+        if model is None:
+            print(f"Failed to load model from {model_path}")
+            return None
+        # Convert numpy array to PIL Image
+        input_pil = Image.fromarray(input_image.astype('uint8'), 'RGB')
+        # Create transforms
+        transform = transforms.Compose([
+            transforms.Resize(256),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+        ])
+        # Process image
+        input_tensor = transform(input_pil).unsqueeze(0)
+        # Generate output
+        with torch.no_grad():
+            output_tensor = model(input_tensor)
+        # Convert to image
+        output_image = postprocess_image(output_tensor)
+        return output_image
+    except Exception as e:
+        print(f"Error in transform_image: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+# Update the Gradio interface
+with gr.Blocks(title="CycleGAN Depth2Image Test", analytics_enabled=False) as demo:
+    gr.Markdown("## Test CycleGAN Depth2Image Model")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(
+                label="Input Image",
+                type="numpy",
+                height=256,
+                width=256
+            )
+            direction = gr.Radio(
+                choices=["Depth to Image", "Image to Depth"],
+                value="Depth to Image",
+                label="Conversion Direction"
+            )
+            transform_btn = gr.Button("Transform", variant="primary")
+        with gr.Column():
+            output_image = gr.Image(
+                label="Generated Output",
+                height=256,
+                width=256
+            )
+            error_output = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
+    # Connect components
+    transform_btn.click(
+        fn=transform_image,
+        inputs=[input_image, direction],
+        outputs=output_image
+    )
+    gr.Markdown("""
+    ### Instructions:
+    1. Upload an image
+    2. Select conversion direction:
+       - "Depth to Image" converts depth maps to realistic images
+       - "Image to Depth" converts realistic images to depth maps
+    3. Click "Transform" to generate the output
+    Note: Input images will be resized to 256x256 pixels.
+    """)
+if __name__ == "__main__":
+    # Make sure checkpoints directory exists
+    os.makedirs("checkpoints/depth2image", exist_ok=True)
+    # Launch with custom server configuration
+    demo.queue(max_size=5).launch(
+        server_name="0.0.0.0",  # Allow external connections
+        server_port=7860,  # Set specific port
+        show_error=True,  # Show detailed errors
+        debug=True  # Enable debug mode
+    )

src/inference/merged-discord-app.py ADDED Viewed

	@@ -0,0 +1,1194 @@

+import gradio as gr
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+import torchvision.transforms as transforms
+import sys
+import os
+import threading
+import pyvirtualcam
+from pyvirtualcam import PixelFormat
+from huggingface_hub import hf_hub_download, login, upload_file
+import torch.nn as nn
+import time
+import mss
+import traceback
+# Ensure required environment variables
+depth_anything_path = os.getenv('DEPTH_ANYTHING_V2_PATH')
+if depth_anything_path is None:
+    raise ValueError("Environment variable DEPTH_ANYTHING_V2_PATH is not set. Please set it to the path of Depth-Anything-V2")
+sys.path.append(depth_anything_path)
+from depth_anything_v2.dpt import DepthAnythingV2
+# --- Global variables and constants ---
+# Updated colormaps to match the dataset generator
+DEPTH_COLORMAPS = {
+    "TURBO": cv2.COLORMAP_TURBO,
+    "JET": cv2.COLORMAP_JET,
+    "PARULA": cv2.COLORMAP_PARULA,
+    "HOT": cv2.COLORMAP_HOT,
+    "WINTER": cv2.COLORMAP_WINTER,
+    "RAINBOW": cv2.COLORMAP_RAINBOW,
+    "OCEAN": cv2.COLORMAP_OCEAN,
+    "SUMMER": cv2.COLORMAP_SUMMER,
+    "SPRING": cv2.COLORMAP_SPRING,
+    "COOL": cv2.COLORMAP_COOL,
+    "HSV": cv2.COLORMAP_HSV,
+    "PINK": cv2.COLORMAP_PINK,
+    "BONE": cv2.COLORMAP_BONE,
+    "VIRIDIS": cv2.COLORMAP_VIRIDIS,
+    "PLASMA": cv2.COLORMAP_PLASMA,
+    "INFERNO": cv2.COLORMAP_INFERNO,
+    "MAGMA": cv2.COLORMAP_MAGMA  # Keeping this one from your webcam app
+}
+# Add these global variables to store current settings
+current_colormap = "TURBO"
+current_mode = "Depth to Robot"
+current_model_name = "Small"
+current_webcam_id = 0
+current_invert_depth = False
+current_input_source = "Webcam"  # or "Desktop"
+current_bypass_depth = False
+current_blend_opacity = 0.1  # New: default opacity for blending
+current_blend_enabled = False  # New: option to enable/disable blending
+# Add these at the top with other globals
+DEPTH2ROBOT_LOCAL_PATH = './checkpoints/depth2image/latest_net_G_A.pth'
+current_gan_source = "Local"  # or "HuggingFace"
+# At the top with other globals, add:
+current_gan_input = None  # Store the current GAN input for display
+# First add a new global variable to track direction
+current_direction = "Depth to Image"  # or "Image to Depth"
+# --- Device selection ---
+DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+print(f"Using device: {DEVICE}")
+# Global variables for thread management
+webcam_thread = None
+stop_signal = False
+# --- Depth-Anything-V2 Model Configurations ---
+model_configs = {
+    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
+}
+encoder2name = {
+    'vits': 'Small',
+    'vitb': 'Base',
+    'vitl': 'Large'
+}
+# Model IDs and filenames for HuggingFace Hub
+DEPTH_MODEL_INFO = {
+    'vits': {
+        'repo_id': 'depth-anything/Depth-Anything-V2-Small',
+        'filename': 'depth_anything_v2_vits.pth'
+    },
+    'vitb': {
+        'repo_id': 'depth-anything/Depth-Anything-V2-Base',
+        'filename': 'depth_anything_v2_vitb.pth'
+    },
+    'vitl': {
+        'repo_id': 'depth-anything/Depth-Anything-V2-Large',
+        'filename': 'depth_anything_v2_vitl.pth'
+    }
+}
+# --- CycleGAN Network Architecture ---
+class ResnetBlock(nn.Module):
+    def __init__(self, dim, padding_type='reflect', norm_layer=nn.InstanceNorm2d, use_dropout=False):
+        super(ResnetBlock, self).__init__()
+        self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, use_dropout)
+    def build_conv_block(self, dim, padding_type, norm_layer, use_dropout):
+        conv_block = []
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(1)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(1)]
+        elif padding_type == 'zero':
+            p = 1
+        else:
+            raise NotImplementedError(f'padding {padding_type} is not implemented')
+        conv_block += [
+            nn.Conv2d(dim, dim, kernel_size=3, padding=p),
+            norm_layer(dim),
+            nn.ReLU(True)
+        ]
+        if use_dropout:
+            conv_block += [nn.Dropout(0.5)]
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(1)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(1)]
+        elif padding_type == 'zero':
+            p = 1
+        else:
+            raise NotImplementedError(f'padding {padding_type} is not implemented')
+        conv_block += [
+            nn.Conv2d(dim, dim, kernel_size=3, padding=p),
+            norm_layer(dim)
+        ]
+        return nn.Sequential(*conv_block)
+    def forward(self, x):
+        return x + self.conv_block(x)
+class Generator(nn.Module):
+    def __init__(self, input_nc=3, output_nc=3, ngf=64, n_blocks=9, norm_layer=nn.InstanceNorm2d):
+        super(Generator, self).__init__()
+        model = [
+            nn.ReflectionPad2d(3),
+            nn.Conv2d(input_nc, ngf, kernel_size=7, padding=0),
+            norm_layer(ngf),
+            nn.ReLU(True)
+        ]
+        # Downsampling
+        n_downsampling = 2
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            model += [
+                nn.Conv2d(ngf * mult, ngf * mult * 2, kernel_size=3, stride=2, padding=1),
+                norm_layer(ngf * mult * 2),
+                nn.ReLU(True)
+            ]
+        # Resnet blocks
+        mult = 2 ** n_downsampling
+        for i in range(n_blocks):
+            model += [ResnetBlock(ngf * mult, norm_layer=norm_layer)]
+        # Upsampling
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += [
+                nn.ConvTranspose2d(ngf * mult, int(ngf * mult / 2), kernel_size=3, stride=2, padding=1, output_padding=1),
+                norm_layer(int(ngf * mult / 2)),
+                nn.ReLU(True)
+            ]
+        model += [
+            nn.ReflectionPad2d(3),
+            nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0),
+            nn.Tanh()
+        ]
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+# --- Global variables for model management ---
+current_depth_model = None
+current_encoder = None
+current_gan_model = None
+# --- Model paths and HuggingFace configuration ---
+DEPTH2ROBOT_MODEL_PATH = './checkpoints/depth2image/latest_net_G_A.pth'
+DEPTH2ROBOT_HF_REPO = 'Borcherding/depth2AnythingCycleGAN_RobotsV2' # Replace with your HF username
+def download_depth_model(encoder):
+    """Download the specified depth model from HuggingFace Hub"""
+    model_info = DEPTH_MODEL_INFO[encoder]
+    model_path = hf_hub_download(
+        repo_id=model_info['repo_id'],
+        filename=model_info['filename'],
+        local_dir='checkpoints'
+    )
+    return model_path
+def load_depth_model(encoder):
+    """Load the specified depth model"""
+    global current_depth_model, current_encoder
+    if current_encoder != encoder:
+        model_path = download_depth_model(encoder)
+        current_depth_model = DepthAnythingV2(**model_configs[encoder])
+        current_depth_model.load_state_dict(torch.load(model_path, map_location='cpu'))
+        current_depth_model = current_depth_model.to(DEVICE).eval()
+        current_encoder = encoder
+    return current_depth_model
+def apply_colormap(depth, colormap=cv2.COLORMAP_TURBO):
+    """Apply a colormap to the depth image"""
+    # COLORMAP_TURBO provides better visualization than COLORMAP_JET
+    # It has a wider color spectrum and better perceptual properties
+    return cv2.applyColorMap(depth, colormap)
+# Modify load_gan_model to handle both directions
+def load_gan_model():
+    global current_gan_model, current_direction
+    try:
+        print(f"\nLoading GAN model for direction: {current_direction}")
+        # Select correct model file
+        if current_direction == "Depth to Image":
+            model_path = './checkpoints/depth2image/latest_net_G_A.pth'
+        else:
+            model_path = './checkpoints/depth2image/latest_net_G_B.pth'
+        print(f"Loading from: {os.path.abspath(model_path)}")
+        if not os.path.exists(model_path):
+            print(f"Model file not found: {model_path}")
+            return None
+        # Initialize model
+        current_gan_model = Generator().to(DEVICE)
+        state_dict = torch.load(model_path, map_location=DEVICE)
+        try:
+            current_gan_model.load_state_dict(state_dict, strict=False)
+            print("Model loaded successfully")
+        except Exception as e:
+            print(f"Error loading state dict: {e}")
+            return None
+        current_gan_model.eval()
+        return "GAN model loaded successfully"
+    except Exception as e:
+        return f"Error loading GAN: {str(e)}"
+def update_gan_source(source, path):
+    """Update GAN model source and path"""
+    global current_gan_source, DEPTH2ROBOT_HF_REPO, current_gan_model, DEPTH2ROBOT_MODEL_PATH
+    current_gan_source = source
+    if source == "HuggingFace":
+        DEPTH2ROBOT_HF_REPO = path
+    else:  # Local
+        DEPTH2ROBOT_MODEL_PATH = path  # Update the model path globally
+    # Force reload of GAN model
+    current_gan_model = None
+    # Test loading
+    model = load_gan_model()
+    if model is not None:
+        return f"✅ Successfully updated GAN source to {source} using path: {path}"
+    else:
+        return "❌ Failed to load GAN model with new settings"
+def toggle_invert_depth():
+    """Toggle depth inversion without restarting the webcam"""
+    global current_invert_depth
+    if webcam_thread and webcam_thread.is_alive():
+        current_invert_depth = not current_invert_depth
+        orientation = "light=near, dark=far" if current_invert_depth else "dark=near, light=far"
+        return f"✅ Depth colors swapped: {orientation}"
+    else:
+        return "⚠️ Webcam is not running. Please start it first."
+def reverse_depth_colormap():
+    """Reverse the depth colormap colors without restarting the webcam"""
+    global current_invert_depth
+    if webcam_thread and webcam_thread.is_alive():
+        current_invert_depth = not current_invert_depth
+        orientation = "dark=near, light=far" if current_invert_depth else "light=near, dark=far"
+        return f"✅ Depth colors reversed: {orientation}"
+    else:
+        return "⚠️ Webcam is not running. Please start it first."
+def blend_images(original, depth, opacity=0.1):
+    """
+    Blend original image with depth map
+    original: Top layer (webcam/desktop)
+    depth: Bottom layer (depth map)
+    opacity: 0.0 = depth only, 1.0 = original only
+    """
+    # Convert inputs to numpy arrays if needed
+    if not isinstance(original, np.ndarray):
+        original = np.array(original)
+    if not isinstance(depth, np.ndarray):
+        depth = np.array(depth)
+    # Ensure both images are float32 for blending
+    original = original.astype(np.float32)
+    depth = depth.astype(np.float32)
+    # Reverse the opacity interpretation for consistency with the UI
+    # (0 = depth only, 1 = original/webcam only)
+    blended = depth * (1 - opacity) + original * opacity
+    # Clip values and convert back to uint8
+    blended = np.clip(blended, 0, 255).astype(np.uint8)
+    return blended
+def toggle_blend_enabled():
+    """Toggle blending without restarting the webcam"""
+    global current_blend_enabled
+    if webcam_thread and webcam_thread.is_alive():
+        current_blend_enabled = not current_blend_enabled
+        status = "enabled" if current_blend_enabled else "disabled"
+        return f"✅ Image blending {status}"
+    else:
+        return "⚠️ Webcam is not running. Please start it first."
+def update_blend_opacity(opacity):
+    """Update the blend opacity without restarting the webcam"""
+    global current_blend_opacity
+    if webcam_thread and webcam_thread.is_alive():
+        current_blend_opacity = opacity
+        return f"✅ Updated blend opacity to {opacity:.1f}"
+    else:
+        return "⚠️ Webcam is not running. Please start it first."
+@torch.inference_mode()
+def predict_depth(image, encoder, invert_depth=None):
+    """Predict depth using the selected model with pure output"""
+    model = load_depth_model(encoder)
+    depth = model.infer_image(image)
+    # Linear normalization to 0-255 range without enhancing contrast
+    depth = depth - depth.min()
+    max_val = depth.max()
+    if (max_val > 0):  # Avoid division by zero
+        depth = (depth / max_val * 255.0)
+    # Convert to uint8 without any additional processing
+    depth = depth.astype(np.uint8)
+    # Simple inversion if requested
+    if invert_depth:
+        depth = 255 - depth
+    return depth
+@torch.inference_mode()
+def depth_to_robot(depth_image):
+    """Convert depth image to robot image using CycleGAN"""
+    try:
+        model = load_gan_model()
+        if model is None:
+            print("No GAN model loaded!")
+            return depth_image
+        print(f"Input shape: {depth_image.shape}, dtype: {depth_image.dtype}")
+        # Ensure input is in correct format
+        if depth_image.dtype != np.uint8:
+            depth_image = depth_image.astype(np.uint8)
+        # Normalize to [-1, 1] range for GAN
+        depth_tensor = torch.from_numpy(depth_image).float().permute(2, 0, 1).unsqueeze(0)
+        depth_tensor = (depth_tensor / 127.5) - 1.0
+        print(f"Tensor shape: {depth_tensor.shape}, device: {depth_tensor.device}")
+        # Process through GAN
+        depth_tensor = depth_tensor.to(DEVICE)
+        with torch.no_grad():
+            robot_tensor = model(depth_tensor)
+        print(f"Output tensor shape: {robot_tensor.shape}")
+        # Convert back to image (0-255 range)
+        robot_tensor = (robot_tensor + 1.0) * 127.5
+        robot_image = robot_tensor[0].permute(1, 2, 0).cpu().numpy().astype(np.uint8)
+        return robot_image
+    except Exception as e:
+        print(f"Error in depth_to_robot: {e}")
+        traceback.print_exc()
+        return depth_image
+def toggle_depth_bypass():
+    """Toggle depth map bypass"""
+    global current_bypass_depth
+    if webcam_thread and webcam_thread.is_alive():
+        current_bypass_depth = not current_bypass_depth
+        status = "enabled" if current_bypass_depth else "disabled"
+        return f"✅ Depth bypass {status}"
+    else:
+        return "⚠️ Webcam is not running. Please start it first."
+def process_frame(frame, encoder, use_gan=True, colormap="WINTER"):
+    """Process a single frame matching the test app's pattern"""
+    global current_invert_depth, current_bypass_depth, current_blend_enabled
+    global current_blend_opacity, current_gan_input, current_direction, current_gan_model
+    try:
+        # Convert frame to RGB
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # Load GAN model if not loaded
+        if use_gan and current_gan_model is None:
+            current_gan_model = load_gan_model()
+            if current_gan_model is None:
+                print("Failed to load GAN model, falling back to depth only")
+                use_gan = False
+        if current_direction == "Depth to Image":
+            # First get depth map
+            depth = predict_depth(frame_rgb, encoder, invert_depth=current_invert_depth)
+            # Apply colormap to depth
+            selected_colormap = DEPTH_COLORMAPS.get(colormap, cv2.COLORMAP_WINTER)
+            depth_colored = cv2.applyColorMap(depth, selected_colormap)
+            # Apply blending if enabled
+            if current_blend_enabled:
+                depth_colored = blend_images(frame_rgb, depth_colored, current_blend_opacity)
+            # Store the input we're sending to GAN
+            current_gan_input = depth_colored.copy()
+            if use_gan and current_gan_model is not None:
+                try:
+                    # Convert to PIL and process like in test app
+                    input_pil = Image.fromarray(depth_colored)
+                    transform = transforms.Compose([
+                        transforms.Resize(256),
+                        transforms.ToTensor(),
+                        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+                    ])
+                    input_tensor = transform(input_pil).unsqueeze(0)
+                    # Process through GAN
+                    input_tensor = input_tensor.to(DEVICE)
+                    with torch.no_grad():
+                        output_tensor = current_gan_model(input_tensor)
+                    # Convert back using same post-processing as test app
+                    output_tensor = output_tensor.squeeze(0).cpu()
+                    output_tensor = (output_tensor + 1) / 2
+                    output_tensor = output_tensor.clamp(0, 1)
+                    output_tensor = output_tensor.permute(1, 2, 0).numpy()
+                    processed = (output_tensor * 255).astype(np.uint8)
+                except Exception as e:
+                    print(f"Error processing through GAN: {e}")
+                    processed = depth_colored
+            else:
+                processed = depth_colored
+        else:  # Image to Depth
+            # Store original as GAN input
+            current_gan_input = frame_rgb.copy()
+            if use_gan:
+                # Process like test app
+                input_pil = Image.fromarray(frame_rgb)
+                transform = transforms.Compose([
+                    transforms.Resize(256),
+                    transforms.ToTensor(),
+                    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+                ])
+                input_tensor = transform(input_pil).unsqueeze(0)
+                input_tensor = input_tensor.to(DEVICE)
+                with torch.no_grad():
+                    output_tensor = current_gan_model(input_tensor)
+                output_tensor = output_tensor.squeeze(0).cpu()
+                output_tensor = (output_tensor + 1) / 2
+                output_tensor = output_tensor.clamp(0, 1)
+                output_tensor = output_tensor.permute(1, 2, 0).numpy()
+                processed = (output_tensor * 255).astype(np.uint8)
+            else:
+                processed = frame_rgb
+    except Exception as e:
+        print(f"Error in processing: {e}")
+        traceback.print_exc()
+        processed = frame
+    return processed
+def virtual_webcam_stream(encoder, use_gan=True, webcam_id=0):
+    """Stream depth display or robot conversion to virtual webcam with dynamic colormap changes"""
+    global current_colormap, current_input_source, current_gan_input, current_gan_model
+    try:
+        # First, ensure GAN model is loaded if needed
+        if use_gan and current_gan_model is None:
+            result = load_gan_model()
+            if isinstance(result, str) and "successfully" not in result.lower():
+                print("Failed to load GAN model, falling back to depth only")
+                use_gan = False
+        # Initialize webcam
+        cap = None
+        if current_input_source == "Webcam":
+            cap = cv2.VideoCapture(int(webcam_id))
+            if not cap.isOpened():
+                raise RuntimeError(f"Failed to open webcam {webcam_id}")
+            print(f"Successfully opened webcam {webcam_id}")
+        # Initialize screen capture if using desktop
+        sct = None
+        if current_input_source == "Desktop":
+            sct = mss.mss()
+            monitor = sct.monitors[1]
+        # Try different virtual camera backends in order of preference
+        cam = None
+        backends = ['droidcam', 'unity', 'obs']  # Changed order to try droidcam first
+        errors = []
+        for backend in backends:
+            try:
+                cam = pyvirtualcam.Camera(
+                    width=640,
+                    height=480,
+                    fps=30,
+                    fmt=PixelFormat.BGR,
+                    backend=backend,
+                    device='/dev/video2' if backend == 'v4l2' else None  # For Linux
+                )
+                print(f'Successfully initialized virtual camera using {backend} backend')
+                break
+            except Exception as e:
+                errors.append(f'{backend} error: {str(e)}')
+                continue
+        if cam is None:
+            raise RuntimeError("Failed to initialize any virtual camera backend:\n" +
+                             "\n".join(errors) +
+                             "\nPlease install OBS Virtual Camera or another compatible virtual camera.")
+        print(f'Using virtual camera: {cam.device}')
+        print(f'Mode: {"Depth to Robot" if use_gan else "Depth Only"}')
+        print(f'Input Source: {current_input_source}')
+        frame_count = 0
+        last_time = time.time()
+        fps = 0
+        while not stop_signal:
+            try:
+                # Get frame based on input source
+                if current_input_source == "Webcam":
+                    ret, frame = cap.read()
+                    if not ret:
+                        print("Failed to get frame from webcam")
+                        time.sleep(0.1)  # Add small delay before retry
+                        continue
+                else:  # Desktop
+                    frame = get_screen_capture()
+                # Calculate FPS
+                frame_count += 1
+                if frame_count % 30 == 0:
+                    current_time = time.time()
+                    fps = 30 / (current_time - last_time)
+                    last_time = current_time
+                    print(f"FPS: {fps:.1f}")
+                # Resize frame to match virtual camera resolution
+                frame = cv2.resize(frame, (640, 480))
+                # Process the frame
+                processed = process_frame(frame, encoder, use_gan, current_colormap)
+                # Add GAN input preview if available
+                if current_gan_input is not None and use_gan:
+                    preview_width = 160
+                    preview_height = 120
+                    preview = cv2.resize(current_gan_input, (preview_width, preview_height))
+                    y_offset = 10
+                    x_offset = processed.shape[1] - preview_width - 10
+                    # Create a copy for modification
+                    output = processed.copy()
+                    # Add semi-transparent black background
+                    overlay = np.zeros((preview_height + 2, preview_width + 2, 3), dtype=np.uint8)
+                    alpha = 0.7
+                    output[y_offset-1:y_offset+preview_height+1,
+                          x_offset-1:x_offset+preview_width+1] = cv2.addWeighted(
+                        output[y_offset-1:y_offset+preview_height+1,
+                               x_offset-1:x_offset+preview_width+1],
+                        1 - alpha,
+                        overlay,
+                        alpha,
+                        0
+                    )
+                    # Add the preview
+                    output[y_offset:y_offset+preview_height,
+                          x_offset:x_offset+preview_width] = preview
+                    processed = output
+                # Send to virtual camera
+                cam.send(processed)
+                cam.sleep_until_next_frame()
+            except Exception as e:
+                print(f"Error processing frame: {e}")
+                traceback.print_exc()
+                time.sleep(0.1)  # Add delay before retry
+                continue
+        # Cleanup
+        if cap is not None:
+            cap.release()
+        if sct is not None:
+            sct.close()
+    except Exception as e:
+        print(f"Critical error in virtual_webcam_stream: {e}")
+        traceback.print_exc()
+        return False
+    return True
+def toggle_input_source():
+    """Toggle between webcam and desktop capture"""
+    global current_input_source, webcam_thread, stop_signal
+    # Stop current stream
+    if webcam_thread and webcam_thread.is_alive():
+        stop_signal = True
+        webcam_thread.join(timeout=1.0)
+    # Toggle source
+    current_input_source = "Desktop" if current_input_source == "Webcam" else "Webcam"
+    # Restart stream if it was running
+    if webcam_thread:
+        return start_webcam_thread(
+            current_model_name,
+            current_mode,
+            current_webcam_id,
+            current_colormap
+        )
+    return f"✅ Switched to {current_input_source} input"
+def get_screen_capture():
+    """Capture the desktop screen"""
+    import mss
+    sct = mss.mss()
+    monitor = sct.monitors[1]  # Primary monitor
+    screenshot = np.array(sct.grab(monitor))
+    return cv2.cvtColor(screenshot, cv2.COLOR_BGRA2BGR)
+def verify_model_path():
+    """Verify the GAN model file exists"""
+    model_path = './checkpoints/depth2image/latest_net_G_A.pth'
+    if not os.path.exists(model_path):
+        print(f"Model file not found at: {model_path}")
+        print("Current working directory:", os.getcwd())
+        return False
+    return True
+# Add this check before starting the webcam:
+def start_webcam_thread(model_name, mode, webcam_id=0, colormap="TURBO"):
+    global webcam_thread, stop_signal, current_colormap, current_mode
+    global current_model_name, current_webcam_id, current_direction
+    # Verify model exists if using GAN
+    if mode != "Depth Only" and not verify_model_path():
+        return "❌ GAN model file not found! Please check the model path."
+    # Update current settings
+    current_colormap = colormap
+    current_mode = mode
+    current_model_name = model_name
+    current_webcam_id = webcam_id
+    # Set direction based on mode
+    if mode == "Depth to Image":
+        current_direction = "Depth to Image"
+    elif mode == "Image to Depth":
+        current_direction = "Image to Depth"
+    # If a thread is already running, stop it
+    if webcam_thread and webcam_thread.is_alive():
+        stop_signal = True
+        webcam_thread.join(timeout=1.0)
+    # Reset stop signal
+    stop_signal = False
+    # Start new thread
+    encoder = {v: k for k, v in encoder2name.items()}[model_name]
+    use_gan = (mode != "Depth Only")
+    webcam_thread = threading.Thread(
+        target=virtual_webcam_stream,
+        args=(encoder, use_gan, int(webcam_id)),
+        daemon=True
+    )
+    webcam_thread.start()
+    return f"✅ Started virtual webcam: {mode} with {model_name} model using {colormap} colormap"
+def update_colormap(colormap):
+    """Update the colormap without restarting the webcam"""
+    global current_colormap
+    if webcam_thread and webcam_thread.is_alive():
+        current_colormap = colormap
+        return f"✅ Updated colormap to {colormap}"
+    else:
+        return "⚠️ Webcam is not running. Please start it first."
+def stop_webcam():
+    """Stop the webcam thread"""
+    global webcam_thread, stop_signal
+    if webcam_thread and webcam_thread.is_alive():
+        stop_signal = True
+        webcam_thread.join(timeout=1.0)
+        return "✅ Webcam stopped"
+    else:
+        return "No webcam is running"
+def set_device_mode(choice):
+    """Set the device to use for model inference"""
+    global DEVICE
+    if choice == "Auto":
+        DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+    elif choice == "CUDA":
+        DEVICE = 'cuda'
+    else:
+        DEVICE = 'cpu'
+    # Reset loaded models to ensure they're on the correct device
+    global current_depth_model, current_gan_model
+    current_depth_model = None
+    current_gan_model = None
+    return f"Device set to: {DEVICE}"
+def test_gan_model():
+    """Test if the GAN model loads and runs correctly"""
+    try:
+        # Try loading the model to verify it works
+        model = load_gan_model()
+        if model is None:
+            return "❌ Failed to load GAN model. Check console for errors."
+        # Create a simple test tensor
+        test_input = torch.zeros(1, 3, 64, 64).to(DEVICE)
+        # Try running inference
+        with torch.no_grad():
+            output = model(test_input)
+        return f"✅ GAN model loaded and tested successfully on {DEVICE}!"
+    except Exception as e:
+        return f"❌ Error testing GAN model: {str(e)}"
+def upload_to_huggingface(hf_token, repo_name=None):
+    """Upload the GAN model to HuggingFace"""
+    if not repo_name:
+        repo_name = DEPTH2ROBOT_HF_REPO
+    if not os.path.exists(DEPTH2ROBOT_MODEL_PATH):
+        return "❌ Model file not found. Please make sure it exists at: ./checkpoints/depth2image/latest_net_G.pth"
+    try:
+        # Login to HuggingFace
+        login(token=hf_token)
+        # Upload the model file
+        upload_info = upload_file(
+            path_or_fileobj=DEPTH2ROBOT_MODEL_PATH,
+            path_in_repo="latest_net_G.pth",
+            repo_id=repo_name,
+            repo_type="model",
+            create_pr=False
+        )
+        # Create a simple model card if it doesn't exist
+        model_card = """---
+tags:
+- depth-to-robot
+- image-to-image
+- cyclegan
+---
+# Depth2Robot GAN Model
+This model transforms depth maps into robot-style images using CycleGAN.
+## Model Description
+- This model was trained on depth maps and robot images.
+- It converts grayscale depth maps to colorful robot-style imagery.
+- Trained using CycleGAN architecture.
+## Usage
+```python
+import torch
+from huggingface_hub import hf_hub_download
+# Download the model
+model_path = hf_hub_download(repo_id="{repo_name}", filename="latest_net_G.pth")
+# Load the model (you need to define the Generator class)
+model = Generator()
+model.load_state_dict(torch.load(model_path), strict=False)
+model.eval()
+# Use the model for inference
+# ...
+```
+""".format(repo_name=repo_name)
+        # Create a temporary model card file
+        with open("./README.md", "w") as f:
+            f.write(model_card)
+        # Upload the model card
+        upload_file(
+            path_or_fileobj="./README.md",
+            path_in_repo="README.md",
+            repo_id=repo_name,
+            repo_type="model",
+            create_pr=False
+        )
+        # Clean up
+        os.remove("./README.md")
+        return f"✅ Successfully uploaded model to HuggingFace!\n\nYou can view it at: https://huggingface.co/{repo_name}"
+    except Exception as e:
+        return f"❌ Error uploading to HuggingFace: {e}"
+def toggle_mode():
+    """Quick toggle between Depth Only and Depth to Robot modes"""
+    global current_mode
+    if webcam_thread and webcam_thread.is_alive():
+        current_mode = "Depth Only" if current_mode == "Depth to Robot" else "Depth to Robot"
+        return start_webcam_thread(
+            current_model_name,
+            current_mode,
+            current_webcam_id,
+            current_colormap
+        )
+    return "⚠️ Webcam is not running. Please start it first."
+def update_gan_preview():
+    """Update the GAN input preview"""
+    global current_gan_input
+    if current_gan_input is not None:
+        return current_gan_input
+    return None
+def test_webcams():
+    """Test available webcams"""
+    available_cams = []
+    for i in range(10):  # Test first 10 indices
+        cap = cv2.VideoCapture(i)
+        if cap.isOpened():
+            ret, _ = cap.read()
+            if ret:
+                available_cams.append(i)
+            cap.release()
+    return available_cams
+def stop_gan():
+    """Stop the GAN processing"""
+    global current_gan_model
+    if current_gan_model is not None:
+        current_gan_model = None
+        return "✅ GAN processing stopped"
+    return "GAN was not running"
+# --- Gradio UI ---
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")) as demo:
+    gr.Markdown("# 🤖 Depth Anything V2 to Robot Virtual Webcam for Discord")
+    with gr.Row():
+        with gr.Column(scale=2):
+            with gr.Group():
+                gr.Markdown("### 📹 Webcam Settings")
+                # First define the status box that will be used in connections
+                webcam_status = gr.Textbox(
+                    label="Status",
+                    placeholder="Not started",
+                    interactive=False
+                )
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        model_dropdown = gr.Dropdown(
+                            choices=list(encoder2name.values()),
+                            value="Small",
+                            label="Depth Model Size",
+                            info="Smaller = faster, larger = more detailed"
+                        )
+                    with gr.Column(scale=1):
+                        mode_dropdown = gr.Dropdown(
+                            choices=["Depth Only", "Depth to Image", "Image to Depth"],
+                            value="Depth to Image",
+                            label="Output Mode",
+                            info="Select conversion direction or depth visualization"
+                        )
+                # Add in the UI section after mode_dropdown
+                with gr.Column(scale=1):
+                    gan_source_radio = gr.Radio(
+                        choices=["Local", "HuggingFace"],
+                        value="Local",
+                        label="GAN Model Source",
+                        info="Choose between local model or download from HuggingFace"
+                    )
+                    gan_path = gr.Textbox(
+                        value=DEPTH2ROBOT_LOCAL_PATH,
+                        label="Local GAN Path/HF Repo",
+                        info="Local path or HuggingFace repo name"
+                    )
+                with gr.Column(scale=1):
+                    colormap_dropdown = gr.Dropdown(
+                        choices=list(DEPTH_COLORMAPS.keys()),
+                        value="TURBO",
+                        label="Depth Colormap",
+                        info="Color scheme for depth visualization"
+                    )
+                    with gr.Row():
+                        update_colormap_button = gr.Button("Update Colormap", variant="secondary")
+                        reverse_depth_button = gr.Button("Reverse Depth Colors", variant="secondary")
+                        bypass_depth_button = gr.Button("Toggle Depth Bypass", variant="secondary")
+                        toggle_invert_button = gr.Button("Toggle Depth Inversion", variant="secondary")
+                webcam_id = gr.Number(
+                    value=0,
+                    label="Webcam ID",
+                    info="Usually 0 for built-in webcam, try 1 or 2 for external cameras",
+                    precision=0
+                )
+                with gr.Row():
+                    start_button = gr.Button("▶️ Start Webcam", variant="primary", scale=2)
+                    stop_button = gr.Button("⏹️ Stop Webcam", variant="stop", scale=1)
+                with gr.Row():
+                    quick_mode_toggle = gr.Button("🔄 Toggle Depth/Robot Mode", variant="primary")
+                    input_source_button = gr.Button("🔄 Toggle Webcam/Desktop", variant="secondary")
+                webcam_status = gr.Textbox(
+                    label="Status",
+                    placeholder="Not started",
+                    interactive=False
+                )
+            with gr.Group():
+                gr.Markdown("### 🎨 Blending Settings")
+                with gr.Row():
+                    blend_enabled_toggle = gr.Checkbox(
+                        label="Enable Blending",
+                        value=False,
+                        info="Blend original video on top of depth map"
+                    )
+                blend_opacity_slider = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.1,
+                    step=0.1,
+                    label="Blend Opacity",
+                    info="0 = Depth only, 1 = Camera only"
+                )
+                with gr.Row():
+                    update_blend_button = gr.Button("Update Blend Settings", variant="secondary")
+            # Testing section
+            with gr.Group():
+                gr.Markdown("### 🧪 Test Your GAN Model")
+                test_button = gr.Button("🧪 Test GAN Model", variant="secondary")
+                test_output = gr.Textbox(label="Test Results", interactive=False)
+        # Right column
+        with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### ⚙️ Advanced Settings")
+                device_radio = gr.Radio(
+                    choices=["Auto", "CUDA", "CPU"],
+                    value="Auto",
+                    label="Device Selection",
+                    info="Use CPU if you experience GPU errors"
+                )
+                device_output = gr.Textbox(
+                    label="Device Status",
+                    value=f"Current device: {DEVICE}",
+                    interactive=False
+                )
+                device_radio.change(fn=set_device_mode, inputs=device_radio, outputs=device_output)
+            with gr.Group():
+                gr.Markdown("### 🚀 Upload to HuggingFace")
+                hf_token = gr.Textbox(
+                    label="HuggingFace API Token",
+                    placeholder="hf_...",
+                    type="password",
+                    info="Get your token from huggingface.co/settings/tokens"
+                )
+                repo_name = gr.Textbox(
+                    label="Repository Name",
+                    placeholder=f"username/depth2robot-model",
+                    info="Format: username/repo-name"
+                )
+                upload_button = gr.Button("📤 Upload Model", variant="secondary")
+                upload_result = gr.Textbox(label="Upload Result", interactive=False)
+            with gr.Group():
+                gr.Markdown("### 🎥 Test Webcams")
+                test_webcams_button = gr.Button("Scan for Webcams")
+                webcams_output = gr.Textbox(label="Available Webcams", interactive=False)
+    # Connect UI elements to functions - MOVED ALL CONNECTIONS HERE
+    start_button.click(
+        fn=start_webcam_thread,
+        inputs=[model_dropdown, mode_dropdown, webcam_id, colormap_dropdown],
+        outputs=webcam_status
+    )
+    stop_button.click(fn=stop_webcam, inputs=[], outputs=webcam_status)
+    # Add this with the other connections near the bottom of the file
+    update_colormap_button.click(
+        fn=update_colormap,
+        inputs=colormap_dropdown,
+        outputs=webcam_status
+    )
+    # Add with other connections
+    input_source_button.click(
+        fn=toggle_input_source,
+        inputs=[],
+        outputs=webcam_status
+    )
+    # Add this with the other connections near the bottom of the file
+    blend_enabled_toggle.change(
+        fn=toggle_blend_enabled,
+        inputs=[],
+        outputs=webcam_status
+    )
+    # Add this with the other connections near the bottom of the file
+    update_blend_button.click(
+        fn=update_blend_opacity,
+        inputs=blend_opacity_slider,
+        outputs=webcam_status
+    )
+    # Add the toggle invert button connection here
+    reverse_depth_button.click(
+        fn=reverse_depth_colormap,
+        inputs=[],
+        outputs=webcam_status
+    )
+    # Add with other connections
+    gan_source_radio.change(
+        fn=update_gan_source,
+        inputs=[gan_source_radio, gan_path],
+        outputs=webcam_status
+    )
+    test_button.click(fn=test_gan_model, inputs=[], outputs=test_output)
+    # Add this with the other connections near the bottom of the file
+    upload_button.click(
+        fn=upload_to_huggingface,
+        inputs=[hf_token, repo_name],
+        outputs=upload_result
+    )
+    # Add this with the other connections near the bottom of the file
+    quick_mode_toggle.click(
+        fn=toggle_mode,
+        inputs=[],
+        outputs=webcam_status
+    )
+    # Add this with the other connections near the bottom of the file
+    test_webcams_button.click(
+        fn=test_webcams,
+        inputs=[],
+        outputs=webcams_output
+    )
+    # Add to the connections section with the other button connections:
+    bypass_depth_button.click(
+        fn=toggle_depth_bypass,
+        inputs=[],
+        outputs=webcam_status
+    )
+    # In the Gradio UI section, add this button:
+    with gr.Row():
+        stop_gan_button = gr.Button("⏹️ Stop GAN", variant="stop")
+    # Add the connection:
+    stop_gan_button.click(
+        fn=stop_gan,
+        inputs=[],
+        outputs=webcam_status
+    )
+    # Add to the UI:
+    with gr.Row():
+        gan_status = gr.Textbox(
+            label="GAN Status",
+            value="Not loaded",
+            interactive=False
+        )
+    # Help section
+    with gr.Accordion("Help & Troubleshooting", open=False):
+        gr.Markdown("""
+        ## Common Issues
+        ### Model not loading
+        - Make sure your model file is in `./checkpoints/depth2image/latest_net_G.pth`
+        - Try using the "Test GAN Model" button to check if it loads correctly
+        - If you see errors about missing keys, the model structure is different - this script uses `strict=False` to load it anyway
+        ### Virtual camera not showing in Discord
+        - Make sure OBS Virtual Camera is installed
+        - Try stopping and starting the webcam
+        - Restart Discord after starting the virtual camera
+        ### Performance issues
+        - Use the "Small" depth model for better performance
+        - Try the "CPU" device option if you're having GPU memory issues
+        """)
+if __name__ == "__main__":
+    # Make sure the checkpoints directory exists
+    os.makedirs("checkpoints/depth2image", exist_ok=True)
+    # Launch the Gradio interface
+    demo.launch()

src/training/trainDepth2AnythingGAN.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff