import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth'))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'refer'))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'stable-diffusion'))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src/taming-transformers'))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src/clip'))) os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth'))) import cv2 import numpy as np import torch from depth.models_depth.model import EVPDepth from models_refer.model import EVPRefer from depth.configs.train_options import TrainOptions from depth.configs.test_options import TestOptions import glob import utils import torchvision.transforms as transforms from utils_depth.misc import colorize from PIL import Image import torch.nn.functional as F import gradio as gr import tempfile from transformers import CLIPTokenizer, AutoModel css = """ #img-display-container { max-height: 50vh; } #img-display-input { max-height: 40vh; } #img-display-output { max-height: 40vh; } """ def create_depth_demo(model, device): gr.Markdown("### Depth Prediction demo") with gr.Row(): input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input') depth_image = gr.Image(label="Depth Map", elem_id='img-display-output') raw_file = gr.File(label="16-bit raw depth, multiplier:256") submit = gr.Button("Submit") def on_submit(image): transform = transforms.ToTensor() image = transform(image).unsqueeze(0).to(device) shape = image.shape image = torch.nn.functional.interpolate(image, (440,480), mode='bilinear', align_corners=True) image = F.pad(image, (0, 0, 40, 0)) with torch.no_grad(): pred = model(image)#['pred_d'] pred = torch.from_numpy(pred).to(device).float() if pred.dim() == 2: # H×W pred = pred.unsqueeze(0).unsqueeze(0) pred = pred[:,:,40:,:] pred = torch.nn.functional.interpolate(pred, shape[2:], mode='bilinear', align_corners=True) pred_d_numpy = pred.squeeze().cpu().numpy() colored_depth, _, _ = colorize(pred_d_numpy, cmap='gray_r') tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False) raw_depth = Image.fromarray((pred_d_numpy*256).astype('uint16')) raw_depth.save(tmp.name) return [colored_depth, tmp.name] submit.click(on_submit, inputs=[input_image], outputs=[depth_image, raw_file]) examples = gr.Examples(examples=["imgs/test_img1.jpg", "imgs/test_img2.jpg", "imgs/test_img3.jpg", "imgs/test_img4.jpg", "imgs/test_img5.jpg"], inputs=[input_image]) def create_refseg_demo(model, tokenizer, device): gr.Markdown("### Referring Segmentation demo") with gr.Row(): input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input') refseg_image = gr.Image(label="Output Mask", elem_id='img-display-output') input_text = gr.Textbox(label='Prompt', placeholder='Please upload your image first', lines=2) submit = gr.Button("Submit") def on_submit(image, text): # Convert PIL -> np array transform = transforms.ToTensor() image_t = transform(image).unsqueeze(0).to(device) with torch.no_grad(): out = model(image_t, text) # Ensure numpy mask if isinstance(out, torch.Tensor): mask = out.squeeze().detach().cpu().numpy() else: mask = out # If model returns multi-channel, collapse with argmax if mask.ndim > 2: mask = np.argmax(mask, axis=0) mask = mask.astype(np.uint8) # Overlay mask on original image image_np = np.array(image).copy() alpha = 0.65 image_np[mask == 0] = (image_np[mask == 0] * alpha).astype(np.uint8) # Draw contours contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) cv2.drawContours(image_np, contours, -1, (0, 255, 0), 2) return Image.fromarray(image_np) submit.click(on_submit, inputs=[input_image, input_text], outputs=refseg_image) examples = gr.Examples( examples=[ ["imgs/test_img2.jpg", "green plant"], ["imgs/test_img3.jpg", "chair"], ["imgs/test_img4.jpg", "left green plant"], ["imgs/test_img5.jpg", "man walking on foot"], ["imgs/test_img5.jpg", "the rightest camel"], ], inputs=[input_image, input_text] ) def main(): upload_2_models = True opt = TestOptions().initialize() args = opt.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if upload_2_models: model = AutoModel.from_pretrained("MykolaL/evp_depth", trust_remote_code=True).to(device).eval() tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") model_refseg = AutoModel.from_pretrained("MykolaL/evp_refer", trust_remote_code=True).to(device).eval() print('Models uploaded successfully') title = "# EVP" description = """Official demo for **EVP: Enhanced Visual Perception using Inverse Multi-Attentive Feature Refinement and Regularized Image-Text Alignment**. EVP is a deep learning model for metric depth estimation from a single image as well as referring segmentation. Please refer to our [project page](https://lavreniuk.github.io/EVP) or [paper](https://arxiv.org/abs/2312.08548) or [github](https://github.com/Lavreniuk/EVP) for more details.""" with gr.Blocks() as demo: gr.Markdown(title) gr.Markdown(description) if upload_2_models: with gr.Tab("Depth Prediction"): create_depth_demo(model, device) with gr.Tab("Referring Segmentation"): create_refseg_demo(model_refseg, tokenizer, device) gr.HTML('''


You can duplicate this Space to skip the queue:Duplicate Space

visitors

''') demo.queue().launch(share=True) if __name__ == '__main__': main()