Grounded-Segment-Anything

Runtime error

App Files Files Community

liuyizhang commited on Apr 12, 2023

Commit

a71406a

1 Parent(s): 9403943

update files

Browse files

Files changed (7) hide show

automatic_label_demo.py +18 -8
gradio_app.py +65 -15
gradio_auto_label.py +392 -0
grounded_sam.ipynb +12 -3
grounded_sam_inpainting_demo.py +10 -1
grounded_sam_whisper_demo.py +258 -0
grounded_dino_sam_inpainting_demo.py → grounded_sam_whisper_inpainting_demo.py +127 -124

automatic_label_demo.py CHANGED Viewed

@@ -43,20 +43,23 @@ def load_image(image_path):
     return image_pil, image
-def generate_caption(raw_image):
     # unconditional image captioning
-    inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
     out = blip_model.generate(**inputs)
     caption = processor.decode(out[0], skip_special_tokens=True)
     return caption
-def generate_tags(caption, max_tokens=100, model="gpt-3.5-turbo"):
     prompt = [
         {
             'role': 'system',
-            'content': 'Extrat the unique nouns in the caption. Remove all the adjectives. ' + \
-                       'List the nouns in singular form. Split them by ".". ' + \
                        f'Caption: {caption}.'
         }
     ]
@@ -197,6 +200,7 @@ if __name__ == "__main__":
         "--sam_checkpoint", type=str, required=True, help="path to checkpoint file"
     )
     parser.add_argument("--input_image", type=str, required=True, help="path to image file")
     parser.add_argument("--openai_key", type=str, required=True, help="key for chatgpt")
     parser.add_argument("--openai_proxy", default=None, type=str, help="proxy for chatgpt")
     parser.add_argument(
@@ -215,6 +219,7 @@ if __name__ == "__main__":
     grounded_checkpoint = args.grounded_checkpoint  # change the path of the model
     sam_checkpoint = args.sam_checkpoint
     image_path = args.input_image
     openai_key = args.openai_key
     openai_proxy = args.openai_proxy
     output_dir = args.output_dir
@@ -242,9 +247,14 @@ if __name__ == "__main__":
     # https://huggingface.co/spaces/xinyu1205/Tag2Text
     # but there are some bugs...
     processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-    blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to("cuda")
-    caption = generate_caption(image_pil)
-    text_prompt = generate_tags(caption)
     print(f"Caption: {caption}")
     print(f"Tags: {text_prompt}")

     return image_pil, image
+def generate_caption(raw_image, device):
     # unconditional image captioning
+    if device == "cuda":
+        inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
+    else:
+        inputs = processor(raw_image, return_tensors="pt")
     out = blip_model.generate(**inputs)
     caption = processor.decode(out[0], skip_special_tokens=True)
     return caption
+def generate_tags(caption, split=',', max_tokens=100, model="gpt-3.5-turbo"):
     prompt = [
         {
             'role': 'system',
+            'content': 'Extract the unique nouns in the caption. Remove all the adjectives. ' + \
+                       f'List the nouns in singular form. Split them by "{split} ". ' + \
                        f'Caption: {caption}.'
         }
     ]
         "--sam_checkpoint", type=str, required=True, help="path to checkpoint file"
     )
     parser.add_argument("--input_image", type=str, required=True, help="path to image file")
+    parser.add_argument("--split", default=",", type=str, help="split for text prompt")
     parser.add_argument("--openai_key", type=str, required=True, help="key for chatgpt")
     parser.add_argument("--openai_proxy", default=None, type=str, help="proxy for chatgpt")
     parser.add_argument(
     grounded_checkpoint = args.grounded_checkpoint  # change the path of the model
     sam_checkpoint = args.sam_checkpoint
     image_path = args.input_image
+    split = args.split
     openai_key = args.openai_key
     openai_proxy = args.openai_proxy
     output_dir = args.output_dir
     # https://huggingface.co/spaces/xinyu1205/Tag2Text
     # but there are some bugs...
     processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+    if device == "cuda":
+        blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to("cuda")
+    else:
+        blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+    caption = generate_caption(image_pil, device=device)
+    # Currently ", " is better for detecting single tags
+    # while ". " is a little worse in some case
+    text_prompt = generate_tags(caption, split=split)
     print(f"Caption: {caption}")
     print(f"Tags: {text_prompt}")

gradio_app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import gradio as gr
 import argparse
-import os
 import copy
 import numpy as np
 import torch
 from PIL import Image, ImageDraw, ImageFont
 # Grounding DINO
@@ -30,6 +32,10 @@ from io import BytesIO
 from diffusers import StableDiffusionInpaintPipeline
 from huggingface_hub import hf_hub_download
 def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
     args = SLConfig.fromfile(model_config_path)
     model = build_model(args)
@@ -42,6 +48,13 @@ def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
     _ = model.eval()
     return model
 def plot_boxes_to_image(image_pil, tgt):
     H, W = tgt["size"]
     boxes = tgt["boxes"]
@@ -135,14 +148,16 @@ def get_grounding_output(model, image, caption, box_threshold, text_threshold, w
     tokenized = tokenlizer(caption)
     # build pred
     pred_phrases = []
     for logit, box in zip(logits_filt, boxes_filt):
         pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
         if with_logits:
             pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
         else:
             pred_phrases.append(pred_phrase)
-    return boxes_filt, pred_phrases
 def show_mask(mask, ax, random_color=False):
     if random_color:
@@ -164,12 +179,11 @@ def show_box(box, ax, label):
 config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
 ckpt_repo_id = "ShilongLiu/GroundingDINO"
 ckpt_filenmae = "groundingdino_swint_ogc.pth"
-sam_checkpoint='/home/ecs-user/download/sam_vit_h_4b8939.pth'
 output_dir="outputs"
 device="cuda"
-def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold):
-    assert text_prompt, 'text_prompt is not found!'
     # make dir
     os.makedirs(output_dir, exist_ok=True)
@@ -177,18 +191,29 @@ def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_thr
     image_pil, image = load_image(image_path.convert("RGB"))
     # load model
     model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
     # visualize raw image
     image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
     # run grounding dino model
-    boxes_filt, pred_phrases = get_grounding_output(
         model, image, text_prompt, box_threshold, text_threshold, device=device
     )
     size = image_pil.size
-    if task_type == 'seg' or task_type == 'inpainting':
         # initialize SAM
         predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
         image = np.array(image_path)
@@ -201,6 +226,16 @@ def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_thr
             boxes_filt[i][2:] += boxes_filt[i][:2]
         boxes_filt = boxes_filt.cpu()
         transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
         masks, _, _ = predictor.predict_torch(
@@ -224,7 +259,7 @@ def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_thr
         image_with_box.save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         return image_result
-    elif task_type == 'seg':
         assert sam_checkpoint, 'sam_checkpoint is not found!'
         # draw output image
@@ -234,6 +269,8 @@ def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_thr
             show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
         for box, label in zip(boxes_filt, pred_phrases):
             show_box(box.numpy(), plt.gca(), label)
         plt.axis('off')
         image_path = os.path.join(output_dir, "grounding_dino_output.jpg")
         plt.savefig(image_path, bbox_inches="tight")
@@ -242,16 +279,24 @@ def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_thr
     elif task_type == 'inpainting':
         assert inpaint_prompt, 'inpaint_prompt is not found!'
         # inpainting pipeline
-        mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
         mask_pil = Image.fromarray(mask)
-        image_pil = Image.fromarray(image)
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
         "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
         )
         pipe = pipe.to("cuda")
         image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
         image_path = os.path.join(output_dir, "grounded_sam_inpainting_output.jpg")
         image.save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
@@ -264,15 +309,16 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
     parser.add_argument("--debug", action="store_true", help="using debug mode")
     parser.add_argument("--share", action="store_true", help="share the app")
     args = parser.parse_args()
     block = gr.Blocks().queue()
     with block:
         with gr.Row():
             with gr.Column():
-                input_image = gr.Image(source='upload', type="pil")
-                text_prompt = gr.Textbox(label="Detection Prompt")
-                task_type = gr.Textbox(label="task type: det/seg/inpainting")
                 inpaint_prompt = gr.Textbox(label="Inpaint Prompt")
                 run_button = gr.Button(label="Run")
                 with gr.Accordion("Advanced options", open=False):
@@ -282,6 +328,10 @@ if __name__ == "__main__":
                     text_threshold = gr.Slider(
                         label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
                     )
             with gr.Column():
                 gallery = gr.outputs.Image(
@@ -289,7 +339,7 @@ if __name__ == "__main__":
                 ).style(full_width=True, full_height=True)
         run_button.click(fn=run_grounded_sam, inputs=[
-                        input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold], outputs=[gallery])
-    block.launch(server_name='0.0.0.0', server_port=7589, debug=args.debug, share=args.share)

+import os
+# os.system('pip install v0.1.0-alpha2.tar.gz')
 import gradio as gr
 import argparse
 import copy
 import numpy as np
 import torch
+import torchvision
 from PIL import Image, ImageDraw, ImageFont
 # Grounding DINO
 from diffusers import StableDiffusionInpaintPipeline
 from huggingface_hub import hf_hub_download
+# BLIP
+from transformers import BlipProcessor, BlipForConditionalGeneration
 def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
     args = SLConfig.fromfile(model_config_path)
     model = build_model(args)
     _ = model.eval()
     return model
+def generate_caption(processor, blip_model, raw_image):
+    # unconditional image captioning
+    inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
+    out = blip_model.generate(**inputs)
+    caption = processor.decode(out[0], skip_special_tokens=True)
+    return caption
 def plot_boxes_to_image(image_pil, tgt):
     H, W = tgt["size"]
     boxes = tgt["boxes"]
     tokenized = tokenlizer(caption)
     # build pred
     pred_phrases = []
+    scores = []
     for logit, box in zip(logits_filt, boxes_filt):
         pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
         if with_logits:
             pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
         else:
             pred_phrases.append(pred_phrase)
+        scores.append(logit.max().item())
+    return boxes_filt, torch.Tensor(scores), pred_phrases
 def show_mask(mask, ax, random_color=False):
     if random_color:
 config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
 ckpt_repo_id = "ShilongLiu/GroundingDINO"
 ckpt_filenmae = "groundingdino_swint_ogc.pth"
+sam_checkpoint='sam_vit_h_4b8939.pth'
 output_dir="outputs"
 device="cuda"
+def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode):
     # make dir
     os.makedirs(output_dir, exist_ok=True)
     image_pil, image = load_image(image_path.convert("RGB"))
     # load model
     model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
+    # model = load_model(config_file, ckpt_filenmae, device=device)
     # visualize raw image
     image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    if task_type == 'automatic':
+        # generate caption and tags
+        # use Tag2Text can generate better captions
+        # https://huggingface.co/spaces/xinyu1205/Tag2Text
+        # but there are some bugs...
+        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+        blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to("cuda")
+        text_prompt = generate_caption(processor, blip_model, image_pil)
+        print(f"Caption: {text_prompt}")
     # run grounding dino model
+    boxes_filt, scores, pred_phrases = get_grounding_output(
         model, image, text_prompt, box_threshold, text_threshold, device=device
     )
     size = image_pil.size
+    if task_type == 'seg' or task_type == 'inpainting' or task_type == 'automatic':
         # initialize SAM
         predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
         image = np.array(image_path)
             boxes_filt[i][2:] += boxes_filt[i][:2]
         boxes_filt = boxes_filt.cpu()
+        if task_type == 'automatic':
+            # use NMS to handle overlapped boxes
+            print(f"Before NMS: {boxes_filt.shape[0]} boxes")
+            nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
+            boxes_filt = boxes_filt[nms_idx]
+            pred_phrases = [pred_phrases[idx] for idx in nms_idx]
+            print(f"After NMS: {boxes_filt.shape[0]} boxes")
+            print(f"Revise caption with number: {text_prompt}")
         transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
         masks, _, _ = predictor.predict_torch(
         image_with_box.save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         return image_result
+    elif task_type == 'seg' or task_type == 'automatic':
         assert sam_checkpoint, 'sam_checkpoint is not found!'
         # draw output image
             show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
         for box, label in zip(boxes_filt, pred_phrases):
             show_box(box.numpy(), plt.gca(), label)
+        if task_type == 'automatic':
+            plt.title(text_prompt)
         plt.axis('off')
         image_path = os.path.join(output_dir, "grounding_dino_output.jpg")
         plt.savefig(image_path, bbox_inches="tight")
     elif task_type == 'inpainting':
         assert inpaint_prompt, 'inpaint_prompt is not found!'
         # inpainting pipeline
+        if inpaint_mode == 'merge':
+            masks = torch.sum(masks, dim=0).unsqueeze(0)
+            masks = torch.where(masks > 0, True, False)
+        else:
+            mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
         mask_pil = Image.fromarray(mask)
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
         "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
         )
         pipe = pipe.to("cuda")
+        image_pil = image_pil.resize((512, 512))
+        mask_pil = mask_pil.resize((512, 512))
         image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
+        image = image.resize(size)
         image_path = os.path.join(output_dir, "grounded_sam_inpainting_output.jpg")
         image.save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
     parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
     parser.add_argument("--debug", action="store_true", help="using debug mode")
     parser.add_argument("--share", action="store_true", help="share the app")
+    parser.add_argument('--port', type=int, default=7589, help='port to run the server')
     args = parser.parse_args()
     block = gr.Blocks().queue()
     with block:
         with gr.Row():
             with gr.Column():
+                input_image = gr.Image(source='upload', type="pil", value="assets/demo1.jpg")
+                task_type = gr.Dropdown(["det", "seg", "inpainting", "automatic"], value="automatic", label="task_type")
+                text_prompt = gr.Textbox(label="Text Prompt")
                 inpaint_prompt = gr.Textbox(label="Inpaint Prompt")
                 run_button = gr.Button(label="Run")
                 with gr.Accordion("Advanced options", open=False):
                     text_threshold = gr.Slider(
                         label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
                     )
+                    iou_threshold = gr.Slider(
+                        label="IOU Threshold", minimum=0.0, maximum=1.0, value=0.5, step=0.001
+                    )
+                    inpaint_mode = gr.Dropdown(["merge", "first"], value="merge", label="inpaint_mode")
             with gr.Column():
                 gallery = gr.outputs.Image(
                 ).style(full_width=True, full_height=True)
         run_button.click(fn=run_grounded_sam, inputs=[
+                        input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode], outputs=[gallery])
+    block.launch(server_name='0.0.0.0', server_port=args.port, debug=args.debug, share=args.share)

gradio_auto_label.py ADDED Viewed

	@@ -0,0 +1,392 @@

+import gradio as gr
+import json
+import argparse
+import os
+import copy
+import numpy as np
+import torch
+import torchvision
+from PIL import Image, ImageDraw, ImageFont
+import openai
+# Grounding DINO
+import GroundingDINO.groundingdino.datasets.transforms as T
+from GroundingDINO.groundingdino.models import build_model
+from GroundingDINO.groundingdino.util import box_ops
+from GroundingDINO.groundingdino.util.slconfig import SLConfig
+from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from transformers import BlipProcessor, BlipForConditionalGeneration
+# segment anything
+from segment_anything import build_sam, SamPredictor
+from segment_anything.utils.amg import remove_small_regions
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+# diffusers
+import PIL
+import requests
+import torch
+from io import BytesIO
+from huggingface_hub import hf_hub_download
+from sys import platform
+#macos
+if platform == 'darwin':
+    import matplotlib
+    matplotlib.use('agg')
+def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
+    args = SLConfig.fromfile(model_config_path)
+    model = build_model(args)
+    args.device = device
+    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
+    checkpoint = torch.load(cache_file, map_location='cpu')
+    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
+    print("Model loaded from {} \n => {}".format(cache_file, log))
+    _ = model.eval()
+    return model
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+    return image_pil, mask
+def load_image(image_path):
+    # # load image
+    # image_pil = Image.open(image_path).convert("RGB")  # load image
+    image_pil = image_path
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+def load_model(model_config_path, model_checkpoint_path, device):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    _ = model.eval()
+    return model
+def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    model = model.to(device)
+    image = image.to(device)
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
+    logits.shape[0]
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    logits_filt.shape[0]
+    # get phrase
+    tokenlizer = model.tokenizer
+    tokenized = tokenlizer(caption)
+    # build pred
+    pred_phrases = []
+    scores = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+        if with_logits:
+            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+        else:
+            pred_phrases.append(pred_phrase)
+        scores.append(logit.max().item())
+    return boxes_filt, torch.Tensor(scores), pred_phrases
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def save_mask_data(output_dir, mask_list, box_list, label_list):
+    value = 0  # 0 for background
+    mask_img = torch.zeros(mask_list.shape[-2:])
+    for idx, mask in enumerate(mask_list):
+        mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1
+    plt.figure(figsize=(10, 10))
+    plt.imshow(mask_img.numpy())
+    plt.axis('off')
+    mask_img_path = os.path.join(output_dir, 'mask.jpg')
+    plt.savefig(mask_img_path, bbox_inches="tight", dpi=300, pad_inches=0.0)
+    json_data = [{
+        'value': value,
+        'label': 'background'
+    }]
+    for label, box in zip(label_list, box_list):
+        value += 1
+        name, logit = label.split('(')
+        logit = logit[:-1] # the last is ')'
+        json_data.append({
+            'value': value,
+            'label': name,
+            'logit': float(logit),
+            'box': box.numpy().tolist(),
+        })
+    mask_json_path = os.path.join(output_dir, 'mask.json')
+    with open(mask_json_path, 'w') as f:
+        json.dump(json_data, f)
+    return mask_img_path, mask_json_path
+def show_box(box, ax, label):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
+    ax.text(x0, y0, label)
+config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
+ckpt_repo_id = "ShilongLiu/GroundingDINO"
+ckpt_filenmae = "groundingdino_swint_ogc.pth"
+sam_checkpoint='sam_vit_h_4b8939.pth'
+output_dir="outputs"
+device="cpu"
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+def generate_caption(raw_image):
+    # unconditional image captioning
+    inputs = processor(raw_image, return_tensors="pt")
+    out = blip_model.generate(**inputs)
+    caption = processor.decode(out[0], skip_special_tokens=True)
+    return caption
+def generate_tags(caption, split=',', max_tokens=100, model="gpt-3.5-turbo", openai_key=''):
+    openai.api_key = openai_key
+    prompt = [
+        {
+            'role': 'system',
+            'content': 'Extract the unique nouns in the caption. Remove all the adjectives. ' + \
+                       f'List the nouns in singular form. Split them by "{split} ". ' + \
+                       f'Caption: {caption}.'
+        }
+    ]
+    response = openai.ChatCompletion.create(model=model, messages=prompt, temperature=0.6, max_tokens=max_tokens)
+    reply = response['choices'][0]['message']['content']
+    # sometimes return with "noun: xxx, xxx, xxx"
+    tags = reply.split(':')[-1].strip()
+    return tags
+def check_caption(caption, pred_phrases, max_tokens=100, model="gpt-3.5-turbo"):
+    object_list = [obj.split('(')[0] for obj in pred_phrases]
+    object_num = []
+    for obj in set(object_list):
+        object_num.append(f'{object_list.count(obj)} {obj}')
+    object_num = ', '.join(object_num)
+    print(f"Correct object number: {object_num}")
+    prompt = [
+        {
+            'role': 'system',
+            'content': 'Revise the number in the caption if it is wrong. ' + \
+                       f'Caption: {caption}. ' + \
+                       f'True object number: {object_num}. ' + \
+                       'Only give the revised caption: '
+        }
+    ]
+    response = openai.ChatCompletion.create(model=model, messages=prompt, temperature=0.6, max_tokens=max_tokens)
+    reply = response['choices'][0]['message']['content']
+    # sometimes return with "Caption: xxx, xxx, xxx"
+    caption = reply.split(':')[-1].strip()
+    return caption
+def run_grounded_sam(image_path, openai_key, box_threshold, text_threshold, iou_threshold, area_threshold):
+    assert openai_key, 'Openai key is not found!'
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    # load image
+    image_pil, image = load_image(image_path.convert("RGB"))
+    # load model
+    model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    caption = generate_caption(image_pil)
+    # Currently ", " is better for detecting single tags
+    # while ". " is a little worse in some case
+    split = ','
+    tags = generate_tags(caption, split=split, openai_key=openai_key)
+    # run grounding dino model
+    boxes_filt, scores, pred_phrases = get_grounding_output(
+        model, image, tags, box_threshold, text_threshold, device=device
+    )
+    size = image_pil.size
+    # initialize SAM
+    predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
+    image = np.array(image_path)
+    predictor.set_image(image)
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+    boxes_filt = boxes_filt.cpu()
+    # use NMS to handle overlapped boxes
+    print(f"Before NMS: {boxes_filt.shape[0]} boxes")
+    nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
+    boxes_filt = boxes_filt[nms_idx]
+    pred_phrases = [pred_phrases[idx] for idx in nms_idx]
+    print(f"After NMS: {boxes_filt.shape[0]} boxes")
+    caption = check_caption(caption, pred_phrases)
+    print(f"Revise caption with number: {caption}")
+    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
+    masks, _, _ = predictor.predict_torch(
+        point_coords = None,
+        point_labels = None,
+        boxes = transformed_boxes,
+        multimask_output = False,
+    )
+    # area threshold: remove the mask when area < area_thresh (in pixels)
+    new_masks = []
+    for mask in masks:
+        # reshape to be used in remove_small_regions()
+        mask = mask.cpu().numpy().squeeze()
+        mask, _ = remove_small_regions(mask, area_threshold, mode="holes")
+        mask, _ = remove_small_regions(mask, area_threshold, mode="islands")
+        new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+    masks = torch.stack(new_masks, dim=0)
+    # masks: [1, 1, 512, 512]
+    assert sam_checkpoint, 'sam_checkpoint is not found!'
+    # draw output image
+    plt.figure(figsize=(10, 10))
+    plt.imshow(image)
+    for mask in masks:
+        show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+    for box, label in zip(boxes_filt, pred_phrases):
+        show_box(box.numpy(), plt.gca(), label)
+    plt.axis('off')
+    image_path = os.path.join(output_dir, "grounding_dino_output.jpg")
+    plt.savefig(image_path, bbox_inches="tight")
+    image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+    mask_img_path, _ = save_mask_data('./outputs', masks, boxes_filt, pred_phrases)
+    mask_img = cv2.cvtColor(cv2.imread(mask_img_path), cv2.COLOR_BGR2RGB)
+    return image_result, mask_img, caption, tags
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
+    parser.add_argument("--debug", action="store_true", help="using debug mode")
+    parser.add_argument("--share", action="store_true", help="share the app")
+    args = parser.parse_args()
+    block = gr.Blocks().queue()
+    with block:
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(source='upload', type="pil")
+                openai_key = gr.Textbox(label="OpenAI key")
+                run_button = gr.Button(label="Run")
+                with gr.Accordion("Advanced options", open=False):
+                    box_threshold = gr.Slider(
+                        label="Box Threshold", minimum=0.0, maximum=1.0, value=0.3, step=0.001
+                    )
+                    text_threshold = gr.Slider(
+                        label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
+                    )
+                    iou_threshold = gr.Slider(
+                        label="IoU Threshold", minimum=0.0, maximum=1.0, value=0.5, step=0.001
+                    )
+                    area_threshold = gr.Slider(
+                        label="Area Threshold", minimum=0.0, maximum=2500, value=100, step=10
+                    )
+            with gr.Column():
+                image_caption = gr.Textbox(label="Image Caption")
+                identified_labels = gr.Textbox(label="Key objects extracted by ChatGPT")
+                gallery = gr.outputs.Image(
+                    type="pil",
+                ).style(full_width=True, full_height=True)
+                mask_gallary = gr.outputs.Image(
+                    type="pil",
+                ).style(full_width=True, full_height=True)
+        run_button.click(fn=run_grounded_sam, inputs=[
+                        input_image, openai_key, box_threshold, text_threshold, iou_threshold, area_threshold],
+                        outputs=[gallery, mask_gallary, image_caption, identified_labels])
+    block.launch(server_name='0.0.0.0', server_port=7589, debug=args.debug, share=args.share)

grounded_sam.ipynb CHANGED Viewed

@@ -53,12 +53,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 187,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
     "\n",
     "# If you have multiple GPUs, you can set the GPU to use here.\n",
     "# The default is to use the first GPU, which is usually GPU 0.\n",
     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
@@ -85,7 +94,7 @@
     "from GroundingDINO.groundingdino.util import box_ops\n",
     "from GroundingDINO.groundingdino.util.slconfig import SLConfig\n",
     "from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap\n",
-    "from groundingdino.util.inference import annotate, load_image, predict\n",
     "\n",
     "import supervision as sv\n",
     "\n",

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os, sys\n",
     "\n",
+    "sys.path.append(os.path.join(os.getcwd(), \"GroundingDINO\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 187,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# If you have multiple GPUs, you can set the GPU to use here.\n",
     "# The default is to use the first GPU, which is usually GPU 0.\n",
     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
     "from GroundingDINO.groundingdino.util import box_ops\n",
     "from GroundingDINO.groundingdino.util.slconfig import SLConfig\n",
     "from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap\n",
+    "from GroundingDINO.groundingdino.util.inference import annotate, load_image, predict\n",
     "\n",
     "import supervision as sv\n",
     "\n",

grounded_sam_inpainting_demo.py CHANGED Viewed

@@ -125,6 +125,7 @@ if __name__ == "__main__":
     parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
     parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
     parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
     args = parser.parse_args()
@@ -138,6 +139,7 @@ if __name__ == "__main__":
     output_dir = args.output_dir
     box_threshold = args.box_threshold
     text_threshold = args.box_threshold
     device = args.device
     # make dir
@@ -181,7 +183,11 @@ if __name__ == "__main__":
     # masks: [1, 1, 512, 512]
     # inpainting pipeline
-    mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
     mask_pil = Image.fromarray(mask)
     image_pil = Image.fromarray(image)
@@ -190,8 +196,11 @@ if __name__ == "__main__":
     )
     pipe = pipe.to("cuda")
     # prompt = "A sofa, high quality, detailed"
     image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
     image.save(os.path.join(output_dir, "grounded_sam_inpainting_output.jpg"))
     # draw output image

     parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
     parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--inpaint_mode", type=str, default="first", help="inpaint mode")
     parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
     args = parser.parse_args()
     output_dir = args.output_dir
     box_threshold = args.box_threshold
     text_threshold = args.box_threshold
+    inpaint_mode = args.inpaint_mode
     device = args.device
     # make dir
     # masks: [1, 1, 512, 512]
     # inpainting pipeline
+    if inpaint_mode == 'merge':
+        masks = torch.sum(masks, dim=0).unsqueeze(0)
+        masks = torch.where(masks > 0, True, False)
+    else:
+        mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
     mask_pil = Image.fromarray(mask)
     image_pil = Image.fromarray(image)
     )
     pipe = pipe.to("cuda")
+    image_pil = image_pil.resize((512, 512))
+    mask_pil = mask_pil.resize((512, 512))
     # prompt = "A sofa, high quality, detailed"
     image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
+    image = image.resize(size)
     image.save(os.path.join(output_dir, "grounded_sam_inpainting_output.jpg"))
     # draw output image

grounded_sam_whisper_demo.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import argparse
+import os
+import copy
+import numpy as np
+import json
+import torch
+import torchvision
+from PIL import Image, ImageDraw, ImageFont
+# Grounding DINO
+import GroundingDINO.groundingdino.datasets.transforms as T
+from GroundingDINO.groundingdino.models import build_model
+from GroundingDINO.groundingdino.util import box_ops
+from GroundingDINO.groundingdino.util.slconfig import SLConfig
+from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+# segment anything
+from segment_anything import build_sam, SamPredictor
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+# whisper
+import whisper
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+def load_model(model_config_path, model_checkpoint_path, device):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+def get_grounding_output(model, image, caption, box_threshold, text_threshold,device="cpu"):
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    model = model.to(device)
+    image = image.to(device)
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
+    logits.shape[0]
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    logits_filt.shape[0]
+    # get phrase
+    tokenlizer = model.tokenizer
+    tokenized = tokenlizer(caption)
+    # build pred
+    pred_phrases = []
+    scores = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+        pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+        scores.append(logit.max().item())
+    return boxes_filt, torch.Tensor(scores), pred_phrases
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax, label):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
+    ax.text(x0, y0, label)
+def save_mask_data(output_dir, mask_list, box_list, label_list):
+    value = 0  # 0 for background
+    mask_img = torch.zeros(mask_list.shape[-2:])
+    for idx, mask in enumerate(mask_list):
+        mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1
+    plt.figure(figsize=(10, 10))
+    plt.imshow(mask_img.numpy())
+    plt.axis('off')
+    plt.savefig(os.path.join(output_dir, 'mask.jpg'), bbox_inches="tight", dpi=300, pad_inches=0.0)
+    json_data = [{
+        'value': value,
+        'label': 'background'
+    }]
+    for label, box in zip(label_list, box_list):
+        value += 1
+        name, logit = label.split('(')
+        logit = logit[:-1] # the last is ')'
+        json_data.append({
+            'value': value,
+            'label': name,
+            'logit': float(logit),
+            'box': box.numpy().tolist(),
+        })
+    with open(os.path.join(output_dir, 'mask.json'), 'w') as f:
+        json.dump(json_data, f)
+def speech_recognition(speech_file, model):
+    # whisper
+    # load audio and pad/trim it to fit 30 seconds
+    audio = whisper.load_audio(speech_file)
+    audio = whisper.pad_or_trim(audio)
+    # make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    # detect the spoken language
+    _, probs = model.detect_language(mel)
+    speech_language = max(probs, key=probs.get)
+    # decode the audio
+    options = whisper.DecodingOptions()
+    result = whisper.decode(model, mel, options)
+    # print the recognized text
+    speech_text = result.text
+    return speech_text, speech_language
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounded-Segment-Anything Demo", add_help=True)
+    parser.add_argument("--config", type=str, required=True, help="path to config file")
+    parser.add_argument(
+        "--grounded_checkpoint", type=str, required=True, help="path to checkpoint file"
+    )
+    parser.add_argument(
+        "--sam_checkpoint", type=str, required=True, help="path to checkpoint file"
+    )
+    parser.add_argument("--input_image", type=str, required=True, help="path to image file")
+    parser.add_argument("--speech_file", type=str, required=True, help="speech file")
+    parser.add_argument(
+        "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
+    )
+    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--iou_threshold", type=float, default=0.5, help="iou threshold")
+    parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
+    args = parser.parse_args()
+    # cfg
+    config_file = args.config  # change the path of the model config file
+    grounded_checkpoint = args.grounded_checkpoint  # change the path of the model
+    sam_checkpoint = args.sam_checkpoint
+    image_path = args.input_image
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    iou_threshold = args.iou_threshold
+    device = args.device
+    # load speech
+    whisper_model = whisper.load_model("base")
+    speech_text, speech_language = speech_recognition(args.speech_file, whisper_model)
+    print(f"speech_text: {speech_text}")
+    print(f"speech_language: {speech_language}")
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    # load image
+    image_pil, image = load_image(image_path)
+    # load model
+    model = load_model(config_file, grounded_checkpoint, device=device)
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    # run grounding dino model
+    text_prompt = speech_text
+    boxes_filt, scores, pred_phrases = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, device=device
+    )
+    # initialize SAM
+    predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint).to(args.device))
+    image = cv2.imread(image_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    predictor.set_image(image)
+    size = image_pil.size
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+    boxes_filt = boxes_filt.cpu()
+    # use NMS to handle overlapped boxes
+    print(f"Before NMS: {boxes_filt.shape[0]} boxes")
+    nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
+    boxes_filt = boxes_filt[nms_idx]
+    pred_phrases = [pred_phrases[idx] for idx in nms_idx]
+    print(f"After NMS: {boxes_filt.shape[0]} boxes")
+    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
+    masks, _, _ = predictor.predict_torch(
+        point_coords = None,
+        point_labels = None,
+        boxes = transformed_boxes.to(args.device),
+        multimask_output = False,
+    )
+    # draw output image
+    plt.figure(figsize=(10, 10))
+    plt.imshow(image)
+    for mask in masks:
+        show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+    for box, label in zip(boxes_filt, pred_phrases):
+        show_box(box.numpy(), plt.gca(), label)
+    plt.title(speech_text)
+    plt.axis('off')
+    plt.savefig(
+        os.path.join(output_dir, "grounded_sam_whisper_output.jpg"),
+        bbox_inches="tight", dpi=300, pad_inches=0.0
+    )
+    save_mask_data(output_dir, masks, boxes_filt, pred_phrases)

grounded_dino_sam_inpainting_demo.py → grounded_sam_whisper_inpainting_demo.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import argparse
 import os
-import copy
 import numpy as np
 import torch
@@ -27,45 +27,12 @@ import torch
 from io import BytesIO
 from diffusers import StableDiffusionInpaintPipeline
-def plot_boxes_to_image(image_pil, tgt):
-    H, W = tgt["size"]
-    boxes = tgt["boxes"]
-    labels = tgt["labels"]
-    assert len(boxes) == len(labels), "boxes and labels must have same length"
-    draw = ImageDraw.Draw(image_pil)
-    mask = Image.new("L", image_pil.size, 0)
-    mask_draw = ImageDraw.Draw(mask)
-    # draw boxes and masks
-    for box, label in zip(boxes, labels):
-        # from 0..1 to 0..W, 0..H
-        box = box * torch.Tensor([W, H, W, H])
-        # from xywh to xyxy
-        box[:2] -= box[2:] / 2
-        box[2:] += box[:2]
-        # random color
-        color = tuple(np.random.randint(0, 255, size=3).tolist())
-        # draw
-        x0, y0, x1, y1 = box
-        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
-        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
-        # draw.text((x0, y0), str(label), fill=color)
-        font = ImageFont.load_default()
-        if hasattr(font, "getbbox"):
-            bbox = draw.textbbox((x0, y0), str(label), font)
-        else:
-            w, h = draw.textsize(str(label), font)
-            bbox = (x0, y0, w + x0, y0 + h)
-        # bbox = draw.textbbox((x0, y0), str(label))
-        draw.rectangle(bbox, fill=color)
-        draw.text((x0, y0), str(label), fill="white")
-        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
-    return image_pil, mask
 def load_image(image_path):
     # load image
@@ -143,6 +110,48 @@ def show_box(box, ax, label):
     w, h = box[2] - box[0], box[3] - box[1]
     ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
     ax.text(x0, y0, label)
 if __name__ == "__main__":
@@ -153,36 +162,38 @@ if __name__ == "__main__":
         "--grounded_checkpoint", type=str, required=True, help="path to checkpoint file"
     )
     parser.add_argument(
-        "--sam_checkpoint", type=str, required=False, help="path to checkpoint file"
     )
-    parser.add_argument("--task_type", type=str, required=True, help="select task")
     parser.add_argument("--input_image", type=str, required=True, help="path to image file")
-    parser.add_argument("--text_prompt", type=str, required=True, help="text prompt")
-    parser.add_argument("--inpaint_prompt", type=str, required=False, help="inpaint prompt")
     parser.add_argument(
         "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
     )
     parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
     parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
     parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
     args = parser.parse_args()
     # cfg
     config_file = args.config  # change the path of the model config file
     grounded_checkpoint = args.grounded_checkpoint  # change the path of the model
     sam_checkpoint = args.sam_checkpoint
-    task_type = args.task_type
     image_path = args.input_image
-    text_prompt = args.text_prompt
-    inpaint_prompt = args.inpaint_prompt
     output_dir = args.output_dir
     box_threshold = args.box_threshold
     text_threshold = args.box_threshold
     device = args.device
-    assert text_prompt, 'text_prompt is not found!'
     # make dir
     os.makedirs(output_dir, exist_ok=True)
     # load image
@@ -192,87 +203,79 @@ if __name__ == "__main__":
     # visualize raw image
     image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
     # run grounding dino model
     boxes_filt, pred_phrases = get_grounding_output(
-        model, image, text_prompt, box_threshold, text_threshold, device=device
     )
     size = image_pil.size
-    if task_type == 'seg' or task_type == 'inpainting':
-        # initialize SAM
-        predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
-        image = cv2.imread(image_path)
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        predictor.set_image(image)
-        H, W = size[1], size[0]
-        for i in range(boxes_filt.size(0)):
-            boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
-            boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
-            boxes_filt[i][2:] += boxes_filt[i][:2]
-        boxes_filt = boxes_filt.cpu()
-        transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
-        masks, _, _ = predictor.predict_torch(
-            point_coords = None,
-            point_labels = None,
-            boxes = transformed_boxes,
-            multimask_output = False,
-        )
-        # masks: [1, 1, 512, 512]
-    if task_type == 'det':
-        assert grounded_checkpoint, 'grounded_checkpoint is not found!'
-        pred_dict = {
-            "boxes": boxes_filt,
-            "size": [size[1], size[0]],  # H,W
-            "labels": pred_phrases,
-        }
-        # import ipdb; ipdb.set_trace()
-        image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
-        image_with_box.save(os.path.join(output_dir, "grounding_dino_output.jpg"))
-    elif task_type == 'seg':
-        assert sam_checkpoint, 'sam_checkpoint is not found!'
-        # draw output image
-        plt.figure(figsize=(10, 10))
-        plt.imshow(image)
-        for mask in masks:
-            show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
-        for box, label in zip(boxes_filt, pred_phrases):
-            show_box(box.numpy(), plt.gca(), label)
-        plt.axis('off')
-        plt.savefig(os.path.join(output_dir, "grounded_sam_output.jpg"), bbox_inches="tight")
-    elif task_type == 'inpainting':
-        assert inpaint_prompt, 'inpaint_prompt is not found!'
-        # inpainting pipeline
-        mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
-        mask_pil = Image.fromarray(mask)
-        image_pil = Image.fromarray(image)
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-        "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
-        )
-        pipe = pipe.to("cuda")
-        # prompt = "A sofa, high quality, detailed"
-        image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
-        image.save(os.path.join(output_dir, "grounded_sam_inpainting_output.jpg"))
-        # draw output image
-        # plt.figure(figsize=(10, 10))
-        # plt.imshow(image)
-        # for mask in masks:
-        #     show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
-        # for box, label in zip(boxes_filt, pred_phrases):
-        #     show_box(box.numpy(), plt.gca(), label)
-        # plt.axis('off')
-        # plt.savefig(os.path.join(output_dir, "grounded_sam_output.jpg"), bbox_inches="tight")
     else:
-        print("task_type:{} error!".format(task_type))

 import argparse
 import os
+from warnings import warn
 import numpy as np
 import torch
 from io import BytesIO
 from diffusers import StableDiffusionInpaintPipeline
+# whisper
+import whisper
+# ChatGPT
+import openai
 def load_image(image_path):
     # load image
     w, h = box[2] - box[0], box[3] - box[1]
     ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
     ax.text(x0, y0, label)
+def speech_recognition(speech_file, model):
+    # whisper
+    # load audio and pad/trim it to fit 30 seconds
+    audio = whisper.load_audio(speech_file)
+    audio = whisper.pad_or_trim(audio)
+    # make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    # detect the spoken language
+    _, probs = model.detect_language(mel)
+    speech_language = max(probs, key=probs.get)
+    # decode the audio
+    options = whisper.DecodingOptions()
+    result = whisper.decode(model, mel, options)
+    # print the recognized text
+    speech_text = result.text
+    return speech_text, speech_language
+def filter_prompts_with_chatgpt(caption, max_tokens=100, model="gpt-3.5-turbo"):
+    prompt = [
+        {
+            'role': 'system',
+            'content': f"Extract the main object to be replaced and marked it as 'main_object', " + \
+                       f"Extract the remaining part as 'other prompt' " + \
+                       f"Return (main_object, other prompt)" + \
+                       f'Given caption: {caption}.'
+        }
+    ]
+    response = openai.ChatCompletion.create(model=model, messages=prompt, temperature=0.6, max_tokens=max_tokens)
+    reply = response['choices'][0]['message']['content']
+    try:
+        det_prompt, inpaint_prompt = reply.split('\n')[0].split(':')[-1].strip(), reply.split('\n')[1].split(':')[-1].strip()
+    except:
+        warn(f"Failed to extract tags from caption") # use caption as det_prompt, inpaint_prompt
+        det_prompt, inpaint_prompt = caption, caption
+    return det_prompt, inpaint_prompt
 if __name__ == "__main__":
         "--grounded_checkpoint", type=str, required=True, help="path to checkpoint file"
     )
     parser.add_argument(
+        "--sam_checkpoint", type=str, required=True, help="path to checkpoint file"
     )
     parser.add_argument("--input_image", type=str, required=True, help="path to image file")
     parser.add_argument(
         "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
     )
+    parser.add_argument("--det_speech_file", type=str, help="grounding speech file")
+    parser.add_argument("--inpaint_speech_file", type=str, help="inpaint speech file")
+    parser.add_argument("--prompt_speech_file", type=str, help="prompt speech file, no need to provide det_speech_file")
+    parser.add_argument("--enable_chatgpt", action="store_true", help="enable chatgpt")
+    parser.add_argument("--openai_key", type=str, help="key for chatgpt")
+    parser.add_argument("--openai_proxy", default=None, type=str, help="proxy for chatgpt")
+    parser.add_argument("--whisper_model", type=str, default="small", help="whisper model version: tiny, base, small, medium, large")
     parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
     parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--inpaint_mode", type=str, default="first", help="inpaint mode")
     parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
+    parser.add_argument("--prompt_extra", type=str, default=" high resolution, real scene", help="extra prompt for inpaint")
     args = parser.parse_args()
     # cfg
     config_file = args.config  # change the path of the model config file
     grounded_checkpoint = args.grounded_checkpoint  # change the path of the model
     sam_checkpoint = args.sam_checkpoint
     image_path = args.input_image
     output_dir = args.output_dir
     box_threshold = args.box_threshold
     text_threshold = args.box_threshold
+    inpaint_mode = args.inpaint_mode
     device = args.device
     # make dir
     os.makedirs(output_dir, exist_ok=True)
     # load image
     # visualize raw image
     image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    # recognize speech
+    whisper_model = whisper.load_model(args.whisper_model)
+    if args.enable_chatgpt:
+        openai.api_key = args.openai_key
+        if args.openai_proxy:
+            openai.proxy = {"http": args.openai_proxy, "https": args.openai_proxy}
+        speech_text, _ = speech_recognition(args.prompt_speech_file, whisper_model)
+        det_prompt, inpaint_prompt = filter_prompts_with_chatgpt(speech_text)
+        inpaint_prompt += args.prompt_extra
+        print(f"det_prompt: {det_prompt}, inpaint_prompt: {inpaint_prompt}")
+    else:
+        det_prompt, det_speech_language = speech_recognition(args.det_speech_file, whisper_model)
+        inpaint_prompt, inpaint_speech_language = speech_recognition(args.inpaint_speech_file, whisper_model)
+        print(f"det_prompt: {det_prompt}, using language: {det_speech_language}")
+        print(f"inpaint_prompt: {inpaint_prompt}, using language: {inpaint_speech_language}")
     # run grounding dino model
     boxes_filt, pred_phrases = get_grounding_output(
+        model, image, det_prompt, box_threshold, text_threshold, device=device
     )
+    # initialize SAM
+    predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
+    image = cv2.imread(image_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    predictor.set_image(image)
     size = image_pil.size
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+    boxes_filt = boxes_filt.cpu()
+    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
+    masks, _, _ = predictor.predict_torch(
+        point_coords = None,
+        point_labels = None,
+        boxes = transformed_boxes,
+        multimask_output = False,
+    )
+    # masks: [1, 1, 512, 512]
+    # inpainting pipeline
+    if inpaint_mode == 'merge':
+        masks = torch.sum(masks, dim=0).unsqueeze(0)
+        masks = torch.where(masks > 0, True, False)
     else:
+        mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
+    mask_pil = Image.fromarray(mask)
+    image_pil = Image.fromarray(image)
+    pipe = StableDiffusionInpaintPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
+    )
+    pipe = pipe.to("cuda")
+    # prompt = "A sofa, high quality, detailed"
+    image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
+    image.save(os.path.join(output_dir, "grounded_sam_inpainting_output.jpg"))
+    # draw output image
+    # plt.figure(figsize=(10, 10))
+    # plt.imshow(image)
+    # for mask in masks:
+    #     show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+    # for box, label in zip(boxes_filt, pred_phrases):
+    #     show_box(box.numpy(), plt.gca(), label)
+    # plt.axis('off')
+    # plt.savefig(os.path.join(output_dir, "grounded_sam_output.jpg"), bbox_inches="tight")