Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	updata sam version
Browse files- app_w_sam.py +139 -0
- models/__pycache__/image_text_transformation.cpython-38.pyc +0 -0
- models/blip2_model.py +8 -5
- models/image_text_transformation.py +2 -1
- models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc +0 -0
- models/segment_models/semantic_segment_anything_model.py +8 -5
    	
        app_w_sam.py
    ADDED
    
    | @@ -0,0 +1,139 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import gradio as gr
         | 
| 2 | 
            +
            import cv2
         | 
| 3 | 
            +
            import numpy as np
         | 
| 4 | 
            +
            from PIL import Image
         | 
| 5 | 
            +
            import base64
         | 
| 6 | 
            +
            from io import BytesIO
         | 
| 7 | 
            +
            from models.image_text_transformation import ImageTextTransformation
         | 
| 8 | 
            +
            import argparse
         | 
| 9 | 
            +
            import torch
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            parser = argparse.ArgumentParser()
         | 
| 12 | 
            +
            parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
         | 
| 13 | 
            +
            parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
         | 
| 14 | 
            +
            parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
         | 
| 15 | 
            +
            parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=True, help='Set this flag to True if you want to use semantic segmentation')
         | 
| 16 | 
            +
            parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
         | 
| 17 | 
            +
            parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
         | 
| 18 | 
            +
            parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
         | 
| 19 | 
            +
            parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            args = parser.parse_args()
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            device = "cuda" if torch.cuda.is_available() else "cpu"
         | 
| 24 | 
            +
            # device = "cpu"
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            if device == "cuda":
         | 
| 27 | 
            +
                args.image_caption_device = "cpu"
         | 
| 28 | 
            +
                args.dense_caption_device = "cuda"
         | 
| 29 | 
            +
                args.semantic_segment_device = "cuda"
         | 
| 30 | 
            +
                args.contolnet_device = "cuda"
         | 
| 31 | 
            +
            else:
         | 
| 32 | 
            +
                args.image_caption_device = "cpu"
         | 
| 33 | 
            +
                args.dense_caption_device = "cpu"
         | 
| 34 | 
            +
                args.semantic_segment_device = "cpu"
         | 
| 35 | 
            +
                args.contolnet_device = "cpu"
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            def pil_image_to_base64(image):
         | 
| 38 | 
            +
                buffered = BytesIO()
         | 
| 39 | 
            +
                image.save(buffered, format="JPEG")
         | 
| 40 | 
            +
                img_str = base64.b64encode(buffered.getvalue()).decode()
         | 
| 41 | 
            +
                return img_str
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            def add_logo():
         | 
| 44 | 
            +
                with open("examples/logo.png", "rb") as f:
         | 
| 45 | 
            +
                    logo_base64 = base64.b64encode(f.read()).decode()
         | 
| 46 | 
            +
                return logo_base64
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            def process_image(image_src, options=None, processor=None):
         | 
| 49 | 
            +
                print(options)
         | 
| 50 | 
            +
                if options is None:
         | 
| 51 | 
            +
                    options = []
         | 
| 52 | 
            +
                processor.args.semantic_segment = "Semantic Segment" in options
         | 
| 53 | 
            +
                image_generation_status = "Image Generation" in options
         | 
| 54 | 
            +
                image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
         | 
| 55 | 
            +
                if image_generation_status:
         | 
| 56 | 
            +
                    gen_image = processor.text_to_image(gen_text)
         | 
| 57 | 
            +
                    gen_image_str = pil_image_to_base64(gen_image)
         | 
| 58 | 
            +
                # Combine the outputs into a single HTML output
         | 
| 59 | 
            +
                custom_output = f'''
         | 
| 60 | 
            +
                <h2>Image->Text:</h2>
         | 
| 61 | 
            +
                <div style="display: flex; flex-wrap: wrap;">
         | 
| 62 | 
            +
                    <div style="flex: 1;">
         | 
| 63 | 
            +
                        <h3>Image Caption</h3>
         | 
| 64 | 
            +
                        <p>{image_caption}</p>
         | 
| 65 | 
            +
                    </div>
         | 
| 66 | 
            +
                    <div style="flex: 1;">
         | 
| 67 | 
            +
                        <h3>Dense Caption</h3>
         | 
| 68 | 
            +
                        <p>{dense_caption}</p>
         | 
| 69 | 
            +
                    </div>
         | 
| 70 | 
            +
                    <div style="flex: 1;">
         | 
| 71 | 
            +
                        <h3>Region Semantic</h3>
         | 
| 72 | 
            +
                        <p>{region_semantic}</p>
         | 
| 73 | 
            +
                    </div>
         | 
| 74 | 
            +
                </div>
         | 
| 75 | 
            +
                <div style="display: flex; flex-wrap: wrap;">
         | 
| 76 | 
            +
                    <div style="flex: 1;">
         | 
| 77 | 
            +
                        <h3>GPT4 Reasoning:</h3>
         | 
| 78 | 
            +
                        <p>{gen_text}</p>
         | 
| 79 | 
            +
                    </div>
         | 
| 80 | 
            +
                </div>
         | 
| 81 | 
            +
                '''
         | 
| 82 | 
            +
                if image_generation_status:
         | 
| 83 | 
            +
                    custom_output += f'''
         | 
| 84 | 
            +
                    <h2>Text->Image:</h2>
         | 
| 85 | 
            +
                    <div style="display: flex; flex-wrap: wrap;">
         | 
| 86 | 
            +
                        <div style="flex: 1;">
         | 
| 87 | 
            +
                            <h3>Generated Image</h3>
         | 
| 88 | 
            +
                            <img src="data:image/jpeg;base64,{gen_image_str}" width="400" style="vertical-align: middle;">
         | 
| 89 | 
            +
                        </div>
         | 
| 90 | 
            +
                    </div>
         | 
| 91 | 
            +
                    '''
         | 
| 92 | 
            +
                return custom_output
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            processor = ImageTextTransformation(args)
         | 
| 95 | 
            +
             | 
| 96 | 
            +
            # Create Gradio input and output components
         | 
| 97 | 
            +
            image_input = gr.inputs.Image(type='filepath', label="Input Image")
         | 
| 98 | 
            +
            semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
         | 
| 99 | 
            +
            image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
             | 
| 102 | 
            +
            extra_title = r'' + '\n' + \
         | 
| 103 | 
            +
                          r'[](https://huggingface.co/spaces/Awiny/Image2Paragraph?duplicate=true)' + '\n\n'
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
             | 
| 107 | 
            +
            logo_base64 = add_logo()
         | 
| 108 | 
            +
            # Create the title with the logo
         | 
| 109 | 
            +
            title_with_logo = \
         | 
| 110 | 
            +
                f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            examples = [
         | 
| 113 | 
            +
                ["examples/test_4.jpg"],
         | 
| 114 | 
            +
            ]
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            # Create Gradio interface
         | 
| 117 | 
            +
            interface = gr.Interface(
         | 
| 118 | 
            +
                fn=lambda image, options: process_image(image, options, processor),
         | 
| 119 | 
            +
                inputs=[image_input,        
         | 
| 120 | 
            +
                        gr.CheckboxGroup(
         | 
| 121 | 
            +
                        label="Options",
         | 
| 122 | 
            +
                        choices=["Image Generation", "Semantic Segment"],
         | 
| 123 | 
            +
                        ),
         | 
| 124 | 
            +
                        ],
         | 
| 125 | 
            +
                outputs=gr.outputs.HTML(),
         | 
| 126 | 
            +
                title=title_with_logo,
         | 
| 127 | 
            +
                examples=examples,
         | 
| 128 | 
            +
                description=extra_title +"""
         | 
| 129 | 
            +
                Image.txt. This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
         | 
| 130 | 
            +
                \n Github: https://github.com/showlab/Image2Paragraph
         | 
| 131 | 
            +
                \n Twitter: https://twitter.com/awinyimgprocess/status/1646225454599372800?s=46&t=HvOe9T2n35iFuCHP5aIHpQ
         | 
| 132 | 
            +
                \n Since GPU is expensive, we use CPU for demo and not include semantic segment anything. Run code local with gpu or google colab we provided for fast speed.
         | 
| 133 | 
            +
                \n Ttext2image model is controlnet ( very slow in cpu(~2m)), which used canny edge as reference.
         | 
| 134 | 
            +
                \n To speed up, we generate image with small size 384, run the code local for high-quality sample.
         | 
| 135 | 
            +
                """
         | 
| 136 | 
            +
            )
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            # Launch the interface
         | 
| 139 | 
            +
            interface.launch()
         | 
    	
        models/__pycache__/image_text_transformation.cpython-38.pyc
    CHANGED
    
    | Binary files a/models/__pycache__/image_text_transformation.cpython-38.pyc and b/models/__pycache__/image_text_transformation.cpython-38.pyc differ | 
|  | 
    	
        models/blip2_model.py
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
             
            from PIL import Image
         | 
| 2 | 
             
            import requests
         | 
| 3 | 
            -
            from transformers import Blip2Processor, Blip2ForConditionalGeneration
         | 
| 4 | 
             
            import torch
         | 
| 5 | 
             
            from utils.util import resize_long_edge
         | 
| 6 |  | 
| @@ -15,10 +15,13 @@ class ImageCaptioning: | |
| 15 | 
             
                        self.data_type = torch.float32
         | 
| 16 | 
             
                    else:
         | 
| 17 | 
             
                        self.data_type = torch.float16
         | 
| 18 | 
            -
                     | 
| 19 | 
            -
                     | 
| 20 | 
            -
             | 
| 21 | 
            -
                     | 
|  | |
|  | |
|  | |
| 22 | 
             
                    model.to(self.device)
         | 
| 23 | 
             
                    return processor, model
         | 
| 24 |  | 
|  | |
| 1 | 
             
            from PIL import Image
         | 
| 2 | 
             
            import requests
         | 
| 3 | 
            +
            from transformers import Blip2Processor, Blip2ForConditionalGeneration, BlipProcessor, BlipForConditionalGeneration
         | 
| 4 | 
             
            import torch
         | 
| 5 | 
             
            from utils.util import resize_long_edge
         | 
| 6 |  | 
|  | |
| 15 | 
             
                        self.data_type = torch.float32
         | 
| 16 | 
             
                    else:
         | 
| 17 | 
             
                        self.data_type = torch.float16
         | 
| 18 | 
            +
                    # uncomment for load stronger captioner
         | 
| 19 | 
            +
                    # processor = Blip2Processor.from_pretrained("pretrained_models/blip2-opt-2.7b")
         | 
| 20 | 
            +
                    # model = Blip2ForConditionalGeneration.from_pretrained(
         | 
| 21 | 
            +
                    #     "pretrained_models/blip2-opt-2.7b", torch_dtype=self.data_type
         | 
| 22 | 
            +
                    # )
         | 
| 23 | 
            +
                    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
         | 
| 24 | 
            +
                    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
         | 
| 25 | 
             
                    model.to(self.device)
         | 
| 26 | 
             
                    return processor, model
         | 
| 27 |  | 
    	
        models/image_text_transformation.py
    CHANGED
    
    | @@ -35,7 +35,8 @@ class ImageTextTransformation: | |
| 35 | 
             
                    self.gpt_model = ImageToText(openai_key)
         | 
| 36 | 
             
                    self.controlnet_model = TextToImage(device=self.args.contolnet_device)
         | 
| 37 | 
             
                    # time-conusimg on CPU, run on local
         | 
| 38 | 
            -
                     | 
|  | |
| 39 | 
             
                    print('\033[1;32m' + "Model initialization finished!".center(50, '-') + '\033[0m')
         | 
| 40 |  | 
| 41 |  | 
|  | |
| 35 | 
             
                    self.gpt_model = ImageToText(openai_key)
         | 
| 36 | 
             
                    self.controlnet_model = TextToImage(device=self.args.contolnet_device)
         | 
| 37 | 
             
                    # time-conusimg on CPU, run on local
         | 
| 38 | 
            +
                    if self.args.semantic_segment:
         | 
| 39 | 
            +
                        self.region_semantic_model = RegionSemantic(device=self.args.semantic_segment_device)
         | 
| 40 | 
             
                    print('\033[1;32m' + "Model initialization finished!".center(50, '-') + '\033[0m')
         | 
| 41 |  | 
| 42 |  | 
    	
        models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc
    CHANGED
    
    | Binary files a/models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc and b/models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc differ | 
|  | 
    	
        models/segment_models/semantic_segment_anything_model.py
    CHANGED
    
    | @@ -27,27 +27,30 @@ class SemanticSegment(): | |
| 27 | 
             
                    self.init_clipseg()
         | 
| 28 |  | 
| 29 | 
             
                def init_clip(self):
         | 
| 30 | 
            -
                    model_name = " | 
|  | |
| 31 | 
             
                    self.clip_processor = CLIPProcessor.from_pretrained(model_name)
         | 
| 32 | 
             
                    self.clip_model = CLIPModel.from_pretrained(model_name).to(self.device)
         | 
| 33 |  | 
| 34 | 
             
                def init_oneformer_ade20k(self):
         | 
| 35 | 
            -
                    model_name = " | 
|  | |
| 36 | 
             
                    self.oneformer_ade20k_processor = OneFormerProcessor.from_pretrained(model_name)
         | 
| 37 | 
             
                    self.oneformer_ade20k_model = OneFormerForUniversalSegmentation.from_pretrained(model_name).to(self.device)
         | 
| 38 |  | 
| 39 | 
             
                def init_oneformer_coco(self):
         | 
| 40 | 
            -
                    model_name = " | 
| 41 | 
             
                    self.oneformer_coco_processor = OneFormerProcessor.from_pretrained(model_name)
         | 
| 42 | 
             
                    self.oneformer_coco_model = OneFormerForUniversalSegmentation.from_pretrained(model_name).to(self.device)
         | 
| 43 |  | 
| 44 | 
             
                def init_blip(self):
         | 
| 45 | 
            -
                    model_name = " | 
|  | |
| 46 | 
             
                    self.blip_processor = BlipProcessor.from_pretrained(model_name)
         | 
| 47 | 
             
                    self.blip_model = BlipForConditionalGeneration.from_pretrained(model_name).to(self.device)
         | 
| 48 |  | 
| 49 | 
             
                def init_clipseg(self):
         | 
| 50 | 
            -
                    model_name = " | 
| 51 | 
             
                    self.clipseg_processor = AutoProcessor.from_pretrained(model_name)
         | 
| 52 | 
             
                    self.clipseg_model = CLIPSegForImageSegmentation.from_pretrained(model_name).to(self.device)
         | 
| 53 | 
             
                    self.clipseg_processor.image_processor.do_resize = False
         | 
|  | |
| 27 | 
             
                    self.init_clipseg()
         | 
| 28 |  | 
| 29 | 
             
                def init_clip(self):
         | 
| 30 | 
            +
                    # model_name = "openai/clip-vit-large-patch14"
         | 
| 31 | 
            +
                    model_name = "openai/clip-vit-base-patch32"
         | 
| 32 | 
             
                    self.clip_processor = CLIPProcessor.from_pretrained(model_name)
         | 
| 33 | 
             
                    self.clip_model = CLIPModel.from_pretrained(model_name).to(self.device)
         | 
| 34 |  | 
| 35 | 
             
                def init_oneformer_ade20k(self):
         | 
| 36 | 
            +
                    # model_name = "shi-labs/oneformer_ade20k_swin_large"
         | 
| 37 | 
            +
                    model_name = "shi-labs/oneformer_ade20k_swin_tiny"
         | 
| 38 | 
             
                    self.oneformer_ade20k_processor = OneFormerProcessor.from_pretrained(model_name)
         | 
| 39 | 
             
                    self.oneformer_ade20k_model = OneFormerForUniversalSegmentation.from_pretrained(model_name).to(self.device)
         | 
| 40 |  | 
| 41 | 
             
                def init_oneformer_coco(self):
         | 
| 42 | 
            +
                    model_name = "shi-labs/oneformer_coco_swin_large"
         | 
| 43 | 
             
                    self.oneformer_coco_processor = OneFormerProcessor.from_pretrained(model_name)
         | 
| 44 | 
             
                    self.oneformer_coco_model = OneFormerForUniversalSegmentation.from_pretrained(model_name).to(self.device)
         | 
| 45 |  | 
| 46 | 
             
                def init_blip(self):
         | 
| 47 | 
            +
                    model_name = "Salesforce/blip-image-captioning-base"
         | 
| 48 | 
            +
                    # model_name = "Salesforce/blip-image-captioning-large"
         | 
| 49 | 
             
                    self.blip_processor = BlipProcessor.from_pretrained(model_name)
         | 
| 50 | 
             
                    self.blip_model = BlipForConditionalGeneration.from_pretrained(model_name).to(self.device)
         | 
| 51 |  | 
| 52 | 
             
                def init_clipseg(self):
         | 
| 53 | 
            +
                    model_name = "CIDAS/clipseg-rd64-refined"
         | 
| 54 | 
             
                    self.clipseg_processor = AutoProcessor.from_pretrained(model_name)
         | 
| 55 | 
             
                    self.clipseg_model = CLIPSegForImageSegmentation.from_pretrained(model_name).to(self.device)
         | 
| 56 | 
             
                    self.clipseg_processor.image_processor.do_resize = False
         | 
