Yw22 commited on
Commit
b2682d8
1 Parent(s): 6444ed9

brushedit demo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +30 -20
  2. README.md +3 -3
  3. app/down_load_brushedit.py +13 -0
  4. app/down_load_brushedit.sh +3 -0
  5. app/gpt4_o/brushedit_app.py +0 -914
  6. app/gpt4_o/instructions.py +11 -10
  7. app/gpt4_o/requirements.txt +0 -18
  8. app/llava/instructions.py +108 -0
  9. app/qwen2/instructions.py +103 -0
  10. app/{gpt4_o/run_app.sh → run_app.sh} +1 -1
  11. app/src/aspect_ratio_template.py +88 -0
  12. app/src/base_model_template.py +61 -0
  13. app/{gpt4_o → src}/brushedit_all_in_one_pipeline.py +6 -13
  14. app/src/brushedit_app.py +1690 -0
  15. app/{gpt4_o → src}/vlm_pipeline.py +118 -34
  16. app/src/vlm_template.py +120 -0
  17. app/utils/GroundingDINO_SwinT_OGC.py +43 -0
  18. assets/angel_christmas/angel_christmas.png +3 -0
  19. assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png +3 -0
  20. assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png +3 -0
  21. assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png +3 -0
  22. assets/angel_christmas/prompt.txt +3 -0
  23. assets/anime_flower/anime_flower.png +3 -0
  24. assets/anime_flower/image_edit_37553172-9b38-4727-bf2e-37d7e2b93461_2.png +3 -0
  25. assets/anime_flower/mask_37553172-9b38-4727-bf2e-37d7e2b93461.png +3 -0
  26. assets/anime_flower/masked_image_37553172-9b38-4727-bf2e-37d7e2b93461.png +3 -0
  27. assets/anime_flower/prompt.txt +1 -0
  28. assets/brushedit_teaser.png +3 -0
  29. assets/chenduling/chengduling.jpg +3 -0
  30. assets/chenduling/image_edit_68e3ff6f-da07-4b37-91df-13d6eed7b997_0.png +3 -0
  31. assets/chenduling/mask_68e3ff6f-da07-4b37-91df-13d6eed7b997.png +3 -0
  32. assets/chenduling/masked_image_68e3ff6f-da07-4b37-91df-13d6eed7b997.png +3 -0
  33. assets/chenduling/prompt.txt +1 -0
  34. assets/chinese_girl/chinese_girl.png +3 -0
  35. assets/chinese_girl/image_edit_54759648-0989-48e0-bc82-f20e28b5ec29_1.png +3 -0
  36. assets/chinese_girl/mask_54759648-0989-48e0-bc82-f20e28b5ec29.png +3 -0
  37. assets/chinese_girl/masked_image_54759648-0989-48e0-bc82-f20e28b5ec29.png +3 -0
  38. assets/chinese_girl/prompt.txt +1 -0
  39. assets/demo_vis.png +3 -0
  40. assets/example.png +3 -0
  41. assets/frog/frog.jpeg +3 -0
  42. assets/frog/image_edit_f7b350de-6f2c-49e3-b535-995c486d78e7_1.png +3 -0
  43. assets/frog/mask_f7b350de-6f2c-49e3-b535-995c486d78e7.png +3 -0
  44. assets/frog/masked_image_f7b350de-6f2c-49e3-b535-995c486d78e7.png +3 -0
  45. assets/frog/prompt.txt +1 -0
  46. assets/girl_on_sun/girl_on_sun.png +3 -0
  47. assets/girl_on_sun/image_edit_264eac8b-8b65-479c-9755-020a60880c37_0.png +3 -0
  48. assets/girl_on_sun/mask_264eac8b-8b65-479c-9755-020a60880c37.png +3 -0
  49. assets/girl_on_sun/masked_image_264eac8b-8b65-479c-9755-020a60880c37.png +3 -0
  50. assets/girl_on_sun/prompt.txt +1 -0
.gitattributes CHANGED
@@ -40,23 +40,33 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
40
  *.gif filter=lfs diff=lfs merge=lfs -text
41
  *.bmp filter=lfs diff=lfs merge=lfs -text
42
  *.tiff filter=lfs diff=lfs merge=lfs -text
43
- assets/hedgehog_rm_fg/hedgehog.png filter=lfs diff=lfs merge=lfs -text
44
- assets/hedgehog_rm_fg/image_edit_82314e18-c64c-4003-9ef9-52cebf254b2f_2.png filter=lfs diff=lfs merge=lfs -text
45
- assets/hedgehog_rm_fg/mask_82314e18-c64c-4003-9ef9-52cebf254b2f.png filter=lfs diff=lfs merge=lfs -text
46
- assets/hedgehog_rm_fg/masked_image_82314e18-c64c-4003-9ef9-52cebf254b2f.png filter=lfs diff=lfs merge=lfs -text
47
- assets/hedgehog_rp_bg/masked_image_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png filter=lfs diff=lfs merge=lfs -text
48
- assets/hedgehog_rp_bg/hedgehog.png filter=lfs diff=lfs merge=lfs -text
49
- assets/hedgehog_rp_bg/image_edit_db7f8bf8-8349-46d3-b14e-43d67fbe25d3_3.png filter=lfs diff=lfs merge=lfs -text
50
- assets/hedgehog_rp_bg/mask_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png filter=lfs diff=lfs merge=lfs -text
51
- assets/hedgehog_rp_fg/hedgehog.png filter=lfs diff=lfs merge=lfs -text
52
- assets/hedgehog_rp_fg/image_edit_5cab3448-5a3a-459c-9144-35cca3d34273_0.png filter=lfs diff=lfs merge=lfs -text
53
- assets/hedgehog_rp_fg/mask_5cab3448-5a3a-459c-9144-35cca3d34273.png filter=lfs diff=lfs merge=lfs -text
54
- assets/hedgehog_rp_fg/masked_image_5cab3448-5a3a-459c-9144-35cca3d34273.png filter=lfs diff=lfs merge=lfs -text
55
- assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png filter=lfs diff=lfs merge=lfs -text
56
- assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png filter=lfs diff=lfs merge=lfs -text
57
- assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png filter=lfs diff=lfs merge=lfs -text
58
- assets/mona_lisa/mona_lisa.png filter=lfs diff=lfs merge=lfs -text
59
- assets/sunflower_girl/image_edit_99cc50b4-7dc4-4de5-8748-ec10772f0317_3.png filter=lfs diff=lfs merge=lfs -text
60
- assets/sunflower_girl/mask_99cc50b4-7dc4-4de5-8748-ec10772f0317.png filter=lfs diff=lfs merge=lfs -text
61
- assets/sunflower_girl/masked_image_99cc50b4-7dc4-4de5-8748-ec10772f0317.png filter=lfs diff=lfs merge=lfs -text
62
- assets/sunflower_girl/sunflower_girl.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
40
  *.gif filter=lfs diff=lfs merge=lfs -text
41
  *.bmp filter=lfs diff=lfs merge=lfs -text
42
  *.tiff filter=lfs diff=lfs merge=lfs -text
43
+ assets/angel_christmas/angel_christmas.png filter=lfs diff=lfs merge=lfs -text
44
+ assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png filter=lfs diff=lfs merge=lfs -text
45
+ assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png filter=lfs diff=lfs merge=lfs -text
46
+ assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png filter=lfs diff=lfs merge=lfs -text
47
+ assets/angel_christmas/prompt.txt filter=lfs diff=lfs merge=lfs -text
48
+ assets/pigeon_rm filter=lfs diff=lfs merge=lfs -text
49
+ assets/brushedit_teaser.png filter=lfs diff=lfs merge=lfs -text
50
+ assets/chenduling filter=lfs diff=lfs merge=lfs -text
51
+ assets/chinese_girl filter=lfs diff=lfs merge=lfs -text
52
+ assets/example.png filter=lfs diff=lfs merge=lfs -text
53
+ assets/frog filter=lfs diff=lfs merge=lfs -text
54
+ assets/hedgehog_rm_fg filter=lfs diff=lfs merge=lfs -text
55
+ assets/hedgehog_rp_fg filter=lfs diff=lfs merge=lfs -text
56
+ assets/spider_man_curl filter=lfs diff=lfs merge=lfs -text
57
+ assets/spider_man_cowboy_hat filter=lfs diff=lfs merge=lfs -text
58
+ assets/spider_man_crown filter=lfs diff=lfs merge=lfs -text
59
+ assets/spider_man_rm filter=lfs diff=lfs merge=lfs -text
60
+ assets/angel_christmas filter=lfs diff=lfs merge=lfs -text
61
+ assets/anime_flower filter=lfs diff=lfs merge=lfs -text
62
+ assets/logo_brushedit.png filter=lfs diff=lfs merge=lfs -text
63
+ assets/spider_man_devil_horn filter=lfs diff=lfs merge=lfs -text
64
+ assets/sunflower_girl filter=lfs diff=lfs merge=lfs -text
65
+ assets/upload.png filter=lfs diff=lfs merge=lfs -text
66
+ assets/demo_vis.png filter=lfs diff=lfs merge=lfs -text
67
+ assets/girl_on_sun filter=lfs diff=lfs merge=lfs -text
68
+ assets/hedgehog_rp_bg filter=lfs diff=lfs merge=lfs -text
69
+ assets/mona_lisa filter=lfs diff=lfs merge=lfs -text
70
+ assets/olsen filter=lfs diff=lfs merge=lfs -text
71
+ assets/spider_man_cat_ears filter=lfs diff=lfs merge=lfs -text
72
+ assets/spider_man_witch_hat filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: indigo
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 4.38.1
8
- app_file: app/gpt4_o/brushedit_app.py
9
  pinned: false
10
- python_version: 3.1
11
- ---
 
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 4.38.1
8
+ app_file: app/src/brushedit_app.py
9
  pinned: false
10
+ python_version: 3.10
11
+ ---
app/down_load_brushedit.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import snapshot_download
3
+
4
+ # download hf models
5
+ BrushEdit_path = "models/"
6
+ if not os.path.exists(BrushEdit_path):
7
+ BrushEdit_path = snapshot_download(
8
+ repo_id="TencentARC/BrushEdit",
9
+ local_dir=BrushEdit_path,
10
+ token=os.getenv("HF_TOKEN"),
11
+ )
12
+
13
+ print("Downloaded BrushEdit to ", BrushEdit_path)
app/down_load_brushedit.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ export PYTHONPATH=.:$PYTHONPATH
2
+
3
+ python app/down_load_brushedit.py
app/gpt4_o/brushedit_app.py DELETED
@@ -1,914 +0,0 @@
1
- ##!/usr/bin/python3
2
- # -*- coding: utf-8 -*-
3
- import os, random
4
- import numpy as np
5
- import torch
6
-
7
- import gradio as gr
8
- import spaces
9
-
10
- from PIL import Image
11
-
12
-
13
- from huggingface_hub import hf_hub_download
14
-
15
- from segment_anything import SamPredictor, build_sam, SamAutomaticMaskGenerator
16
- from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler
17
- from scipy.ndimage import binary_dilation, binary_erosion
18
-
19
- from app.gpt4_o.vlm_pipeline import (
20
- vlm_response_editing_type,
21
- vlm_response_object_wait_for_edit,
22
- vlm_response_mask,
23
- vlm_response_prompt_after_apply_instruction
24
- )
25
- from app.gpt4_o.brushedit_all_in_one_pipeline import BrushEdit_Pipeline
26
- from app.utils.utils import load_grounding_dino_model
27
-
28
-
29
- #### Description ####
30
- head = r"""
31
- <div style="text-align: center;">
32
- <h1> BrushEdit: All-In-One Image Inpainting and Editing</h1>
33
- <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
34
- <a href='https://tencentarc.github.io/BrushNet/'><img src='https://img.shields.io/badge/Project_Page-BrushNet-green' alt='Project Page'></a>
35
- <a href='https://github.com/TencentARC/BrushNet/blob/main/InstructionGuidedEditing/CVPR2024workshop_technique_report.pdf'><img src='https://img.shields.io/badge/Paper-Arxiv-blue'></a>
36
- <a href='https://github.com/TencentARC/BrushNet'><img src='https://img.shields.io/badge/Code-Github-orange'></a>
37
-
38
- </div>
39
- </br>
40
- </div>
41
- """
42
- descriptions = r"""
43
- Official Gradio Demo for <a href='https://tencentarc.github.io/BrushNet/'><b>BrushEdit: All-In-One Image Inpainting and Editing</b></a><br>
44
- 🧙 BrushEdit enables precise, user-friendly instruction-based image editing via a inpainting model.<br>
45
- """
46
-
47
- instructions = r"""
48
- Currently, we support two modes: <b>fully automated command editing</b> and <b>interactive command editing</b>.
49
-
50
- 🛠️ <b>Fully automated instruction-based editing</b>:
51
- <ul>
52
- <li> ⭐️ <b>step1:</b> Upload or select one image from Example. </li>
53
- <li> ⭐️ <b>step2:</b> Input the instructions (supports addition, deletion, and modification), e.g. remove xxx .</li>
54
- <li> ⭐️ <b>step3:</b> Click <b>Run</b> button to automatic edit image.</li>
55
- </ul>
56
-
57
- 🛠️ <b>Interactive instruction-based editing</b>:
58
- <ul>
59
- <li> ⭐️ <b>step1:</b> Upload or select one image from Example. </li>
60
- <li> ⭐️ <b>step2:</b> Use a brush to outline the area you want to edit. </li>
61
- <li> ⭐️ <b>step3:</b> Input the instructions. </li>
62
- <li> ⭐️ <b>step4:</b> Click <b>Run</b> button to automatic edit image. </li>
63
- </ul>
64
-
65
- 💡 <b>Some tips</b>:
66
- <ul>
67
- <li> 🤠 After input the instructions, you can click the <b>Generate Mask</b> button. The mask generated by VLM will be displayed in the preview panel on the right side. </li>
68
- <li> 🤠 After generating the mask or when you use the brush to draw the mask, you can perform operations such as <b>randomization</b>, <b>dilation</b>, <b>erosion</b>, and <b>movement</b>. </li>
69
- <li> 🤠 After input the instructions, you can click the <b>Generate Target Prompt</b> button. The target prompt will be displayed in the text box, and you can modify it according to your ideas. </li>
70
- </ul>
71
-
72
- ☕️ Have fun!
73
- """
74
-
75
-
76
- # - - - - - examples - - - - - #
77
- EXAMPLES = [
78
- # [
79
- # {"background": Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").convert("RGBA"),
80
- # "layers": [Image.new("RGBA", (Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").width, Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").height), (0, 0, 0, 0))],
81
- # "composite": Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").convert("RGBA")},
82
- # # Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").convert("RGBA"),
83
- # "add a shining necklace",
84
- # # [Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.jpg")],
85
- # # [Image.open("assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png")],
86
- # # [Image.open("assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png")]
87
- # ],
88
-
89
- [
90
- # load_image_from_url("https://github.com/liyaowei-stu/BrushEdit/blob/main/assets/mona_lisa/mona_lisa.png"),
91
- Image.open("assets/mona_lisa/mona_lisa.png").convert("RGBA"),
92
- "add a shining necklace",
93
- # [Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.jpg")],
94
- # [Image.open("assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png")],
95
- # [Image.open("assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png")]
96
- ],
97
-
98
-
99
-
100
-
101
- ]
102
-
103
-
104
- ## init VLM
105
- from openai import OpenAI
106
-
107
- OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
108
- os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
109
- vlm = OpenAI(base_url="http://v2.open.venus.oa.com/llmproxy")
110
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
111
-
112
-
113
-
114
- # download hf models
115
- base_model_path = hf_hub_download(
116
- repo_id="Yw22/BrushEdit",
117
- subfolder="base_model/realisticVisionV60B1_v51VAE",
118
- token=os.getenv("HF_TOKEN"),
119
- )
120
-
121
-
122
- brushnet_path = hf_hub_download(
123
- repo_id="Yw22/BrushEdit",
124
- subfolder="brushnetX",
125
- token=os.getenv("HF_TOKEN"),
126
- )
127
-
128
- sam_path = hf_hub_download(
129
- repo_id="Yw22/BrushEdit",
130
- subfolder="sam",
131
- filename="sam_vit_h_4b8939.pth",
132
- token=os.getenv("HF_TOKEN"),
133
- )
134
-
135
- groundingdino_path = hf_hub_download(
136
- repo_id="Yw22/BrushEdit",
137
- subfolder="grounding_dino",
138
- filename="groundingdino_swint_ogc.pth",
139
- token=os.getenv("HF_TOKEN"),
140
- )
141
-
142
-
143
- # input brushnetX ckpt path
144
- brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch.float16)
145
- pipe = StableDiffusionBrushNetPipeline.from_pretrained(
146
- base_model_path, brushnet=brushnet, torch_dtype=torch.float16, low_cpu_mem_usage=False
147
- )
148
- # speed up diffusion process with faster scheduler and memory optimization
149
- pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
150
- # remove following line if xformers is not installed or when using Torch 2.0.
151
- # pipe.enable_xformers_memory_efficient_attention()
152
- pipe.enable_model_cpu_offload()
153
-
154
-
155
- ## init SAM
156
- sam = build_sam(checkpoint=sam_path)
157
- sam.to(device=device)
158
- sam_predictor = SamPredictor(sam)
159
- sam_automask_generator = SamAutomaticMaskGenerator(sam)
160
-
161
- ## init groundingdino_model
162
- config_file = 'third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
163
- groundingdino_model = load_grounding_dino_model(config_file, groundingdino_path, device=device)
164
-
165
- ## Ordinary function
166
- def crop_and_resize(image: Image.Image,
167
- target_width: int,
168
- target_height: int) -> Image.Image:
169
- """
170
- Crops and resizes an image while preserving the aspect ratio.
171
-
172
- Args:
173
- image (Image.Image): Input PIL image to be cropped and resized.
174
- target_width (int): Target width of the output image.
175
- target_height (int): Target height of the output image.
176
-
177
- Returns:
178
- Image.Image: Cropped and resized image.
179
- """
180
- # Original dimensions
181
- original_width, original_height = image.size
182
- original_aspect = original_width / original_height
183
- target_aspect = target_width / target_height
184
-
185
- # Calculate crop box to maintain aspect ratio
186
- if original_aspect > target_aspect:
187
- # Crop horizontally
188
- new_width = int(original_height * target_aspect)
189
- new_height = original_height
190
- left = (original_width - new_width) / 2
191
- top = 0
192
- right = left + new_width
193
- bottom = original_height
194
- else:
195
- # Crop vertically
196
- new_width = original_width
197
- new_height = int(original_width / target_aspect)
198
- left = 0
199
- top = (original_height - new_height) / 2
200
- right = original_width
201
- bottom = top + new_height
202
-
203
- # Crop and resize
204
- cropped_image = image.crop((left, top, right, bottom))
205
- resized_image = cropped_image.resize((target_width, target_height), Image.NEAREST)
206
-
207
- return resized_image
208
-
209
-
210
- def move_mask_func(mask, direction, units):
211
- binary_mask = mask.squeeze()>0
212
- rows, cols = binary_mask.shape
213
-
214
- moved_mask = np.zeros_like(binary_mask, dtype=bool)
215
-
216
- if direction == 'down':
217
- # move down
218
- moved_mask[max(0, units):, :] = binary_mask[:rows - units, :]
219
-
220
- elif direction == 'up':
221
- # move up
222
- moved_mask[:rows - units, :] = binary_mask[units:, :]
223
-
224
- elif direction == 'right':
225
- # move left
226
- moved_mask[:, max(0, units):] = binary_mask[:, :cols - units]
227
-
228
- elif direction == 'left':
229
- # move right
230
- moved_mask[:, :cols - units] = binary_mask[:, units:]
231
-
232
- return moved_mask
233
-
234
-
235
- def random_mask_func(mask, dilation_type='square'):
236
- # Randomly select the size of dilation
237
- dilation_size = np.random.randint(20, 40) # Randomly select the size of dilation
238
- binary_mask = mask.squeeze()>0
239
-
240
- if dilation_type == 'square_dilation':
241
- structure = np.ones((dilation_size, dilation_size), dtype=bool)
242
- dilated_mask = binary_dilation(binary_mask, structure=structure)
243
- elif dilation_type == 'square_erosion':
244
- structure = np.ones((dilation_size, dilation_size), dtype=bool)
245
- dilated_mask = binary_erosion(binary_mask, structure=structure)
246
- elif dilation_type == 'bounding_box':
247
- # find the most left top and left bottom point
248
- rows, cols = np.where(binary_mask)
249
- if len(rows) == 0 or len(cols) == 0:
250
- return mask # return original mask if no valid points
251
-
252
- min_row = np.min(rows)
253
- max_row = np.max(rows)
254
- min_col = np.min(cols)
255
- max_col = np.max(cols)
256
-
257
- # create a bounding box
258
- dilated_mask = np.zeros_like(binary_mask, dtype=bool)
259
- dilated_mask[min_row:max_row + 1, min_col:max_col + 1] = True
260
-
261
- elif dilation_type == 'bounding_ellipse':
262
- # find the most left top and left bottom point
263
- rows, cols = np.where(binary_mask)
264
- if len(rows) == 0 or len(cols) == 0:
265
- return mask # return original mask if no valid points
266
-
267
- min_row = np.min(rows)
268
- max_row = np.max(rows)
269
- min_col = np.min(cols)
270
- max_col = np.max(cols)
271
-
272
- # calculate the center and axis length of the ellipse
273
- center = ((min_col + max_col) // 2, (min_row + max_row) // 2)
274
- a = (max_col - min_col) // 2 # half long axis
275
- b = (max_row - min_row) // 2 # half short axis
276
-
277
- # create a bounding ellipse
278
- y, x = np.ogrid[:mask.shape[0], :mask.shape[1]]
279
- ellipse_mask = ((x - center[0])**2 / a**2 + (y - center[1])**2 / b**2) <= 1
280
- dilated_mask = np.zeros_like(binary_mask, dtype=bool)
281
- dilated_mask[ellipse_mask] = True
282
- else:
283
- raise ValueError("dilation_type must be 'square' or 'ellipse'")
284
-
285
- # use binary dilation
286
- dilated_mask = np.uint8(dilated_mask[:,:,np.newaxis]) * 255
287
- return dilated_mask
288
-
289
-
290
- ## Gradio component function
291
- @spaces.GPU(duration=180)
292
- def process(input_image,
293
- original_image,
294
- original_mask,
295
- prompt,
296
- negative_prompt,
297
- control_strength,
298
- seed,
299
- randomize_seed,
300
- guidance_scale,
301
- num_inference_steps,
302
- num_samples,
303
- blending,
304
- category,
305
- target_prompt,
306
- resize_and_crop):
307
-
308
- import ipdb; ipdb.set_trace()
309
- if original_image is None:
310
- raise gr.Error('Please upload the input image')
311
- if prompt is None:
312
- raise gr.Error("Please input your instructions, e.g., remove the xxx")
313
-
314
-
315
- alpha_mask = input_image["layers"][0].split()[3]
316
- input_mask = np.asarray(alpha_mask)
317
- if resize_and_crop:
318
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
319
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
320
- original_image = np.array(original_image)
321
- input_mask = np.array(input_mask)
322
-
323
- if input_mask.max() == 0:
324
- original_mask = original_mask
325
- else:
326
- original_mask = input_mask[:,:,None]
327
-
328
- # load example image
329
- # if isinstance(original_image, str):
330
- # # image_name = image_examples[original_image][0]
331
- # # original_image = cv2.imread(image_name)
332
- # # original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
333
- # original_image = input_image
334
- # num_samples = 1
335
- # blending = True
336
-
337
- if category is not None:
338
- pass
339
- else:
340
- category = vlm_response_editing_type(vlm, original_image, prompt)
341
-
342
-
343
- if original_mask is not None:
344
- original_mask = np.clip(original_mask, 0, 255).astype(np.uint8)
345
- else:
346
- object_wait_for_edit = vlm_response_object_wait_for_edit(vlm,
347
- category,
348
- prompt)
349
- original_mask = vlm_response_mask(vlm,
350
- category,
351
- original_image,
352
- prompt,
353
- object_wait_for_edit,
354
- sam,
355
- sam_predictor,
356
- sam_automask_generator,
357
- groundingdino_model,
358
- )[:,:,None]
359
-
360
-
361
- if len(target_prompt) <= 1:
362
- prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(vlm,
363
- original_image,
364
- prompt)
365
- else:
366
- prompt_after_apply_instruction = target_prompt
367
-
368
- generator = torch.Generator("cuda").manual_seed(random.randint(0, 2147483647) if randomize_seed else seed)
369
-
370
-
371
-
372
- image, mask_image = BrushEdit_Pipeline(pipe,
373
- prompt_after_apply_instruction,
374
- original_mask,
375
- original_image,
376
- generator,
377
- num_inference_steps,
378
- guidance_scale,
379
- control_strength,
380
- negative_prompt,
381
- num_samples,
382
- blending)
383
-
384
- masked_image = original_image * (1 - (original_mask>0))
385
- masked_image = masked_image.astype(np.uint8)
386
- masked_image = Image.fromarray(masked_image)
387
-
388
- import uuid
389
- uuid = str(uuid.uuid4())
390
- image[0].save(f"outputs/image_edit_{uuid}_0.png")
391
- image[1].save(f"outputs/image_edit_{uuid}_1.png")
392
- image[2].save(f"outputs/image_edit_{uuid}_2.png")
393
- image[3].save(f"outputs/image_edit_{uuid}_3.png")
394
- mask_image.save(f"outputs/mask_{uuid}.png")
395
- masked_image.save(f"outputs/masked_image_{uuid}.png")
396
- return image, [mask_image], [masked_image], ''
397
-
398
-
399
- def generate_target_prompt(input_image,
400
- original_image,
401
- prompt):
402
- # load example image
403
- if isinstance(original_image, str):
404
- original_image = input_image
405
-
406
- prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(vlm,
407
- original_image,
408
- prompt)
409
- return prompt_after_apply_instruction
410
-
411
-
412
- def process_mask(input_image,
413
- original_image,
414
- prompt,
415
- resize_and_crop):
416
- if original_image is None:
417
- raise gr.Error('Please upload the input image')
418
- if prompt is None:
419
- raise gr.Error("Please input your instructions, e.g., remove the xxx")
420
-
421
- ## load mask
422
- alpha_mask = input_image["layers"][0].split()[3]
423
- input_mask = np.array(alpha_mask)
424
-
425
- # load example image
426
- if isinstance(original_image, str):
427
- original_image = input_image["background"]
428
-
429
- if resize_and_crop:
430
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
431
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
432
- original_image = np.array(original_image)
433
- input_mask = np.array(input_mask)
434
-
435
-
436
- if input_mask.max() == 0:
437
- category = vlm_response_editing_type(vlm, original_image, prompt)
438
-
439
- object_wait_for_edit = vlm_response_object_wait_for_edit(vlm,
440
- category,
441
- prompt)
442
- # original mask: h,w,1 [0, 255]
443
- original_mask = vlm_response_mask(
444
- vlm,
445
- category,
446
- original_image,
447
- prompt,
448
- object_wait_for_edit,
449
- sam,
450
- sam_predictor,
451
- sam_automask_generator,
452
- groundingdino_model,
453
- )[:,:,None]
454
- else:
455
- original_mask = input_mask[:,:,None]
456
- category = None
457
-
458
-
459
- mask_image = Image.fromarray(original_mask.squeeze().astype(np.uint8)).convert("RGB")
460
-
461
- masked_image = original_image * (1 - (original_mask>0))
462
- masked_image = masked_image.astype(np.uint8)
463
- masked_image = Image.fromarray(masked_image)
464
-
465
- ## not work for image editor
466
- # background = input_image["background"]
467
- # mask_array = original_mask.squeeze()
468
- # layer_rgba = np.array(input_image['layers'][0])
469
- # layer_rgba[mask_array > 0] = [0, 0, 0, 255]
470
- # layer_rgba = Image.fromarray(layer_rgba, 'RGBA')
471
- # black_image = Image.new("RGBA", layer_rgba.size, (0, 0, 0, 255))
472
- # composite = Image.composite(black_image, background, layer_rgba)
473
- # output_base = {"layers": [layer_rgba], "background": background, "composite": composite}
474
-
475
-
476
- return [masked_image], [mask_image], original_mask.astype(np.uint8), category
477
-
478
-
479
- def process_random_mask(input_image, original_image, original_mask, resize_and_crop):
480
-
481
- alpha_mask = input_image["layers"][0].split()[3]
482
- input_mask = np.asarray(alpha_mask)
483
- if resize_and_crop:
484
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
485
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
486
- original_image = np.array(original_image)
487
- input_mask = np.array(input_mask)
488
-
489
-
490
- if input_mask.max() == 0:
491
- if original_mask is None:
492
- raise gr.Error('Please generate mask first')
493
- original_mask = original_mask
494
- else:
495
- original_mask = input_mask[:,:,None]
496
-
497
-
498
- dilation_type = np.random.choice(['bounding_box', 'bounding_ellipse'])
499
- random_mask = random_mask_func(original_mask, dilation_type).squeeze()
500
-
501
- mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
502
-
503
- masked_image = original_image * (1 - (random_mask[:,:,None]>0))
504
- masked_image = masked_image.astype(original_image.dtype)
505
- masked_image = Image.fromarray(masked_image)
506
-
507
-
508
- return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
509
-
510
-
511
- def process_dilation_mask(input_image, original_image, original_mask, resize_and_crop):
512
-
513
- alpha_mask = input_image["layers"][0].split()[3]
514
- input_mask = np.asarray(alpha_mask)
515
- if resize_and_crop:
516
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
517
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
518
- original_image = np.array(original_image)
519
- input_mask = np.array(input_mask)
520
-
521
- if input_mask.max() == 0:
522
- if original_mask is None:
523
- raise gr.Error('Please generate mask first')
524
- original_mask = original_mask
525
- else:
526
- original_mask = input_mask[:,:,None]
527
-
528
- dilation_type = np.random.choice(['square_dilation'])
529
- random_mask = random_mask_func(original_mask, dilation_type).squeeze()
530
-
531
- mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
532
-
533
- masked_image = original_image * (1 - (random_mask[:,:,None]>0))
534
- masked_image = masked_image.astype(original_image.dtype)
535
- masked_image = Image.fromarray(masked_image)
536
-
537
- return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
538
-
539
-
540
- def process_erosion_mask(input_image, original_image, original_mask, resize_and_crop):
541
- alpha_mask = input_image["layers"][0].split()[3]
542
- input_mask = np.asarray(alpha_mask)
543
- if resize_and_crop:
544
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
545
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
546
- original_image = np.array(original_image)
547
- input_mask = np.array(input_mask)
548
-
549
- if input_mask.max() == 0:
550
- if original_mask is None:
551
- raise gr.Error('Please generate mask first')
552
- original_mask = original_mask
553
- else:
554
- original_mask = input_mask[:,:,None]
555
-
556
- dilation_type = np.random.choice(['square_erosion'])
557
- random_mask = random_mask_func(original_mask, dilation_type).squeeze()
558
-
559
- mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
560
-
561
- masked_image = original_image * (1 - (random_mask[:,:,None]>0))
562
- masked_image = masked_image.astype(original_image.dtype)
563
- masked_image = Image.fromarray(masked_image)
564
-
565
-
566
- return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
567
-
568
-
569
- def move_mask_left(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
570
-
571
- alpha_mask = input_image["layers"][0].split()[3]
572
- input_mask = np.asarray(alpha_mask)
573
- if resize_and_crop:
574
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
575
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
576
- original_image = np.array(original_image)
577
- input_mask = np.array(input_mask)
578
-
579
- if input_mask.max() == 0:
580
- if original_mask is None:
581
- raise gr.Error('Please generate mask first')
582
- original_mask = original_mask
583
- else:
584
- original_mask = input_mask[:,:,None]
585
-
586
- moved_mask = move_mask_func(original_mask, 'left', int(moving_pixels)).squeeze()
587
- mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
588
-
589
- masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
590
- masked_image = masked_image.astype(original_image.dtype)
591
- masked_image = Image.fromarray(masked_image)
592
-
593
- if moved_mask.max() <= 1:
594
- moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
595
- original_mask = moved_mask
596
- return [masked_image], [mask_image], original_mask.astype(np.uint8)
597
-
598
-
599
- def move_mask_right(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
600
- alpha_mask = input_image["layers"][0].split()[3]
601
- input_mask = np.asarray(alpha_mask)
602
- if resize_and_crop:
603
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
604
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
605
- original_image = np.array(original_image)
606
- input_mask = np.array(input_mask)
607
-
608
- if input_mask.max() == 0:
609
- if original_mask is None:
610
- raise gr.Error('Please generate mask first')
611
- original_mask = original_mask
612
- else:
613
- original_mask = input_mask[:,:,None]
614
-
615
- moved_mask = move_mask_func(original_mask, 'right', int(moving_pixels)).squeeze()
616
-
617
- mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
618
-
619
- masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
620
- masked_image = masked_image.astype(original_image.dtype)
621
- masked_image = Image.fromarray(masked_image)
622
-
623
-
624
- if moved_mask.max() <= 1:
625
- moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
626
- original_mask = moved_mask
627
-
628
- return [masked_image], [mask_image], original_mask.astype(np.uint8)
629
-
630
-
631
- def move_mask_up(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
632
- alpha_mask = input_image["layers"][0].split()[3]
633
- input_mask = np.asarray(alpha_mask)
634
- if resize_and_crop:
635
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
636
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
637
- original_image = np.array(original_image)
638
- input_mask = np.array(input_mask)
639
-
640
- if input_mask.max() == 0:
641
- if original_mask is None:
642
- raise gr.Error('Please generate mask first')
643
- original_mask = original_mask
644
- else:
645
- original_mask = input_mask[:,:,None]
646
-
647
- moved_mask = move_mask_func(original_mask, 'up', int(moving_pixels)).squeeze()
648
- mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
649
-
650
- masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
651
- masked_image = masked_image.astype(original_image.dtype)
652
- masked_image = Image.fromarray(masked_image)
653
-
654
- if moved_mask.max() <= 1:
655
- moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
656
- original_mask = moved_mask
657
-
658
- return [masked_image], [mask_image], original_mask.astype(np.uint8)
659
-
660
-
661
- def move_mask_down(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
662
- alpha_mask = input_image["layers"][0].split()[3]
663
- input_mask = np.asarray(alpha_mask)
664
- if resize_and_crop:
665
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
666
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
667
- original_image = np.array(original_image)
668
- input_mask = np.array(input_mask)
669
-
670
- if input_mask.max() == 0:
671
- if original_mask is None:
672
- raise gr.Error('Please generate mask first')
673
- original_mask = original_mask
674
- else:
675
- original_mask = input_mask[:,:,None]
676
-
677
- moved_mask = move_mask_func(original_mask, 'down', int(moving_pixels)).squeeze()
678
- mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
679
-
680
- masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
681
- masked_image = masked_image.astype(original_image.dtype)
682
- masked_image = Image.fromarray(masked_image)
683
-
684
- if moved_mask.max() <= 1:
685
- moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
686
- original_mask = moved_mask
687
-
688
- return [masked_image], [mask_image], original_mask.astype(np.uint8)
689
-
690
-
691
- def store_img(base):
692
- import ipdb; ipdb.set_trace()
693
- image_pil = base["background"].convert("RGB")
694
- original_image = np.array(image_pil)
695
- # import ipdb; ipdb.set_trace()
696
- if max(original_image.shape[0], original_image.shape[1]) * 1.0 / min(original_image.shape[0], original_image.shape[1])>2.0:
697
- raise gr.Error('image aspect ratio cannot be larger than 2.0')
698
- return base, original_image, None, "", None, None, None, None, None
699
-
700
-
701
- def reset_func(input_image, original_image, original_mask, prompt, target_prompt):
702
- input_image = None
703
- original_image = None
704
- original_mask = None
705
- prompt = ''
706
- mask_gallery = []
707
- masked_gallery = []
708
- result_gallery = []
709
- target_prompt = ''
710
- return input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt
711
-
712
-
713
- block = gr.Blocks(
714
- theme=gr.themes.Soft(
715
- radius_size=gr.themes.sizes.radius_none,
716
- text_size=gr.themes.sizes.text_md
717
- )
718
- ).queue()
719
- with block as demo:
720
- with gr.Row():
721
- with gr.Column():
722
- gr.HTML(head)
723
-
724
- gr.Markdown(descriptions)
725
-
726
- with gr.Accordion(label="🧭 Instructions:", open=True, elem_id="accordion"):
727
- with gr.Row(equal_height=True):
728
- gr.Markdown(instructions)
729
-
730
- original_image = gr.State(value=None)
731
- original_mask = gr.State(value=None)
732
- category = gr.State(value=None)
733
-
734
- with gr.Row():
735
- with gr.Column():
736
- with gr.Row():
737
- input_image = gr.ImageEditor(
738
- label="Input Image",
739
- type="pil",
740
- brush=gr.Brush(colors=["#000000"], default_size = 30, color_mode="fixed"),
741
- layers = False,
742
- interactive=True,
743
- height=800,
744
- # transforms=("crop"),
745
- # crop_size=(640, 640),
746
- )
747
-
748
- prompt = gr.Textbox(label="Prompt", placeholder="Please input your instruction.",value='',lines=1)
749
-
750
- with gr.Row():
751
- mask_button = gr.Button("Generate Mask")
752
- random_mask_button = gr.Button("Random Generated Mask")
753
- with gr.Row():
754
- dilation_mask_button = gr.Button("Dilation Generated Mask")
755
- erosion_mask_button = gr.Button("Erosion Generated Mask")
756
-
757
- with gr.Row():
758
- generate_target_prompt_button = gr.Button("Generate Target Prompt")
759
- run_button = gr.Button("Run")
760
-
761
-
762
- target_prompt = gr.Text(
763
- label="Target prompt",
764
- max_lines=5,
765
- placeholder="VLM-generated target prompt, you can first generate if and then modify it (optional)",
766
- value='',
767
- lines=2
768
- )
769
-
770
- resize_and_crop = gr.Checkbox(label="Resize and Crop (640 x 640)", value=False)
771
-
772
- with gr.Accordion("More input params (highly-recommended)", open=False, elem_id="accordion1"):
773
- negative_prompt = gr.Text(
774
- label="Negative Prompt",
775
- max_lines=5,
776
- placeholder="Please input your negative prompt",
777
- value='ugly, low quality',lines=1
778
- )
779
-
780
- control_strength = gr.Slider(
781
- label="Control Strength: ", show_label=True, minimum=0, maximum=1.1, value=1, step=0.01
782
- )
783
- with gr.Group():
784
- seed = gr.Slider(
785
- label="Seed: ", minimum=0, maximum=2147483647, step=1, value=648464818
786
- )
787
- randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
788
-
789
- blending = gr.Checkbox(label="Blending mode", value=True)
790
-
791
-
792
- num_samples = gr.Slider(
793
- label="Num samples", minimum=0, maximum=4, step=1, value=4
794
- )
795
-
796
- with gr.Group():
797
- with gr.Row():
798
- guidance_scale = gr.Slider(
799
- label="Guidance scale",
800
- minimum=1,
801
- maximum=12,
802
- step=0.1,
803
- value=7.5,
804
- )
805
- num_inference_steps = gr.Slider(
806
- label="Number of inference steps",
807
- minimum=1,
808
- maximum=50,
809
- step=1,
810
- value=50,
811
- )
812
-
813
-
814
- with gr.Column():
815
- with gr.Row():
816
- with gr.Tabs(elem_classes=["feedback"]):
817
- with gr.TabItem("Mask"):
818
- mask_gallery = gr.Gallery(label='Mask', show_label=False, elem_id="gallery", preview=True, height=360)
819
- with gr.Tabs(elem_classes=["feedback"]):
820
- with gr.TabItem("Masked Image"):
821
- masked_gallery = gr.Gallery(label='Masked Image', show_label=False, elem_id="gallery", preview=True, height=360)
822
-
823
- moving_pixels = gr.Slider(
824
- label="Moving pixels:", show_label=True, minimum=0, maximum=50, value=4, step=1
825
- )
826
- with gr.Row():
827
- move_left_button = gr.Button("Move Left")
828
- move_right_button = gr.Button("Move Right")
829
- with gr.Row():
830
- move_up_button = gr.Button("Move Up")
831
- move_down_button = gr.Button("Move Down")
832
-
833
- with gr.Tabs(elem_classes=["feedback"]):
834
- with gr.TabItem("Outputs"):
835
- result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery", preview=True, height=360)
836
-
837
- reset_button = gr.Button("Reset")
838
-
839
-
840
- with gr.Row():
841
- # # example = gr.Examples(
842
- # # label="Quick Example",
843
- # # examples=EXAMPLES,
844
- # # inputs=[prompt, seed, result_gallery, mask_gallery, masked_gallery],
845
- # # examples_per_page=10,
846
- # # cache_examples=False,
847
- # # )
848
- example = gr.Examples(
849
- label="Quick Example",
850
- examples=EXAMPLES,
851
- inputs=[input_image, prompt],
852
- examples_per_page=10,
853
- cache_examples=False,
854
- )
855
- # def process_example(prompt, seed, eg_output):
856
- # import ipdb; ipdb.set_trace()
857
- # eg_output_path = os.path.join("assets/", eg_output)
858
- # return prompt, seed, [Image.open(eg_output_path)]
859
- # example = gr.Examples(
860
- # label="Quick Example",
861
- # examples=EXAMPLES,
862
- # inputs=[prompt, seed, eg_output],
863
- # outputs=[prompt, seed, result_gallery],
864
- # fn=process_example,
865
- # examples_per_page=10,
866
- # run_on_click=True,
867
- # cache_examples=False,
868
- # )
869
-
870
- input_image.upload(
871
- store_img,
872
- [input_image],
873
- [input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt]
874
- )
875
-
876
-
877
- ips=[input_image,
878
- original_image,
879
- original_mask,
880
- prompt,
881
- negative_prompt,
882
- control_strength,
883
- seed,
884
- randomize_seed,
885
- guidance_scale,
886
- num_inference_steps,
887
- num_samples,
888
- blending,
889
- category,
890
- target_prompt,
891
- resize_and_crop]
892
-
893
- ## run brushedit
894
- run_button.click(fn=process, inputs=ips, outputs=[result_gallery, mask_gallery, masked_gallery, target_prompt])
895
-
896
- ## mask func
897
- mask_button.click(fn=process_mask, inputs=[input_image, original_image, prompt, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask, category])
898
- random_mask_button.click(fn=process_random_mask, inputs=[input_image, original_image, original_mask, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
899
- dilation_mask_button.click(fn=process_dilation_mask, inputs=[input_image, original_image, original_mask, resize_and_crop], outputs=[ masked_gallery, mask_gallery, original_mask])
900
- erosion_mask_button.click(fn=process_erosion_mask, inputs=[input_image, original_image, original_mask, resize_and_crop], outputs=[ masked_gallery, mask_gallery, original_mask])
901
-
902
- ## move mask func
903
- move_left_button.click(fn=move_mask_left, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
904
- move_right_button.click(fn=move_mask_right, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
905
- move_up_button.click(fn=move_mask_up, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
906
- move_down_button.click(fn=move_mask_down, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
907
-
908
- ## prompt func
909
- generate_target_prompt_button.click(fn=generate_target_prompt, inputs=[input_image, original_image, prompt], outputs=[target_prompt])
910
-
911
- ## reset func
912
- reset_button.click(fn=reset_func, inputs=[input_image, original_image, original_mask, prompt, target_prompt], outputs=[input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt])
913
-
914
- demo.launch(server_name="0.0.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/gpt4_o/instructions.py CHANGED
@@ -1,15 +1,16 @@
1
- def create_editing_category_messages(editing_prompt):
2
  messages = [{
3
  "role": "system",
4
  "content": [
5
  {
6
  "type": "text",
7
- "text": "I will give you an image and an editing instruction of the image. Please output which type of editing category it is in. You can choose from the following categories: \n\
8
- 1. Addition: Adding new objects within the images, e.g., add a bird to the image \n\
9
- 2. Remove: Removing objects, e.g., remove the mask \n\
10
- 3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) \n\
11
- 4. Global: Edit the entire image, e.g., let's see it in winter \n\
12
- 5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc.",
 
13
  },]
14
  },
15
  {
@@ -24,7 +25,7 @@ def create_editing_category_messages(editing_prompt):
24
  return messages
25
 
26
 
27
- def create_ori_object_messages(editing_prompt):
28
 
29
  messages = [
30
  {
@@ -49,7 +50,7 @@ def create_ori_object_messages(editing_prompt):
49
  return messages
50
 
51
 
52
- def create_add_object_messages(editing_prompt, base64_image, height=640, width=640):
53
 
54
  size_str = f"The image size is height {height}px and width {width}px. The top - left corner is coordinate [0 , 0]. The bottom - right corner is coordinnate [{height} , {width}]. "
55
 
@@ -77,7 +78,7 @@ def create_add_object_messages(editing_prompt, base64_image, height=640, width=6
77
  return messages
78
 
79
 
80
- def create_apply_editing_messages(editing_prompt, base64_image):
81
  messages = [
82
  {
83
  "role": "system",
 
1
+ def create_editing_category_messages_gpt4o(editing_prompt):
2
  messages = [{
3
  "role": "system",
4
  "content": [
5
  {
6
  "type": "text",
7
+ "text": "I will give you an editing instruction of the image. Please output which type of editing category it is in. You can choose from the following categories: \n\
8
+ 1. Addition: Adding new objects within the images, e.g., add a bird \n\
9
+ 2. Remove: Removing objects, e.g., remove the mask \n\
10
+ 3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) \n\
11
+ 4. Global: Edit the entire image, e.g., let's see it in winter \n\
12
+ 5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc. \n\
13
+ Only output a single word, e.g., 'Addition'.",
14
  },]
15
  },
16
  {
 
25
  return messages
26
 
27
 
28
+ def create_ori_object_messages_gpt4o(editing_prompt):
29
 
30
  messages = [
31
  {
 
50
  return messages
51
 
52
 
53
+ def create_add_object_messages_gpt4o(editing_prompt, base64_image, height=640, width=640):
54
 
55
  size_str = f"The image size is height {height}px and width {width}px. The top - left corner is coordinate [0 , 0]. The bottom - right corner is coordinnate [{height} , {width}]. "
56
 
 
78
  return messages
79
 
80
 
81
+ def create_apply_editing_messages_gpt4o(editing_prompt, base64_image):
82
  messages = [
83
  {
84
  "role": "system",
app/gpt4_o/requirements.txt DELETED
@@ -1,18 +0,0 @@
1
- torchvision
2
- transformers>=4.25.1
3
- ftfy
4
- tensorboard
5
- datasets
6
- Pillow==9.5.0
7
- opencv-python
8
- imgaug
9
- accelerate==0.20.3
10
- image-reward
11
- hpsv2
12
- torchmetrics
13
- open-clip-torch
14
- clip
15
- # gradio==4.44.1
16
- gradio==4.38.1
17
- segment_anything
18
- openai
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/llava/instructions.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def create_editing_category_messages_llava(editing_prompt):
2
+ messages = [{
3
+ "role": "system",
4
+ "content": [
5
+ {
6
+ "type": "text",
7
+ "text": "I will give you an image and an editing instruction of the image. Please output which type of editing category it is in. You can choose from the following categories: \n\
8
+ 1. Addition: Adding new objects within the images, e.g., add a bird \n\
9
+ 2. Remove: Removing objects, e.g., remove the mask \n\
10
+ 3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) \n\
11
+ 4. Global: Edit the entire image, e.g., let's see it in winter \n\
12
+ 5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc. \n\
13
+ Only output a single word, e.g., 'Addition'.",
14
+ },]
15
+ },
16
+ {
17
+ "role": "user",
18
+ "content": [
19
+ {
20
+ "type": "image"
21
+ },
22
+ {
23
+ "type": "text",
24
+ "text": editing_prompt
25
+ },
26
+ ]
27
+ }]
28
+ return messages
29
+
30
+
31
+ def create_ori_object_messages_llava(editing_prompt):
32
+
33
+ messages = [
34
+ {
35
+ "role": "system",
36
+ "content": [
37
+ {
38
+ "type": "text",
39
+ "text": "I will give you an editing instruction of the image. Please output the object needed to be edited. You only need to output the basic description of the object in no more than 5 words. The output should only contain one noun. \n \
40
+ For example, the editing instruction is 'Change the white cat to a black dog'. Then you need to output: 'white cat'. Only output the new content. Do not output anything else."
41
+ },]
42
+ },
43
+ {
44
+ "role": "user",
45
+ "content": [
46
+ {
47
+ "type": "image"
48
+ },
49
+ {
50
+ "type": "text",
51
+ "text": editing_prompt
52
+ }
53
+ ]
54
+ }
55
+ ]
56
+ return messages
57
+
58
+
59
+ def create_add_object_messages_llava(editing_prompt, height=640, width=640):
60
+
61
+ size_str = f"The image size is height {height}px and width {width}px. The top - left corner is coordinate [0 , 0]. The bottom - right corner is coordinnate [{height} , {width}]. "
62
+
63
+ messages = [
64
+ {
65
+ "role": "user",
66
+ "content": [
67
+ {
68
+ "type": "image"
69
+ },
70
+ {
71
+ "type": "text",
72
+ "text": "I need to add an object to the image following the instruction: " + editing_prompt + ". " + size_str + " \n \
73
+ Can you give me a possible bounding box of the location for the added object? Please output with the format of [top - left x coordinate , top - left y coordinate , box width , box height]. You should only output the bounding box position and nothing else. Please refer to the example below for the desired format.\n\
74
+ [Examples]\n \
75
+ [19, 101, 32, 153]\n \
76
+ [54, 12, 242, 96]"
77
+ },
78
+ ]
79
+ }
80
+ ]
81
+ return messages
82
+
83
+
84
+ def create_apply_editing_messages_llava(editing_prompt):
85
+ messages = [
86
+ {
87
+ "role": "system",
88
+ "content": [
89
+ {
90
+ "type": "text",
91
+ "text": "I will provide an image along with an editing instruction. Please describe the new content that should be present in the image after applying the instruction. \n \
92
+ For example, if the original image content shows a grandmother wearing a mask and the instruction is 'remove the mask', your output should be: 'a grandmother'. The output should only include elements that remain in the image after the edit and should not mention elements that have been changed or removed, such as 'mask' in this example. Do not output 'sorry, xxx', even if it's a guess, directly output the answer you think is correct."
93
+ },]
94
+ },
95
+ {
96
+ "role": "user",
97
+ "content": [
98
+ {
99
+ "type": "image"
100
+ },
101
+ {
102
+ "type": "text",
103
+ "text": editing_prompt
104
+ },
105
+ ]
106
+ },
107
+ ]
108
+ return messages
app/qwen2/instructions.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def create_editing_category_messages_qwen2(editing_prompt):
2
+ messages = [{
3
+ "role": "system",
4
+ "content": [
5
+ {
6
+ "type": "text",
7
+ "text": "I will give you an image and an editing instruction of the image. Please output which type of editing category it is in. You can choose from the following categories: \n\
8
+ 1. Addition: Adding new objects within the images, e.g., add a bird to the image \n\
9
+ 2. Remove: Removing objects, e.g., remove the mask \n\
10
+ 3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) \n\
11
+ 4. Global: Edit the entire image, e.g., let's see it in winter \n\
12
+ 5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc.",
13
+ },]
14
+ },
15
+ {
16
+ "role": "user",
17
+ "content": [
18
+ {
19
+ "type": "text",
20
+ "text": editing_prompt
21
+ },
22
+ ]
23
+ }]
24
+ return messages
25
+
26
+
27
+ def create_ori_object_messages_qwen2(editing_prompt):
28
+
29
+ messages = [
30
+ {
31
+ "role": "system",
32
+ "content": [
33
+ {
34
+ "type": "text",
35
+ "text": "I will give you an editing instruction of the image. Please output the object needed to be edited. You only need to output the basic description of the object in no more than 5 words. The output should only contain one noun. \n \
36
+ For example, the editing instruction is 'Change the white cat to a black dog'. Then you need to output: 'white cat'. Only output the new content. Do not output anything else."
37
+ },]
38
+ },
39
+ {
40
+ "role": "user",
41
+ "content": [
42
+ {
43
+ "type": "text",
44
+ "text": editing_prompt
45
+ }
46
+ ]
47
+ }
48
+ ]
49
+ return messages
50
+
51
+
52
+ def create_add_object_messages_qwen2(editing_prompt, base64_image, height=640, width=640):
53
+
54
+ size_str = f"The image size is height {height}px and width {width}px. The top - left corner is coordinate [0 , 0]. The bottom - right corner is coordinnate [{height} , {width}]. "
55
+
56
+ messages = [
57
+ {
58
+ "role": "user",
59
+ "content": [
60
+ {
61
+ "type": "text",
62
+ "text": "I need to add an object to the image following the instruction: " + editing_prompt + ". " + size_str + " \n \
63
+ Can you give me a possible bounding box of the location for the added object? Please output with the format of [top - left x coordinate , top - left y coordinate , box width , box height]. You should only output the bounding box position and nothing else. Please refer to the example below for the desired format.\n\
64
+ [Examples]\n \
65
+ [19, 101, 32, 153]\n \
66
+ [54, 12, 242, 96]"
67
+ },
68
+ {
69
+ "type": "image",
70
+ "image": f"data:image;base64,{base64_image}",
71
+ }
72
+ ]
73
+ }
74
+ ]
75
+ return messages
76
+
77
+
78
+ def create_apply_editing_messages_qwen2(editing_prompt, base64_image):
79
+ messages = [
80
+ {
81
+ "role": "system",
82
+ "content": [
83
+ {
84
+ "type": "text",
85
+ "text": "I will provide an image along with an editing instruction. Please describe the new content that should be present in the image after applying the instruction. \n \
86
+ For example, if the original image content shows a grandmother wearing a mask and the instruction is 'remove the mask', your output should be: 'a grandmother'. The output should only include elements that remain in the image after the edit and should not mention elements that have been changed or removed, such as 'mask' in this example. Do not output 'sorry, xxx', even if it's a guess, directly output the answer you think is correct."
87
+ },]
88
+ },
89
+ {
90
+ "role": "user",
91
+ "content": [
92
+ {
93
+ "type": "text",
94
+ "text": editing_prompt
95
+ },
96
+ {
97
+ "type": "image",
98
+ "image": f"data:image;base64,{base64_image}",
99
+ },
100
+ ]
101
+ }
102
+ ]
103
+ return messages
app/{gpt4_o/run_app.sh → run_app.sh} RENAMED
@@ -2,4 +2,4 @@ export PYTHONPATH=.:$PYTHONPATH
2
 
3
  export CUDA_VISIBLE_DEVICES=0
4
 
5
- python app/gpt4_o/brushedit_app.py
 
2
 
3
  export CUDA_VISIBLE_DEVICES=0
4
 
5
+ python app/src/brushedit_app.py
app/src/aspect_ratio_template.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # From https://github.com/TencentARC/PhotoMaker/pull/120 written by https://github.com/DiscoNova
2
+ # Note: Since output width & height need to be divisible by 8, the w & h -values do
3
+ # not exactly match the stated aspect ratios... but they are "close enough":)
4
+
5
+ aspect_ratio_list = [
6
+ {
7
+ "name": "Small Square (1:1)",
8
+ "w": 640,
9
+ "h": 640,
10
+ },
11
+ {
12
+ "name": "Custom resolution",
13
+ "w": "",
14
+ "h": "",
15
+ },
16
+ {
17
+ "name": "Instagram (1:1)",
18
+ "w": 1024,
19
+ "h": 1024,
20
+ },
21
+ {
22
+ "name": "35mm film / Landscape (3:2)",
23
+ "w": 1024,
24
+ "h": 680,
25
+ },
26
+ {
27
+ "name": "35mm film / Portrait (2:3)",
28
+ "w": 680,
29
+ "h": 1024,
30
+ },
31
+ {
32
+ "name": "CRT Monitor / Landscape (4:3)",
33
+ "w": 1024,
34
+ "h": 768,
35
+ },
36
+ {
37
+ "name": "CRT Monitor / Portrait (3:4)",
38
+ "w": 768,
39
+ "h": 1024,
40
+ },
41
+ {
42
+ "name": "Widescreen TV / Landscape (16:9)",
43
+ "w": 1024,
44
+ "h": 576,
45
+ },
46
+ {
47
+ "name": "Widescreen TV / Portrait (9:16)",
48
+ "w": 576,
49
+ "h": 1024,
50
+ },
51
+ {
52
+ "name": "Widescreen Monitor / Landscape (16:10)",
53
+ "w": 1024,
54
+ "h": 640,
55
+ },
56
+ {
57
+ "name": "Widescreen Monitor / Portrait (10:16)",
58
+ "w": 640,
59
+ "h": 1024,
60
+ },
61
+ {
62
+ "name": "Cinemascope (2.39:1)",
63
+ "w": 1024,
64
+ "h": 424,
65
+ },
66
+ {
67
+ "name": "Widescreen Movie (1.85:1)",
68
+ "w": 1024,
69
+ "h": 552,
70
+ },
71
+ {
72
+ "name": "Academy Movie (1.37:1)",
73
+ "w": 1024,
74
+ "h": 744,
75
+ },
76
+ {
77
+ "name": "Sheet-print (A-series) / Landscape (297:210)",
78
+ "w": 1024,
79
+ "h": 720,
80
+ },
81
+ {
82
+ "name": "Sheet-print (A-series) / Portrait (210:297)",
83
+ "w": 720,
84
+ "h": 1024,
85
+ },
86
+ ]
87
+
88
+ aspect_ratios = {k["name"]: (k["w"], k["h"]) for k in aspect_ratio_list}
app/src/base_model_template.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from huggingface_hub import snapshot_download
4
+
5
+ from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler
6
+
7
+
8
+
9
+ torch_dtype = torch.float16
10
+ device = "cpu"
11
+
12
+ BrushEdit_path = "models/"
13
+ if not os.path.exists(BrushEdit_path):
14
+ BrushEdit_path = snapshot_download(
15
+ repo_id="TencentARC/BrushEdit",
16
+ local_dir=BrushEdit_path,
17
+ token=os.getenv("HF_TOKEN"),
18
+ )
19
+ brushnet_path = os.path.join(BrushEdit_path, "brushnetX")
20
+ brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch_dtype)
21
+
22
+
23
+ base_models_list = [
24
+ {
25
+ "name": "dreamshaper_8 (Preload)",
26
+ "local_path": "models/base_model/dreamshaper_8",
27
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
28
+ "models/base_model/dreamshaper_8", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
29
+ ).to(device)
30
+ },
31
+ {
32
+ "name": "epicrealism (Preload)",
33
+ "local_path": "models/base_model/epicrealism_naturalSinRC1VAE",
34
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
35
+ "models/base_model/epicrealism_naturalSinRC1VAE", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
36
+ ).to(device)
37
+ },
38
+ {
39
+ "name": "henmixReal (Preload)",
40
+ "local_path": "models/base_model/henmixReal_v5c",
41
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
42
+ "models/base_model/henmixReal_v5c", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
43
+ ).to(device)
44
+ },
45
+ {
46
+ "name": "meinamix (Preload)",
47
+ "local_path": "models/base_model/meinamix_meinaV11",
48
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
49
+ "models/base_model/meinamix_meinaV11", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
50
+ ).to(device)
51
+ },
52
+ {
53
+ "name": "realisticVision (Default)",
54
+ "local_path": "models/base_model/realisticVisionV60B1_v51VAE",
55
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
56
+ "models/base_model/realisticVisionV60B1_v51VAE", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
57
+ ).to(device)
58
+ },
59
+ ]
60
+
61
+ base_models_template = {k["name"]: (k["local_path"], k["pipe"]) for k in base_models_list}
app/{gpt4_o → src}/brushedit_all_in_one_pipeline.py RENAMED
@@ -22,10 +22,6 @@ def BrushEdit_Pipeline(pipe,
22
 
23
  mask_np = mask_np / 255
24
  height, width = mask_np.shape[0], mask_np.shape[1]
25
- # back/foreground
26
- # if mask_np[94:547,94:546].sum() < mask_np.sum() - mask_np[94:547,94:546].sum() and mask_np[0,:].sum()>0 and mask_np[-1,:].sum()>0 and mask_np[:,0].sum()>0 and mask_np[:,-1].sum()>0 and mask_np[1,:].sum()>0 and mask_np[-2,:].sum()>0 and mask_np[:,1].sum()>0 and mask_np[:,-2].sum()>0 :
27
- # mask_np = 1 - mask_np
28
-
29
  ## resize the mask and original image to the same size which is divisible by vae_scale_factor
30
  image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True)
31
  height_new, width_new = image_processor.get_default_height_width(original_image, height, width)
@@ -53,16 +49,13 @@ def BrushEdit_Pipeline(pipe,
53
  height=height_new,
54
  width=width_new,
55
  ).images
56
-
 
 
 
 
57
  if blending:
58
-
59
  mask_blurred = mask_blurred * 0.5 + 0.5
60
-
61
- ## convert to vae shape format, must be divisible by 8
62
- original_image_pil = Image.fromarray(original_image).convert("RGB")
63
- init_image_np = np.array(image_processor.preprocess(original_image_pil, height=height_new, width=width_new).squeeze())
64
- init_image_np = ((init_image_np.transpose(1,2,0) + 1.) / 2.) * 255
65
- init_image_np = init_image_np.astype(np.uint8)
66
  image_all = []
67
  for image_i in images:
68
  image_np = np.array(image_i)
@@ -75,6 +68,6 @@ def BrushEdit_Pipeline(pipe,
75
  image_all = images
76
 
77
 
78
- return image_all, mask_image
79
 
80
 
 
22
 
23
  mask_np = mask_np / 255
24
  height, width = mask_np.shape[0], mask_np.shape[1]
 
 
 
 
25
  ## resize the mask and original image to the same size which is divisible by vae_scale_factor
26
  image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True)
27
  height_new, width_new = image_processor.get_default_height_width(original_image, height, width)
 
49
  height=height_new,
50
  width=width_new,
51
  ).images
52
+ ## convert to vae shape format, must be divisible by 8
53
+ original_image_pil = Image.fromarray(original_image).convert("RGB")
54
+ init_image_np = np.array(image_processor.preprocess(original_image_pil, height=height_new, width=width_new).squeeze())
55
+ init_image_np = ((init_image_np.transpose(1,2,0) + 1.) / 2.) * 255
56
+ init_image_np = init_image_np.astype(np.uint8)
57
  if blending:
 
58
  mask_blurred = mask_blurred * 0.5 + 0.5
 
 
 
 
 
 
59
  image_all = []
60
  for image_i in images:
61
  image_np = np.array(image_i)
 
68
  image_all = images
69
 
70
 
71
+ return image_all, mask_image, mask_np, init_image_np
72
 
73
 
app/src/brushedit_app.py ADDED
@@ -0,0 +1,1690 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import os, random, sys
4
+ import numpy as np
5
+ import requests
6
+ import torch
7
+ import spaces
8
+
9
+
10
+ import gradio as gr
11
+
12
+ from PIL import Image
13
+
14
+
15
+ from huggingface_hub import hf_hub_download, snapshot_download
16
+ from scipy.ndimage import binary_dilation, binary_erosion
17
+ from transformers import (LlavaNextProcessor, LlavaNextForConditionalGeneration,
18
+ Qwen2VLForConditionalGeneration, Qwen2VLProcessor)
19
+
20
+ from segment_anything import SamPredictor, build_sam, SamAutomaticMaskGenerator
21
+ from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler
22
+ from diffusers.image_processor import VaeImageProcessor
23
+
24
+
25
+ from app.src.vlm_pipeline import (
26
+ vlm_response_editing_type,
27
+ vlm_response_object_wait_for_edit,
28
+ vlm_response_mask,
29
+ vlm_response_prompt_after_apply_instruction
30
+ )
31
+ from app.src.brushedit_all_in_one_pipeline import BrushEdit_Pipeline
32
+ from app.utils.utils import load_grounding_dino_model
33
+
34
+ from app.src.vlm_template import vlms_template
35
+ from app.src.base_model_template import base_models_template
36
+ from app.src.aspect_ratio_template import aspect_ratios
37
+
38
+ from openai import OpenAI
39
+ # base_openai_url = ""
40
+
41
+ #### Description ####
42
+ logo = r"""
43
+ <center><img src='./assets/logo_brushedit.png' alt='BrushEdit logo' style="width:80px; margin-bottom:10px"></center>
44
+ """
45
+ head = r"""
46
+ <div style="text-align: center;">
47
+ <h1> BrushEdit: All-In-One Image Inpainting and Editing</h1>
48
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
49
+ <a href='https://liyaowei-stu.github.io/project/BrushEdit/'><img src='https://img.shields.io/badge/Project_Page-BrushEdit-green' alt='Project Page'></a>
50
+ <a href='https://arxiv.org/abs/2412.10316'><img src='https://img.shields.io/badge/Paper-Arxiv-blue'></a>
51
+ <a href='https://github.com/TencentARC/BrushEdit'><img src='https://img.shields.io/badge/Code-Github-orange'></a>
52
+
53
+ </div>
54
+ </br>
55
+ </div>
56
+ """
57
+ descriptions = r"""
58
+ Official Gradio Demo for <a href='https://tencentarc.github.io/BrushNet/'><b>BrushEdit: All-In-One Image Inpainting and Editing</b></a><br>
59
+ 🧙 BrushEdit enables precise, user-friendly instruction-based image editing via a inpainting model.<br>
60
+ """
61
+
62
+ instructions = r"""
63
+ Currently, we support two modes: <b>fully automated command editing</b> and <b>interactive command editing</b>.
64
+
65
+ 🛠️ <b>Fully automated instruction-based editing</b>:
66
+ <ul>
67
+ <li> ⭐️ <b>1.Choose Image: </b> Upload <img src="https://github.com/user-attachments/assets/f2dca1e6-31f9-4716-ae84-907f24415bac" alt="upload" style="display:inline; height:1em; vertical-align:middle;"> or select <img src="https://github.com/user-attachments/assets/de808f7d-c74a-44c7-9cbf-f0dbfc2c1abf" alt="example" style="display:inline; height:1em; vertical-align:middle;"> one image from Example. </li>
68
+ <li> ⭐️ <b>2.Input ⌨️ Instructions: </b> Input the instructions (supports addition, deletion, and modification), e.g. remove xxx .</li>
69
+ <li> ⭐️ <b>3.Run: </b> Click <b>💫 Run</b> button to automatic edit image.</li>
70
+ </ul>
71
+
72
+ 🛠️ <b>Interactive instruction-based editing</b>:
73
+ <ul>
74
+ <li> ⭐️ <b>1.Choose Image: </b> Upload <img src="https://github.com/user-attachments/assets/f2dca1e6-31f9-4716-ae84-907f24415bac" alt="upload" style="display:inline; height:1em; vertical-align:middle;"> or select <img src="https://github.com/user-attachments/assets/de808f7d-c74a-44c7-9cbf-f0dbfc2c1abf" alt="example" style="display:inline; height:1em; vertical-align:middle;"> one image from Example. </li>
75
+ <li> ⭐️ <b>2.Finely Brushing: </b> Use a brush <img src="https://github.com/user-attachments/assets/c466c5cc-ac8f-4b4a-9bc5-04c4737fe1ef" alt="brush" style="display:inline; height:1em; vertical-align:middle;"> to outline the area you want to edit. And You can also use the eraser <img src="https://github.com/user-attachments/assets/b6370369-b080-4550-b0d0-830ff22d9068" alt="eraser" style="display:inline; height:1em; vertical-align:middle;"> to restore. </li>
76
+ <li> ⭐️ <b>3.Input ⌨️ Instructions: </b> Input the instructions. </li>
77
+ <li> ⭐️ <b>4.Run: </b> Click <b>💫 Run</b> button to automatic edit image. </li>
78
+ </ul>
79
+
80
+ <b> We strongly recommend using GPT-4o for reasoning. </b> After selecting the VLM model as gpt4-o, enter the API KEY and click the Submit and Verify button. If the output is success, you can use gpt4-o normally. Secondarily, we recommend using the Qwen2VL model.
81
+
82
+ <b> We recommend zooming out in your browser for a better viewing range and experience. </b>
83
+
84
+ <b> For more detailed feature descriptions, see the bottom. </b>
85
+
86
+ ☕️ Have fun! 🎄 Wishing you a merry Christmas!
87
+ """
88
+
89
+ tips = r"""
90
+ 💡 <b>Some Tips</b>:
91
+ <ul>
92
+ <li> 🤠 After input the instructions, you can click the <b>Generate Mask</b> button. The mask generated by VLM will be displayed in the preview panel on the right side. </li>
93
+ <li> 🤠 After generating the mask or when you use the brush to draw the mask, you can perform operations such as <b>randomization</b>, <b>dilation</b>, <b>erosion</b>, and <b>movement</b>. </li>
94
+ <li> 🤠 After input the instructions, you can click the <b>Generate Target Prompt</b> button. The target prompt will be displayed in the text box, and you can modify it according to your ideas. </li>
95
+ </ul>
96
+
97
+ 💡 <b>Detailed Features</b>:
98
+ <ul>
99
+ <li> 🎨 <b>Aspect Ratio</b>: Select the aspect ratio of the image. To prevent OOM, 1024px is the maximum resolution.</li>
100
+ <li> 🎨 <b>VLM Model</b>: Select the VLM model. We use preloaded models to save time. To use other VLM models, download them and uncomment the relevant lines in vlm_template.py from our GitHub repo. </li>
101
+ <li> 🎨 <b>Generate Mask</b>: According to the input instructions, generate a mask for the area that may need to be edited. </li>
102
+ <li> 🎨 <b>Square/Circle Mask</b>: Based on the existing mask, generate masks for squares and circles. (The coarse-grained mask provides more editing imagination.) </li>
103
+ <li> 🎨 <b>Invert Mask</b>: Invert the mask to generate a new mask. </li>
104
+ <li> 🎨 <b>Dilation/Erosion Mask</b>: Expand or shrink the mask to include or exclude more areas. </li>
105
+ <li> 🎨 <b>Move Mask</b>: Move the mask to a new position. </li>
106
+ <li> 🎨 <b>Generate Target Prompt</b>: Generate a target prompt based on the input instructions. </li>
107
+ <li> 🎨 <b>Target Prompt</b>: Description for masking area, manual input or modification can be made when the content generated by VLM does not meet expectations. </li>
108
+ <li> 🎨 <b>Blending</b>: Blending brushnet's output and the original input, ensuring the original image details in the unedited areas. (turn off is beeter when removing.) </li>
109
+ <li> 🎨 <b>Control length</b>: The intensity of editing and inpainting. </li>
110
+ </ul>
111
+
112
+ 💡 <b>Advanced Features</b>:
113
+ <ul>
114
+ <li> 🎨 <b>Base Model</b>: We use preloaded models to save time. To use other VLM models, download them and uncomment the relevant lines in vlm_template.py from our GitHub repo. </li>
115
+ <li> 🎨 <b>Blending</b>: Blending brushnet's output and the original input, ensuring the original image details in the unedited areas. (turn off is beeter when removing.) </li>
116
+ <li> 🎨 <b>Control length</b>: The intensity of editing and inpainting. </li>
117
+ <li> 🎨 <b>Num samples</b>: The number of samples to generate. </li>
118
+ <li> 🎨 <b>Negative prompt</b>: The negative prompt for the classifier-free guidance. </li>
119
+ <li> 🎨 <b>Guidance scale</b>: The guidance scale for the classifier-free guidance. </li>
120
+ </ul>
121
+
122
+
123
+ """
124
+
125
+
126
+
127
+ citation = r"""
128
+ If BrushEdit is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/BrushEdit' target='_blank'>Github Repo</a>. Thanks!
129
+ [![GitHub Stars](https://img.shields.io/github/stars/TencentARC/BrushEdit?style=social)](https://github.com/TencentARC/BrushEdit)
130
+ ---
131
+ 📝 **Citation**
132
+ <br>
133
+ If our work is useful for your research, please consider citing:
134
+ ```bibtex
135
+ @misc{li2024brushedit,
136
+ title={BrushEdit: All-In-One Image Inpainting and Editing},
137
+ author={Yaowei Li and Yuxuan Bian and Xuan Ju and Zhaoyang Zhang and and Junhao Zhuang and Ying Shan and Yuexian Zou and Qiang Xu},
138
+ year={2024},
139
+ eprint={2412.10316},
140
+ archivePrefix={arXiv},
141
+ primaryClass={cs.CV}
142
+ }
143
+ ```
144
+ 📧 **Contact**
145
+ <br>
146
+ If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
147
+ """
148
+
149
+ # - - - - - examples - - - - - #
150
+ EXAMPLES = [
151
+
152
+ [
153
+ Image.open("./assets/frog/frog.jpeg").convert("RGBA"),
154
+ "add a magic hat on frog head.",
155
+ 642087011,
156
+ "frog",
157
+ "frog",
158
+ True,
159
+ False,
160
+ "GPT4-o (Highly Recommended)"
161
+ ],
162
+ [
163
+ Image.open("./assets/chinese_girl/chinese_girl.png").convert("RGBA"),
164
+ "replace the background to ancient China.",
165
+ 648464818,
166
+ "chinese_girl",
167
+ "chinese_girl",
168
+ True,
169
+ False,
170
+ "GPT4-o (Highly Recommended)"
171
+ ],
172
+ [
173
+ Image.open("./assets/angel_christmas/angel_christmas.png").convert("RGBA"),
174
+ "remove the deer.",
175
+ 648464818,
176
+ "angel_christmas",
177
+ "angel_christmas",
178
+ False,
179
+ False,
180
+ "GPT4-o (Highly Recommended)"
181
+ ],
182
+ [
183
+ Image.open("./assets/sunflower_girl/sunflower_girl.png").convert("RGBA"),
184
+ "add a wreath on head.",
185
+ 648464818,
186
+ "sunflower_girl",
187
+ "sunflower_girl",
188
+ True,
189
+ False,
190
+ "GPT4-o (Highly Recommended)"
191
+ ],
192
+ [
193
+ Image.open("./assets/girl_on_sun/girl_on_sun.png").convert("RGBA"),
194
+ "add a butterfly fairy.",
195
+ 648464818,
196
+ "girl_on_sun",
197
+ "girl_on_sun",
198
+ True,
199
+ False,
200
+ "GPT4-o (Highly Recommended)"
201
+ ],
202
+ [
203
+ Image.open("./assets/spider_man_rm/spider_man.png").convert("RGBA"),
204
+ "remove the christmas hat.",
205
+ 642087011,
206
+ "spider_man_rm",
207
+ "spider_man_rm",
208
+ False,
209
+ False,
210
+ "GPT4-o (Highly Recommended)"
211
+ ],
212
+ [
213
+ Image.open("./assets/anime_flower/anime_flower.png").convert("RGBA"),
214
+ "remove the flower.",
215
+ 642087011,
216
+ "anime_flower",
217
+ "anime_flower",
218
+ False,
219
+ False,
220
+ "GPT4-o (Highly Recommended)"
221
+ ],
222
+ [
223
+ Image.open("./assets/chenduling/chengduling.jpg").convert("RGBA"),
224
+ "replace the clothes to a delicated floral skirt.",
225
+ 648464818,
226
+ "chenduling",
227
+ "chenduling",
228
+ True,
229
+ False,
230
+ "GPT4-o (Highly Recommended)"
231
+ ],
232
+ [
233
+ Image.open("./assets/hedgehog_rp_bg/hedgehog.png").convert("RGBA"),
234
+ "make the hedgehog in Italy.",
235
+ 648464818,
236
+ "hedgehog_rp_bg",
237
+ "hedgehog_rp_bg",
238
+ True,
239
+ False,
240
+ "GPT4-o (Highly Recommended)"
241
+ ],
242
+
243
+ ]
244
+
245
+ INPUT_IMAGE_PATH = {
246
+ "frog": "./assets/frog/frog.jpeg",
247
+ "chinese_girl": "./assets/chinese_girl/chinese_girl.png",
248
+ "angel_christmas": "./assets/angel_christmas/angel_christmas.png",
249
+ "sunflower_girl": "./assets/sunflower_girl/sunflower_girl.png",
250
+ "girl_on_sun": "./assets/girl_on_sun/girl_on_sun.png",
251
+ "spider_man_rm": "./assets/spider_man_rm/spider_man.png",
252
+ "anime_flower": "./assets/anime_flower/anime_flower.png",
253
+ "chenduling": "./assets/chenduling/chengduling.jpg",
254
+ "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/hedgehog.png",
255
+ }
256
+ MASK_IMAGE_PATH = {
257
+ "frog": "./assets/frog/mask_f7b350de-6f2c-49e3-b535-995c486d78e7.png",
258
+ "chinese_girl": "./assets/chinese_girl/mask_54759648-0989-48e0-bc82-f20e28b5ec29.png",
259
+ "angel_christmas": "./assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png",
260
+ "sunflower_girl": "./assets/sunflower_girl/mask_99cc50b4-7dc4-4de5-8748-ec10772f0317.png",
261
+ "girl_on_sun": "./assets/girl_on_sun/mask_264eac8b-8b65-479c-9755-020a60880c37.png",
262
+ "spider_man_rm": "./assets/spider_man_rm/mask_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png",
263
+ "anime_flower": "./assets/anime_flower/mask_37553172-9b38-4727-bf2e-37d7e2b93461.png",
264
+ "chenduling": "./assets/chenduling/mask_68e3ff6f-da07-4b37-91df-13d6eed7b997.png",
265
+ "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/mask_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png",
266
+ }
267
+ MASKED_IMAGE_PATH = {
268
+ "frog": "./assets/frog/masked_image_f7b350de-6f2c-49e3-b535-995c486d78e7.png",
269
+ "chinese_girl": "./assets/chinese_girl/masked_image_54759648-0989-48e0-bc82-f20e28b5ec29.png",
270
+ "angel_christmas": "./assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png",
271
+ "sunflower_girl": "./assets/sunflower_girl/masked_image_99cc50b4-7dc4-4de5-8748-ec10772f0317.png",
272
+ "girl_on_sun": "./assets/girl_on_sun/masked_image_264eac8b-8b65-479c-9755-020a60880c37.png",
273
+ "spider_man_rm": "./assets/spider_man_rm/masked_image_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png",
274
+ "anime_flower": "./assets/anime_flower/masked_image_37553172-9b38-4727-bf2e-37d7e2b93461.png",
275
+ "chenduling": "./assets/chenduling/masked_image_68e3ff6f-da07-4b37-91df-13d6eed7b997.png",
276
+ "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/masked_image_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png",
277
+ }
278
+ OUTPUT_IMAGE_PATH = {
279
+ "frog": "./assets/frog/image_edit_f7b350de-6f2c-49e3-b535-995c486d78e7_1.png",
280
+ "chinese_girl": "./assets/chinese_girl/image_edit_54759648-0989-48e0-bc82-f20e28b5ec29_1.png",
281
+ "angel_christmas": "./assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png",
282
+ "sunflower_girl": "./assets/sunflower_girl/image_edit_99cc50b4-7dc4-4de5-8748-ec10772f0317_3.png",
283
+ "girl_on_sun": "./assets/girl_on_sun/image_edit_264eac8b-8b65-479c-9755-020a60880c37_0.png",
284
+ "spider_man_rm": "./assets/spider_man_rm/image_edit_a5d410e6-8e8d-432f-8144-defbc3e1eae9_0.png",
285
+ "anime_flower": "./assets/anime_flower/image_edit_37553172-9b38-4727-bf2e-37d7e2b93461_2.png",
286
+ "chenduling": "./assets/chenduling/image_edit_68e3ff6f-da07-4b37-91df-13d6eed7b997_0.png",
287
+ "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/image_edit_db7f8bf8-8349-46d3-b14e-43d67fbe25d3_3.png",
288
+ }
289
+
290
+ # os.environ['GRADIO_TEMP_DIR'] = 'gradio_temp_dir'
291
+ # os.makedirs('gradio_temp_dir', exist_ok=True)
292
+
293
+ VLM_MODEL_NAMES = list(vlms_template.keys())
294
+ DEFAULT_VLM_MODEL_NAME = "Qwen2-VL-7B-Instruct (Default)"
295
+ BASE_MODELS = list(base_models_template.keys())
296
+ DEFAULT_BASE_MODEL = "realisticVision (Default)"
297
+
298
+ ASPECT_RATIO_LABELS = list(aspect_ratios)
299
+ DEFAULT_ASPECT_RATIO = ASPECT_RATIO_LABELS[0]
300
+
301
+
302
+ ## init device
303
+ try:
304
+ if torch.cuda.is_available():
305
+ device = "cuda"
306
+ elif sys.platform == "darwin" and torch.backends.mps.is_available():
307
+ device = "mps"
308
+ else:
309
+ device = "cpu"
310
+ except:
311
+ device = "cpu"
312
+
313
+ # ## init torch dtype
314
+ # if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
315
+ # torch_dtype = torch.bfloat16
316
+ # else:
317
+ # torch_dtype = torch.float16
318
+
319
+ # if device == "mps":
320
+ # torch_dtype = torch.float16
321
+
322
+ torch_dtype = torch.float16
323
+
324
+
325
+
326
+ # download hf models
327
+ BrushEdit_path = "models/"
328
+ if not os.path.exists(BrushEdit_path):
329
+ BrushEdit_path = snapshot_download(
330
+ repo_id="TencentARC/BrushEdit",
331
+ local_dir=BrushEdit_path,
332
+ token=os.getenv("HF_TOKEN"),
333
+ )
334
+
335
+ ## init default VLM
336
+ vlm_type, vlm_local_path, vlm_processor, vlm_model = vlms_template[DEFAULT_VLM_MODEL_NAME]
337
+ if vlm_processor != "" and vlm_model != "":
338
+ vlm_model.to(device)
339
+ else:
340
+ gr.Error("Please Download default VLM model "+ DEFAULT_VLM_MODEL_NAME +" first.")
341
+
342
+
343
+ ## init base model
344
+ base_model_path = os.path.join(BrushEdit_path, "base_model/realisticVisionV60B1_v51VAE")
345
+ brushnet_path = os.path.join(BrushEdit_path, "brushnetX")
346
+ sam_path = os.path.join(BrushEdit_path, "sam/sam_vit_h_4b8939.pth")
347
+ groundingdino_path = os.path.join(BrushEdit_path, "grounding_dino/groundingdino_swint_ogc.pth")
348
+
349
+
350
+ # input brushnetX ckpt path
351
+ brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch_dtype)
352
+ pipe = StableDiffusionBrushNetPipeline.from_pretrained(
353
+ base_model_path, brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
354
+ )
355
+ # speed up diffusion process with faster scheduler and memory optimization
356
+ pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
357
+ # remove following line if xformers is not installed or when using Torch 2.0.
358
+ # pipe.enable_xformers_memory_efficient_attention()
359
+ pipe.enable_model_cpu_offload()
360
+
361
+
362
+ ## init SAM
363
+ sam = build_sam(checkpoint=sam_path)
364
+ sam.to(device=device)
365
+ sam_predictor = SamPredictor(sam)
366
+ sam_automask_generator = SamAutomaticMaskGenerator(sam)
367
+
368
+ ## init groundingdino_model
369
+ config_file = 'app/utils/GroundingDINO_SwinT_OGC.py'
370
+ groundingdino_model = load_grounding_dino_model(config_file, groundingdino_path, device=device)
371
+
372
+ ## Ordinary function
373
+ def crop_and_resize(image: Image.Image,
374
+ target_width: int,
375
+ target_height: int) -> Image.Image:
376
+ """
377
+ Crops and resizes an image while preserving the aspect ratio.
378
+
379
+ Args:
380
+ image (Image.Image): Input PIL image to be cropped and resized.
381
+ target_width (int): Target width of the output image.
382
+ target_height (int): Target height of the output image.
383
+
384
+ Returns:
385
+ Image.Image: Cropped and resized image.
386
+ """
387
+ # Original dimensions
388
+ original_width, original_height = image.size
389
+ original_aspect = original_width / original_height
390
+ target_aspect = target_width / target_height
391
+
392
+ # Calculate crop box to maintain aspect ratio
393
+ if original_aspect > target_aspect:
394
+ # Crop horizontally
395
+ new_width = int(original_height * target_aspect)
396
+ new_height = original_height
397
+ left = (original_width - new_width) / 2
398
+ top = 0
399
+ right = left + new_width
400
+ bottom = original_height
401
+ else:
402
+ # Crop vertically
403
+ new_width = original_width
404
+ new_height = int(original_width / target_aspect)
405
+ left = 0
406
+ top = (original_height - new_height) / 2
407
+ right = original_width
408
+ bottom = top + new_height
409
+
410
+ # Crop and resize
411
+ cropped_image = image.crop((left, top, right, bottom))
412
+ resized_image = cropped_image.resize((target_width, target_height), Image.NEAREST)
413
+ return resized_image
414
+
415
+
416
+ ## Ordinary function
417
+ def resize(image: Image.Image,
418
+ target_width: int,
419
+ target_height: int) -> Image.Image:
420
+ """
421
+ Crops and resizes an image while preserving the aspect ratio.
422
+
423
+ Args:
424
+ image (Image.Image): Input PIL image to be cropped and resized.
425
+ target_width (int): Target width of the output image.
426
+ target_height (int): Target height of the output image.
427
+
428
+ Returns:
429
+ Image.Image: Cropped and resized image.
430
+ """
431
+ # Original dimensions
432
+ resized_image = image.resize((target_width, target_height), Image.NEAREST)
433
+ return resized_image
434
+
435
+
436
+ def move_mask_func(mask, direction, units):
437
+ binary_mask = mask.squeeze()>0
438
+ rows, cols = binary_mask.shape
439
+ moved_mask = np.zeros_like(binary_mask, dtype=bool)
440
+
441
+ if direction == 'down':
442
+ # move down
443
+ moved_mask[max(0, units):, :] = binary_mask[:rows - units, :]
444
+
445
+ elif direction == 'up':
446
+ # move up
447
+ moved_mask[:rows - units, :] = binary_mask[units:, :]
448
+
449
+ elif direction == 'right':
450
+ # move left
451
+ moved_mask[:, max(0, units):] = binary_mask[:, :cols - units]
452
+
453
+ elif direction == 'left':
454
+ # move right
455
+ moved_mask[:, :cols - units] = binary_mask[:, units:]
456
+
457
+ return moved_mask
458
+
459
+
460
+ def random_mask_func(mask, dilation_type='square', dilation_size=20):
461
+ # Randomly select the size of dilation
462
+ binary_mask = mask.squeeze()>0
463
+
464
+ if dilation_type == 'square_dilation':
465
+ structure = np.ones((dilation_size, dilation_size), dtype=bool)
466
+ dilated_mask = binary_dilation(binary_mask, structure=structure)
467
+ elif dilation_type == 'square_erosion':
468
+ structure = np.ones((dilation_size, dilation_size), dtype=bool)
469
+ dilated_mask = binary_erosion(binary_mask, structure=structure)
470
+ elif dilation_type == 'bounding_box':
471
+ # find the most left top and left bottom point
472
+ rows, cols = np.where(binary_mask)
473
+ if len(rows) == 0 or len(cols) == 0:
474
+ return mask # return original mask if no valid points
475
+
476
+ min_row = np.min(rows)
477
+ max_row = np.max(rows)
478
+ min_col = np.min(cols)
479
+ max_col = np.max(cols)
480
+
481
+ # create a bounding box
482
+ dilated_mask = np.zeros_like(binary_mask, dtype=bool)
483
+ dilated_mask[min_row:max_row + 1, min_col:max_col + 1] = True
484
+
485
+ elif dilation_type == 'bounding_ellipse':
486
+ # find the most left top and left bottom point
487
+ rows, cols = np.where(binary_mask)
488
+ if len(rows) == 0 or len(cols) == 0:
489
+ return mask # return original mask if no valid points
490
+
491
+ min_row = np.min(rows)
492
+ max_row = np.max(rows)
493
+ min_col = np.min(cols)
494
+ max_col = np.max(cols)
495
+
496
+ # calculate the center and axis length of the ellipse
497
+ center = ((min_col + max_col) // 2, (min_row + max_row) // 2)
498
+ a = (max_col - min_col) // 2 # half long axis
499
+ b = (max_row - min_row) // 2 # half short axis
500
+
501
+ # create a bounding ellipse
502
+ y, x = np.ogrid[:mask.shape[0], :mask.shape[1]]
503
+ ellipse_mask = ((x - center[0])**2 / a**2 + (y - center[1])**2 / b**2) <= 1
504
+ dilated_mask = np.zeros_like(binary_mask, dtype=bool)
505
+ dilated_mask[ellipse_mask] = True
506
+ else:
507
+ raise ValueError("dilation_type must be 'square' or 'ellipse'")
508
+
509
+ # use binary dilation
510
+ dilated_mask = np.uint8(dilated_mask[:,:,np.newaxis]) * 255
511
+ return dilated_mask
512
+
513
+
514
+ ## Gradio component function
515
+ def update_vlm_model(vlm_name):
516
+ global vlm_model, vlm_processor
517
+ if vlm_model is not None:
518
+ del vlm_model
519
+ torch.cuda.empty_cache()
520
+
521
+ vlm_type, vlm_local_path, vlm_processor, vlm_model = vlms_template[vlm_name]
522
+
523
+ ## we recommend using preload models, otherwise it will take a long time to download the model. you can edit the code via vlm_template.py
524
+ if vlm_type == "llava-next":
525
+ if vlm_processor != "" and vlm_model != "":
526
+ vlm_model.to(device)
527
+ return vlm_model_dropdown
528
+ else:
529
+ if os.path.exists(vlm_local_path):
530
+ vlm_processor = LlavaNextProcessor.from_pretrained(vlm_local_path)
531
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype="auto", device_map="auto")
532
+ else:
533
+ if vlm_name == "llava-v1.6-mistral-7b-hf (Preload)":
534
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
535
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype="auto", device_map="auto")
536
+ elif vlm_name == "llama3-llava-next-8b-hf (Preload)":
537
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llama3-llava-next-8b-hf")
538
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llama3-llava-next-8b-hf", torch_dtype="auto", device_map="auto")
539
+ elif vlm_name == "llava-v1.6-vicuna-13b-hf (Preload)":
540
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf")
541
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf", torch_dtype="auto", device_map="auto")
542
+ elif vlm_name == "llava-v1.6-34b-hf (Preload)":
543
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-34b-hf")
544
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-34b-hf", torch_dtype="auto", device_map="auto")
545
+ elif vlm_name == "llava-next-72b-hf (Preload)":
546
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-next-72b-hf")
547
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-next-72b-hf", torch_dtype="auto", device_map="auto")
548
+ elif vlm_type == "qwen2-vl":
549
+ if vlm_processor != "" and vlm_model != "":
550
+ vlm_model.to(device)
551
+ return vlm_model_dropdown
552
+ else:
553
+ if os.path.exists(vlm_local_path):
554
+ vlm_processor = Qwen2VLProcessor.from_pretrained(vlm_local_path)
555
+ vlm_model = Qwen2VLForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype="auto", device_map="auto")
556
+ else:
557
+ if vlm_name == "qwen2-vl-2b-instruct (Preload)":
558
+ vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
559
+ vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto")
560
+ elif vlm_name == "qwen2-vl-7b-instruct (Preload)":
561
+ vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
562
+ vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto")
563
+ elif vlm_name == "qwen2-vl-72b-instruct (Preload)":
564
+ vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct")
565
+ vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-72B-Instruct", torch_dtype="auto", device_map="auto")
566
+ elif vlm_type == "openai":
567
+ pass
568
+ return "success"
569
+
570
+
571
+ def update_base_model(base_model_name):
572
+ global pipe
573
+ ## we recommend using preload models, otherwise it will take a long time to download the model. you can edit the code via base_model_template.py
574
+ if pipe is not None:
575
+ del pipe
576
+ torch.cuda.empty_cache()
577
+ base_model_path, pipe = base_models_template[base_model_name]
578
+ if pipe != "":
579
+ pipe.to(device)
580
+ else:
581
+ if os.path.exists(base_model_path):
582
+ pipe = StableDiffusionBrushNetPipeline.from_pretrained(
583
+ base_model_path, brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
584
+ )
585
+ # pipe.enable_xformers_memory_efficient_attention()
586
+ pipe.enable_model_cpu_offload()
587
+ else:
588
+ raise gr.Error(f"The base model {base_model_name} does not exist")
589
+ return "success"
590
+
591
+
592
+ def submit_GPT4o_KEY(GPT4o_KEY):
593
+ global vlm_model, vlm_processor
594
+ if vlm_model is not None:
595
+ del vlm_model
596
+ torch.cuda.empty_cache()
597
+ try:
598
+ vlm_model = OpenAI(api_key=GPT4o_KEY)
599
+ vlm_processor = ""
600
+ response = vlm_model.chat.completions.create(
601
+ model="gpt-4o-2024-08-06",
602
+ messages=[
603
+ {"role": "system", "content": "You are a helpful assistant."},
604
+ {"role": "user", "content": "Say this is a test"}
605
+ ]
606
+ )
607
+ response_str = response.choices[0].message.content
608
+
609
+ return "Success, " + response_str, "GPT4-o (Highly Recommended)"
610
+ except Exception as e:
611
+ return "Invalid GPT4o API Key", "GPT4-o (Highly Recommended)"
612
+
613
+
614
+
615
+ @spaces.GPU(duration=180)
616
+ def process(input_image,
617
+ original_image,
618
+ original_mask,
619
+ prompt,
620
+ negative_prompt,
621
+ control_strength,
622
+ seed,
623
+ randomize_seed,
624
+ guidance_scale,
625
+ num_inference_steps,
626
+ num_samples,
627
+ blending,
628
+ category,
629
+ target_prompt,
630
+ resize_default,
631
+ aspect_ratio_name,
632
+ invert_mask_state):
633
+ if original_image is None:
634
+ if input_image is None:
635
+ raise gr.Error('Please upload the input image')
636
+ else:
637
+ image_pil = input_image["background"].convert("RGB")
638
+ original_image = np.array(image_pil)
639
+ if prompt is None or prompt == "":
640
+ raise gr.Error("Please input your instructions, e.g., remove the xxx")
641
+
642
+ alpha_mask = input_image["layers"][0].split()[3]
643
+ input_mask = np.asarray(alpha_mask)
644
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
645
+ if output_w == "" or output_h == "":
646
+ output_h, output_w = original_image.shape[:2]
647
+
648
+ if resize_default:
649
+ short_side = min(output_w, output_h)
650
+ scale_ratio = 640 / short_side
651
+ output_w = int(output_w * scale_ratio)
652
+ output_h = int(output_h * scale_ratio)
653
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
654
+ original_image = np.array(original_image)
655
+ if input_mask is not None:
656
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
657
+ input_mask = np.array(input_mask)
658
+ if original_mask is not None:
659
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
660
+ original_mask = np.array(original_mask)
661
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
662
+ else:
663
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
664
+ pass
665
+ else:
666
+ if resize_default:
667
+ short_side = min(output_w, output_h)
668
+ scale_ratio = 640 / short_side
669
+ output_w = int(output_w * scale_ratio)
670
+ output_h = int(output_h * scale_ratio)
671
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
672
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
673
+ original_image = np.array(original_image)
674
+ if input_mask is not None:
675
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
676
+ input_mask = np.array(input_mask)
677
+ if original_mask is not None:
678
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
679
+ original_mask = np.array(original_mask)
680
+
681
+ if invert_mask_state:
682
+ original_mask = original_mask
683
+ else:
684
+ if input_mask.max() == 0:
685
+ original_mask = original_mask
686
+ else:
687
+ original_mask = input_mask
688
+
689
+
690
+
691
+ if category is not None:
692
+ pass
693
+ else:
694
+ category = vlm_response_editing_type(vlm_processor, vlm_model, original_image, prompt, device)
695
+
696
+
697
+ if original_mask is not None:
698
+ original_mask = np.clip(original_mask, 0, 255).astype(np.uint8)
699
+ else:
700
+ object_wait_for_edit = vlm_response_object_wait_for_edit(
701
+ vlm_processor,
702
+ vlm_model,
703
+ original_image,
704
+ category,
705
+ prompt,
706
+ device)
707
+
708
+ original_mask = vlm_response_mask(vlm_processor,
709
+ vlm_model,
710
+ category,
711
+ original_image,
712
+ prompt,
713
+ object_wait_for_edit,
714
+ sam,
715
+ sam_predictor,
716
+ sam_automask_generator,
717
+ groundingdino_model,
718
+ device)
719
+ if original_mask.ndim == 2:
720
+ original_mask = original_mask[:,:,None]
721
+
722
+
723
+ if len(target_prompt) <= 1:
724
+ prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(
725
+ vlm_processor,
726
+ vlm_model,
727
+ original_image,
728
+ prompt,
729
+ device)
730
+ else:
731
+ prompt_after_apply_instruction = target_prompt
732
+
733
+ generator = torch.Generator(device).manual_seed(random.randint(0, 2147483647) if randomize_seed else seed)
734
+
735
+
736
+ with torch.autocast(device):
737
+ image, mask_image, mask_np, init_image_np = BrushEdit_Pipeline(pipe,
738
+ prompt_after_apply_instruction,
739
+ original_mask,
740
+ original_image,
741
+ generator,
742
+ num_inference_steps,
743
+ guidance_scale,
744
+ control_strength,
745
+ negative_prompt,
746
+ num_samples,
747
+ blending)
748
+ original_image = np.array(init_image_np)
749
+ masked_image = original_image * (1 - (mask_np>0))
750
+ masked_image = masked_image.astype(np.uint8)
751
+ masked_image = Image.fromarray(masked_image)
752
+ # Save the images (optional)
753
+ # import uuid
754
+ # uuid = str(uuid.uuid4())
755
+ # image[0].save(f"outputs/image_edit_{uuid}_0.png")
756
+ # image[1].save(f"outputs/image_edit_{uuid}_1.png")
757
+ # image[2].save(f"outputs/image_edit_{uuid}_2.png")
758
+ # image[3].save(f"outputs/image_edit_{uuid}_3.png")
759
+ # mask_image.save(f"outputs/mask_{uuid}.png")
760
+ # masked_image.save(f"outputs/masked_image_{uuid}.png")
761
+ return image, [mask_image], [masked_image], prompt, '', prompt_after_apply_instruction, False
762
+
763
+
764
+ def generate_target_prompt(input_image,
765
+ original_image,
766
+ prompt):
767
+ # load example image
768
+ if isinstance(original_image, str):
769
+ original_image = input_image
770
+
771
+ prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(
772
+ vlm_processor,
773
+ vlm_model,
774
+ original_image,
775
+ prompt,
776
+ device)
777
+ return prompt_after_apply_instruction, prompt_after_apply_instruction
778
+
779
+
780
+ def process_mask(input_image,
781
+ original_image,
782
+ prompt,
783
+ resize_default,
784
+ aspect_ratio_name):
785
+ if original_image is None:
786
+ raise gr.Error('Please upload the input image')
787
+ if prompt is None:
788
+ raise gr.Error("Please input your instructions, e.g., remove the xxx")
789
+
790
+ ## load mask
791
+ alpha_mask = input_image["layers"][0].split()[3]
792
+ input_mask = np.array(alpha_mask)
793
+
794
+ # load example image
795
+ if isinstance(original_image, str):
796
+ original_image = input_image["background"]
797
+
798
+ if input_mask.max() == 0:
799
+ category = vlm_response_editing_type(vlm_processor, vlm_model, original_image, prompt, device)
800
+
801
+ object_wait_for_edit = vlm_response_object_wait_for_edit(vlm_processor,
802
+ vlm_model,
803
+ original_image,
804
+ category,
805
+ prompt,
806
+ device)
807
+ # original mask: h,w,1 [0, 255]
808
+ original_mask = vlm_response_mask(
809
+ vlm_processor,
810
+ vlm_model,
811
+ category,
812
+ original_image,
813
+ prompt,
814
+ object_wait_for_edit,
815
+ sam,
816
+ sam_predictor,
817
+ sam_automask_generator,
818
+ groundingdino_model,
819
+ device)
820
+ else:
821
+ original_mask = input_mask
822
+ category = None
823
+
824
+ ## resize mask if needed
825
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
826
+ if output_w == "" or output_h == "":
827
+ output_h, output_w = original_image.shape[:2]
828
+ if resize_default:
829
+ short_side = min(output_w, output_h)
830
+ scale_ratio = 640 / short_side
831
+ output_w = int(output_w * scale_ratio)
832
+ output_h = int(output_h * scale_ratio)
833
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
834
+ original_image = np.array(original_image)
835
+ if input_mask is not None:
836
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
837
+ input_mask = np.array(input_mask)
838
+ if original_mask is not None:
839
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
840
+ original_mask = np.array(original_mask)
841
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
842
+ else:
843
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
844
+ pass
845
+ else:
846
+ if resize_default:
847
+ short_side = min(output_w, output_h)
848
+ scale_ratio = 640 / short_side
849
+ output_w = int(output_w * scale_ratio)
850
+ output_h = int(output_h * scale_ratio)
851
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
852
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
853
+ original_image = np.array(original_image)
854
+ if input_mask is not None:
855
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
856
+ input_mask = np.array(input_mask)
857
+ if original_mask is not None:
858
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
859
+ original_mask = np.array(original_mask)
860
+
861
+
862
+
863
+ if original_mask.ndim == 2:
864
+ original_mask = original_mask[:,:,None]
865
+
866
+ mask_image = Image.fromarray(original_mask.squeeze().astype(np.uint8)).convert("RGB")
867
+
868
+ masked_image = original_image * (1 - (original_mask>0))
869
+ masked_image = masked_image.astype(np.uint8)
870
+ masked_image = Image.fromarray(masked_image)
871
+
872
+ return [masked_image], [mask_image], original_mask.astype(np.uint8), category
873
+
874
+
875
+ def process_random_mask(input_image,
876
+ original_image,
877
+ original_mask,
878
+ resize_default,
879
+ aspect_ratio_name,
880
+ ):
881
+
882
+ alpha_mask = input_image["layers"][0].split()[3]
883
+ input_mask = np.asarray(alpha_mask)
884
+
885
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
886
+ if output_w == "" or output_h == "":
887
+ output_h, output_w = original_image.shape[:2]
888
+ if resize_default:
889
+ short_side = min(output_w, output_h)
890
+ scale_ratio = 640 / short_side
891
+ output_w = int(output_w * scale_ratio)
892
+ output_h = int(output_h * scale_ratio)
893
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
894
+ original_image = np.array(original_image)
895
+ if input_mask is not None:
896
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
897
+ input_mask = np.array(input_mask)
898
+ if original_mask is not None:
899
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
900
+ original_mask = np.array(original_mask)
901
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
902
+ else:
903
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
904
+ pass
905
+ else:
906
+ if resize_default:
907
+ short_side = min(output_w, output_h)
908
+ scale_ratio = 640 / short_side
909
+ output_w = int(output_w * scale_ratio)
910
+ output_h = int(output_h * scale_ratio)
911
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
912
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
913
+ original_image = np.array(original_image)
914
+ if input_mask is not None:
915
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
916
+ input_mask = np.array(input_mask)
917
+ if original_mask is not None:
918
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
919
+ original_mask = np.array(original_mask)
920
+
921
+
922
+ if input_mask.max() == 0:
923
+ original_mask = original_mask
924
+ else:
925
+ original_mask = input_mask
926
+
927
+ if original_mask is None:
928
+ raise gr.Error('Please generate mask first')
929
+
930
+ if original_mask.ndim == 2:
931
+ original_mask = original_mask[:,:,None]
932
+
933
+ dilation_type = np.random.choice(['bounding_box', 'bounding_ellipse'])
934
+ random_mask = random_mask_func(original_mask, dilation_type).squeeze()
935
+
936
+ mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
937
+
938
+ masked_image = original_image * (1 - (random_mask[:,:,None]>0))
939
+ masked_image = masked_image.astype(original_image.dtype)
940
+ masked_image = Image.fromarray(masked_image)
941
+
942
+
943
+ return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
944
+
945
+
946
+ def process_dilation_mask(input_image,
947
+ original_image,
948
+ original_mask,
949
+ resize_default,
950
+ aspect_ratio_name,
951
+ dilation_size=20):
952
+
953
+ alpha_mask = input_image["layers"][0].split()[3]
954
+ input_mask = np.asarray(alpha_mask)
955
+
956
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
957
+ if output_w == "" or output_h == "":
958
+ output_h, output_w = original_image.shape[:2]
959
+ if resize_default:
960
+ short_side = min(output_w, output_h)
961
+ scale_ratio = 640 / short_side
962
+ output_w = int(output_w * scale_ratio)
963
+ output_h = int(output_h * scale_ratio)
964
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
965
+ original_image = np.array(original_image)
966
+ if input_mask is not None:
967
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
968
+ input_mask = np.array(input_mask)
969
+ if original_mask is not None:
970
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
971
+ original_mask = np.array(original_mask)
972
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
973
+ else:
974
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
975
+ pass
976
+ else:
977
+ if resize_default:
978
+ short_side = min(output_w, output_h)
979
+ scale_ratio = 640 / short_side
980
+ output_w = int(output_w * scale_ratio)
981
+ output_h = int(output_h * scale_ratio)
982
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
983
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
984
+ original_image = np.array(original_image)
985
+ if input_mask is not None:
986
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
987
+ input_mask = np.array(input_mask)
988
+ if original_mask is not None:
989
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
990
+ original_mask = np.array(original_mask)
991
+
992
+ if input_mask.max() == 0:
993
+ original_mask = original_mask
994
+ else:
995
+ original_mask = input_mask
996
+
997
+ if original_mask is None:
998
+ raise gr.Error('Please generate mask first')
999
+
1000
+ if original_mask.ndim == 2:
1001
+ original_mask = original_mask[:,:,None]
1002
+
1003
+ dilation_type = np.random.choice(['square_dilation'])
1004
+ random_mask = random_mask_func(original_mask, dilation_type, dilation_size).squeeze()
1005
+
1006
+ mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
1007
+
1008
+ masked_image = original_image * (1 - (random_mask[:,:,None]>0))
1009
+ masked_image = masked_image.astype(original_image.dtype)
1010
+ masked_image = Image.fromarray(masked_image)
1011
+
1012
+ return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
1013
+
1014
+
1015
+ def process_erosion_mask(input_image,
1016
+ original_image,
1017
+ original_mask,
1018
+ resize_default,
1019
+ aspect_ratio_name,
1020
+ dilation_size=20):
1021
+ alpha_mask = input_image["layers"][0].split()[3]
1022
+ input_mask = np.asarray(alpha_mask)
1023
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
1024
+ if output_w == "" or output_h == "":
1025
+ output_h, output_w = original_image.shape[:2]
1026
+ if resize_default:
1027
+ short_side = min(output_w, output_h)
1028
+ scale_ratio = 640 / short_side
1029
+ output_w = int(output_w * scale_ratio)
1030
+ output_h = int(output_h * scale_ratio)
1031
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1032
+ original_image = np.array(original_image)
1033
+ if input_mask is not None:
1034
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1035
+ input_mask = np.array(input_mask)
1036
+ if original_mask is not None:
1037
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1038
+ original_mask = np.array(original_mask)
1039
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1040
+ else:
1041
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1042
+ pass
1043
+ else:
1044
+ if resize_default:
1045
+ short_side = min(output_w, output_h)
1046
+ scale_ratio = 640 / short_side
1047
+ output_w = int(output_w * scale_ratio)
1048
+ output_h = int(output_h * scale_ratio)
1049
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1050
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1051
+ original_image = np.array(original_image)
1052
+ if input_mask is not None:
1053
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1054
+ input_mask = np.array(input_mask)
1055
+ if original_mask is not None:
1056
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1057
+ original_mask = np.array(original_mask)
1058
+
1059
+ if input_mask.max() == 0:
1060
+ original_mask = original_mask
1061
+ else:
1062
+ original_mask = input_mask
1063
+
1064
+ if original_mask is None:
1065
+ raise gr.Error('Please generate mask first')
1066
+
1067
+ if original_mask.ndim == 2:
1068
+ original_mask = original_mask[:,:,None]
1069
+
1070
+ dilation_type = np.random.choice(['square_erosion'])
1071
+ random_mask = random_mask_func(original_mask, dilation_type, dilation_size).squeeze()
1072
+
1073
+ mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
1074
+
1075
+ masked_image = original_image * (1 - (random_mask[:,:,None]>0))
1076
+ masked_image = masked_image.astype(original_image.dtype)
1077
+ masked_image = Image.fromarray(masked_image)
1078
+
1079
+
1080
+ return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
1081
+
1082
+
1083
+ def move_mask_left(input_image,
1084
+ original_image,
1085
+ original_mask,
1086
+ moving_pixels,
1087
+ resize_default,
1088
+ aspect_ratio_name):
1089
+
1090
+ alpha_mask = input_image["layers"][0].split()[3]
1091
+ input_mask = np.asarray(alpha_mask)
1092
+
1093
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
1094
+ if output_w == "" or output_h == "":
1095
+ output_h, output_w = original_image.shape[:2]
1096
+ if resize_default:
1097
+ short_side = min(output_w, output_h)
1098
+ scale_ratio = 640 / short_side
1099
+ output_w = int(output_w * scale_ratio)
1100
+ output_h = int(output_h * scale_ratio)
1101
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1102
+ original_image = np.array(original_image)
1103
+ if input_mask is not None:
1104
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1105
+ input_mask = np.array(input_mask)
1106
+ if original_mask is not None:
1107
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1108
+ original_mask = np.array(original_mask)
1109
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1110
+ else:
1111
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1112
+ pass
1113
+ else:
1114
+ if resize_default:
1115
+ short_side = min(output_w, output_h)
1116
+ scale_ratio = 640 / short_side
1117
+ output_w = int(output_w * scale_ratio)
1118
+ output_h = int(output_h * scale_ratio)
1119
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1120
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1121
+ original_image = np.array(original_image)
1122
+ if input_mask is not None:
1123
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1124
+ input_mask = np.array(input_mask)
1125
+ if original_mask is not None:
1126
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1127
+ original_mask = np.array(original_mask)
1128
+
1129
+ if input_mask.max() == 0:
1130
+ original_mask = original_mask
1131
+ else:
1132
+ original_mask = input_mask
1133
+
1134
+ if original_mask is None:
1135
+ raise gr.Error('Please generate mask first')
1136
+
1137
+ if original_mask.ndim == 2:
1138
+ original_mask = original_mask[:,:,None]
1139
+
1140
+ moved_mask = move_mask_func(original_mask, 'left', int(moving_pixels)).squeeze()
1141
+ mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
1142
+
1143
+ masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
1144
+ masked_image = masked_image.astype(original_image.dtype)
1145
+ masked_image = Image.fromarray(masked_image)
1146
+
1147
+ if moved_mask.max() <= 1:
1148
+ moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
1149
+ original_mask = moved_mask
1150
+ return [masked_image], [mask_image], original_mask.astype(np.uint8)
1151
+
1152
+
1153
+ def move_mask_right(input_image,
1154
+ original_image,
1155
+ original_mask,
1156
+ moving_pixels,
1157
+ resize_default,
1158
+ aspect_ratio_name):
1159
+ alpha_mask = input_image["layers"][0].split()[3]
1160
+ input_mask = np.asarray(alpha_mask)
1161
+
1162
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
1163
+ if output_w == "" or output_h == "":
1164
+ output_h, output_w = original_image.shape[:2]
1165
+ if resize_default:
1166
+ short_side = min(output_w, output_h)
1167
+ scale_ratio = 640 / short_side
1168
+ output_w = int(output_w * scale_ratio)
1169
+ output_h = int(output_h * scale_ratio)
1170
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1171
+ original_image = np.array(original_image)
1172
+ if input_mask is not None:
1173
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1174
+ input_mask = np.array(input_mask)
1175
+ if original_mask is not None:
1176
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1177
+ original_mask = np.array(original_mask)
1178
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1179
+ else:
1180
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1181
+ pass
1182
+ else:
1183
+ if resize_default:
1184
+ short_side = min(output_w, output_h)
1185
+ scale_ratio = 640 / short_side
1186
+ output_w = int(output_w * scale_ratio)
1187
+ output_h = int(output_h * scale_ratio)
1188
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1189
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1190
+ original_image = np.array(original_image)
1191
+ if input_mask is not None:
1192
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1193
+ input_mask = np.array(input_mask)
1194
+ if original_mask is not None:
1195
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1196
+ original_mask = np.array(original_mask)
1197
+
1198
+ if input_mask.max() == 0:
1199
+ original_mask = original_mask
1200
+ else:
1201
+ original_mask = input_mask
1202
+
1203
+ if original_mask is None:
1204
+ raise gr.Error('Please generate mask first')
1205
+
1206
+ if original_mask.ndim == 2:
1207
+ original_mask = original_mask[:,:,None]
1208
+
1209
+ moved_mask = move_mask_func(original_mask, 'right', int(moving_pixels)).squeeze()
1210
+
1211
+ mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
1212
+
1213
+ masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
1214
+ masked_image = masked_image.astype(original_image.dtype)
1215
+ masked_image = Image.fromarray(masked_image)
1216
+
1217
+
1218
+ if moved_mask.max() <= 1:
1219
+ moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
1220
+ original_mask = moved_mask
1221
+
1222
+ return [masked_image], [mask_image], original_mask.astype(np.uint8)
1223
+
1224
+
1225
+ def move_mask_up(input_image,
1226
+ original_image,
1227
+ original_mask,
1228
+ moving_pixels,
1229
+ resize_default,
1230
+ aspect_ratio_name):
1231
+ alpha_mask = input_image["layers"][0].split()[3]
1232
+ input_mask = np.asarray(alpha_mask)
1233
+
1234
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
1235
+ if output_w == "" or output_h == "":
1236
+ output_h, output_w = original_image.shape[:2]
1237
+ if resize_default:
1238
+ short_side = min(output_w, output_h)
1239
+ scale_ratio = 640 / short_side
1240
+ output_w = int(output_w * scale_ratio)
1241
+ output_h = int(output_h * scale_ratio)
1242
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1243
+ original_image = np.array(original_image)
1244
+ if input_mask is not None:
1245
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1246
+ input_mask = np.array(input_mask)
1247
+ if original_mask is not None:
1248
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1249
+ original_mask = np.array(original_mask)
1250
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1251
+ else:
1252
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1253
+ pass
1254
+ else:
1255
+ if resize_default:
1256
+ short_side = min(output_w, output_h)
1257
+ scale_ratio = 640 / short_side
1258
+ output_w = int(output_w * scale_ratio)
1259
+ output_h = int(output_h * scale_ratio)
1260
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1261
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1262
+ original_image = np.array(original_image)
1263
+ if input_mask is not None:
1264
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1265
+ input_mask = np.array(input_mask)
1266
+ if original_mask is not None:
1267
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1268
+ original_mask = np.array(original_mask)
1269
+
1270
+ if input_mask.max() == 0:
1271
+ original_mask = original_mask
1272
+ else:
1273
+ original_mask = input_mask
1274
+
1275
+ if original_mask is None:
1276
+ raise gr.Error('Please generate mask first')
1277
+
1278
+ if original_mask.ndim == 2:
1279
+ original_mask = original_mask[:,:,None]
1280
+
1281
+ moved_mask = move_mask_func(original_mask, 'up', int(moving_pixels)).squeeze()
1282
+ mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
1283
+
1284
+ masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
1285
+ masked_image = masked_image.astype(original_image.dtype)
1286
+ masked_image = Image.fromarray(masked_image)
1287
+
1288
+ if moved_mask.max() <= 1:
1289
+ moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
1290
+ original_mask = moved_mask
1291
+
1292
+ return [masked_image], [mask_image], original_mask.astype(np.uint8)
1293
+
1294
+
1295
+ def move_mask_down(input_image,
1296
+ original_image,
1297
+ original_mask,
1298
+ moving_pixels,
1299
+ resize_default,
1300
+ aspect_ratio_name):
1301
+ alpha_mask = input_image["layers"][0].split()[3]
1302
+ input_mask = np.asarray(alpha_mask)
1303
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
1304
+ if output_w == "" or output_h == "":
1305
+ output_h, output_w = original_image.shape[:2]
1306
+ if resize_default:
1307
+ short_side = min(output_w, output_h)
1308
+ scale_ratio = 640 / short_side
1309
+ output_w = int(output_w * scale_ratio)
1310
+ output_h = int(output_h * scale_ratio)
1311
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1312
+ original_image = np.array(original_image)
1313
+ if input_mask is not None:
1314
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1315
+ input_mask = np.array(input_mask)
1316
+ if original_mask is not None:
1317
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1318
+ original_mask = np.array(original_mask)
1319
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1320
+ else:
1321
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1322
+ pass
1323
+ else:
1324
+ if resize_default:
1325
+ short_side = min(output_w, output_h)
1326
+ scale_ratio = 640 / short_side
1327
+ output_w = int(output_w * scale_ratio)
1328
+ output_h = int(output_h * scale_ratio)
1329
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
1330
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
1331
+ original_image = np.array(original_image)
1332
+ if input_mask is not None:
1333
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
1334
+ input_mask = np.array(input_mask)
1335
+ if original_mask is not None:
1336
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
1337
+ original_mask = np.array(original_mask)
1338
+
1339
+ if input_mask.max() == 0:
1340
+ original_mask = original_mask
1341
+ else:
1342
+ original_mask = input_mask
1343
+
1344
+ if original_mask is None:
1345
+ raise gr.Error('Please generate mask first')
1346
+
1347
+ if original_mask.ndim == 2:
1348
+ original_mask = original_mask[:,:,None]
1349
+
1350
+ moved_mask = move_mask_func(original_mask, 'down', int(moving_pixels)).squeeze()
1351
+ mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
1352
+
1353
+ masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
1354
+ masked_image = masked_image.astype(original_image.dtype)
1355
+ masked_image = Image.fromarray(masked_image)
1356
+
1357
+ if moved_mask.max() <= 1:
1358
+ moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
1359
+ original_mask = moved_mask
1360
+
1361
+ return [masked_image], [mask_image], original_mask.astype(np.uint8)
1362
+
1363
+
1364
+ def invert_mask(input_image,
1365
+ original_image,
1366
+ original_mask,
1367
+ ):
1368
+ alpha_mask = input_image["layers"][0].split()[3]
1369
+ input_mask = np.asarray(alpha_mask)
1370
+ if input_mask.max() == 0:
1371
+ original_mask = 1 - (original_mask>0).astype(np.uint8)
1372
+ else:
1373
+ original_mask = 1 - (input_mask>0).astype(np.uint8)
1374
+
1375
+ if original_mask is None:
1376
+ raise gr.Error('Please generate mask first')
1377
+
1378
+ original_mask = original_mask.squeeze()
1379
+ mask_image = Image.fromarray(original_mask*255).convert("RGB")
1380
+
1381
+ if original_mask.ndim == 2:
1382
+ original_mask = original_mask[:,:,None]
1383
+
1384
+ if original_mask.max() <= 1:
1385
+ original_mask = (original_mask * 255).astype(np.uint8)
1386
+
1387
+ masked_image = original_image * (1 - (original_mask>0))
1388
+ masked_image = masked_image.astype(original_image.dtype)
1389
+ masked_image = Image.fromarray(masked_image)
1390
+
1391
+ return [masked_image], [mask_image], original_mask, True
1392
+
1393
+
1394
+ def init_img(base,
1395
+ init_type,
1396
+ prompt,
1397
+ aspect_ratio,
1398
+ example_change_times
1399
+ ):
1400
+ image_pil = base["background"].convert("RGB")
1401
+ original_image = np.array(image_pil)
1402
+ if max(original_image.shape[0], original_image.shape[1]) * 1.0 / min(original_image.shape[0], original_image.shape[1])>2.0:
1403
+ raise gr.Error('image aspect ratio cannot be larger than 2.0')
1404
+ if init_type in MASK_IMAGE_PATH.keys() and example_change_times < 2:
1405
+ mask_gallery = [Image.open(MASK_IMAGE_PATH[init_type]).convert("L")]
1406
+ masked_gallery = [Image.open(MASKED_IMAGE_PATH[init_type]).convert("RGB")]
1407
+ result_gallery = [Image.open(OUTPUT_IMAGE_PATH[init_type]).convert("RGB")]
1408
+ width, height = image_pil.size
1409
+ image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True)
1410
+ height_new, width_new = image_processor.get_default_height_width(image_pil, height, width)
1411
+ image_pil = image_pil.resize((width_new, height_new))
1412
+ mask_gallery[0] = mask_gallery[0].resize((width_new, height_new))
1413
+ masked_gallery[0] = masked_gallery[0].resize((width_new, height_new))
1414
+ result_gallery[0] = result_gallery[0].resize((width_new, height_new))
1415
+ original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1
1416
+ return base, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, "", "", "", "Custom resolution", False, False, example_change_times
1417
+ else:
1418
+ return base, original_image, None, "", None, None, None, "", "", "", aspect_ratio, True, False, 0
1419
+
1420
+
1421
+ def reset_func(input_image,
1422
+ original_image,
1423
+ original_mask,
1424
+ prompt,
1425
+ target_prompt,
1426
+ target_prompt_output):
1427
+ input_image = None
1428
+ original_image = None
1429
+ original_mask = None
1430
+ prompt = ''
1431
+ mask_gallery = []
1432
+ masked_gallery = []
1433
+ result_gallery = []
1434
+ target_prompt = ''
1435
+ target_prompt_output = ''
1436
+ if torch.cuda.is_available():
1437
+ torch.cuda.empty_cache()
1438
+ return input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, target_prompt_output, True, False
1439
+
1440
+
1441
+ def update_example(example_type,
1442
+ prompt,
1443
+ example_change_times):
1444
+ input_image = INPUT_IMAGE_PATH[example_type]
1445
+ image_pil = Image.open(input_image).convert("RGB")
1446
+ mask_gallery = [Image.open(MASK_IMAGE_PATH[example_type]).convert("L")]
1447
+ masked_gallery = [Image.open(MASKED_IMAGE_PATH[example_type]).convert("RGB")]
1448
+ result_gallery = [Image.open(OUTPUT_IMAGE_PATH[example_type]).convert("RGB")]
1449
+ width, height = image_pil.size
1450
+ image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True)
1451
+ height_new, width_new = image_processor.get_default_height_width(image_pil, height, width)
1452
+ image_pil = image_pil.resize((width_new, height_new))
1453
+ mask_gallery[0] = mask_gallery[0].resize((width_new, height_new))
1454
+ masked_gallery[0] = masked_gallery[0].resize((width_new, height_new))
1455
+ result_gallery[0] = result_gallery[0].resize((width_new, height_new))
1456
+
1457
+ original_image = np.array(image_pil)
1458
+ original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1
1459
+ aspect_ratio = "Custom resolution"
1460
+ example_change_times += 1
1461
+ return input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, "", "", False, example_change_times
1462
+
1463
+ block = gr.Blocks(
1464
+ theme=gr.themes.Soft(
1465
+ radius_size=gr.themes.sizes.radius_none,
1466
+ text_size=gr.themes.sizes.text_md
1467
+ )
1468
+ ).queue()
1469
+ with block as demo:
1470
+ with gr.Row():
1471
+ with gr.Column():
1472
+ gr.HTML(head)
1473
+
1474
+ gr.Markdown(descriptions)
1475
+
1476
+ with gr.Accordion(label="🧭 Instructions:", open=True, elem_id="accordion"):
1477
+ with gr.Row(equal_height=True):
1478
+ gr.Markdown(instructions)
1479
+
1480
+ original_image = gr.State(value=None)
1481
+ original_mask = gr.State(value=None)
1482
+ category = gr.State(value=None)
1483
+ status = gr.State(value=None)
1484
+ invert_mask_state = gr.State(value=False)
1485
+ example_change_times = gr.State(value=0)
1486
+
1487
+
1488
+ with gr.Row():
1489
+ with gr.Column():
1490
+ with gr.Row():
1491
+ input_image = gr.ImageEditor(
1492
+ label="Input Image",
1493
+ type="pil",
1494
+ brush=gr.Brush(colors=["#FFFFFF"], default_size = 30, color_mode="fixed"),
1495
+ layers = False,
1496
+ interactive=True,
1497
+ height=1024,
1498
+ sources=["upload"],
1499
+ )
1500
+
1501
+
1502
+ vlm_model_dropdown = gr.Dropdown(label="VLM model", choices=VLM_MODEL_NAMES, value=DEFAULT_VLM_MODEL_NAME, interactive=True)
1503
+ with gr.Group():
1504
+ with gr.Row():
1505
+ GPT4o_KEY = gr.Textbox(label="GPT4o API Key", placeholder="Please input your GPT4o API Key when use GPT4o VLM (highly recommended).", value="", lines=1)
1506
+
1507
+ GPT4o_KEY_submit = gr.Button("Submit and Verify")
1508
+
1509
+
1510
+ aspect_ratio = gr.Dropdown(label="Output aspect ratio", choices=ASPECT_RATIO_LABELS, value=DEFAULT_ASPECT_RATIO)
1511
+ resize_default = gr.Checkbox(label="Short edge resize to 640px", value=True)
1512
+
1513
+
1514
+ prompt = gr.Textbox(label="⌨️ Instruction", placeholder="Please input your instruction.", value="",lines=1)
1515
+
1516
+ run_button = gr.Button("💫 Run")
1517
+
1518
+
1519
+ with gr.Row():
1520
+ mask_button = gr.Button("Generate Mask")
1521
+ random_mask_button = gr.Button("Square/Circle Mask ")
1522
+
1523
+
1524
+ with gr.Row():
1525
+ generate_target_prompt_button = gr.Button("Generate Target Prompt")
1526
+
1527
+ target_prompt = gr.Text(
1528
+ label="Input Target Prompt",
1529
+ max_lines=5,
1530
+ placeholder="VLM-generated target prompt, you can first generate if and then modify it (optional)",
1531
+ value='',
1532
+ lines=2
1533
+ )
1534
+
1535
+ with gr.Accordion("Advanced Options", open=False, elem_id="accordion1"):
1536
+ base_model_dropdown = gr.Dropdown(label="Base model", choices=BASE_MODELS, value=DEFAULT_BASE_MODEL, interactive=True)
1537
+ negative_prompt = gr.Text(
1538
+ label="Negative Prompt",
1539
+ max_lines=5,
1540
+ placeholder="Please input your negative prompt",
1541
+ value='ugly, low quality',lines=1
1542
+ )
1543
+
1544
+ control_strength = gr.Slider(
1545
+ label="Control Strength: ", show_label=True, minimum=0, maximum=1.1, value=1, step=0.01
1546
+ )
1547
+ with gr.Group():
1548
+ seed = gr.Slider(
1549
+ label="Seed: ", minimum=0, maximum=2147483647, step=1, value=648464818
1550
+ )
1551
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
1552
+
1553
+ blending = gr.Checkbox(label="Blending mode", value=True)
1554
+
1555
+
1556
+ num_samples = gr.Slider(
1557
+ label="Num samples", minimum=0, maximum=4, step=1, value=4
1558
+ )
1559
+
1560
+ with gr.Group():
1561
+ with gr.Row():
1562
+ guidance_scale = gr.Slider(
1563
+ label="Guidance scale",
1564
+ minimum=1,
1565
+ maximum=12,
1566
+ step=0.1,
1567
+ value=7.5,
1568
+ )
1569
+ num_inference_steps = gr.Slider(
1570
+ label="Number of inference steps",
1571
+ minimum=1,
1572
+ maximum=50,
1573
+ step=1,
1574
+ value=50,
1575
+ )
1576
+
1577
+
1578
+ with gr.Column():
1579
+ with gr.Row():
1580
+ with gr.Tab(elem_classes="feedback", label="Masked Image"):
1581
+ masked_gallery = gr.Gallery(label='Masked Image', show_label=True, elem_id="gallery", preview=True, height=360)
1582
+ with gr.Tab(elem_classes="feedback", label="Mask"):
1583
+ mask_gallery = gr.Gallery(label='Mask', show_label=True, elem_id="gallery", preview=True, height=360)
1584
+
1585
+ invert_mask_button = gr.Button("Invert Mask")
1586
+ dilation_size = gr.Slider(
1587
+ label="Dilation size: ", minimum=0, maximum=50, step=1, value=20
1588
+ )
1589
+ with gr.Row():
1590
+ dilation_mask_button = gr.Button("Dilation Generated Mask")
1591
+ erosion_mask_button = gr.Button("Erosion Generated Mask")
1592
+
1593
+ moving_pixels = gr.Slider(
1594
+ label="Moving pixels:", show_label=True, minimum=0, maximum=50, value=4, step=1
1595
+ )
1596
+ with gr.Row():
1597
+ move_left_button = gr.Button("Move Left")
1598
+ move_right_button = gr.Button("Move Right")
1599
+ with gr.Row():
1600
+ move_up_button = gr.Button("Move Up")
1601
+ move_down_button = gr.Button("Move Down")
1602
+
1603
+ with gr.Tab(elem_classes="feedback", label="Output"):
1604
+ result_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", preview=True, height=400)
1605
+
1606
+ target_prompt_output = gr.Text(label="Output Target Prompt", value="", lines=1, interactive=False)
1607
+
1608
+ reset_button = gr.Button("Reset")
1609
+
1610
+ init_type = gr.Textbox(label="Init Name", value="", visible=False)
1611
+ example_type = gr.Textbox(label="Example Name", value="", visible=False)
1612
+
1613
+
1614
+
1615
+ with gr.Row():
1616
+ example = gr.Examples(
1617
+ label="Quick Example",
1618
+ examples=EXAMPLES,
1619
+ inputs=[input_image, prompt, seed, init_type, example_type, blending, resize_default, vlm_model_dropdown],
1620
+ examples_per_page=10,
1621
+ cache_examples=False,
1622
+ )
1623
+
1624
+
1625
+ with gr.Accordion(label="🎬 Feature Details:", open=True, elem_id="accordion"):
1626
+ with gr.Row(equal_height=True):
1627
+ gr.Markdown(tips)
1628
+
1629
+ with gr.Row():
1630
+ gr.Markdown(citation)
1631
+
1632
+ ## gr.examples can not be used to update the gr.Gallery, so we need to use the following two functions to update the gr.Gallery.
1633
+ ## And we need to solve the conflict between the upload and change example functions.
1634
+ input_image.upload(
1635
+ init_img,
1636
+ [input_image, init_type, prompt, aspect_ratio, example_change_times],
1637
+ [input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, target_prompt_output, init_type, aspect_ratio, resize_default, invert_mask_state, example_change_times]
1638
+ )
1639
+ example_type.change(fn=update_example, inputs=[example_type, prompt, example_change_times], outputs=[input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, target_prompt, target_prompt_output, invert_mask_state, example_change_times])
1640
+
1641
+ ## vlm and base model dropdown
1642
+ vlm_model_dropdown.change(fn=update_vlm_model, inputs=[vlm_model_dropdown], outputs=[status])
1643
+ base_model_dropdown.change(fn=update_base_model, inputs=[base_model_dropdown], outputs=[status])
1644
+
1645
+
1646
+ GPT4o_KEY_submit.click(fn=submit_GPT4o_KEY, inputs=[GPT4o_KEY], outputs=[GPT4o_KEY, vlm_model_dropdown])
1647
+ invert_mask_button.click(fn=invert_mask, inputs=[input_image, original_image, original_mask], outputs=[masked_gallery, mask_gallery, original_mask, invert_mask_state])
1648
+
1649
+
1650
+ ips=[input_image,
1651
+ original_image,
1652
+ original_mask,
1653
+ prompt,
1654
+ negative_prompt,
1655
+ control_strength,
1656
+ seed,
1657
+ randomize_seed,
1658
+ guidance_scale,
1659
+ num_inference_steps,
1660
+ num_samples,
1661
+ blending,
1662
+ category,
1663
+ target_prompt,
1664
+ resize_default,
1665
+ aspect_ratio,
1666
+ invert_mask_state]
1667
+
1668
+ ## run brushedit
1669
+ run_button.click(fn=process, inputs=ips, outputs=[result_gallery, mask_gallery, masked_gallery, prompt, target_prompt, target_prompt_output, invert_mask_state])
1670
+
1671
+ ## mask func
1672
+ mask_button.click(fn=process_mask, inputs=[input_image, original_image, prompt, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask, category])
1673
+ random_mask_button.click(fn=process_random_mask, inputs=[input_image, original_image, original_mask, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
1674
+ dilation_mask_button.click(fn=process_dilation_mask, inputs=[input_image, original_image, original_mask, resize_default, aspect_ratio, dilation_size], outputs=[ masked_gallery, mask_gallery, original_mask])
1675
+ erosion_mask_button.click(fn=process_erosion_mask, inputs=[input_image, original_image, original_mask, resize_default, aspect_ratio, dilation_size], outputs=[ masked_gallery, mask_gallery, original_mask])
1676
+
1677
+ ## move mask func
1678
+ move_left_button.click(fn=move_mask_left, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
1679
+ move_right_button.click(fn=move_mask_right, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
1680
+ move_up_button.click(fn=move_mask_up, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
1681
+ move_down_button.click(fn=move_mask_down, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
1682
+
1683
+ ## prompt func
1684
+ generate_target_prompt_button.click(fn=generate_target_prompt, inputs=[input_image, original_image, prompt], outputs=[target_prompt, target_prompt_output])
1685
+
1686
+ ## reset func
1687
+ reset_button.click(fn=reset_func, inputs=[input_image, original_image, original_mask, prompt, target_prompt, target_prompt_output], outputs=[input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, target_prompt_output, resize_default, invert_mask_state])
1688
+
1689
+
1690
+ demo.launch(server_name="0.0.0.0", server_port=12345, share=False)
app/{gpt4_o → src}/vlm_pipeline.py RENAMED
@@ -7,13 +7,28 @@ from io import BytesIO
7
  import numpy as np
8
  import gradio as gr
9
 
 
 
 
10
 
11
  from app.gpt4_o.instructions import (
12
- create_editing_category_messages,
13
- create_ori_object_messages,
14
- create_add_object_messages,
15
- create_apply_editing_messages)
16
-
 
 
 
 
 
 
 
 
 
 
 
 
17
  from app.utils.utils import run_grounded_sam
18
 
19
 
@@ -25,46 +40,96 @@ def encode_image(img):
25
  return base64.b64encode(img_bytes).decode('utf-8')
26
 
27
 
28
- def run_gpt4o_vl_inference(vlm,
29
  messages):
30
- response = vlm.chat.completions.create(
31
  model="gpt-4o-2024-08-06",
32
  messages=messages
33
  )
34
  response_str = response.choices[0].message.content
35
  return response_str
36
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- def vlm_response_editing_type(vlm,
39
- image,
40
- editing_prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- base64_image = encode_image(image)
43
-
44
- messages = create_editing_category_messages(editing_prompt)
45
-
46
- response_str = run_gpt4o_vl_inference(vlm, messages)
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  for category_name in ["Addition","Remove","Local","Global","Background"]:
49
  if category_name.lower() in response_str.lower():
50
  return category_name
51
- raise ValueError("Please input correct commands, including add, delete, and modify commands.")
52
 
53
 
54
- def vlm_response_object_wait_for_edit(vlm,
 
 
 
55
  category,
56
- editing_prompt):
 
57
  if category in ["Background", "Global", "Addition"]:
58
  edit_object = "nan"
59
  return edit_object
60
 
61
- messages = create_ori_object_messages(editing_prompt)
62
-
63
- response_str = run_gpt4o_vl_inference(vlm, messages)
 
 
 
 
 
 
64
  return response_str
65
 
66
 
67
- def vlm_response_mask(vlm,
 
 
68
  category,
69
  image,
70
  editing_prompt,
@@ -73,16 +138,25 @@ def vlm_response_mask(vlm,
73
  sam_predictor=None,
74
  sam_automask_generator=None,
75
  groundingdino_model=None,
 
76
  ):
77
  mask = None
78
  if editing_prompt is None or len(editing_prompt)==0:
79
  raise gr.Error("Please input the editing instruction!")
80
  height, width = image.shape[:2]
81
  if category=="Addition":
82
- base64_image = encode_image(image)
83
- messages = create_add_object_messages(editing_prompt, base64_image, height=height, width=width)
84
  try:
85
- response_str = run_gpt4o_vl_inference(vlm, messages)
 
 
 
 
 
 
 
 
 
 
86
  pattern = r'\[\d{1,3}(?:,\s*\d{1,3}){3}\]'
87
  box = re.findall(pattern, response_str)
88
  box = box[0][1:-1].split(",")
@@ -92,7 +166,7 @@ def vlm_response_mask(vlm,
92
  cus_mask[box[1]: box[1]+box[3], box[0]: box[0]+box[2]]=255
93
  mask = cus_mask
94
  except:
95
- raise gr.Error("Please set the mask manually, MLLM cannot output the mask!")
96
 
97
  elif category=="Background":
98
  labels = "background"
@@ -104,7 +178,6 @@ def vlm_response_mask(vlm,
104
  if mask is None:
105
  for thresh in [0.3,0.25,0.2,0.15,0.1,0.05,0]:
106
  try:
107
- device = "cuda" if torch.cuda.is_available() else "cpu"
108
  detections = run_grounded_sam(
109
  input_image={"image":Image.fromarray(image.astype('uint8')),
110
  "mask":None},
@@ -128,11 +201,22 @@ def vlm_response_mask(vlm,
128
  return mask
129
 
130
 
131
- def vlm_response_prompt_after_apply_instruction(vlm,
 
132
  image,
133
- editing_prompt):
134
- base64_image = encode_image(image)
135
- messages = create_apply_editing_messages(editing_prompt, base64_image)
136
-
137
- response_str = run_gpt4o_vl_inference(vlm, messages)
138
- return response_str
 
 
 
 
 
 
 
 
 
 
 
7
  import numpy as np
8
  import gradio as gr
9
 
10
+ from openai import OpenAI
11
+ from transformers import (LlavaNextForConditionalGeneration, Qwen2VLForConditionalGeneration)
12
+ from qwen_vl_utils import process_vision_info
13
 
14
  from app.gpt4_o.instructions import (
15
+ create_editing_category_messages_gpt4o,
16
+ create_ori_object_messages_gpt4o,
17
+ create_add_object_messages_gpt4o,
18
+ create_apply_editing_messages_gpt4o)
19
+
20
+ from app.llava.instructions import (
21
+ create_editing_category_messages_llava,
22
+ create_ori_object_messages_llava,
23
+ create_add_object_messages_llava,
24
+ create_apply_editing_messages_llava)
25
+
26
+ from app.qwen2.instructions import (
27
+ create_editing_category_messages_qwen2,
28
+ create_ori_object_messages_qwen2,
29
+ create_add_object_messages_qwen2,
30
+ create_apply_editing_messages_qwen2)
31
+
32
  from app.utils.utils import run_grounded_sam
33
 
34
 
 
40
  return base64.b64encode(img_bytes).decode('utf-8')
41
 
42
 
43
+ def run_gpt4o_vl_inference(vlm_model,
44
  messages):
45
+ response = vlm_model.chat.completions.create(
46
  model="gpt-4o-2024-08-06",
47
  messages=messages
48
  )
49
  response_str = response.choices[0].message.content
50
  return response_str
51
 
52
+ def run_llava_next_inference(vlm_processor, vlm_model, messages, image, device="cuda"):
53
+ prompt = vlm_processor.apply_chat_template(messages, add_generation_prompt=True)
54
+ inputs = vlm_processor(images=image, text=prompt, return_tensors="pt").to(device)
55
+ output = vlm_model.generate(**inputs, max_new_tokens=200)
56
+ generated_ids_trimmed = [
57
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, output)
58
+ ]
59
+ response_str = vlm_processor.decode(generated_ids_trimmed[0], skip_special_tokens=True)
60
+
61
+ return response_str
62
 
63
+ def run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device="cuda"):
64
+ text = vlm_processor.apply_chat_template(
65
+ messages, tokenize=False, add_generation_prompt=True
66
+ )
67
+ image_inputs, video_inputs = process_vision_info(messages)
68
+ inputs = vlm_processor(
69
+ text=[text],
70
+ images=image_inputs,
71
+ videos=video_inputs,
72
+ padding=True,
73
+ return_tensors="pt",
74
+ )
75
+ inputs = inputs.to(device)
76
+ generated_ids = vlm_model.generate(**inputs, max_new_tokens=128)
77
+ generated_ids_trimmed = [
78
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
79
+ ]
80
+ response_str = vlm_processor.decode(generated_ids_trimmed[0], skip_special_tokens=True)
81
+ return response_str
82
 
 
 
 
 
 
83
 
84
+ ### response editing type
85
+ def vlm_response_editing_type(vlm_processor,
86
+ vlm_model,
87
+ image,
88
+ editing_prompt,
89
+ device):
90
+
91
+ if isinstance(vlm_model, OpenAI):
92
+ messages = create_editing_category_messages_gpt4o(editing_prompt)
93
+ response_str = run_gpt4o_vl_inference(vlm_model, messages)
94
+ elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
95
+ messages = create_editing_category_messages_llava(editing_prompt)
96
+ response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image, device=device)
97
+ elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
98
+ messages = create_editing_category_messages_qwen2(editing_prompt)
99
+ response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device=device)
100
+
101
  for category_name in ["Addition","Remove","Local","Global","Background"]:
102
  if category_name.lower() in response_str.lower():
103
  return category_name
104
+ raise gr.Error("Please input correct commands, including add, delete, and modify commands. If it still does not work, please switch to a more powerful VLM.")
105
 
106
 
107
+ ### response object to be edited
108
+ def vlm_response_object_wait_for_edit(vlm_processor,
109
+ vlm_model,
110
+ image,
111
  category,
112
+ editing_prompt,
113
+ device):
114
  if category in ["Background", "Global", "Addition"]:
115
  edit_object = "nan"
116
  return edit_object
117
 
118
+ if isinstance(vlm_model, OpenAI):
119
+ messages = create_ori_object_messages_gpt4o(editing_prompt)
120
+ response_str = run_gpt4o_vl_inference(vlm_model, messages)
121
+ elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
122
+ messages = create_ori_object_messages_llava(editing_prompt)
123
+ response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image , device)
124
+ elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
125
+ messages = create_ori_object_messages_qwen2(editing_prompt)
126
+ response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device)
127
  return response_str
128
 
129
 
130
+ ### response mask
131
+ def vlm_response_mask(vlm_processor,
132
+ vlm_model,
133
  category,
134
  image,
135
  editing_prompt,
 
138
  sam_predictor=None,
139
  sam_automask_generator=None,
140
  groundingdino_model=None,
141
+ device=None,
142
  ):
143
  mask = None
144
  if editing_prompt is None or len(editing_prompt)==0:
145
  raise gr.Error("Please input the editing instruction!")
146
  height, width = image.shape[:2]
147
  if category=="Addition":
 
 
148
  try:
149
+ if isinstance(vlm_model, OpenAI):
150
+ base64_image = encode_image(image)
151
+ messages = create_add_object_messages_gpt4o(editing_prompt, base64_image, height=height, width=width)
152
+ response_str = run_gpt4o_vl_inference(vlm_model, messages)
153
+ elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
154
+ messages = create_add_object_messages_llava(editing_prompt, height=height, width=width)
155
+ response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image, device)
156
+ elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
157
+ base64_image = encode_image(image)
158
+ messages = create_add_object_messages_qwen2(editing_prompt, base64_image, height=height, width=width)
159
+ response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device)
160
  pattern = r'\[\d{1,3}(?:,\s*\d{1,3}){3}\]'
161
  box = re.findall(pattern, response_str)
162
  box = box[0][1:-1].split(",")
 
166
  cus_mask[box[1]: box[1]+box[3], box[0]: box[0]+box[2]]=255
167
  mask = cus_mask
168
  except:
169
+ raise gr.Error("Please set the mask manually, currently the VLM cannot output the mask!")
170
 
171
  elif category=="Background":
172
  labels = "background"
 
178
  if mask is None:
179
  for thresh in [0.3,0.25,0.2,0.15,0.1,0.05,0]:
180
  try:
 
181
  detections = run_grounded_sam(
182
  input_image={"image":Image.fromarray(image.astype('uint8')),
183
  "mask":None},
 
201
  return mask
202
 
203
 
204
+ def vlm_response_prompt_after_apply_instruction(vlm_processor,
205
+ vlm_model,
206
  image,
207
+ editing_prompt,
208
+ device):
209
+ if isinstance(vlm_model, OpenAI):
210
+ base64_image = encode_image(image)
211
+ messages = create_apply_editing_messages_gpt4o(editing_prompt, base64_image)
212
+ response_str = run_gpt4o_vl_inference(vlm_model, messages)
213
+ elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
214
+ messages = create_apply_editing_messages_llava(editing_prompt)
215
+ response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image, device)
216
+ elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
217
+ base64_image = encode_image(image)
218
+ messages = create_apply_editing_messages_qwen2(editing_prompt, base64_image)
219
+ response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device)
220
+ else:
221
+ raise gr.Error("Please select the correct VLM model!")
222
+ return response_str
app/src/vlm_template.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ from openai import OpenAI
5
+ from transformers import (
6
+ LlavaNextProcessor, LlavaNextForConditionalGeneration,
7
+ Qwen2VLForConditionalGeneration, Qwen2VLProcessor
8
+ )
9
+ ## init device
10
+ device = "cpu"
11
+ torch_dtype = torch.float16
12
+
13
+
14
+ vlms_list = [
15
+ # {
16
+ # "type": "llava-next",
17
+ # "name": "llava-v1.6-mistral-7b-hf",
18
+ # "local_path": "models/vlms/llava-v1.6-mistral-7b-hf",
19
+ # "processor": LlavaNextProcessor.from_pretrained(
20
+ # "models/vlms/llava-v1.6-mistral-7b-hf"
21
+ # ) if os.path.exists("models/vlms/llava-v1.6-mistral-7b-hf") else LlavaNextProcessor.from_pretrained(
22
+ # "llava-hf/llava-v1.6-mistral-7b-hf"
23
+ # ),
24
+ # "model": LlavaNextForConditionalGeneration.from_pretrained(
25
+ # "models/vlms/llava-v1.6-mistral-7b-hf", torch_dtype=torch_dtype, device_map=device
26
+ # ).to("cpu") if os.path.exists("models/vlms/llava-v1.6-mistral-7b-hf") else
27
+ # LlavaNextForConditionalGeneration.from_pretrained(
28
+ # "llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch_dtype, device_map=device
29
+ # ).to("cpu"),
30
+ # },
31
+ {
32
+ "type": "llava-next",
33
+ "name": "llama3-llava-next-8b-hf (Preload)",
34
+ "local_path": "models/vlms/llama3-llava-next-8b-hf",
35
+ "processor": LlavaNextProcessor.from_pretrained(
36
+ "models/vlms/llama3-llava-next-8b-hf"
37
+ ) if os.path.exists("models/vlms/llama3-llava-next-8b-hf") else LlavaNextProcessor.from_pretrained(
38
+ "llava-hf/llama3-llava-next-8b-hf"
39
+ ),
40
+ "model": LlavaNextForConditionalGeneration.from_pretrained(
41
+ "models/vlms/llama3-llava-next-8b-hf", torch_dtype=torch_dtype, device_map=device
42
+ ).to("cpu") if os.path.exists("models/vlms/llama3-llava-next-8b-hf") else
43
+ LlavaNextForConditionalGeneration.from_pretrained(
44
+ "llava-hf/llama3-llava-next-8b-hf", torch_dtype=torch_dtype, device_map=device
45
+ ).to("cpu"),
46
+ },
47
+ # {
48
+ # "type": "llava-next",
49
+ # "name": "llava-v1.6-vicuna-13b-hf",
50
+ # "local_path": "models/vlms/llava-v1.6-vicuna-13b-hf",
51
+ # "processor": LlavaNextProcessor.from_pretrained(
52
+ # "models/vlms/llava-v1.6-vicuna-13b-hf"
53
+ # ) if os.path.exists("models/vlms/llava-v1.6-vicuna-13b-hf") else LlavaNextProcessor.from_pretrained(
54
+ # "llava-hf/llava-v1.6-vicuna-13b-hf"
55
+ # ),
56
+ # "model": LlavaNextForConditionalGeneration.from_pretrained(
57
+ # "models/vlms/llava-v1.6-vicuna-13b-hf", torch_dtype=torch_dtype, device_map=device
58
+ # ).to("cpu") if os.path.exists("models/vlms/llava-v1.6-vicuna-13b-hf") else
59
+ # LlavaNextForConditionalGeneration.from_pretrained(
60
+ # "llava-hf/llava-v1.6-vicuna-13b-hf", torch_dtype=torch_dtype, device_map=device
61
+ # ).to("cpu"),
62
+ # },
63
+ # {
64
+ # "type": "llava-next",
65
+ # "name": "llava-v1.6-34b-hf",
66
+ # "local_path": "models/vlms/llava-v1.6-34b-hf",
67
+ # "processor": LlavaNextProcessor.from_pretrained(
68
+ # "models/vlms/llava-v1.6-34b-hf"
69
+ # ) if os.path.exists("models/vlms/llava-v1.6-34b-hf") else LlavaNextProcessor.from_pretrained(
70
+ # "llava-hf/llava-v1.6-34b-hf"
71
+ # ),
72
+ # "model": LlavaNextForConditionalGeneration.from_pretrained(
73
+ # "models/vlms/llava-v1.6-34b-hf", torch_dtype=torch_dtype, device_map=device
74
+ # ).to("cpu") if os.path.exists("models/vlms/llava-v1.6-34b-hf") else
75
+ # LlavaNextForConditionalGeneration.from_pretrained(
76
+ # "llava-hf/llava-v1.6-34b-hf", torch_dtype=torch_dtype, device_map=device
77
+ # ).to("cpu"),
78
+ # },
79
+ # {
80
+ # "type": "qwen2-vl",
81
+ # "name": "Qwen2-VL-2B-Instruct",
82
+ # "local_path": "models/vlms/Qwen2-VL-2B-Instruct",
83
+ # "processor": Qwen2VLProcessor.from_pretrained(
84
+ # "models/vlms/Qwen2-VL-2B-Instruct"
85
+ # ) if os.path.exists("models/vlms/Qwen2-VL-2B-Instruct") else Qwen2VLProcessor.from_pretrained(
86
+ # "Qwen/Qwen2-VL-2B-Instruct"
87
+ # ),
88
+ # "model": Qwen2VLForConditionalGeneration.from_pretrained(
89
+ # "models/vlms/Qwen2-VL-2B-Instruct", torch_dtype=torch_dtype, device_map=device
90
+ # ).to("cpu") if os.path.exists("models/vlms/Qwen2-VL-2B-Instruct") else
91
+ # Qwen2VLForConditionalGeneration.from_pretrained(
92
+ # "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch_dtype, device_map=device
93
+ # ).to("cpu"),
94
+ # },
95
+ {
96
+ "type": "qwen2-vl",
97
+ "name": "Qwen2-VL-7B-Instruct (Default)",
98
+ "local_path": "models/vlms/Qwen2-VL-7B-Instruct",
99
+ "processor": Qwen2VLProcessor.from_pretrained(
100
+ "models/vlms/Qwen2-VL-7B-Instruct"
101
+ ) if os.path.exists("models/vlms/Qwen2-VL-7B-Instruct") else Qwen2VLProcessor.from_pretrained(
102
+ "Qwen/Qwen2-VL-7B-Instruct"
103
+ ),
104
+ "model": Qwen2VLForConditionalGeneration.from_pretrained(
105
+ "models/vlms/Qwen2-VL-7B-Instruct", torch_dtype=torch_dtype, device_map=device
106
+ ).to("cpu") if os.path.exists("models/vlms/Qwen2-VL-7B-Instruct") else
107
+ Qwen2VLForConditionalGeneration.from_pretrained(
108
+ "Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch_dtype, device_map=device
109
+ ).to("cpu"),
110
+ },
111
+ {
112
+ "type": "openai",
113
+ "name": "GPT4-o (Highly Recommended)",
114
+ "local_path": "",
115
+ "processor": "",
116
+ "model": ""
117
+ },
118
+ ]
119
+
120
+ vlms_template = {k["name"]: (k["type"], k["local_path"], k["processor"], k["model"]) for k in vlms_list}
app/utils/GroundingDINO_SwinT_OGC.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_size = 1
2
+ modelname = "groundingdino"
3
+ backbone = "swin_T_224_1k"
4
+ position_embedding = "sine"
5
+ pe_temperatureH = 20
6
+ pe_temperatureW = 20
7
+ return_interm_indices = [1, 2, 3]
8
+ backbone_freeze_keywords = None
9
+ enc_layers = 6
10
+ dec_layers = 6
11
+ pre_norm = False
12
+ dim_feedforward = 2048
13
+ hidden_dim = 256
14
+ dropout = 0.0
15
+ nheads = 8
16
+ num_queries = 900
17
+ query_dim = 4
18
+ num_patterns = 0
19
+ num_feature_levels = 4
20
+ enc_n_points = 4
21
+ dec_n_points = 4
22
+ two_stage_type = "standard"
23
+ two_stage_bbox_embed_share = False
24
+ two_stage_class_embed_share = False
25
+ transformer_activation = "relu"
26
+ dec_pred_bbox_embed_share = True
27
+ dn_box_noise_scale = 1.0
28
+ dn_label_noise_ratio = 0.5
29
+ dn_label_coef = 1.0
30
+ dn_bbox_coef = 1.0
31
+ embed_init_tgt = True
32
+ dn_labelbook_size = 2000
33
+ max_text_len = 256
34
+ text_encoder_type = "bert-base-uncased"
35
+ use_text_enhancer = True
36
+ use_fusion_layer = True
37
+ use_checkpoint = True
38
+ use_transformer_ckpt = True
39
+ use_text_cross_attention = True
40
+ text_dropout = 0.0
41
+ fusion_dropout = 0.0
42
+ fusion_droppath = 0.1
43
+ sub_sentence_present = True
assets/angel_christmas/angel_christmas.png ADDED

Git LFS Details

  • SHA256: 90efa52308e2dc16274ddaef147d89979bf6bdb2c1f2b06f639b4e43fb96f8db
  • Pointer size: 132 Bytes
  • Size of remote file: 1.47 MB
assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png ADDED

Git LFS Details

  • SHA256: a259c2958d665532dfdf459ccb8d808967eee2d2f6e87dadd51ca1a01b590b44
  • Pointer size: 132 Bytes
  • Size of remote file: 1.43 MB
assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png ADDED

Git LFS Details

  • SHA256: 14318679567d391ee5e08d96dae249ed1bca1a0f349b76f725cc70288ce04030
  • Pointer size: 129 Bytes
  • Size of remote file: 3.99 kB
assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png ADDED

Git LFS Details

  • SHA256: be7745d023596428d3ff449f48f3aad3aa8ae00a42c089a7b1311cdae3e39b70
  • Pointer size: 132 Bytes
  • Size of remote file: 1.43 MB
assets/angel_christmas/prompt.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89ed635310c87d5f2d8f32d813018acd5040edd745a29c5bf84a435916525789
3
+ size 27
assets/anime_flower/anime_flower.png ADDED

Git LFS Details

  • SHA256: 1adc101088a1428361410fe3c637155da48d0eb21b3782377dd258a0a5df576a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
assets/anime_flower/image_edit_37553172-9b38-4727-bf2e-37d7e2b93461_2.png ADDED

Git LFS Details

  • SHA256: 3912e75e44c89a7ec8d0f6e34c90d4ea2212f80e5c2a12e6ba3dac405ca7be6c
  • Pointer size: 131 Bytes
  • Size of remote file: 930 kB
assets/anime_flower/mask_37553172-9b38-4727-bf2e-37d7e2b93461.png ADDED

Git LFS Details

  • SHA256: 9f07ca0c719ffc09f282c424c66c869a9a31a1fc6386dba679e994f0b34bf51c
  • Pointer size: 129 Bytes
  • Size of remote file: 4.22 kB
assets/anime_flower/masked_image_37553172-9b38-4727-bf2e-37d7e2b93461.png ADDED

Git LFS Details

  • SHA256: 73cac864b127f287579a8d259ff1165841d4c5e63731b4ac54e872567137e5e6
  • Pointer size: 131 Bytes
  • Size of remote file: 967 kB
assets/anime_flower/prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 648464818: remove the flower.
assets/brushedit_teaser.png ADDED

Git LFS Details

  • SHA256: bcd1d0c9f6fc083a33ec4565d98120bf1099914a0d2c0247eaa462052911ea59
  • Pointer size: 132 Bytes
  • Size of remote file: 3.45 MB
assets/chenduling/chengduling.jpg ADDED

Git LFS Details

  • SHA256: 0df06c0394583181e5cdf92f997c1276deb27cf96dd36b6443fe9d347a1e013a
  • Pointer size: 131 Bytes
  • Size of remote file: 168 kB
assets/chenduling/image_edit_68e3ff6f-da07-4b37-91df-13d6eed7b997_0.png ADDED

Git LFS Details

  • SHA256: 7bb60b8093291c9720e61160f7e598aadfc02f62bc08ad825d1ba9f2e8431b6a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.39 MB
assets/chenduling/mask_68e3ff6f-da07-4b37-91df-13d6eed7b997.png ADDED

Git LFS Details

  • SHA256: db47c2d9f18b0e25041c894945f6b52d3fcff473a0f0496b89dc2ac7d36536fc
  • Pointer size: 129 Bytes
  • Size of remote file: 5.68 kB
assets/chenduling/masked_image_68e3ff6f-da07-4b37-91df-13d6eed7b997.png ADDED

Git LFS Details

  • SHA256: 0200a970fc55930f9bcc9910cea3126b96c28e2bddca10b2b1969cbc979092be
  • Pointer size: 132 Bytes
  • Size of remote file: 1.11 MB
assets/chenduling/prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 648464818: replace the clothes to a delicated floral skirt
assets/chinese_girl/chinese_girl.png ADDED

Git LFS Details

  • SHA256: 52f7dfc3333b48f677035180506650fb4ee9911a31426adb83c7e13fd5ac6693
  • Pointer size: 132 Bytes
  • Size of remote file: 1.26 MB
assets/chinese_girl/image_edit_54759648-0989-48e0-bc82-f20e28b5ec29_1.png ADDED

Git LFS Details

  • SHA256: b330e926da856f1027dd09e2bb3dc5910bb0a2dc9bc4a402b107c3f7b18b7de0
  • Pointer size: 131 Bytes
  • Size of remote file: 881 kB
assets/chinese_girl/mask_54759648-0989-48e0-bc82-f20e28b5ec29.png ADDED

Git LFS Details

  • SHA256: d46957b4cd0af13f57ace1cf181a13c8da7feebf9a9f37e8e5d582086a337843
  • Pointer size: 130 Bytes
  • Size of remote file: 10.4 kB
assets/chinese_girl/masked_image_54759648-0989-48e0-bc82-f20e28b5ec29.png ADDED

Git LFS Details

  • SHA256: f26a9d923a432b80e91b09d711f4000e9b1afe7edece788c9b8b86a3cce45855
  • Pointer size: 131 Bytes
  • Size of remote file: 412 kB
assets/chinese_girl/prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 648464818: replace the background to ancient China.
assets/demo_vis.png ADDED

Git LFS Details

  • SHA256: 755ecfc61a70da9eb3abde0d4353590c0344a0d55e5d3622da4fe58837ca457b
  • Pointer size: 132 Bytes
  • Size of remote file: 1.04 MB
assets/example.png ADDED

Git LFS Details

  • SHA256: 1e86dbd1cb8d4c787a910d17400b081fa1d0daac35645f808c088fd316d1861b
  • Pointer size: 129 Bytes
  • Size of remote file: 3.22 kB
assets/frog/frog.jpeg ADDED

Git LFS Details

  • SHA256: bff47418f10bcbebdced638256fce1e075d93ccedc3b44ca83d04f7c7145ab1e
  • Pointer size: 131 Bytes
  • Size of remote file: 896 kB
assets/frog/image_edit_f7b350de-6f2c-49e3-b535-995c486d78e7_1.png ADDED

Git LFS Details

  • SHA256: c9c1dfe00fd70e1cee76037941876c03a64863b3d598f925e7d0a39f3065db89
  • Pointer size: 131 Bytes
  • Size of remote file: 923 kB
assets/frog/mask_f7b350de-6f2c-49e3-b535-995c486d78e7.png ADDED

Git LFS Details

  • SHA256: 2df1f32f92028ef8dbd677d039af09acb82db62d60bf4dea7812eefab340f553
  • Pointer size: 129 Bytes
  • Size of remote file: 3.34 kB
assets/frog/masked_image_f7b350de-6f2c-49e3-b535-995c486d78e7.png ADDED

Git LFS Details

  • SHA256: bd9730b6c718a44614cfc8873e65bf660183adb4fbf2352f6488a33be5d4d7a1
  • Pointer size: 131 Bytes
  • Size of remote file: 881 kB
assets/frog/prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 648464818: add a magic hat on frog head.
assets/girl_on_sun/girl_on_sun.png ADDED

Git LFS Details

  • SHA256: ec304a50b692e2b898b59b22c84cda84663738aacf5e9bf64cdfed1cde853e2a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.59 MB
assets/girl_on_sun/image_edit_264eac8b-8b65-479c-9755-020a60880c37_0.png ADDED

Git LFS Details

  • SHA256: 800a5e5f953290472898247f974893506ca41a3c7acda02d7eb1a69844ad6d7c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.1 MB
assets/girl_on_sun/mask_264eac8b-8b65-479c-9755-020a60880c37.png ADDED

Git LFS Details

  • SHA256: 9ebe3b43581718a525c258d7a1f28d7b4acc4e61150b008f5e909946717ce73f
  • Pointer size: 129 Bytes
  • Size of remote file: 3.91 kB
assets/girl_on_sun/masked_image_264eac8b-8b65-479c-9755-020a60880c37.png ADDED

Git LFS Details

  • SHA256: a3c165dde2730c2648191e84fc2edb7d27babb8b80ffa159f567f2d670028b18
  • Pointer size: 132 Bytes
  • Size of remote file: 1.2 MB
assets/girl_on_sun/prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 648464818: add a butterfly fairy.