Spaces:
Paused
Paused
File size: 8,340 Bytes
b673263 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
import gradio as gr
from PIL import Image
import torch
import re
import os
import requests
from customization import customize_vae_decoder
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel, DDIMScheduler, EulerDiscreteScheduler
from torchvision import transforms
from attribution import MappingNetwork
import math
from typing import List
from PIL import Image, ImageChops
import numpy as np
import torch
PRETRAINED_MODEL_NAME_OR_PATH = "./checkpoints/"
def get_image_grid(images: List[Image.Image]) -> Image:
num_images = len(images)
cols = 3#int(math.ceil(math.sqrt(num_images)))
rows = 1#int(math.ceil(num_images / cols))
width, height = images[0].size
grid_image = Image.new('RGB', (cols * width, rows * height))
for i, img in enumerate(images):
x = i % cols
y = i // cols
grid_image.paste(img, (x * width, y * height))
return grid_image
class AttributionModel:
def __init__(self):
is_cuda = False
if torch.cuda.is_available():
is_cuda = True
scheduler = EulerDiscreteScheduler.from_pretrained('stabilityai/stable-diffusion-2', subfolder="scheduler")
self.pipe = StableDiffusionPipeline.from_pretrained('stabilityai/stable-diffusion-2', scheduler=scheduler)#, safety_checker=None, torch_dtype=torch.float16)
if is_cuda:
self.pipe = self.pipe.to("cuda")
self.resize_transform = transforms.Resize(512, interpolation=transforms.InterpolationMode.BILINEAR)
self.vae = AutoencoderKL.from_pretrained(
'stabilityai/stable-diffusion-2', subfolder="vae"
)
self.vae = customize_vae_decoder(self.vae, 128, "deqkv", "all", False, 1.0)
self.mapping_network = MappingNetwork(32, 0, 128, None, num_layers=2, w_avg_beta=None, normalization = False)
from torchvision.models import resnet50, ResNet50_Weights
self.decoding_network = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
self.decoding_network.fc = torch.nn.Linear(2048,32)
self.vae.decoder.load_state_dict(torch.load(os.path.join(PRETRAINED_MODEL_NAME_OR_PATH, 'vae_decoder.pth')))
self.mapping_network.load_state_dict(torch.load(os.path.join(PRETRAINED_MODEL_NAME_OR_PATH, 'mapping_network.pth')))
self.decoding_network.load_state_dict(torch.load(os.path.join(PRETRAINED_MODEL_NAME_OR_PATH, 'decoding_network.pth')))
if is_cuda:
self.vae = self.vae.to("cuda")
self.mapping_network = self.mapping_network.to("cuda")
self.decoding_network = self.decoding_network.to("cuda")
self.test_norm = transforms.Compose(
[
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
]
)
def infer(self, prompt, negative, steps, guidance_scale):
with torch.no_grad():
out_latents = self.pipe([prompt], negative_prompt=[negative], output_type="latent", num_inference_steps=steps, guidance_scale=guidance_scale).images
image_attr = self.inference_with_attribution(out_latents)
image_attr_pil = self.pipe.numpy_to_pil(image_attr[0])
image_org = self.inference_without_attribution(out_latents)
image_org_pil = self.pipe.numpy_to_pil(image_org[0])
# image_diff_pil = self.pipe.numpy_to_pil(image_attr[0] - image_org[0])
diff_factor = 5
image_diff_pil = ImageChops.difference(image_org_pil[0], image_attr_pil[0]).convert("RGB", (diff_factor,0,0,0,0,diff_factor,0,0,0,0,diff_factor,0))
return image_org_pil[0], image_attr_pil[0], image_diff_pil
def inference_without_attribution(self, latents):
latents = 1 / 0.18215 * latents
with torch.no_grad():
image = self.pipe.vae.decode(latents).sample
image = image.clamp(-1,1)
image = (image / 2 + 0.5).clamp(0, 1)
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
def get_phis(self, phi_dimension, batch_size ,eps = 1e-8):
phi_length = phi_dimension
b = batch_size
phi = torch.empty(b,phi_length).uniform_(0,1)
return torch.bernoulli(phi) + eps
def inference_with_attribution(self, latents, key=None):
if key==None:
key = self.get_phis(32, 1)
latents = 1 / 0.18215 * latents
with torch.no_grad():
image = self.vae.decode(latents, self.mapping_network(key.cuda())).sample
image = image.clamp(-1,1)
image = (image / 2 + 0.5).clamp(0, 1)
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
def postprocess(self, image):
image = self.resize_transform(image)
return image
def detect_key(self, image):
reconstructed_keys = self.decoding_network(self.test_norm((image / 2 + 0.5).clamp(0, 1)))
return reconstructed_keys
attribution_model = AttributionModel()
def get_images(prompt, negative, steps, guidence_scale):
x1, x2, x3 = attribution_model.infer(prompt, negative, steps, guidence_scale)
return [x1, x2, x3]
image_examples = [
["A pikachu fine dining with a view to the Eiffel Tower", "low quality", 50, 10],
["A mecha robot in a favela in expressionist style", "low quality, 3d, photorealistic", 50, 10]
]
with gr.Blocks() as demo:
gr.Markdown(
"""<h1 style="text-align: center;"><b>WOUAF:
Weight Modulation for User Attribution and Fingerprinting in Text-to-Image Diffusion Models</b> <br> <a href="https://wouaf.vercel.app">Project Page</a></h1>""")
with gr.Row(elem_id="prompt-container").style(mobile_collapse=False, equal_height=True):
with gr.Column():
text = gr.Textbox(
label="Enter your prompt",
show_label=False,
max_lines=1,
placeholder="Enter your prompt",
elem_id="prompt-text-input",
).style(
border=(True, False, True, True),
rounded=(True, False, False, True),
container=False,
)
negative = gr.Textbox(
label="Enter your negative prompt",
show_label=False,
max_lines=1,
placeholder="Enter a negative prompt",
elem_id="negative-prompt-text-input",
).style(
border=(True, False, True, True),
rounded=(True, False, False, True),
container=False,
)
with gr.Row():
steps = gr.Slider(label="Steps", minimum=45, maximum=55, value=50, step=1)
guidance_scale = gr.Slider(
label="Guidance Scale", minimum=0, maximum=10, value=7.5, step=0.1
)
with gr.Row():
btn = gr.Button(value="Generate Image", full_width=False)
with gr.Row():
im_2 = gr.Image(type="pil", label="without attribution")
im_3 = gr.Image(type="pil", label="**with** attribution")
im_4 = gr.Image(type="pil", label="pixel-wise difference multiplied by 5")
btn.click(get_images, inputs=[text, negative, steps, guidance_scale], outputs=[im_2, im_3, im_4])
gr.Examples(
examples=image_examples,
inputs=[text, negative, steps, guidance_scale],
outputs=[im_2, im_3, im_4],
fn=get_images,
cache_examples=True,
)
gr.HTML(
"""
<div class="footer">
<p>Pre-trained model by <a href="https://huggingface.co/stabilityai" style="text-decoration: underline;" target="_blank">StabilityAI</a>
</p>
<p>
Fine-tuned by authors for research purpose.
</p>
</div>
"""
)
with gr.Accordion(label="Ethics & Privacy", open=False):
gr.HTML(
"""<div class="acknowledgments">
<p><h4>Privacy</h4>
We do not collect any images or key data. This demo is designed with sole purpose of fun and reducing misuse of AI.
<p><h4>Biases and content acknowledgment</h4>
This model will have the same biases as Stable Diffusion V2.1 </div>
"""
)
if __name__ == "__main__":
demo.launch()
|