Spaces:

TruongScotl
/

Gaussian-Shading-watermark

Sleeping

App Files Files Community

TruongScotl commited on Jan 21

Commit

f961e67

verified ·

1 Parent(s): 0c4598f

Upload 7 files

Browse files

Files changed (7) hide show

app.py +205 -0
image_utils.py +108 -0
inverse_stable_diffusion.py +205 -0
modified_stable_diffusion.py +235 -0
requirements.txt +14 -0
run_gaussian_shading.py +106 -0
watermark.py +95 -0

app.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import numpy as np
+import gradio as gr
+from run_gaussian_shading import *
+examples = [
+    "A photo of a cat",
+    "A pizza with pineapple on it",
+    "A photo of dog",
+]
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 700px;
+}
+"""
+MAX_SEED = np.iinfo(np.int32).max
+#---------------------------------------------------------------------------------------------------
+with gr.Blocks(css=css) as demo:
+# ---------------------------------- Add Watermark -----------------------------------------
+    with gr.Tab("Add watermark"):
+        with gr.Column(elem_id="col-container"):
+            gr.Markdown(" # Text-to-Image Watermark")
+            with gr.Accordion("Instruction", open=False):
+                gr.Markdown("""
+                    # Embedding Watermark
+                    ## 1. Generate watermarked image
+                    * Enter your prompt in the text box.
+                    * Click  **Run**  to generate an image with a random binary watermark.
+                    ## 2. Save Image
+                    Click **Download** to save the watermarked image in PNG format
+                    ## 3. Advanced Settings
+                    -   **Seed**: Generates different images with different seed.
+                    -   **Guidance Scale**: Higher values give the model more freedom in image creation.
+                    -   **Num Inference Steps**: More steps enhance image detail and quality but increase computational cost.
+                    Source code: [Gaussian Shading](https://github.com/bsmhmmlf/Gaussian-Shading)""")
+            with gr.Row():
+                prompt = gr.Text(
+                    label="Prompt",
+                    show_label=False,
+                    max_lines=1,
+                    placeholder="Enter your prompt",
+                    container=False,
+                )
+                run_button = gr.Button("Run", scale=0, variant="primary")
+                download_button = gr.DownloadButton(visible=True)
+            with gr.Row():
+                result_original = gr.Image(label="Image without watermark", show_label=True)
+                result = gr.Image(label="Watermarked Image", show_label=True)
+            with gr.Accordion("Advanced Settings", open=False):
+                    seed = gr.Slider(
+                        label="Seed",
+                        minimum=0,
+                        maximum=MAX_SEED,
+                        step=1,
+                        value=0,
+                    )
+                    with gr.Row():
+                        guidance_scale = gr.Slider(
+                            label="Guidance scale",
+                            minimum=1.5,
+                            maximum=10,
+                            step=0.1,
+                            value=7.5,
+                        )
+                        num_inference_steps = gr.Slider(
+                            label="Num inference steps",
+                            minimum=10,
+                            maximum=100,
+                            step=1,
+                            value=50,
+                        )
+            gr.Examples(examples=examples, inputs=[prompt])
+# ---------------------------------- Extract Watermark -----------------------------------------
+    with gr.Tab("Extract watermark"):
+        with gr.Column(elem_id="col-container"):
+            gr.Markdown(" # Watermark Extraction")
+            with gr.Accordion("Instruction", open=False):
+                gr.Markdown("""
+                    # Extracting Watermark
+                    **Note**: Ensure you create an image first to add the watermark to the database.
+                    ## 1. Upload Image
+                    - Upload the image to the Image box.
+                    - Click the  **Extract**  button to extract the watermark.
+                    ## 2. Advanced Settings
+                    These settings are  **optional**  and can be used to simulate real-world attacks to erase the watermark:
+                    Click the  **Attack**  button to generate a distorted image.
+                    * **Seed**: Initialize the random number generator, ensuring reproducibility of the attack
+                    * **Random crop ratio**: determines the proportion of the image to be randomly cropped. A lower ratio means more of the image will be cropped.
+                    * **Random drop ratio**: specifies the fraction of pixels to be randomly dropped. A higher ratio increases the number of dropped pixels.
+                    * **Resize ratio**: determines how much the image will be resized. A lower ratio means the image will be reduced more significantly.
+                    * **Gaussian blur R**: the radius of the Gaussian blur applied to the image. A larger radius results in a more blurred image.
+                    * **Gaussian Std**: standard deviation of the Gaussian distribution used for blurring. A higher value results in a stronger blur effect.
+                    * **Sp prob**: the probability of each pixel being replaced with either black or white noise. A higher probability increases the amount of noise added to the image.
+                    ## Output Explanation
+                    -   **Output watermark**: The binary bit embedding in the image.
+                    -   **Accuracy bit**: The number of binary bits extracted that match the binary watermark in the database.
+                """)
+            with gr.Row():
+                input_image = gr.Image(type='pil')
+                extract_button = gr.Button("Extract", scale=0, variant="primary")
+            with gr.Accordion("Advanced Settings", open=False):
+                with gr.Row():
+                    seed = gr.Slider(
+                        label="Seed",
+                        minimum=0,
+                        maximum=MAX_SEED,
+                        step=1,
+                        value=0,
+                    )
+                    attack_button = gr.Button("Attack!", scale=0, variant="primary")
+                with gr.Row():
+                    random_crop_ratio = gr.Slider(
+                        label="Random crop ratio",
+                        minimum=0.5,
+                        maximum=1,
+                        step=0.1,
+                        value=1,
+                    )
+                    random_drop_ratio = gr.Slider(
+                        label="Random drop ratio",
+                        minimum=0,
+                        maximum=1,
+                        step=0.1,
+                        value=0,
+                    )
+                with gr.Row():
+                    resize_ratio = gr.Slider(
+                        label="Resize ratio",
+                        minimum=0.2,
+                        maximum=1,
+                        step=0.1,
+                        value=1,
+                    )
+                    gaussian_blur_r = gr.Slider(
+                        label="Gaussian blur r",
+                        minimum=0,
+                        maximum=1,
+                        step=0.1,
+                        value=0,
+                    )
+                with gr.Row():
+                    gaussian_std = gr.Slider(
+                        label="Gaussian std",
+                        minimum=0,
+                        maximum=0.01,
+                        step=0.0001,
+                        value=0,
+                    )
+                    sp_prob = gr.Slider(
+                        label="Sp prob",
+                        minimum=0,
+                        maximum=0.1,
+                        step=0.001,
+                        value=0,
+                    )
+                    attack_image = gr.Image(label="Attacked Image")
+        output = gr.Textbox(label="Output")
+        with gr.Accordion("More Details", open=False):
+            result_extract = gr.Textbox(label="Bit watermark")
+            accuracy_bit = gr.Textbox(label="Accuracy bit")
+# ----------------------------- Embedding watermark -------------------------
+    gr.on(
+        triggers=[run_button.click, prompt.submit],
+        fn=generate_with_watermark,
+        inputs=[
+            seed,
+            prompt,
+            guidance_scale,
+            num_inference_steps
+        ],
+        outputs=[result_original, result, download_button],
+    )
+# ----------------------------- Extract watermark -------------------------
+    gr.on(
+        triggers=[extract_button.click, attack_button.click],
+        fn=reverse_watermark,
+        inputs=[
+            input_image,
+            seed,
+            random_crop_ratio,
+            random_drop_ratio,
+            resize_ratio,
+            gaussian_blur_r,
+            gaussian_std,
+            sp_prob,
+        ],
+        outputs=[output, result_extract, accuracy_bit, attack_image],
+    )
+demo.launch(share=True)

image_utils.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+import numpy as np
+from torchvision import transforms
+from PIL import Image, ImageFilter
+import random
+def set_random_seed(seed=0):
+    torch.manual_seed(seed + 0)
+    torch.cuda.manual_seed(seed + 1)
+    torch.cuda.manual_seed_all(seed + 2)
+    np.random.seed(seed + 3)
+    torch.cuda.manual_seed_all(seed + 4)
+    random.seed(seed + 5)
+def transform_img(image, target_size=512):
+    tform = transforms.Compose(
+        [
+            transforms.Resize(target_size),
+            transforms.CenterCrop(target_size),
+            transforms.ToTensor(),
+        ]
+    )
+    image = tform(image)
+    return 2.0 * image - 1.0
+def latents_to_imgs(pipe, latents):
+    x = pipe.decode_image(latents)
+    x = pipe.torch_to_numpy(x)
+    x = pipe.numpy_to_pil(x)
+    return x
+def image_distortion(img,
+                     seed: int = 42,
+                     random_crop_ratio: float = None,
+                     random_drop_ratio: float = None,
+                     resize_ratio: float = None,
+                     gaussian_blur_r: int = None, #
+                     gaussian_std: float = None,
+                     sp_prob: float = None):
+    if random_crop_ratio is not None:
+        set_random_seed(seed)
+        width, height, c = np.array(img).shape
+        img = np.array(img)
+        new_width = int(width * random_crop_ratio)
+        new_height = int(height * random_crop_ratio)
+        start_x = np.random.randint(0, width - new_width + 1)
+        start_y = np.random.randint(0, height - new_height + 1)
+        end_x = start_x + new_width
+        end_y = start_y + new_height
+        padded_image = np.zeros_like(img)
+        padded_image[start_y:end_y, start_x:end_x] = img[start_y:end_y, start_x:end_x]
+        img = Image.fromarray(padded_image)
+    if random_drop_ratio is not None:
+        set_random_seed(seed)
+        width, height, c = np.array(img).shape
+        img = np.array(img)
+        new_width = int(width * random_drop_ratio)
+        new_height = int(height * random_drop_ratio)
+        start_x = np.random.randint(0, width - new_width + 1)
+        start_y = np.random.randint(0, height - new_height + 1)
+        padded_image = np.zeros_like(img[start_y:start_y + new_height, start_x:start_x + new_width])
+        img[start_y:start_y + new_height, start_x:start_x + new_width] = padded_image
+        img = Image.fromarray(img)
+    if resize_ratio is not None:
+        img_shape = np.array(img).shape
+        resize_size = int(img_shape[0] * resize_ratio)
+        img = transforms.Resize(size=resize_size)(img)
+        img = transforms.Resize(size=img_shape[0])(img)
+    if gaussian_blur_r is not None:
+        img = img.filter(ImageFilter.GaussianBlur(radius=gaussian_blur_r))
+    if gaussian_std is not None:
+        img_shape = np.array(img).shape
+        g_noise = np.random.normal(0, gaussian_std, img_shape) * 255
+        g_noise = g_noise.astype(np.uint8)
+        img = Image.fromarray(np.clip(np.array(img) + g_noise, 0, 255))
+    if sp_prob is not None:
+        c,h,w = np.array(img).shape
+        prob_zero = sp_prob / 2
+        prob_one = 1 - prob_zero
+        rdn = np.random.rand(c,h,w)
+        img = np.where(rdn > prob_one, np.zeros_like(img), img)
+        img = np.where(rdn < prob_zero, np.ones_like(img)*255, img)
+        img = Image.fromarray(img)
+    return img
+def measure_similarity(images, prompt, model, clip_preprocess, tokenizer, device):
+    with torch.no_grad():
+        img_batch = [clip_preprocess(i).unsqueeze(0) for i in images]
+        img_batch = torch.concatenate(img_batch).to(device)
+        image_features = model.encode_image(img_batch)
+        text = tokenizer([prompt]).to(device)
+        text_features = model.encode_text(text)
+        image_features /= image_features.norm(dim=-1, keepdim=True)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
+        return (image_features @ text_features.T).mean(-1)

inverse_stable_diffusion.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from functools import partial
+from typing import Callable, List, Optional, Union, Tuple
+import torch
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+# from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.stable_diffusion.safety_checker import \
+    StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler,PNDMScheduler, LMSDiscreteScheduler
+from modified_stable_diffusion import ModifiedStableDiffusionPipeline
+from torchvision.transforms import ToPILImage
+import matplotlib.pyplot as plt
+### credit to: https://github.com/cccntu/efficient-prompt-to-prompt
+def backward_ddim(x_t, alpha_t, alpha_tm1, eps_xt):
+    """ from noise to image"""
+    return (
+        alpha_tm1**0.5
+        * (
+            (alpha_t**-0.5 - alpha_tm1**-0.5) * x_t
+            + ((1 / alpha_tm1 - 1) ** 0.5 - (1 / alpha_t - 1) ** 0.5) * eps_xt
+        )
+        + x_t
+    )
+def forward_ddim(x_t, alpha_t, alpha_tp1, eps_xt):
+    """ from image to noise, it's the same as backward_ddim"""
+    return backward_ddim(x_t, alpha_t, alpha_tp1, eps_xt)
+class InversableStableDiffusionPipeline(ModifiedStableDiffusionPipeline):
+    def __init__(self,
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        scheduler,
+        safety_checker,
+        feature_extractor,
+        requires_safety_checker: bool = False,
+    ):
+        super(InversableStableDiffusionPipeline, self).__init__(vae,
+                text_encoder,
+                tokenizer,
+                unet,
+                scheduler,
+                safety_checker,
+                feature_extractor,
+                requires_safety_checker)
+        self.forward_diffusion = partial(self.backward_diffusion, reverse_process=True)
+        self.count = 0
+    def get_random_latents(self, latents=None, height=512, width=512, generator=None):
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        batch_size = 1
+        device = self._execution_device
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            self.text_encoder.dtype,
+            device,
+            generator,
+            latents,
+        )
+        return latents
+    @torch.inference_mode()
+    def get_text_embedding(self, prompt):
+        text_input_ids = self.tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        return text_embeddings
+    @torch.inference_mode()
+    def get_image_latents(self, image, sample=True, rng_generator=None):
+        encoding_dist = self.vae.encode(image).latent_dist
+        if sample:
+            encoding = encoding_dist.sample(generator=rng_generator)
+        else:
+            encoding = encoding_dist.mode()
+        latents = encoding * 0.18215
+        return latents
+    @torch.inference_mode()
+    def backward_diffusion(
+        self,
+        use_old_emb_i=25,
+        text_embeddings=None,
+        old_text_embeddings=None,
+        new_text_embeddings=None,
+        latents: Optional[torch.FloatTensor] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        reverse_process: True = False,
+        **kwargs,
+    ):
+        """ Generate image from text prompt and latents
+        """
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        if old_text_embeddings is not None and new_text_embeddings is not None:
+            prompt_to_prompt = True
+        else:
+            prompt_to_prompt = False
+        for i, t in enumerate(self.progress_bar(timesteps_tensor if not reverse_process else reversed(timesteps_tensor))):
+            if prompt_to_prompt:
+                if i < use_old_emb_i:
+                    text_embeddings = old_text_embeddings
+                else:
+                    text_embeddings = new_text_embeddings
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input, t, encoder_hidden_states=text_embeddings
+            ).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+            prev_timestep = (
+                t
+                - self.scheduler.config.num_train_timesteps
+                // self.scheduler.num_inference_steps
+            )
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+            # ddim
+            alpha_prod_t = self.scheduler.alphas_cumprod[t]
+            alpha_prod_t_prev = (
+                self.scheduler.alphas_cumprod[prev_timestep]
+                if prev_timestep >= 0
+                else self.scheduler.final_alpha_cumprod
+            )
+            if reverse_process:
+                alpha_prod_t, alpha_prod_t_prev = alpha_prod_t_prev, alpha_prod_t
+            latents = backward_ddim(
+                x_t=latents,
+                alpha_t=alpha_prod_t,
+                alpha_tm1=alpha_prod_t_prev,
+                eps_xt=noise_pred,
+            )
+        return latents
+    @torch.inference_mode()
+    def decode_image(self, latents: torch.FloatTensor, **kwargs):
+        scaled_latents = 1 / 0.18215 * latents
+        image = [
+            self.vae.decode(scaled_latents[i : i + 1]).sample for i in range(len(latents))
+        ]
+        image = torch.cat(image, dim=0)
+        return image
+    @torch.inference_mode()
+    def torch_to_numpy(self, image):
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        return image

modified_stable_diffusion.py ADDED Viewed

	@@ -0,0 +1,235 @@

+from typing import Callable, List, Optional, Union, Any, Dict
+import copy
+import numpy as np
+import PIL
+import torch
+from diffusers import StableDiffusionPipeline, DDIMScheduler, UNet2DConditionModel
+from diffusers.utils import logging, BaseOutput
+from torchvision.transforms import ToPILImage
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class ModifiedStableDiffusionPipelineOutput(BaseOutput):
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+    init_latents: Optional[torch.FloatTensor]
+class ModifiedStableDiffusionPipeline(StableDiffusionPipeline):
+    def __init__(self,
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        scheduler,
+        safety_checker,
+        feature_extractor,
+        requires_safety_checker: bool = False,
+    ):
+        super(ModifiedStableDiffusionPipeline, self).__init__(vae,
+                text_encoder,
+                tokenizer,
+                unet,
+                scheduler,
+                safety_checker,
+                feature_extractor,
+                requires_safety_checker)
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        watermarking_gamma: float = None,
+        watermarking_delta: float = None,
+        watermarking_mask: Optional[torch.BoolTensor] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        self.count = 0
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        init_latents = copy.deepcopy(latents)
+        # watermarking mask
+        if watermarking_gamma is not None:
+            watermarking_mask = torch.rand(latents.shape, device=device) < watermarking_gamma
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # add watermark
+                if watermarking_mask is not None:
+                    # latents[watermarking_mask] += watermarking_delta
+                    latents[watermarking_mask] += watermarking_delta * torch.sign(latents[watermarking_mask])
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+        # 9. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return ModifiedStableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept, init_latents=init_latents)
+    @torch.inference_mode()
+    def decode_image(self, latents: torch.FloatTensor, **kwargs):
+        scaled_latents = 1 / 0.18215 * latents
+        image = [
+            self.vae.decode(scaled_latents[i : i + 1]).sample for i in range(len(latents))
+        ]
+        image = torch.cat(image, dim=0)
+        return image
+    @torch.inference_mode()
+    def torch_to_numpy(self, image):
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        return image
+    @torch.inference_mode()
+    def get_image_latents(self, image, sample=True, rng_generator=None):
+        encoding_dist = self.vae.encode(image).latent_dist
+        if sample:
+            encoding = encoding_dist.sample(generator=rng_generator)
+        else:
+            encoding = encoding_dist.mode()
+        latents = encoding * 0.18215
+        return latents

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+albumentations
+diffusers
+einops
+huggingface_hub
+natsort
+pillow
+PyYAML
+regex
+requests
+timm
+torch
+torchvision
+tqdm
+transformers

run_gaussian_shading.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from tqdm import tqdm
+import torch
+from transformers import CLIPModel, CLIPTokenizer
+from inverse_stable_diffusion import InversableStableDiffusionPipeline
+from diffusers import DPMSolverMultistepScheduler, DDIMScheduler
+import os
+import gradio as gr
+from image_utils import *
+from watermark import *
+# Initialize the parameter:
+model_path = 'stabilityai/stable-diffusion-2-1-base'
+channel_copy = 1
+hw_copy = 8
+fpr = 0.000001
+user_number = 1000000
+guidance_scale = 7.5
+num_inference_steps = 50
+image_length = 512
+# """ ---------------------- Initialization ---------------------- """
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+scheduler = DPMSolverMultistepScheduler.from_pretrained(model_path, subfolder='scheduler')
+pipe = InversableStableDiffusionPipeline.from_pretrained(
+        model_path,
+        scheduler=scheduler,
+        torch_dtype=torch.float16,
+        revision='fp16',
+)
+pipe.safety_checker = None
+pipe = pipe.to(device)
+#a simple implement watermark
+watermark = Gaussian_Shading(channel_copy, hw_copy, fpr, user_number)
+# assume at the detection time, the original prompt is unknown
+tester_prompt = ''
+text_embeddings = pipe.get_text_embedding(tester_prompt)
+#generate with watermark
+def generate_with_watermark(seed, prompt, guidance_scale=7.5, num_inference_steps=50):
+    set_random_seed(seed)
+    init_latents_w, key, wk = watermark.create_watermark_and_return_w()
+    watermark_list = []
+    torch.save(key, 'key.pt')
+    if not os.path.exists('watermark.pt'):
+        torch.save(wk, 'watermark.pt')
+    else:
+        watermark_list = torch.load('watermark.pt')
+    if not isinstance(watermark_list, list):
+        watermark_list = [watermark_list]
+    watermark_list.append(wk)
+    torch.save(watermark_list, 'watermark.pt')
+    outputs = pipe(
+        prompt,
+        num_images_per_prompt=1,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        height=image_length,
+        width=image_length,
+        latents=init_latents_w,
+    )
+    image_w = outputs.images[0]
+    # From original
+    outputs_original = pipe(
+        prompt,
+        num_images_per_prompt=1,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        height=image_length,
+        width=image_length
+    )
+    image_original = outputs_original.images[0]
+    # save file, download and remove
+    image_path = 'output_image.png'
+    if os.path.exists(image_path):
+        os.remove(image_path)
+    image_w.save('output_image.png', format='PNG')
+    return image_original, image_w, 'output_image.png'
+# reverse img
+def reverse_watermark(image, *args, **kwargs):
+    image_attacked = image_distortion(image, *args, **kwargs)
+    image_w_distortion = transform_img(image_attacked).unsqueeze(0).to(text_embeddings.dtype).to(device)
+    image_latents_w = pipe.get_image_latents(image_w_distortion, sample=False)
+    reversed_latents_w = pipe.forward_diffusion(
+        latents=image_latents_w,
+        text_embeddings=text_embeddings,
+        guidance_scale=1,
+        num_inference_steps=50,
+    )
+    try:
+        bit, accuracy =  watermark.eval_watermark(reversed_latents_w)
+    except FileNotFoundError:
+        raise gr.Error("Database is empty. Please generate Image first!", duration=8)
+    if accuracy > 0.7:
+        output = 'This Image have watermark'
+    else:
+        output = "This Image doesn't have watermark"
+    return output, bit, accuracy, image_attacked

watermark.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+from scipy.stats import norm,truncnorm
+from functools import reduce
+from scipy.special import betainc
+import numpy as np
+class Gaussian_Shading:
+    def __init__(self, ch_factor, hw_factor, fpr, user_number):
+        self.ch = ch_factor
+        self.hw = hw_factor
+        self.key = None
+        self.watermark = None
+        self.latentlength = 4 * 64 * 64
+        self.marklength = self.latentlength//(self.ch * self.hw * self.hw)
+        self.threshold = 1 if self.hw == 1 and self.ch == 1 else self.ch * self.hw * self.hw // 2
+        self.tp_onebit_count = 0
+        self.tp_bits_count = 0
+        self.tau_onebit = None
+        self.tau_bits = None
+        for i in range(self.marklength):
+            fpr_onebit = betainc(i+1, self.marklength-i, 0.5)
+            fpr_bits = betainc(i+1, self.marklength-i, 0.5) * user_number
+            if fpr_onebit <= fpr and self.tau_onebit is None:
+                self.tau_onebit = i / self.marklength
+            if fpr_bits <= fpr and self.tau_bits is None:
+                self.tau_bits = i / self.marklength
+    def truncSampling(self, message):
+        z = np.zeros(self.latentlength)
+        denominator = 2.0
+        ppf = [norm.ppf(j / denominator) for j in range(int(denominator) + 1)]
+        for i in range(self.latentlength):
+            dec_mes = reduce(lambda a, b: 2 * a + b, message[i : i + 1])
+            dec_mes = int(dec_mes)
+            z[i] = truncnorm.rvs(ppf[dec_mes], ppf[dec_mes + 1])
+        z = torch.from_numpy(z).reshape(1, 4, 64, 64).half()
+        return z.cuda()
+    def create_watermark_and_return_w(self):
+        rng_state = torch.get_rng_state()
+        torch.manual_seed(42)
+        self.key = torch.randint(0, 2, [1, 4, 64, 64]).cuda()
+        torch.set_rng_state(rng_state)
+        self.watermark = torch.randint(0, 2, [1, 4 // self.ch, 64 // self.hw, 64 // self.hw]).cuda()
+        sd = self.watermark.repeat(1,self.ch,self.hw,self.hw)
+        m = ((sd + self.key) % 2).flatten().cpu().numpy()
+        w = self.truncSampling(m)
+        return w, self.key, self.watermark
+    def diffusion_inverse(self,watermark_sd):
+        ch_stride = 4 // self.ch
+        hw_stride = 64 // self.hw
+        ch_list = [ch_stride] * self.ch
+        hw_list = [hw_stride] * self.hw
+        split_dim1 = torch.cat(torch.split(watermark_sd, tuple(ch_list), dim=1), dim=0)
+        split_dim2 = torch.cat(torch.split(split_dim1, tuple(hw_list), dim=2), dim=0)
+        split_dim3 = torch.cat(torch.split(split_dim2, tuple(hw_list), dim=3), dim=0)
+        vote = torch.sum(split_dim3, dim=0).clone()
+        vote[vote <= self.threshold] = 0
+        vote[vote > self.threshold] = 1
+        return vote
+    def sequence_binary_watermark(self, watermark):
+        ls = watermark.view(-1).tolist()
+        sequence = ''.join(str(i) for i in ls)
+        return sequence
+    def eval_watermark(self, reversed_m):
+        key = torch.load('key.pt')
+        reversed_m = (reversed_m > 0).int()
+        # reversed_sd = (reversed_m + self.key) % 2
+        reversed_sd = (reversed_m + key) % 2
+        reversed_watermark = self.diffusion_inverse(reversed_sd)
+        print(f"The extracted watermark is {self.sequence_binary_watermark(reversed_watermark)}")
+        watermark = torch.load('watermark.pt')
+        ls_accurate = []
+        for i in watermark:
+            ls_accurate.append((reversed_watermark == i).float().mean().item())
+        correct = max(ls_accurate)
+        if correct >= self.tau_onebit:
+            self.tp_onebit_count = self.tp_onebit_count+1
+        if correct >= self.tau_bits:
+            self.tp_bits_count = self.tp_bits_count + 1
+        return self.sequence_binary_watermark(reversed_watermark), correct
+    def get_tpr(self):
+        return self.tp_onebit_count, self.tp_bits_count