Spaces:

awacke1
/

Art-Generator-and-Style-Mixer

Runtime error

App Files Files Community

awacke1 commited on Oct 3, 2024

Commit

a3dfcb0

verified ·

1 Parent(s): 3836db0

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -101

app.py CHANGED Viewed

@@ -13,18 +13,15 @@ from torch.nn import functional as F
 from torchvision import transforms
 from torchvision.transforms import functional as TF
 from tqdm import trange
-from cloob_training import model_pt, pretrained
-import ldm.models.autoencoder
-from diffusion import sampling, utils
-import train_latent_diffusion as train
 from huggingface_hub import hf_hub_url, cached_download
 import gradio as gr  # 🎨 The magic canvas for AI-powered image generation!
-# 🖼️ Download the necessary model files
-# These files are loaded from HuggingFace's repository
-checkpoint = cached_download(hf_hub_url("huggan/distill-ccld-wa", filename="model_student.ckpt"))
-ae_model_path = cached_download(hf_hub_url("huggan/ccld_wa", filename="ae_model.ckpt"))
-ae_config_path = cached_download(hf_hub_url("huggan/ccld_wa", filename="ae_model.yaml"))
 # 📐 Utility Functions: Math and images, what could go wrong?
 # These functions help parse prompts and resize/crop images to fit nicely
@@ -33,11 +30,7 @@ def parse_prompt(prompt, default_weight=3.):
     """
     🎯 Parses a prompt into text and weight.
     """
-    if prompt.startswith('http://') or prompt.startswith('https://'):
-        vals = prompt.rsplit(':', 2)
-        vals = [vals[0] + ':' + vals[1], *vals[2:]]
-    else:
-        vals = prompt.rsplit(':', 1)
     vals = vals + ['', default_weight][len(vals):]
     return vals[0], float(vals[1])
@@ -49,59 +42,51 @@ def resize_and_center_crop(image, size):
     image = image.resize((int(fac * image.size[0]), int(fac * image.size[1])), Image.LANCZOS)
     return TF.center_crop(image, size[::-1])
 # 🧠 Model loading: the brain of our operation! 🔥
-# Load all the models: autoencoder, diffusion, and CLOOB
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 print('Using device:', device)
 print('loading models... 🛠️')
-# 🔧 Autoencoder Setup: Let’s decode the madness into images
-ae_config = OmegaConf.load(ae_config_path)
-ae_model = ldm.models.autoencoder.AutoencoderKL(**ae_config.model.params)
-ae_model.eval().requires_grad_(False).to(device)
-ae_model.load_state_dict(torch.load(ae_model_path))
-n_ch, side_y, side_x = 4, 32, 32
-# 🌀 Diffusion Model Setup: The artist behind the scenes
-model = train.DiffusionModel(192, [1,1,2,2], autoencoder_scale=torch.tensor(4.3084))
-model.load_state_dict(torch.load(checkpoint, map_location='cpu'))
-model = model.to(device).eval().requires_grad_(False)
-# 👁️ CLOOB Setup: Our vision model to understand art in human style
-cloob_config = pretrained.get_config('cloob_laion_400m_vit_b_16_16_epochs')
-cloob = model_pt.get_pt_model(cloob_config)
-checkpoint = pretrained.download_checkpoint(cloob_config)
-cloob.load_state_dict(model_pt.get_pt_params(cloob_config, checkpoint))
-cloob.eval().requires_grad_(False).to(device)
 # 🎨 The key function: Where the magic happens!
 # This is where we generate images based on text and image prompts
-def generate(n=1, prompts=['a red circle'], images=[], seed=42, steps=15, method='plms', eta=None):
     """
     🖼️ Generates a list of PIL images based on given text and image prompts.
     """
-    zero_embed = torch.zeros([1, cloob.config['d_embed']], device=device)
     target_embeds, weights = [zero_embed], []
-    # Parse text prompts
     for prompt in prompts:
-        txt, weight = parse_prompt(prompt)
-        target_embeds.append(cloob.text_encoder(cloob.tokenize(txt).to(device)).float())
-        weights.append(weight)
     # Parse image prompts
     for prompt in images:
         path, weight = parse_prompt(prompt)
-        img = Image.open(utils.fetch(path)).convert('RGB')
-        clip_size = cloob.config['image_encoder']['image_size']
-        img = resize_and_center_crop(img, (clip_size, clip_size))
-        batch = TF.to_tensor(img)[None].to(device)
-        embed = F.normalize(cloob.image_encoder(cloob.normalize(batch)).float(), dim=-1)
-        target_embeds.append(embed)
         weights.append(weight)
     # Adjust weights and set seed
@@ -115,7 +100,7 @@ def generate(n=1, prompts=['a red circle'], images=[], seed=42, steps=15, method
         x_in = x.repeat([n_conds, 1, 1, 1])
         t_in = t.repeat([n_conds])
         embed_in = torch.cat([*target_embeds]).repeat_interleave(n, 0)
-        vs = model(x_in, t_in, embed_in).view([n_conds, n, *x.shape[1:]])
         v = vs.mul(weights[:, None, None, None, None]).sum(0)
         return v
@@ -131,22 +116,19 @@ def generate(n=1, prompts=['a red circle'], images=[], seed=42, steps=15, method
     # 🏃‍♂️ Generate the output images
     batch_size = n
-    x = torch.randn([n, n_ch, side_y, side_x], device=device)
     t = torch.linspace(1, 0, steps + 1, device=device)[:-1]
     pil_ims = []
     for i in trange(0, n, batch_size):
         cur_batch_size = min(n - i, batch_size)
         out_latents = run(x[i:i + cur_batch_size], steps)
-        outs = ae_model.decode(out_latents * torch.tensor(2.55).to(device))
         for j, out in enumerate(outs):
-            pil_ims.append(utils.to_pil_image(out))
     return pil_ims
 # 🖌️ Interface: Gradio's brush to paint the UI
-# Gradio is used here to create a user-friendly interface for art generation.
 def gen_ims(prompt, im_prompt=None, seed=None, n_steps=10, method='plms'):
     """
     💡 Gradio function to wrap image generation.
@@ -169,56 +151,12 @@ iface = gr.Interface(
     ],
     outputs=gr.Image(type="pil", label="Generated Image"),
     examples=[
-  ["Virgin and Child, in the style of Jacopo Bellini"],
-  ["Art Nouveau, in the style of John Singer Sargent"],
-  ["Neoclassicism, in the style of Gustav Klimt"],
-  ["Abstract Art, in the style of M.C. Escher"],
-  ['Surrealism, in the style of Salvador Dali'],
-   ["Romanesque Art, in the style of Leonardo da Vinci"],
-   ["landscape"],
-    ["portrait"],
-    ["sculpture"],
-    ["photo"],
-    ["figurative"],
-    ["illustration"],
-    ["still life"],
-    ["cityscape"],
-    ["marina"],
-    ["animal painting"],
-    ["graffiti"],
-    ["mythological painting"],
-    ["battle painting"],
-    ["self-portrait"],
-    ["Impressionism, oil on canvas"],
-    ["Katsushika Hokusai, The Dragon of Smoke Escaping from Mount Fuji"],
-    ["Moon Light Sonata by Basuki Abdullah"],
-    ["Two Trees by M.C. Escher"],
-    ["Futurism, in the style of Wassily Kandinsky"],
-    ["Surrealism, in the style of Edgar Degas"],
-    ["Expressionism, in the style of Wassily Kandinsky"],
-    ["Futurism, in the style of Egon Schiele"],
-    ["Cubism, in the style of Gustav Klimt"],
-    ["Op Art, in the style of Marc Chagall"],
-    ["Romanticism, in the style of M.C. Escher"],
-    ["Futurism, in the style of M.C. Escher"],
-    ["Mannerism, in the style of Paul Klee"],
-    ["High Renaissance, in the style of Rembrandt"],
-    ["Magic Realism, in the style of Gustave Dore"],
-    ["Realism, in the style of Jean-Michel Basquiat"],
-    ["Art Nouveau, in the style of Paul Gauguin"],
-    ["Avant-garde, in the style of Pierre-Auguste Renoir"],
-    ["Baroque, in the style of Edward Hopper"],
-    ["Post-Impressionism, in the style of Wassily Kandinsky"],
-    ["Naturalism, in the style of Rene Magritte"],
-    ["Constructivism, in the style of Paul Cezanne"],
-    ["Abstract Expressionism, in the style of Henri Matisse"],
-    ["Pop Art, in the style of Vincent van Gogh"],
-    ["Futurism, in the style of Zdzislaw Beksinski"],
-    ["Aaron Wacker, oil on canvas"]
     ],
-    title='Art Generator and Style Mixer from 🧠 Cloob and 🎨 WikiArt - Visual Art Encyclopedia',
-    description="Trained on images from the [WikiArt](https://www.wikiart.org/) dataset, comprised of visual arts",
-    article='Model used is: [model card](https://huggingface.co/huggan/distill-ccld-wa).'
 )
 # 🚀 Launch the Gradio interface

 from torchvision import transforms
 from torchvision.transforms import functional as TF
 from tqdm import trange
+from transformers import CLIPProcessor, CLIPModel
+from vqvae import VQVAE2  # Autoencoder replacement
+from diffusion_models import Diffusion  # Swapped Diffusion model for DALL·E 2 based model
 from huggingface_hub import hf_hub_url, cached_download
 import gradio as gr  # 🎨 The magic canvas for AI-powered image generation!
+# 🖼️ Download the necessary model files from HuggingFace
+vqvae_model_path = cached_download(hf_hub_url("huggingface/vqvae-2", filename="vqvae_model.ckpt"))
+diffusion_model_path = cached_download(hf_hub_url("huggingface/dalle-2", filename="diffusion_model.ckpt"))
 # 📐 Utility Functions: Math and images, what could go wrong?
 # These functions help parse prompts and resize/crop images to fit nicely
     """
     🎯 Parses a prompt into text and weight.
     """
+    vals = prompt.rsplit(':', 1)
     vals = vals + ['', default_weight][len(vals):]
     return vals[0], float(vals[1])
     image = image.resize((int(fac * image.size[0]), int(fac * image.size[1])), Image.LANCZOS)
     return TF.center_crop(image, size[::-1])
 # 🧠 Model loading: the brain of our operation! 🔥
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 print('Using device:', device)
 print('loading models... 🛠️')
+# Load CLIP model
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+# Load VQ-VAE-2 Autoencoder
+vqvae = VQVAE2()
+vqvae.load_state_dict(torch.load(vqvae_model_path))
+vqvae.eval().requires_grad_(False).to(device)
+# Load Diffusion Model
+diffusion_model = Diffusion()
+diffusion_model.load_state_dict(torch.load(diffusion_model_path))
+diffusion_model = diffusion_model.to(device).eval().requires_grad_(False)
 # 🎨 The key function: Where the magic happens!
 # This is where we generate images based on text and image prompts
+def generate(n=1, prompts=['a red circle'], images=[], seed=42, steps=15, method='ddim', eta=None):
     """
     🖼️ Generates a list of PIL images based on given text and image prompts.
     """
+    zero_embed = torch.zeros([1, clip_model.config.projection_dim], device=device)
     target_embeds, weights = [zero_embed], []
+    # Parse text prompts and encode with CLIP
     for prompt in prompts:
+        inputs = clip_processor(text=prompt, return_tensors="pt").to(device)
+        text_embed = clip_model.get_text_features(**inputs).float()
+        target_embeds.append(text_embed)
+        weights.append(1.0)
     # Parse image prompts
     for prompt in images:
         path, weight = parse_prompt(prompt)
+        img = Image.open(path).convert('RGB')
+        img = resize_and_center_crop(img, (224, 224))
+        inputs = clip_processor(images=img, return_tensors="pt").to(device)
+        image_embed = clip_model.get_image_features(**inputs).float()
+        target_embeds.append(image_embed)
         weights.append(weight)
     # Adjust weights and set seed
         x_in = x.repeat([n_conds, 1, 1, 1])
         t_in = t.repeat([n_conds])
         embed_in = torch.cat([*target_embeds]).repeat_interleave(n, 0)
+        vs = diffusion_model(x_in, t_in, embed_in).view([n_conds, n, *x.shape[1:]])
         v = vs.mul(weights[:, None, None, None, None]).sum(0)
         return v
     # 🏃‍♂️ Generate the output images
     batch_size = n
+    x = torch.randn([n, 3, 64, 64], device=device)
     t = torch.linspace(1, 0, steps + 1, device=device)[:-1]
     pil_ims = []
     for i in trange(0, n, batch_size):
         cur_batch_size = min(n - i, batch_size)
         out_latents = run(x[i:i + cur_batch_size], steps)
+        outs = vqvae.decode(out_latents)
         for j, out in enumerate(outs):
+            pil_ims.append(transforms.ToPILImage()(out))
     return pil_ims
 # 🖌️ Interface: Gradio's brush to paint the UI
 def gen_ims(prompt, im_prompt=None, seed=None, n_steps=10, method='plms'):
     """
     💡 Gradio function to wrap image generation.
     ],
     outputs=gr.Image(type="pil", label="Generated Image"),
     examples=[
+        ["A beautiful sunset over the ocean"],
+        ["A futuristic cityscape at night"],
+        ["A surreal dream-like landscape"]
     ],
+    title='CLIP + Diffusion Model Image Generator',
+    description="Generate stunning images from text and image prompts using CLIP and a diffusion model.",
 )
 # 🚀 Launch the Gradio interface