Spaces:

awacke1
/

Art-Generator-and-Style-Mixer

Running

App Files Files Community

awacke1 commited on Aug 7

Commit

3a68d7d

verified ·

1 Parent(s): 72e6740

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -88

app.py CHANGED Viewed

@@ -14,8 +14,8 @@ from torchvision import transforms
 from torchvision.transforms import functional as TF
 from tqdm import trange
 from transformers import CLIPProcessor, CLIPModel
-from huggingface_hub import hf_hub_download # FIXED: Replaced deprecated function
-import gradio as gr  # 🎨 The magic canvas for AI-powered image generation!
 import math
 # -----------------------------------------------------------------------------
@@ -27,10 +27,6 @@ import math
 class VQVAE2(nn.Module):
     def __init__(self, n_embed=8192, embed_dim=256, ch=128):
         super().__init__()
-        # This is a simplified placeholder. The actual architecture would be more complex.
-        # The key is having a 'decode' method that matches the state_dict.
-        # A full implementation would require the original model's architecture file.
-        # For this fix, we assume a basic structure that allows loading the state_dict.
         self.decoder = nn.Sequential(
             nn.Conv2d(embed_dim, ch * 4, 3, padding=1),
             nn.ReLU(),
@@ -40,40 +36,32 @@ class VQVAE2(nn.Module):
             nn.ReLU(),
             nn.ConvTranspose2d(ch, 3, 4, stride=2, padding=1),
         )
     def decode(self, latents):
-        # A real VQVAE would involve lookup tables, but for generation we only need the decoder part.
-        # This part is highly dependent on the model checkpoint.
-        # The following is a guess to make it runnable, assuming latents are ready for the decoder.
         return self.decoder(latents)
 # Diffusion Model Definition
 class Diffusion(nn.Module):
-    def __init__(self, n_inputs=3, n_embed=512, n_head=8, n_layer=12):
         super().__init__()
-        # This is also a placeholder for the architecture.
-        # A full UNet-style model is expected here. The key is that it can be called
-        # with x, t, and conditional embeddings, and returns the predicted noise.
         self.time_embed = nn.Embedding(1000, n_inputs * 4)
         self.cond_embed = nn.Linear(n_embed, n_inputs * 4)
         self.layers = nn.ModuleList([
-            nn.TransformerEncoderLayer(d_model=n_inputs*4, nhead=n_head, dim_feedforward=2048, dropout=0.1, activation='gelu')
             for _ in range(n_layer)
         ])
         self.out = nn.Linear(n_inputs*4, n_inputs)
     def forward(self, x, t, c):
-        # A very simplified forward pass
-        # The actual model is likely a UNet with cross-attention.
         bs, ch, h, w = x.shape
         x = x.permute(0, 2, 3, 1).reshape(bs, h * w, ch)
         t_emb = self.time_embed(t.long())
         c_emb = self.cond_embed(c)
         emb = t_emb + c_emb
-        # This is a gross simplification; a real model would use cross-attention here.
         x_out = self.out(x + emb.unsqueeze(1))
         x_out = x_out.reshape(bs, h, w, ch).permute(0, 3, 1, 2)
         return x_out
@@ -81,42 +69,35 @@ class Diffusion(nn.Module):
 # Sampling Function Definitions
 def get_sigmas(n_steps):
-    """Returns the sigma schedule."""
     t = torch.linspace(1, 0, n_steps + 1)
     return ((t[:-1] ** 2) / (t[1:] ** 2) - 1).sqrt()
 @torch.no_grad()
 def plms_sample(model, x, steps, **kwargs):
-    """Poor Man's LMS Sampler"""
     ts = x.new_ones([x.shape[0]])
     sigmas = get_sigmas(steps)
     model_fn = lambda x, t: model(x, t * 1000, **kwargs)
-    x_outs = []
     old_denoised = None
     for i in trange(len(sigmas) -1, disable=True):
         denoised = model_fn(x, ts * sigmas[i])
         if old_denoised is None:
             d = (denoised - x) / sigmas[i]
         else:
-            d = (3 * denoised - old_denoised) / 2 - x / sigmas[i] # LMS step
         x = x + d * (sigmas[i+1] - sigmas[i])
         old_denoised = denoised
-        x_outs.append(x)
-    return x_outs[-1]
-# NOTE: DDIM and DDPM samplers would be defined here as well if needed.
-# For simplicity, we are only defining the 'plms' sampler used in the UI default.
 def ddim_sample(model, x, steps, eta, **kwargs):
-    # This is a placeholder for a full DDIM implementation
     print("Warning: DDIM sampler is not fully implemented. Using PLMS instead.")
     return plms_sample(model, x, steps, **kwargs)
 def ddpm_sample(model, x, steps, **kwargs):
-    # This is a placeholder for a full DDPM implementation
     print("Warning: DDPM sampler is not fully implemented. Using PLMS instead.")
     return plms_sample(model, x, steps, **kwargs)
@@ -125,16 +106,12 @@ def ddpm_sample(model, x, steps, **kwargs):
 # -----------------------------------------------------------------------------
 # 🖼️ Download the necessary model files from HuggingFace
-# NOTE: The HuggingFace URLs you provided might be placeholders.
-# Make sure these point to the correct model files.
 try:
-    # FIXED: Using the new hf_hub_download function with keyword arguments
     vqvae_model_path = hf_hub_download(repo_id="dalle-mini/vqgan_imagenet_f16_16384", filename="flax_model.msgpack")
     diffusion_model_path = hf_hub_download(repo_id="huggingface/dalle-2", filename="diffusion_model.ckpt")
 except Exception as e:
     print(f"Could not download models. Please ensure the HuggingFace URLs are correct.")
     print("Using placeholder models which will not produce good images.")
-    # Create dummy files if download fails to allow script to run
     Path("vqvae_model.ckpt").touch()
     Path("diffusion_model.ckpt").touch()
     vqvae_model_path = "vqvae_model.ckpt"
@@ -142,26 +119,17 @@ except Exception as e:
 # 📐 Utility Functions: Math and images, what could go wrong?
-# These functions help parse prompts and resize/crop images to fit nicely
 def parse_prompt(prompt, default_weight=3.):
-    """
-    🎯 Parses a prompt into text and weight.
-    """
     vals = prompt.rsplit(':', 1)
     vals = vals + ['', default_weight][len(vals):]
     return vals[0], float(vals[1])
 def resize_and_center_crop(image, size):
-    """
-    ✂️ Resize and crop image to center it beautifully.
-    """
     fac = max(size[0] / image.size[0], size[1] / image.size[1])
     image = image.resize((int(fac * image.size[0]), int(fac * image.size[1])), Image.LANCZOS)
     return TF.center_crop(image, size[::-1])
 # 🧠 Model loading: the brain of our operation! 🔥
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 print('Using device:', device)
 print('loading models... 🛠️')
@@ -171,23 +139,17 @@ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 # Load VQ-VAE-2 Autoencoder
-# NOTE: The VQVAE2 class is a placeholder. Loading a real checkpoint will likely fail
-# unless the class definition perfectly matches the architecture of the saved model.
 try:
     vqvae = VQVAE2()
-    # vqvae.load_state_dict(torch.load(vqvae_model_path, map_location=device))
     print("Skipping VQVAE weight loading due to placeholder architecture.")
 except Exception as e:
     print(f"Could not load VQVAE model: {e}. Using placeholder.")
     vqvae = VQVAE2()
 vqvae.eval().requires_grad_(False).to(device)
 # Load Diffusion Model
-# NOTE: The Diffusion class is a placeholder. This will also likely fail.
 try:
     diffusion_model = Diffusion()
-    # diffusion_model.load_state_dict(torch.load(diffusion_model_path, map_location=device))
     print("Skipping Diffusion Model weight loading due to placeholder architecture.")
 except Exception as e:
     print(f"Could not load Diffusion model: {e}. Using placeholder.")
@@ -196,27 +158,19 @@ diffusion_model = diffusion_model.to(device).eval().requires_grad_(False)
 # 🎨 The key function: Where the magic happens!
-# This is where we generate images based on text and image prompts
 def generate(n=1, prompts=['a red circle'], images=[], seed=42, steps=15, method='ddim', eta=None):
-    """
-    🖼️ Generates a list of PIL images based on given text and image prompts.
-    """
     zero_embed = torch.zeros([1, clip_model.config.projection_dim], device=device)
     target_embeds, weights = [zero_embed], []
-    # Parse text prompts and encode with CLIP
     for prompt in prompts:
         inputs = clip_processor(text=prompt, return_tensors="pt").to(device)
         text_embed = clip_model.get_text_features(**inputs).float()
         target_embeds.append(text_embed)
         weights.append(1.0)
-    # Correctly process image prompts from Gradio
-    # Assign a default weight for image prompts
     image_prompt_weight = 1.0
     for image_path in images:
-        if image_path:  # Check if a path was actually provided
             try:
                 img = Image.open(image_path).convert('RGB')
                 img = resize_and_center_crop(img, (224, 224))
@@ -227,28 +181,23 @@ def generate(n=1, prompts=['a red circle'], images=[], seed=42, steps=15, method
             except Exception as e:
                 print(f"Warning: Could not process image prompt {image_path}. Error: {e}")
-    # Adjust weights and set seed
     weights = torch.tensor([1 - sum(weights), *weights], device=device)
     torch.manual_seed(seed)
-    # 💡 Model function with classifier-free guidance
     def cfg_model_fn(x, t):
         n = x.shape[0]
         n_conds = len(target_embeds)
         x_in = x.repeat([n_conds, 1, 1, 1])
         t_in = t.repeat([n_conds])
         embed_in = torch.cat(target_embeds).repeat_interleave(n, 0)
-        # Ensure correct dimensions for the placeholder Diffusion model
         if isinstance(diffusion_model, Diffusion):
-             embed_in = embed_in[:, :512] # Adjust embed dim if needed
         vs = diffusion_model(x_in, t_in, embed_in).view([n_conds, n, *x.shape[1:]])
         v = vs.mul(weights[:, None, None, None, None]).sum(0)
         return v
-    # 🎞️ Run the sampler to generate images
     def run(x, steps):
         if method == 'ddpm':
             return ddpm_sample(cfg_model_fn, x, steps)
@@ -258,52 +207,39 @@ def generate(n=1, prompts=['a red circle'], images=[], seed=42, steps=15, method
             return plms_sample(cfg_model_fn, x, steps)
         assert False, f"Unknown method: {method}"
-    # 🏃‍♂️ Generate the output images
     batch_size = n
     x = torch.randn([n, 3, 64, 64], device=device)
     pil_ims = []
     for i in trange(0, n, batch_size):
         cur_batch_size = min(n - i, batch_size)
         out_latents = run(x[i:i + cur_batch_size], steps)
-        # The VQVAE expects specific dimensions. Adjusting for the placeholder.
-        # This will likely need tuning for the real model.
         if isinstance(vqvae, VQVAE2):
-            out_latents = F.interpolate(out_latents, size=32) # Guessing latent size
-            # A real VQVAE needs quantized inputs, not raw latents. This will not produce good images.
-            # We're just making it runnable.
-            quant_guess = F.gumbel_softmax(out_latents, hard=True).permute(0, 2, 3, 1) # (B, H, W, C)
-            pil_ims.append(transforms.ToPILImage()(quant_guess[0].permute(2, 0, 1)))
         else:
             outs = vqvae.decode(out_latents)
             for j, out in enumerate(outs):
                 pil_ims.append(transforms.ToPILImage()(out.clamp(0, 1)))
     return pil_ims
 # 🖌️ Interface: Gradio's brush to paint the UI
 def gen_ims(prompt, im_prompt=None, seed=None, n_steps=10, method='plms'):
-    """
-    💡 Gradio function to wrap image generation.
-    """
     if seed is None:
         seed = random.randint(0, 10000)
     prompts = [prompt]
     im_prompts = []
     if im_prompt is not None:
         im_prompts = [im_prompt]
     try:
         pil_ims = generate(n=1, prompts=prompts, images=im_prompts, seed=seed, steps=n_steps, method=method)
         return pil_ims[0]
     except Exception as e:
         print(f"ERROR during generation: {e}")
-        # Return a blank image on failure
         return Image.new('RGB', (256, 256), color = 'red')
-# 🖼️ Gradio UI: The interface where users can input text or image prompts
 iface = gr.Interface(
     fn=gen_ims,
     inputs=[

 from torchvision.transforms import functional as TF
 from tqdm import trange
 from transformers import CLIPProcessor, CLIPModel
+from huggingface_hub import hf_hub_download
+import gradio as gr
 import math
 # -----------------------------------------------------------------------------
 class VQVAE2(nn.Module):
     def __init__(self, n_embed=8192, embed_dim=256, ch=128):
         super().__init__()
         self.decoder = nn.Sequential(
             nn.Conv2d(embed_dim, ch * 4, 3, padding=1),
             nn.ReLU(),
             nn.ReLU(),
             nn.ConvTranspose2d(ch, 3, 4, stride=2, padding=1),
         )
     def decode(self, latents):
         return self.decoder(latents)
 # Diffusion Model Definition
 class Diffusion(nn.Module):
+    # FIXED: Changed n_head default from 8 to 4 to make dimensions compatible
+    def __init__(self, n_inputs=3, n_embed=512, n_head=4, n_layer=12):
         super().__init__()
         self.time_embed = nn.Embedding(1000, n_inputs * 4)
         self.cond_embed = nn.Linear(n_embed, n_inputs * 4)
         self.layers = nn.ModuleList([
+            nn.TransformerEncoderLayer(d_model=n_inputs*4, nhead=n_head, dim_feedforward=2048, dropout=0.1, activation='gelu', batch_first=True)
             for _ in range(n_layer)
         ])
         self.out = nn.Linear(n_inputs*4, n_inputs)
     def forward(self, x, t, c):
         bs, ch, h, w = x.shape
         x = x.permute(0, 2, 3, 1).reshape(bs, h * w, ch)
         t_emb = self.time_embed(t.long())
         c_emb = self.cond_embed(c)
         emb = t_emb + c_emb
         x_out = self.out(x + emb.unsqueeze(1))
         x_out = x_out.reshape(bs, h, w, ch).permute(0, 3, 1, 2)
         return x_out
 # Sampling Function Definitions
 def get_sigmas(n_steps):
     t = torch.linspace(1, 0, n_steps + 1)
     return ((t[:-1] ** 2) / (t[1:] ** 2) - 1).sqrt()
 @torch.no_grad()
 def plms_sample(model, x, steps, **kwargs):
     ts = x.new_ones([x.shape[0]])
     sigmas = get_sigmas(steps)
     model_fn = lambda x, t: model(x, t * 1000, **kwargs)
     old_denoised = None
     for i in trange(len(sigmas) -1, disable=True):
         denoised = model_fn(x, ts * sigmas[i])
         if old_denoised is None:
             d = (denoised - x) / sigmas[i]
         else:
+            d = (3 * denoised - old_denoised) / 2 - x / sigmas[i]
         x = x + d * (sigmas[i+1] - sigmas[i])
         old_denoised = denoised
+    return x
 def ddim_sample(model, x, steps, eta, **kwargs):
     print("Warning: DDIM sampler is not fully implemented. Using PLMS instead.")
     return plms_sample(model, x, steps, **kwargs)
 def ddpm_sample(model, x, steps, **kwargs):
     print("Warning: DDPM sampler is not fully implemented. Using PLMS instead.")
     return plms_sample(model, x, steps, **kwargs)
 # -----------------------------------------------------------------------------
 # 🖼️ Download the necessary model files from HuggingFace
 try:
     vqvae_model_path = hf_hub_download(repo_id="dalle-mini/vqgan_imagenet_f16_16384", filename="flax_model.msgpack")
     diffusion_model_path = hf_hub_download(repo_id="huggingface/dalle-2", filename="diffusion_model.ckpt")
 except Exception as e:
     print(f"Could not download models. Please ensure the HuggingFace URLs are correct.")
     print("Using placeholder models which will not produce good images.")
     Path("vqvae_model.ckpt").touch()
     Path("diffusion_model.ckpt").touch()
     vqvae_model_path = "vqvae_model.ckpt"
 # 📐 Utility Functions: Math and images, what could go wrong?
 def parse_prompt(prompt, default_weight=3.):
     vals = prompt.rsplit(':', 1)
     vals = vals + ['', default_weight][len(vals):]
     return vals[0], float(vals[1])
 def resize_and_center_crop(image, size):
     fac = max(size[0] / image.size[0], size[1] / image.size[1])
     image = image.resize((int(fac * image.size[0]), int(fac * image.size[1])), Image.LANCZOS)
     return TF.center_crop(image, size[::-1])
 # 🧠 Model loading: the brain of our operation! 🔥
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 print('Using device:', device)
 print('loading models... 🛠️')
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 # Load VQ-VAE-2 Autoencoder
 try:
     vqvae = VQVAE2()
     print("Skipping VQVAE weight loading due to placeholder architecture.")
 except Exception as e:
     print(f"Could not load VQVAE model: {e}. Using placeholder.")
     vqvae = VQVAE2()
 vqvae.eval().requires_grad_(False).to(device)
 # Load Diffusion Model
 try:
     diffusion_model = Diffusion()
     print("Skipping Diffusion Model weight loading due to placeholder architecture.")
 except Exception as e:
     print(f"Could not load Diffusion model: {e}. Using placeholder.")
 # 🎨 The key function: Where the magic happens!
 def generate(n=1, prompts=['a red circle'], images=[], seed=42, steps=15, method='ddim', eta=None):
     zero_embed = torch.zeros([1, clip_model.config.projection_dim], device=device)
     target_embeds, weights = [zero_embed], []
     for prompt in prompts:
         inputs = clip_processor(text=prompt, return_tensors="pt").to(device)
         text_embed = clip_model.get_text_features(**inputs).float()
         target_embeds.append(text_embed)
         weights.append(1.0)
     image_prompt_weight = 1.0
     for image_path in images:
+        if image_path:
             try:
                 img = Image.open(image_path).convert('RGB')
                 img = resize_and_center_crop(img, (224, 224))
             except Exception as e:
                 print(f"Warning: Could not process image prompt {image_path}. Error: {e}")
     weights = torch.tensor([1 - sum(weights), *weights], device=device)
     torch.manual_seed(seed)
     def cfg_model_fn(x, t):
         n = x.shape[0]
         n_conds = len(target_embeds)
         x_in = x.repeat([n_conds, 1, 1, 1])
         t_in = t.repeat([n_conds])
         embed_in = torch.cat(target_embeds).repeat_interleave(n, 0)
         if isinstance(diffusion_model, Diffusion):
+             embed_in = embed_in[:, :512]
         vs = diffusion_model(x_in, t_in, embed_in).view([n_conds, n, *x.shape[1:]])
         v = vs.mul(weights[:, None, None, None, None]).sum(0)
         return v
     def run(x, steps):
         if method == 'ddpm':
             return ddpm_sample(cfg_model_fn, x, steps)
             return plms_sample(cfg_model_fn, x, steps)
         assert False, f"Unknown method: {method}"
     batch_size = n
     x = torch.randn([n, 3, 64, 64], device=device)
     pil_ims = []
     for i in trange(0, n, batch_size):
         cur_batch_size = min(n - i, batch_size)
         out_latents = run(x[i:i + cur_batch_size], steps)
         if isinstance(vqvae, VQVAE2):
+            outs = vqvae.decode(out_latents)
+            for j, out in enumerate(outs):
+                pil_ims.append(transforms.ToPILImage()(out.clamp(0, 1)))
         else:
             outs = vqvae.decode(out_latents)
             for j, out in enumerate(outs):
                 pil_ims.append(transforms.ToPILImage()(out.clamp(0, 1)))
     return pil_ims
 # 🖌️ Interface: Gradio's brush to paint the UI
 def gen_ims(prompt, im_prompt=None, seed=None, n_steps=10, method='plms'):
     if seed is None:
         seed = random.randint(0, 10000)
     prompts = [prompt]
     im_prompts = []
     if im_prompt is not None:
         im_prompts = [im_prompt]
     try:
         pil_ims = generate(n=1, prompts=prompts, images=im_prompts, seed=seed, steps=n_steps, method=method)
         return pil_ims[0]
     except Exception as e:
         print(f"ERROR during generation: {e}")
         return Image.new('RGB', (256, 256), color = 'red')
+# 🖼️ Gradio UI
 iface = gr.Interface(
     fn=gen_ims,
     inputs=[