Spaces:

Keshabwi66
/

SmartLuga

Runtime error

App Files Files Community

Keshabwi66 commited on Feb 15

Commit

8f0759c

verified ·

1 Parent(s): d21101d

Update app.py

Browse files

Files changed (1) hide show

app.py +195 -208

app.py CHANGED Viewed

@@ -1,243 +1,230 @@
 import sys
 import os
 sys.path.append('./')
-os.system("pip install huggingface_hub==0.24.7")
 os.system("pip install gradio accelerate==0.25.0 torchmetrics==1.2.1 tqdm==4.66.1 fastapi==0.111.0 transformers==4.36.2 diffusers==0.25 einops==0.7.0 bitsandbytes scipy==1.11.1 opencv-python gradio==4.24.0 fvcore cloudpickle omegaconf pycocotools basicsr av onnxruntime==1.16.2 peft==0.11.1 huggingface_hub==0.24.7 --no-deps")
-import gradio as gr
-import torch
 import spaces
 from fastapi import FastAPI
 app = FastAPI()
-from PIL import Image
-import torch.nn.functional as F
-from transformers import CLIPImageProcessor
-# Add necessary imports and initialize the model as in your code...
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Literal
-import matplotlib.pyplot as plt
-import torch.utils.data as data
-import torchvision
-import numpy as np
 import torch
-import torch.nn.functional as F
-from accelerate.logging import get_logger
-from accelerate.utils import  set_seed
 from torchvision import transforms
-from diffusers import AutoencoderKL, DDPMScheduler
-from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModelWithProjection,CLIPTextModelWithProjection, CLIPTextModel
-from src.unet_hacked_tryon import UNet2DConditionModel
-from src.unet_hacked_garmnet import UNet2DConditionModel as UNet2DConditionModel_ref
-from src.tryon_pipeline import StableDiffusionXLInpaintPipeline as TryonPipeline
-# Define a class to hold configuration arguments
-class Args:
-    def __init__(self):
-        self.pretrained_model_name_or_path = "yisol/IDM-VTON"
-        self.width = 768
-        self.height = 1024
-        self.num_inference_steps = 10
-        self.seed = 42
-        self.guidance_scale = 2.0
-        self.mixed_precision = None
 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
-def pil_to_tensor(images):
-    images = np.array(images).astype(np.float32) / 255.0
-    images = torch.from_numpy(images.transpose(2, 0, 1))
-    return images
-args = Args()
-# Define the data type for model weights
-weight_dtype = torch.float16
-if args.seed is not None:
-        set_seed(args.seed)
-# Load scheduler, tokenizer and models.
-noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="vae",
-        torch_dtype=torch.float16,
-         )
 unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-        torch_dtype=torch.float16,
-        )
-image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="image_encoder",
-        torch_dtype=torch.float16,
-    )
-unet_encoder = UNet2DConditionModel_ref.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet_encoder",
-        torch_dtype=torch.float16,
-    )
 text_encoder_one = CLIPTextModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        torch_dtype=torch.float16,
-        )
 text_encoder_two = CLIPTextModelWithProjection.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder_2",
-        torch_dtype=torch.float16,
     )
-tokenizer_one = AutoTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="tokenizer",
-        revision=None,
-        use_fast=False,
-    )
-tokenizer_two = AutoTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="tokenizer_2",
-        revision=None,
-        use_fast=False,
-    )
- # Freeze vae and text_encoder and set unet to trainable
-unet.requires_grad_(False)
-vae.requires_grad_(False)
 image_encoder.requires_grad_(False)
-unet_encoder.requires_grad_(False)
 text_encoder_one.requires_grad_(False)
 text_encoder_two.requires_grad_(False)
-unet_encoder.requires_grad_(False)
-unet.eval()
-unet_encoder.eval()
 pipe = TryonPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=unet,
-            vae=vae,
-            feature_extractor= CLIPImageProcessor(),
-            text_encoder = text_encoder_one,
-            text_encoder_2 = text_encoder_two,
-            tokenizer = tokenizer_one,
-            tokenizer_2 = tokenizer_two,
-            scheduler = noise_scheduler,
-            image_encoder=image_encoder,
-            unet_encoder = unet_encoder,
-            torch_dtype=torch.float16,
-    )
 @spaces.GPU
-def generate_virtual_try_on(person_image, cloth_image, mask_image, pose_image,cloth_des):
     pipe.to(device)
-    # Prepare the input images as tensors
-    person_image = person_image.resize((args.width, args.height))
-    cloth_image = cloth_image.resize((args.width, args.height))
-    mask_image = mask_image.resize((args.width, args.height))
-    pose_image = pose_image.resize((args.width, args.height))
-    # Define transformations
-    transform = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Normalize([0.5], [0.5]),
-    ])
-    guidance_scale=2.0
-    seed=42
-    to_tensor = transforms.ToTensor()
-    person_tensor = transform(person_image).unsqueeze(0).to(device)  # Add batch dimension
-    cloth_pure = transform(cloth_image).unsqueeze(0).to(device)
-    mask_tensor = to_tensor(mask_image)[:1].unsqueeze(0).to(device)  # Keep only one channel
-    pose_tensor = transform(pose_image).unsqueeze(0).to(device)
-    # Prepare text prompts
-    prompt = ["A person wearing the cloth"+cloth_des]  # Example prompt
-    negative_prompt = ["monochrome, lowres, bad anatomy, worst quality, low quality"]
-    # Encode prompts
-    with torch.inference_mode():
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = pipe.encode_prompt(
-            prompt,
-            num_images_per_prompt=1,
-            do_classifier_free_guidance=True,
-            negative_prompt=negative_prompt,
-        )
-    prompt_cloth = ["a photo of"+cloth_des]
-    with torch.inference_mode():
-     (
-        prompt_embeds_c,
-        _,
-        _,
-        _,
-     ) = pipe.encode_prompt(
-        prompt_cloth,
-        num_images_per_prompt=1,
-        do_classifier_free_guidance=False,
-        negative_prompt=negative_prompt,
-    )
-    # Encode garment using IP-Adapter
-    clip_processor = CLIPImageProcessor()
-    image_embeds = clip_processor(images=cloth_image, return_tensors="pt").pixel_values.to(device)
-    # Generate the image
-    generator = torch.Generator(pipe.device).manual_seed(seed) if seed is not None else None
-    with torch.no_grad():
-        images = pipe(
-            prompt_embeds=prompt_embeds.to(device,torch.float16),
-            negative_prompt_embeds=negative_prompt_embeds.to(device,torch.float16),
-            pooled_prompt_embeds=pooled_prompt_embeds.to(device,torch.float16),
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,torch.float16),
-            num_inference_steps=args.num_inference_steps,
-            generator=generator,
-            strength=1.0,
-            pose_img=pose_tensor.to(device,torch.float16),
-            text_embeds_cloth=prompt_embeds_c.to(device,torch.float16),
-            cloth=cloth_pure.to(device,torch.float16),
-            mask_image=mask_tensor.to(device,torch.float16),
-            image=(person_tensor + 1.0) / 2.0,
-            height=args.height,
-            width=args.width,
-            guidance_scale=guidance_scale,
-            ip_adapter_image=image_embeds.to(device,torch.float16),
-        )[0]
-    # Convert output image to PIL format for display
-    generated_image = transforms.ToPILImage()(images[0])
-    return generated_image
-# Create Gradio interface
-iface = gr.Interface(
-    fn=generate_virtual_try_on,
-    inputs=[
-        gr.Image(type="pil", label="Person Image"),
-        gr.Image(type="pil", label="Cloth Image"),
-        gr.Image(type="pil", label="Mask Image"),
-        gr.Image(type="pil", label="Pose Image"),
-        gr.Textbox(label="cloth_des"),  # Add text input
-    ],
-    outputs=gr.Image(type="pil", label="Generated Image"),
-)
-# Launch the interface
-iface.launch()

 import sys
 import os
 sys.path.append('./')
 os.system("pip install gradio accelerate==0.25.0 torchmetrics==1.2.1 tqdm==4.66.1 fastapi==0.111.0 transformers==4.36.2 diffusers==0.25 einops==0.7.0 bitsandbytes scipy==1.11.1 opencv-python gradio==4.24.0 fvcore cloudpickle omegaconf pycocotools basicsr av onnxruntime==1.16.2 peft==0.11.1 huggingface_hub==0.24.7 --no-deps")
 import spaces
 from fastapi import FastAPI
 app = FastAPI()
+from PIL import Image
+import gradio as gr
+from src.tryon_pipeline import StableDiffusionXLInpaintPipeline as TryonPipeline
+from src.unet_hacked_garmnet import UNet2DConditionModel as UNet2DConditionModel_ref
+from src.unet_hacked_tryon import UNet2DConditionModel
+from transformers import (
+    CLIPImageProcessor,
+    CLIPVisionModelWithProjection,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+)
+from diffusers import DDPMScheduler,AutoencoderKL
+from typing import List
 import torch
+import os
+from transformers import AutoTokenizer
+import numpy as np
+from utils_mask import get_mask_location
 from torchvision import transforms
+import apply_net
 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+def pil_to_binary_mask(pil_image, threshold=0):
+    np_image = np.array(pil_image)
+    grayscale_image = Image.fromarray(np_image).convert("L")
+    binary_mask = np.array(grayscale_image) > threshold
+    mask = np.zeros(binary_mask.shape, dtype=np.uint8)
+    for i in range(binary_mask.shape[0]):
+        for j in range(binary_mask.shape[1]):
+            if binary_mask[i,j] == True :
+                mask[i,j] = 1
+    mask = (mask*255).astype(np.uint8)
+    output_mask = Image.fromarray(mask)
+    return output_mask
+base_path = 'yisol/IDM-VTON'
 unet = UNet2DConditionModel.from_pretrained(
+    base_path,
+    subfolder="unet",
+    torch_dtype=torch.float16,
+)
+unet.requires_grad_(False)
+tokenizer_one = AutoTokenizer.from_pretrained(
+    base_path,
+    subfolder="tokenizer",
+    revision=None,
+    use_fast=False,
+)
+tokenizer_two = AutoTokenizer.from_pretrained(
+    base_path,
+    subfolder="tokenizer_2",
+    revision=None,
+    use_fast=False,
+)
+noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
 text_encoder_one = CLIPTextModel.from_pretrained(
+    base_path,
+    subfolder="text_encoder",
+    torch_dtype=torch.float16,
+)
 text_encoder_two = CLIPTextModelWithProjection.from_pretrained(
+    base_path,
+    subfolder="text_encoder_2",
+    torch_dtype=torch.float16,
+)
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    base_path,
+    subfolder="image_encoder",
+    torch_dtype=torch.float16,
     )
+vae = AutoencoderKL.from_pretrained(base_path,
+                                    subfolder="vae",
+                                    torch_dtype=torch.float16,
+)
+# "stabilityai/stable-diffusion-xl-base-1.0",
+UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
+    base_path,
+    subfolder="unet_encoder",
+    torch_dtype=torch.float16,
+)
+parsing_model = Parsing(0)
+openpose_model = OpenPose(0)
+UNet_Encoder.requires_grad_(False)
 image_encoder.requires_grad_(False)
+vae.requires_grad_(False)
+unet.requires_grad_(False)
 text_encoder_one.requires_grad_(False)
 text_encoder_two.requires_grad_(False)
+tensor_transfrom = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+    )
 pipe = TryonPipeline.from_pretrained(
+        base_path,
+        unet=unet,
+        vae=vae,
+        feature_extractor= CLIPImageProcessor(),
+        text_encoder = text_encoder_one,
+        text_encoder_2 = text_encoder_two,
+        tokenizer = tokenizer_one,
+        tokenizer_2 = tokenizer_two,
+        scheduler = noise_scheduler,
+        image_encoder=image_encoder,
+        torch_dtype=torch.float16,
+)
+pipe.unet_encoder = UNet_Encoder
 @spaces.GPU
+def start_tryon(person_img, pose_img, mask_img, cloth_img, garment_des, denoise_steps, seed):
+    # Assuming device is set up (e.g., "cuda" or "cpu")
+    openpose_model.preprocessor.body_estimation.model.to(device)
     pipe.to(device)
+    pipe.unet_encoder.to(device)
+    # Resize and prepare images
+    garm_img = cloth_img.convert("RGB").resize((768, 1024))
+    human_img = person_img.convert("RGB").resize((768, 1024))
+    mask = mask_img.convert("RGB").resize((768, 1024))
+    # Prepare pose image (already uploaded)
+    pose_img = pose_img.resize((768, 1024))
+    # Generate text embeddings for garment description
+    prompt = f"model is wearing {garment_des}"
+    negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+    # Embedding generation for prompts
+    with torch.no_grad():
+        with torch.cuda.amp.autocast():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=1,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds_cloth, _ = pipe.encode_prompt(
+                f"a photo of {garment_des}",
+                num_images_per_prompt=1,
+                do_classifier_free_guidance=False,
+                negative_prompt=negative_prompt,
+            )
+            # Convert images to tensors for processing
+            pose_img_tensor = tensor_transfrom(pose_img).unsqueeze(0).to(device, torch.float16)
+            garm_tensor = tensor_transfrom(garm_img).unsqueeze(0).to(device, torch.float16)
+            mask_tensor = tensor_transfrom(mask).unsqueeze(0).to(device, torch.float16)
+            # Prepare the generator with optional seed
+            generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
+            # Generate the virtual try-on output image
+            images = pipe(
+                prompt_embeds=prompt_embeds.to(device, torch.float16),
+                negative_prompt_embeds=negative_prompt_embeds.to(device, torch.float16),
+                pooled_prompt_embeds=pooled_prompt_embeds.to(device, torch.float16),
+                negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device, torch.float16),
+                num_inference_steps=denoise_steps,
+                generator=generator,
+                strength=1.0,
+                pose_img=pose_img_tensor.to(device, torch.float16),
+                text_embeds_cloth=prompt_embeds_cloth.to(device, torch.float16),
+                cloth=garm_tensor.to(device, torch.float16),
+                mask_image=mask_tensor,
+                image=human_img,
+                height=1024,
+                width=768,
+                ip_adapter_image=garm_img.resize((768, 1024)),
+                guidance_scale=2.0,
+            )[0]
+    return images
+# Gradio interface for the virtual try-on model
+image_blocks = gr.Blocks().queue()
+with image_blocks as demo:
+    gr.Markdown("## SmartLuga ")
+    with gr.Row():
+        with gr.Column():
+            imgs = gr.ImageEditor(sources='upload', type="pil", label='Human Image', interactive=True)
+            with gr.Row():
+                is_checked_crop = gr.Checkbox(label="Use auto-crop & resizing", value=False)
+        with gr.Column():
+            garm_img = gr.Image(label="Garment", sources='upload', type="pil")
+            with gr.Row(elem_id="prompt-container"):
+                prompt = gr.Textbox(placeholder="Description of garment ex) Short Sleeve Round Neck T-shirts", show_label=False, elem_id="prompt")
+        with gr.Column():
+            masked_img = gr.Image(label="Masked image output", elem_id="masked-img", show_share_button=False)
+        with gr.Column():
+            image_out = gr.Image(label="Output", elem_id="output-img", show_share_button=False)
+    with gr.Column():
+        try_button = gr.Button(value="Try-on")
+        with gr.Accordion(label="Advanced Settings", open=False):
+            with gr.Row():
+                denoise_steps = gr.Number(label="Denoising Steps", minimum=20, maximum=40, value=30, step=1)
+                seed = gr.Number(label="Seed", minimum=-1, maximum=2147483647, step=1, value=42)
+    try_button.click(fn=start_tryon, inputs=[imgs, garm_img, prompt, denoise_steps, seed], outputs=[image_out, masked_img], api_name='tryon')
+image_blocks.launch()