Spaces:

InstantX
/

InstantID

Starting on Zero

App Files Files Community

102

hysts HF Staff commited on Jan 24, 2024

Commit

b106068

1 Parent(s): 0add3fc

Apply formatter

Browse files

Files changed (1) hide show

app.py +187 -114

app.py CHANGED Viewed

@@ -1,25 +1,22 @@
-import os
-import cv2
 import math
-import spaces
-import torch
 import random
-import numpy as np
-import PIL
-from PIL import Image
 import diffusers
-from diffusers.utils import load_image
-from diffusers.models import ControlNetModel
 import insightface
 from insightface.app import FaceAnalysis
-from style_template import styles
 from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline
-import gradio as gr
 # global variable
 MAX_SEED = np.iinfo(np.int32).max
@@ -29,22 +26,27 @@ DEFAULT_STYLE_NAME = "Watercolor"
 # download checkpoints
 from huggingface_hub import hf_hub_download
 hf_hub_download(repo_id="InstantX/InstantID", filename="ControlNetModel/config.json", local_dir="./checkpoints")
-hf_hub_download(repo_id="InstantX/InstantID", filename="ControlNetModel/diffusion_pytorch_model.safetensors", local_dir="./checkpoints")
 hf_hub_download(repo_id="InstantX/InstantID", filename="ip-adapter.bin", local_dir="./checkpoints")
 # Load face encoder
-app = FaceAnalysis(name='antelopev2', root='./', providers=['CPUExecutionProvider'])
 app.prepare(ctx_id=0, det_size=(640, 640))
 # Path to InstantID models
-face_adapter = f'./checkpoints/ip-adapter.bin'
-controlnet_path = f'./checkpoints/ControlNetModel'
 # Load pipeline
 controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
-base_model_path = 'wangqixun/YamerMIX_v8'
 pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
     base_model_path,
@@ -55,54 +57,68 @@ pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
 )
 pipe.cuda()
 pipe.load_ip_adapter_instantid(face_adapter)
-pipe.image_proj_model.to('cuda')
-pipe.unet.to('cuda')
 def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     return seed
 def swap_to_gallery(images):
-    return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)
 def upload_example_to_gallery(images, prompt, style, negative_prompt):
-    return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)
 def remove_back_to_files():
     return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
 def remove_tips():
     return gr.update(visible=False)
 def get_example():
     case = [
         [
-            ['./examples/yann-lecun_resize.jpg'],
             "a man",
             "Snow",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
         ],
         [
-            ['./examples/musk_resize.jpeg'],
             "a man",
             "Mars",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
         ],
         [
-            ['./examples/sam_resize.png'],
             "a man",
             "Jungle",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, gree",
         ],
         [
-            ['./examples/schmidhuber_resize.png'],
             "a man",
             "Neon",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
         ],
         [
-            ['./examples/kaifu_resize.png'],
             "a man",
             "Vibrant Color",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
@@ -110,16 +126,20 @@ def get_example():
     ]
     return case
 def run_for_examples(face_files, prompt, style, negative_prompt):
     return generate_image(face_files, None, prompt, negative_prompt, style, True, 30, 0.8, 0.8, 5, 42)
 def convert_from_cv2_to_image(img: np.ndarray) -> Image:
     return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
 def convert_from_image_to_cv2(img: Image) -> np.ndarray:
     return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,255,0), (255,0,255)]):
     stickwidth = 4
     limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
     kps = np.array(kps)
@@ -135,7 +155,9 @@ def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,2
         y = kps[index][:, 1]
         length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
         angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
-        polygon = cv2.ellipse2Poly((int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
         out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
     out_img = (out_img * 0.6).astype(np.uint8)
@@ -147,89 +169,115 @@ def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,2
     out_img_pil = Image.fromarray(out_img.astype(np.uint8))
     return out_img_pil
-def resize_img(input_image, max_side=1280, min_side=1024, size=None,
-               pad_to_max_side=False, mode=PIL.Image.BILINEAR, base_pixel_number=64):
-        w, h = input_image.size
-        if size is not None:
-            w_resize_new, h_resize_new = size
-        else:
-            ratio = min_side / min(h, w)
-            w, h = round(ratio*w), round(ratio*h)
-            ratio = max_side / max(h, w)
-            input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
-            w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
-            h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
-        input_image = input_image.resize([w_resize_new, h_resize_new], mode)
-        if pad_to_max_side:
-            res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
-            offset_x = (max_side - w_resize_new) // 2
-            offset_y = (max_side - h_resize_new) // 2
-            res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
-            input_image = Image.fromarray(res)
-        return input_image
 def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
     p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
-    return p.replace("{prompt}", positive), n + ' ' + negative
-@spaces.GPU
-def generate_image(face_image, pose_image, prompt, negative_prompt, style_name, enhance_face_region, num_steps, identitynet_strength_ratio, adapter_strength_ratio, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
     if face_image is None:
         raise gr.Error(f"Cannot find any input face image! Please upload the face image")
     if prompt is None:
         prompt = "a person"
     # apply the style template
     prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
     face_image = load_image(face_image[0])
     face_image = resize_img(face_image)
     face_image_cv2 = convert_from_image_to_cv2(face_image)
     height, width, _ = face_image_cv2.shape
     # Extract face features
     face_info = app.get(face_image_cv2)
     if len(face_info) == 0:
         raise gr.Error(f"Cannot find any face in the image! Please upload another person image")
-    face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1]  # only use the maximum face
-    face_emb = face_info['embedding']
-    face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info['kps'])
     if pose_image is not None:
         pose_image = load_image(pose_image[0])
         pose_image = resize_img(pose_image)
         pose_image_cv2 = convert_from_image_to_cv2(pose_image)
         face_info = app.get(pose_image_cv2)
         if len(face_info) == 0:
             raise gr.Error(f"Cannot find any face in the reference image! Please upload another person image")
         face_info = face_info[-1]
-        face_kps = draw_kps(pose_image, face_info['kps'])
         width, height = face_kps.size
     if enhance_face_region:
         control_mask = np.zeros([height, width, 3])
-        x1, y1, x2, y2 = face_info['bbox']
         x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
         control_mask[y1:y2, x1:x2] = 255
         control_mask = Image.fromarray(control_mask.astype(np.uint8))
     else:
         control_mask = None
     generator = torch.Generator(device=device).manual_seed(seed)
     print("Start inference...")
     print(f"[Debug] Prompt: {prompt}, \n[Debug] Neg Prompt: {negative_prompt}")
     pipe.set_ip_adapter_scale(adapter_strength_ratio)
     images = pipe(
         prompt=prompt,
@@ -242,11 +290,12 @@ def generate_image(face_image, pose_image, prompt, negative_prompt, style_name,
         guidance_scale=guidance_scale,
         height=height,
         width=width,
-        generator=generator
     ).images
     return images, gr.update(visible=True)
 ### Description
 title = r"""
 <h1 align="center">InstantID: Zero-shot Identity-Preserving Generation in Seconds</h1>
@@ -289,46 +338,44 @@ tips = r"""
 4. Find a good base model always makes a difference.
 """
-css = '''
 .gradio-container {width: 85% !important}
-'''
 with gr.Blocks(css=css) as demo:
     # description
     gr.Markdown(title)
     gr.Markdown(description)
     with gr.Row():
         with gr.Column():
             # upload face image
-            face_files = gr.Files(
-                        label="Upload a photo of your face",
-                        file_types=["image"]
-                    )
             uploaded_faces = gr.Gallery(label="Your images", visible=False, columns=1, rows=1, height=512)
             with gr.Column(visible=False) as clear_button_face:
-                remove_and_reupload_faces = gr.ClearButton(value="Remove and upload new ones", components=face_files, size="sm")
             # optional: upload a reference pose image
-            pose_files = gr.Files(
-                        label="Upload a reference pose image (optional)",
-                        file_types=["image"]
-                    )
             uploaded_poses = gr.Gallery(label="Your images", visible=False, columns=1, rows=1, height=512)
             with gr.Column(visible=False) as clear_button_pose:
-                remove_and_reupload_poses = gr.ClearButton(value="Remove and upload new ones", components=pose_files, size="sm")
             # prompt
-            prompt = gr.Textbox(label="Prompt",
-                       info="Give simple prompt is enough to achieve good face fedility",
-                       placeholder="A photo of a person",
-                       value="")
             submit = gr.Button("Submit", variant="primary")
             style = gr.Dropdown(label="Style template", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
             # strength
             identitynet_strength_ratio = gr.Slider(
                 label="IdentityNet strength (for fedility)",
@@ -344,14 +391,14 @@ with gr.Blocks(css=css) as demo:
                 step=0.05,
                 value=0.80,
             )
             with gr.Accordion(open=False, label="Advanced Options"):
                 negative_prompt = gr.Textbox(
-                    label="Negative Prompt",
                     placeholder="low quality",
                     value="(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
                 )
-                num_steps = gr.Slider(
                     label="Number of sample steps",
                     minimum=20,
                     maximum=100,
@@ -377,17 +424,31 @@ with gr.Blocks(css=css) as demo:
         with gr.Column():
             gallery = gr.Gallery(label="Generated Images")
-            usage_tips = gr.Markdown(label="Usage tips of InstantID", value=tips ,visible=False)
-        face_files.upload(fn=swap_to_gallery, inputs=face_files, outputs=[uploaded_faces, clear_button_face, face_files])
-        pose_files.upload(fn=swap_to_gallery, inputs=pose_files, outputs=[uploaded_poses, clear_button_pose, pose_files])
-        remove_and_reupload_faces.click(fn=remove_back_to_files, outputs=[uploaded_faces, clear_button_face, face_files])
-        remove_and_reupload_poses.click(fn=remove_back_to_files, outputs=[uploaded_poses, clear_button_pose, pose_files])
         submit.click(
             fn=remove_tips,
-            outputs=usage_tips,
         ).then(
             fn=randomize_seed_fn,
             inputs=[seed, randomize_seed],
@@ -396,21 +457,33 @@ with gr.Blocks(css=css) as demo:
             api_name=False,
         ).then(
             fn=generate_image,
-            inputs=[face_files, pose_files, prompt, negative_prompt, style, enhance_face_region, num_steps, identitynet_strength_ratio, adapter_strength_ratio, guidance_scale, seed],
-            outputs=[gallery, usage_tips]
         )
     gr.Examples(
         examples=get_example(),
         inputs=[face_files, prompt, style, negative_prompt],
         run_on_click=True,
         fn=upload_example_to_gallery,
         outputs=[uploaded_faces, clear_button_face, face_files],
-        cache_examples=True
     )
     gr.Markdown(article)
 demo.queue(api_open=False)
-demo.launch()

 import math
+import os
 import random
+import cv2
 import diffusers
+import gradio as gr
 import insightface
+import numpy as np
+import PIL
+import spaces
+import torch
+from diffusers.models import ControlNetModel
+from diffusers.utils import load_image
 from insightface.app import FaceAnalysis
+from PIL import Image
 from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline
+from style_template import styles
 # global variable
 MAX_SEED = np.iinfo(np.int32).max
 # download checkpoints
 from huggingface_hub import hf_hub_download
 hf_hub_download(repo_id="InstantX/InstantID", filename="ControlNetModel/config.json", local_dir="./checkpoints")
+hf_hub_download(
+    repo_id="InstantX/InstantID",
+    filename="ControlNetModel/diffusion_pytorch_model.safetensors",
+    local_dir="./checkpoints",
+)
 hf_hub_download(repo_id="InstantX/InstantID", filename="ip-adapter.bin", local_dir="./checkpoints")
 # Load face encoder
+app = FaceAnalysis(name="antelopev2", root="./", providers=["CPUExecutionProvider"])
 app.prepare(ctx_id=0, det_size=(640, 640))
 # Path to InstantID models
+face_adapter = f"./checkpoints/ip-adapter.bin"
+controlnet_path = f"./checkpoints/ControlNetModel"
 # Load pipeline
 controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
+base_model_path = "wangqixun/YamerMIX_v8"
 pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
     base_model_path,
 )
 pipe.cuda()
 pipe.load_ip_adapter_instantid(face_adapter)
+pipe.image_proj_model.to("cuda")
+pipe.unet.to("cuda")
 def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     return seed
 def swap_to_gallery(images):
+    return (
+        gr.update(value=images, visible=True),
+        gr.update(visible=True),
+        gr.update(visible=False),
+    )
 def upload_example_to_gallery(images, prompt, style, negative_prompt):
+    return (
+        gr.update(value=images, visible=True),
+        gr.update(visible=True),
+        gr.update(visible=False),
+    )
 def remove_back_to_files():
     return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
 def remove_tips():
     return gr.update(visible=False)
 def get_example():
     case = [
         [
+            ["./examples/yann-lecun_resize.jpg"],
             "a man",
             "Snow",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
         ],
         [
+            ["./examples/musk_resize.jpeg"],
             "a man",
             "Mars",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
         ],
         [
+            ["./examples/sam_resize.png"],
             "a man",
             "Jungle",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, gree",
         ],
         [
+            ["./examples/schmidhuber_resize.png"],
             "a man",
             "Neon",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
         ],
         [
+            ["./examples/kaifu_resize.png"],
             "a man",
             "Vibrant Color",
             "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
     ]
     return case
 def run_for_examples(face_files, prompt, style, negative_prompt):
     return generate_image(face_files, None, prompt, negative_prompt, style, True, 30, 0.8, 0.8, 5, 42)
 def convert_from_cv2_to_image(img: np.ndarray) -> Image:
     return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
 def convert_from_image_to_cv2(img: Image) -> np.ndarray:
     return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+def draw_kps(image_pil, kps, color_list=[(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]):
     stickwidth = 4
     limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
     kps = np.array(kps)
         y = kps[index][:, 1]
         length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
         angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
+        polygon = cv2.ellipse2Poly(
+            (int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1
+        )
         out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
     out_img = (out_img * 0.6).astype(np.uint8)
     out_img_pil = Image.fromarray(out_img.astype(np.uint8))
     return out_img_pil
+def resize_img(
+    input_image,
+    max_side=1280,
+    min_side=1024,
+    size=None,
+    pad_to_max_side=False,
+    mode=PIL.Image.BILINEAR,
+    base_pixel_number=64,
+):
+    w, h = input_image.size
+    if size is not None:
+        w_resize_new, h_resize_new = size
+    else:
+        ratio = min_side / min(h, w)
+        w, h = round(ratio * w), round(ratio * h)
+        ratio = max_side / max(h, w)
+        input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
+        w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
+        h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
+    input_image = input_image.resize([w_resize_new, h_resize_new], mode)
+    if pad_to_max_side:
+        res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
+        offset_x = (max_side - w_resize_new) // 2
+        offset_y = (max_side - h_resize_new) // 2
+        res[offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new] = np.array(input_image)
+        input_image = Image.fromarray(res)
+    return input_image
 def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
     p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
+    return p.replace("{prompt}", positive), n + " " + negative
+@spaces.GPU
+def generate_image(
+    face_image,
+    pose_image,
+    prompt,
+    negative_prompt,
+    style_name,
+    enhance_face_region,
+    num_steps,
+    identitynet_strength_ratio,
+    adapter_strength_ratio,
+    guidance_scale,
+    seed,
+    progress=gr.Progress(track_tqdm=True),
+):
     if face_image is None:
         raise gr.Error(f"Cannot find any input face image! Please upload the face image")
     if prompt is None:
         prompt = "a person"
     # apply the style template
     prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
     face_image = load_image(face_image[0])
     face_image = resize_img(face_image)
     face_image_cv2 = convert_from_image_to_cv2(face_image)
     height, width, _ = face_image_cv2.shape
     # Extract face features
     face_info = app.get(face_image_cv2)
     if len(face_info) == 0:
         raise gr.Error(f"Cannot find any face in the image! Please upload another person image")
+    face_info = sorted(
+        face_info,
+        key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
+    )[
+        -1
+    ]  # only use the maximum face
+    face_emb = face_info["embedding"]
+    face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info["kps"])
     if pose_image is not None:
         pose_image = load_image(pose_image[0])
         pose_image = resize_img(pose_image)
         pose_image_cv2 = convert_from_image_to_cv2(pose_image)
         face_info = app.get(pose_image_cv2)
         if len(face_info) == 0:
             raise gr.Error(f"Cannot find any face in the reference image! Please upload another person image")
         face_info = face_info[-1]
+        face_kps = draw_kps(pose_image, face_info["kps"])
         width, height = face_kps.size
     if enhance_face_region:
         control_mask = np.zeros([height, width, 3])
+        x1, y1, x2, y2 = face_info["bbox"]
         x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
         control_mask[y1:y2, x1:x2] = 255
         control_mask = Image.fromarray(control_mask.astype(np.uint8))
     else:
         control_mask = None
     generator = torch.Generator(device=device).manual_seed(seed)
     print("Start inference...")
     print(f"[Debug] Prompt: {prompt}, \n[Debug] Neg Prompt: {negative_prompt}")
     pipe.set_ip_adapter_scale(adapter_strength_ratio)
     images = pipe(
         prompt=prompt,
         guidance_scale=guidance_scale,
         height=height,
         width=width,
+        generator=generator,
     ).images
     return images, gr.update(visible=True)
 ### Description
 title = r"""
 <h1 align="center">InstantID: Zero-shot Identity-Preserving Generation in Seconds</h1>
 4. Find a good base model always makes a difference.
 """
+css = """
 .gradio-container {width: 85% !important}
+"""
 with gr.Blocks(css=css) as demo:
     # description
     gr.Markdown(title)
     gr.Markdown(description)
     with gr.Row():
         with gr.Column():
             # upload face image
+            face_files = gr.Files(label="Upload a photo of your face", file_types=["image"])
             uploaded_faces = gr.Gallery(label="Your images", visible=False, columns=1, rows=1, height=512)
             with gr.Column(visible=False) as clear_button_face:
+                remove_and_reupload_faces = gr.ClearButton(
+                    value="Remove and upload new ones", components=face_files, size="sm"
+                )
             # optional: upload a reference pose image
+            pose_files = gr.Files(label="Upload a reference pose image (optional)", file_types=["image"])
             uploaded_poses = gr.Gallery(label="Your images", visible=False, columns=1, rows=1, height=512)
             with gr.Column(visible=False) as clear_button_pose:
+                remove_and_reupload_poses = gr.ClearButton(
+                    value="Remove and upload new ones", components=pose_files, size="sm"
+                )
             # prompt
+            prompt = gr.Textbox(
+                label="Prompt",
+                info="Give simple prompt is enough to achieve good face fedility",
+                placeholder="A photo of a person",
+                value="",
+            )
             submit = gr.Button("Submit", variant="primary")
             style = gr.Dropdown(label="Style template", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
             # strength
             identitynet_strength_ratio = gr.Slider(
                 label="IdentityNet strength (for fedility)",
                 step=0.05,
                 value=0.80,
             )
             with gr.Accordion(open=False, label="Advanced Options"):
                 negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
                     placeholder="low quality",
                     value="(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
                 )
+                num_steps = gr.Slider(
                     label="Number of sample steps",
                     minimum=20,
                     maximum=100,
         with gr.Column():
             gallery = gr.Gallery(label="Generated Images")
+            usage_tips = gr.Markdown(label="Usage tips of InstantID", value=tips, visible=False)
+        face_files.upload(
+            fn=swap_to_gallery,
+            inputs=face_files,
+            outputs=[uploaded_faces, clear_button_face, face_files],
+        )
+        pose_files.upload(
+            fn=swap_to_gallery,
+            inputs=pose_files,
+            outputs=[uploaded_poses, clear_button_pose, pose_files],
+        )
+        remove_and_reupload_faces.click(
+            fn=remove_back_to_files,
+            outputs=[uploaded_faces, clear_button_face, face_files],
+        )
+        remove_and_reupload_poses.click(
+            fn=remove_back_to_files,
+            outputs=[uploaded_poses, clear_button_pose, pose_files],
+        )
         submit.click(
             fn=remove_tips,
+            outputs=usage_tips,
         ).then(
             fn=randomize_seed_fn,
             inputs=[seed, randomize_seed],
             api_name=False,
         ).then(
             fn=generate_image,
+            inputs=[
+                face_files,
+                pose_files,
+                prompt,
+                negative_prompt,
+                style,
+                enhance_face_region,
+                num_steps,
+                identitynet_strength_ratio,
+                adapter_strength_ratio,
+                guidance_scale,
+                seed,
+            ],
+            outputs=[gallery, usage_tips],
         )
     gr.Examples(
         examples=get_example(),
         inputs=[face_files, prompt, style, negative_prompt],
         run_on_click=True,
         fn=upload_example_to_gallery,
         outputs=[uploaded_faces, clear_button_face, face_files],
+        cache_examples=True,
     )
     gr.Markdown(article)
 demo.queue(api_open=False)
+demo.launch()