Spaces:

adaface-neurips
/

adaface-animate

Running on Zero

App Files Files Community

adaface-neurips commited on Oct 10

Commit

f0b9ada

•

1 Parent(s): 2a110ec

Integrate do_neg_id_prompt_weight, fix bugs, various refinements

Browse files

Files changed (9) hide show

adaface/adaface_infer.py +1 -1
adaface/adaface_translate.py +1 -1
adaface/adaface_wrapper.py +70 -26
adaface/face_id_to_ada_prompt.py +4 -0
adaface/test_img_prompt_model.py +1 -1
adaface/util.py +6 -4
app.py +46 -29
faceadapter/face_adapter.py +8 -3
models/adaface/{VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt → VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-30000.pt} +1 -1

adaface/adaface_infer.py CHANGED Viewed

@@ -151,5 +151,5 @@ if __name__ == "__main__":
         adaface.prepare_adaface_embeddings(image_paths, init_id_embs,
                                            perturb_at_stage='img_prompt_emb',
                                            perturb_std=args.perturb_std, update_text_encoder=True)
-    images = adaface(noise, args.prompt, None, args.guidance_scale, args.out_image_count, verbose=True)
     save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.perturb_std)

         adaface.prepare_adaface_embeddings(image_paths, init_id_embs,
                                            perturb_at_stage='img_prompt_emb',
                                            perturb_std=args.perturb_std, update_text_encoder=True)
+    images = adaface(noise, args.prompt, None, 'append', args.guidance_scale, args.out_image_count, verbose=True)
     save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.perturb_std)

adaface/adaface_translate.py CHANGED Viewed

@@ -195,7 +195,7 @@ if __name__ == "__main__":
             # A noise level of 0.08 could change gender, but 0.06 is usually safe.
             # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
             # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
-            out_images = adaface(in_images, args.prompt, None, args.guidance_scale, num_out_images, ref_img_strength=args.ref_img_strength)
             for img_i, img in enumerate(out_images):
                 # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...

             # A noise level of 0.08 could change gender, but 0.06 is usually safe.
             # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
             # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
+            out_images = adaface(in_images, args.prompt, None, 'append', args.guidance_scale, num_out_images, ref_img_strength=args.ref_img_strength)
             for img_i, img in enumerate(out_images):
                 # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...

adaface/adaface_wrapper.py CHANGED Viewed

@@ -217,6 +217,9 @@ class AdaFaceWrapper(nn.Module):
             self.placeholder_tokens_strs.append(placeholder_tokens_str)
         self.all_placeholder_tokens_str = " ".join(self.placeholder_tokens_strs)
         # Add the new tokens to the tokenizer.
         num_added_tokens = tokenizer.add_tokens(self.all_placeholder_tokens)
@@ -226,7 +229,7 @@ class AdaFaceWrapper(nn.Module):
                 " `subject_string` that is not already in the tokenizer.")
         print(f"Added {num_added_tokens} tokens ({self.all_placeholder_tokens_str}) to the tokenizer.")
         # placeholder_token_ids: [49408, ..., 49423].
         self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.all_placeholder_tokens)
         #print("New tokens:", self.placeholder_token_ids)
@@ -247,22 +250,30 @@ class AdaFaceWrapper(nn.Module):
                 token_embeds[token_id] = subj_embs[i]
             print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
-    def update_prompt(self, prompt, placeholder_tokens_pos='postpend'):
         if prompt is None:
             prompt = ""
         # Delete the subject_string from the prompt.
-        re.sub(r'\b(a|an|the)\s+' + self.subject_string + r'\b,?', "", prompt)
-        re.sub(r'\b' + self.subject_string + r'\b,?',              "", prompt)
         # Prevously, arc2face ada prompts work better if they are prepended to the prompt,
         # and consistentID ada prompts work better if they are appended to the prompt.
         # When we do joint training, seems both work better if they are appended to the prompt.
         # Therefore we simply appended all placeholder_tokens_str's to the prompt.
         # NOTE: Prepending them hurts compositional prompts.
         if placeholder_tokens_pos == 'prepend':
-            prompt = self.all_placeholder_tokens_str + " " + prompt
-        elif placeholder_tokens_pos == 'postpend':
-            prompt = prompt + " " + self.all_placeholder_tokens_str
         return prompt
@@ -293,22 +304,7 @@ class AdaFaceWrapper(nn.Module):
             self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
         return all_adaface_subj_embs
-    def encode_prompt(self, prompt, negative_prompt=None,
-                      placeholder_tokens_pos='postpend',
-                      device=None, verbose=False):
-        if negative_prompt is None:
-            negative_prompt = self.negative_prompt
-        if device is None:
-            device = self.device
-        prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos)
-        if verbose:
-            print(f"Subject prompt: {prompt}")
-        # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
-        # So we manually move it to GPU here.
-        self.pipeline.text_encoder.to(device)
         # pooled_prompt_embeds_, negative_pooled_prompt_embeds_ are used by text2img3 and flux.
         pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = None, None
@@ -317,7 +313,8 @@ class AdaFaceWrapper(nn.Module):
             # prompt_embeds_, negative_prompt_embeds_: [77, 768] -> [1, 77, 768].
             prompt_embeds_, negative_prompt_embeds_ = \
                 self.pipeline._encode_prompt(prompt, device=device, num_images_per_prompt=1,
-                                             do_classifier_free_guidance=True, negative_prompt=negative_prompt)
             prompt_embeds_ = prompt_embeds_.unsqueeze(0)
             negative_prompt_embeds_ = negative_prompt_embeds_.unsqueeze(0)
         else:
@@ -351,12 +348,58 @@ class AdaFaceWrapper(nn.Module):
                                                 num_images_per_prompt=1,
                                                 do_classifier_free_guidance=True,
                                                 negative_prompt=negative_prompt)
         return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
     # ref_img_strength is used only in the img2img pipeline.
     def forward(self, noise, prompt, negative_prompt=None,
-                placeholder_tokens_pos='postpend',
                 guidance_scale=6.0, out_image_count=4,
                 ref_img_strength=0.8, generator=None, verbose=False):
         noise = noise.to(device=self.device, dtype=torch.float16)
@@ -368,6 +411,7 @@ class AdaFaceWrapper(nn.Module):
             negative_pooled_prompt_embeds_ = \
                 self.encode_prompt(prompt, negative_prompt,
                                    placeholder_tokens_pos=placeholder_tokens_pos,
                                    device=self.device, verbose=verbose)
         # Repeat the prompt embeddings for all images in the batch.
         prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)

             self.placeholder_tokens_strs.append(placeholder_tokens_str)
         self.all_placeholder_tokens_str = " ".join(self.placeholder_tokens_strs)
+        # all_null_placeholder_tokens_str: ", , , , ..." (20 times).
+        # It just contains the commas and spaces with the same length, but no actual tokens.
+        self.all_null_placeholder_tokens_str = " ".join([", "] * len(self.all_placeholder_tokens))
         # Add the new tokens to the tokenizer.
         num_added_tokens = tokenizer.add_tokens(self.all_placeholder_tokens)
                 " `subject_string` that is not already in the tokenizer.")
         print(f"Added {num_added_tokens} tokens ({self.all_placeholder_tokens_str}) to the tokenizer.")
         # placeholder_token_ids: [49408, ..., 49423].
         self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.all_placeholder_tokens)
         #print("New tokens:", self.placeholder_token_ids)
                 token_embeds[token_id] = subj_embs[i]
             print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
+    def update_prompt(self, prompt, placeholder_tokens_pos='append',
+                      use_null_placeholders=False):
         if prompt is None:
             prompt = ""
+        if use_null_placeholders:
+            all_placeholder_tokens_str = self.all_null_placeholder_tokens_str
+        else:
+            all_placeholder_tokens_str = self.all_placeholder_tokens_str
         # Delete the subject_string from the prompt.
+        prompt = re.sub(r'\b(a|an|the)\s+' + self.subject_string + r'\b,?', "", prompt)
+        prompt = re.sub(r'\b' + self.subject_string + r'\b,?',              "", prompt)
         # Prevously, arc2face ada prompts work better if they are prepended to the prompt,
         # and consistentID ada prompts work better if they are appended to the prompt.
         # When we do joint training, seems both work better if they are appended to the prompt.
         # Therefore we simply appended all placeholder_tokens_str's to the prompt.
         # NOTE: Prepending them hurts compositional prompts.
         if placeholder_tokens_pos == 'prepend':
+            prompt = all_placeholder_tokens_str + " " + prompt
+        elif placeholder_tokens_pos == 'append':
+            prompt = prompt + " " + all_placeholder_tokens_str
+        else:
+            breakpoint()
         return prompt
             self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
         return all_adaface_subj_embs
+    def diffusers_encode_prompts(self, prompt, negative_prompt, device):
         # pooled_prompt_embeds_, negative_pooled_prompt_embeds_ are used by text2img3 and flux.
         pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = None, None
             # prompt_embeds_, negative_prompt_embeds_: [77, 768] -> [1, 77, 768].
             prompt_embeds_, negative_prompt_embeds_ = \
                 self.pipeline._encode_prompt(prompt, device=device, num_images_per_prompt=1,
+                                             do_classifier_free_guidance=True,
+                                             negative_prompt=negative_prompt)
             prompt_embeds_ = prompt_embeds_.unsqueeze(0)
             negative_prompt_embeds_ = negative_prompt_embeds_.unsqueeze(0)
         else:
                                                 num_images_per_prompt=1,
                                                 do_classifier_free_guidance=True,
                                                 negative_prompt=negative_prompt)
+        return prompt_embeds_, negative_prompt_embeds_, \
+               pooled_prompt_embeds_, negative_pooled_prompt_embeds_
+    def encode_prompt(self, prompt, negative_prompt=None,
+                      placeholder_tokens_pos='append',
+                      do_neg_id_prompt_weight=0,
+                      device=None, verbose=False):
+        if negative_prompt is None:
+            negative_prompt = self.negative_prompt
+        if device is None:
+            device = self.device
+        prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos)
+        if verbose:
+            print(f"Subject prompt:\n{prompt}")
+        if do_neg_id_prompt_weight > 0:
+            # Use 'prepend' for the negative prompt, since it's long and we want to make sure
+            # the placeholder tokens are not cut off.
+            negative_prompt0 = negative_prompt
+            negative_prompt      = self.update_prompt(negative_prompt0, placeholder_tokens_pos='prepend')
+            null_negative_prompt = self.update_prompt(negative_prompt0, placeholder_tokens_pos='prepend',
+                                                      use_null_placeholders=True)
+            if verbose:
+                print(f"do_neg_id_prompt_weight: {do_neg_id_prompt_weight}")
+                #print(f"Negative prompt:\n{negative_prompt}")
+                #print(f"Null negative prompt:\n{null_negative_prompt}")
+        else:
+            null_negative_prompt = None
+        # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
+        # So we manually move it to GPU here.
+        self.pipeline.text_encoder.to(device)
+        prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
+            self.diffusers_encode_prompts(prompt, negative_prompt, device)
+        if 0 < do_neg_id_prompt_weight < 1:
+            _, negative_prompt_embeds_null, _, _ = \
+                self.diffusers_encode_prompts(prompt, null_negative_prompt, device)
+            negative_prompt_embeds_ = negative_prompt_embeds_ * do_neg_id_prompt_weight + \
+                                      negative_prompt_embeds_null * (1 - do_neg_id_prompt_weight)
         return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
     # ref_img_strength is used only in the img2img pipeline.
     def forward(self, noise, prompt, negative_prompt=None,
+                placeholder_tokens_pos='append',
+                do_neg_id_prompt_weight=0,
                 guidance_scale=6.0, out_image_count=4,
                 ref_img_strength=0.8, generator=None, verbose=False):
         noise = noise.to(device=self.device, dtype=torch.float16)
             negative_pooled_prompt_embeds_ = \
                 self.encode_prompt(prompt, negative_prompt,
                                    placeholder_tokens_pos=placeholder_tokens_pos,
+                                   do_neg_id_prompt_weight=do_neg_id_prompt_weight,
                                    device=self.device, verbose=verbose)
         # Repeat the prompt embeddings for all images in the batch.
         prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)

adaface/face_id_to_ada_prompt.py CHANGED Viewed

@@ -672,6 +672,10 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
             # are not used and will be released soon.
             # Only the consistentID modules and bise_net are used.
             assert base_model_path is not None, "base_model_path should be provided."
             pipe = ConsistentIDPipeline.from_single_file(base_model_path)
             pipe.load_ConsistentID_model(consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
                                          bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")

             # are not used and will be released soon.
             # Only the consistentID modules and bise_net are used.
             assert base_model_path is not None, "base_model_path should be provided."
+            # Avoid passing dtype to ConsistentIDPipeline.from_single_file(),
+            # because we've overloaded .to() to convert consistentID specific modules as well,
+            # but diffusers will call .to(dtype) in .from_single_file(),
+            # and at that moment, the consistentID specific modules are not loaded yet.
             pipe = ConsistentIDPipeline.from_single_file(base_model_path)
             pipe.load_ConsistentID_model(consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
                                          bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")

adaface/test_img_prompt_model.py CHANGED Viewed

@@ -159,7 +159,7 @@ if __name__ == "__main__":
             pipeline.encode_prompt(comp_prompt, device='cuda', num_images_per_prompt=args.out_image_count,
                                    do_classifier_free_guidance=True, negative_prompt=negative_prompt)
         #pipeline.text_encoder = text_encoder
-        # Postpend the id prompt embeddings to the prompt embeddings.
         # For arc2face, id_prompt_emb can be either pre- or post-pended.
         # But for ConsistentID, id_prompt_emb has to be **post-pended**. Otherwise, the result images are blank.

             pipeline.encode_prompt(comp_prompt, device='cuda', num_images_per_prompt=args.out_image_count,
                                    do_classifier_free_guidance=True, negative_prompt=negative_prompt)
         #pipeline.text_encoder = text_encoder
+        # Append the id prompt embeddings to the prompt embeddings.
         # For arc2face, id_prompt_emb can be either pre- or post-pended.
         # But for ConsistentID, id_prompt_emb has to be **post-pended**. Otherwise, the result images are blank.

adaface/util.py CHANGED Viewed

@@ -225,16 +225,18 @@ class UNetEnsemble(nn.Module):
 def create_consistentid_pipeline(base_model_path="models/sd15-dste8-vae.safetensors",
                                  dtype=torch.float16, unet_only=False):
-    pipe = ConsistentIDPipeline.from_single_file(
-        base_model_path,
-        torch_dtype=dtype,
-    )
     # consistentID specific modules are still in fp32. Will be converted to fp16
     # later with .to(device, torch_dtype) by the caller.
     pipe.load_ConsistentID_model(
         consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
         bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
     )
     # We load the pipeline first, then use the unet in the pipeline.
     # Since the pipeline initialization will load LoRA into the unet,
     # now we have the unet with LoRA loaded.

 def create_consistentid_pipeline(base_model_path="models/sd15-dste8-vae.safetensors",
                                  dtype=torch.float16, unet_only=False):
+    pipe = ConsistentIDPipeline.from_single_file(base_model_path)
     # consistentID specific modules are still in fp32. Will be converted to fp16
     # later with .to(device, torch_dtype) by the caller.
     pipe.load_ConsistentID_model(
         consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
         bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
     )
+    # Avoid passing dtype to ConsistentIDPipeline.from_single_file(),
+    # because we've overloaded .to() to convert consistentID specific modules as well,
+    # but diffusers will call .to(dtype) in .from_single_file(),
+    # and at that moment, the consistentID specific modules are not loaded yet.
+    pipe.to(dtype=dtype)
     # We load the pipeline first, then use the unet in the pipeline.
     # Since the pipeline initialization will load LoRA into the unet,
     # now we have the unet with LoRA loaded.

app.py CHANGED Viewed

@@ -24,9 +24,14 @@ parser = argparse.ArgumentParser()
 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
 parser.add_argument('--adaface_ckpt_path', type=str,
-                    default='models/adaface/VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt')
 parser.add_argument('--model_style_type', type=str, default='realistic',
                     choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")
 args = parser.parse_args()
@@ -116,7 +121,7 @@ def gen_init_images(uploaded_image_paths, model_style_type, prompt, out_image_co
     # samples: A list of PIL Image instances.
     with torch.no_grad():
         samples = adaface(noise, prompt,
-                          placeholder_tokens_pos='prepend',
                           out_image_count=out_image_count, verbose=True)
     face_paths = []
@@ -133,8 +138,8 @@ def gen_init_images(uploaded_image_paths, model_style_type, prompt, out_image_co
 @spaces.GPU(duration=90)
 def generate_video(image_container, uploaded_image_paths, init_img_file_paths, init_img_selected_idx,
                    init_image_strength, init_image_final_weight, model_style_type,
-                   prompt, negative_prompt, num_steps, video_length, guidance_scale, seed,
-                   attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
                    is_adaface_enabled, adaface_ckpt_path, adaface_power_scale,
                    id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
@@ -169,7 +174,7 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
         prompt_img_lists.append(load_image(face_path).resize((224,224)))
     if adaface is None or not is_adaface_enabled:
-        adaface_prompt_embeds = None
         image_embed_cfg_scales = (1, 1)
     else:
         if (adaface_ckpt_path is not None and adaface_ckpt_path.strip() != '') \
@@ -184,9 +189,10 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
                                                    update_text_encoder=True)
             # adaface_prompt_embeds: [1, 77, 768].
-            adaface_prompt_embeds, _, _, _ = adaface.encode_prompt(prompt,
-                                                                   placeholder_tokens_pos='prepend',
-                                                                   verbose=True)
         image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
@@ -206,7 +212,7 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
                                   init_image_strength   = (init_image_strength, init_image_final_weight),
                                   prompt                = prompt,
                                   negative_prompt       = negative_prompt,
-                                  adaface_prompt_embeds = adaface_prompt_embeds,
                                   # adaface_power_scale is not so useful, and when it's set >= 2, weird artifacts appear.
                                   # Here it's limited to 0.7~1.3.
                                   adaface_power_scale   = adaface_power_scale,
@@ -241,7 +247,9 @@ with gr.Blocks(css=css) as demo:
 ❗️**Tips**❗️
 - You can upload one or more subject images for generating ID-specific video.
-- Try different parameter combinations for the best generation quality.
 - Usage explanations and demos: [Readme](https://huggingface.co/spaces/adaface-neurips/adaface-animate/blob/main/README2.md).
 - AdaFace Text-to-Image: <a href="https://huggingface.co/spaces/adaface-neurips/adaface" style="display: inline-flex; align-items: center;">
   AdaFace
@@ -284,20 +292,20 @@ with gr.Blocks(css=css) as demo:
             prompt = gr.Dropdown(label="Prompt",
                        info="Try something like 'man/woman walking on the beach'.",
-                       value="portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
                        allow_custom_value=True,
                        filterable=False,
                        choices=[
-                            "portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
-                            "portrait, walking on the beach, sunset, orange sky, eye level shot",
-                            "portrait, in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
-                            "portrait, dancing pose among folks in a park, waving hands",
-                            "portrait, in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
-                            "portrait, jedi wielding a lightsaber, star wars, full body view, eye level shot",
-                            "portrait, playing guitar on a boat, ocean waves",
-                            "portrait, with a passion for reading, curled up with a book in a cozy nook near a window",
-                            "portrait, running pose in a park, eye level shot",
-                            "portrait, in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
                        ])
             init_image_strength = gr.Slider(
@@ -321,16 +329,25 @@ with gr.Blocks(css=css) as demo:
                 label="Base Model Style Type",
                 info="Switching the base model type will take 10~20 seconds to reload the model",
                 value=args.model_style_type.capitalize(),
-                choices=["Rrealistic", "Anime", "Photorealistic"],
                 allow_custom_value=False,
                 filterable=False,
             )
             guidance_scale = gr.Slider(
                 label="Guidance scale",
                 minimum=1.0,
-                maximum=8.0,
-                step=0.5,
-                value=6,
             )
             seed = gr.Slider(
@@ -351,10 +368,10 @@ with gr.Blocks(css=css) as demo:
                 value="(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime), text, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, bare breasts, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, long neck, UnrealisticDream",
             )
             num_steps = gr.Slider(
-                label="Number of sampling steps",
                 minimum=30,
-                maximum=80,
-                step=1,
                 value=40,
             )
@@ -448,7 +465,7 @@ with gr.Blocks(css=css) as demo:
                  fn=generate_video,
                  inputs=[image_container, files, init_img_files, init_img_selected_idx, init_image_strength,
                          init_image_final_weight, model_style_type,
-                         prompt, negative_prompt, num_steps, video_length, guidance_scale,
                          seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
                          is_adaface_enabled, adaface_ckpt_path, adaface_power_scale, id_animator_anneal_steps],
                  outputs=[result_video]

 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
 parser.add_argument('--adaface_ckpt_path', type=str,
+                    default='models/adaface/VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-30000.pt')
 parser.add_argument('--model_style_type', type=str, default='realistic',
                     choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
+parser.add_argument("--guidance_scale", type=float, default=8.0,
+                    help="The guidance scale for the diffusion model. Default: 8.0")
+parser.add_argument("--do_neg_id_prompt_weight", type=float, default=0.2,
+                    help="The weight of added ID prompt embeddings into the negative prompt. Default: 0, disabled.")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")
 args = parser.parse_args()
     # samples: A list of PIL Image instances.
     with torch.no_grad():
         samples = adaface(noise, prompt,
+                          placeholder_tokens_pos='append',
                           out_image_count=out_image_count, verbose=True)
     face_paths = []
 @spaces.GPU(duration=90)
 def generate_video(image_container, uploaded_image_paths, init_img_file_paths, init_img_selected_idx,
                    init_image_strength, init_image_final_weight, model_style_type,
+                   prompt, negative_prompt, num_steps, video_length, guidance_scale, do_neg_id_prompt_weight,
+                   seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
                    is_adaface_enabled, adaface_ckpt_path, adaface_power_scale,
                    id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
         prompt_img_lists.append(load_image(face_path).resize((224,224)))
     if adaface is None or not is_adaface_enabled:
+        adaface_prompt_embeds, negative_prompt_embeds = None, None
         image_embed_cfg_scales = (1, 1)
     else:
         if (adaface_ckpt_path is not None and adaface_ckpt_path.strip() != '') \
                                                    update_text_encoder=True)
             # adaface_prompt_embeds: [1, 77, 768].
+            adaface_prompt_embeds, negative_prompt_embeds, _, _ = \
+                adaface.encode_prompt(prompt, placeholder_tokens_pos='append',
+                                      do_neg_id_prompt_weight=do_neg_id_prompt_weight,
+                                      verbose=True)
         image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
                                   init_image_strength   = (init_image_strength, init_image_final_weight),
                                   prompt                = prompt,
                                   negative_prompt       = negative_prompt,
+                                  adaface_prompt_embeds = (adaface_prompt_embeds, negative_prompt_embeds),
                                   # adaface_power_scale is not so useful, and when it's set >= 2, weird artifacts appear.
                                   # Here it's limited to 0.7~1.3.
                                   adaface_power_scale   = adaface_power_scale,
 ❗️**Tips**❗️
 - You can upload one or more subject images for generating ID-specific video.
+- If the face dominates the video frames, try increasing the Weight of ID prompt in the negative prompt, at the cost of slight drop of ID authenticity.
+- If the face loses focu, try increasing the guidance scale. At the same time, increase the Weight of ID prompt in the negative prompt proportionally.
+- If the motion is weird, e.g., running, try increasing the number of sampling steps.
 - Usage explanations and demos: [Readme](https://huggingface.co/spaces/adaface-neurips/adaface-animate/blob/main/README2.md).
 - AdaFace Text-to-Image: <a href="https://huggingface.co/spaces/adaface-neurips/adaface" style="display: inline-flex; align-items: center;">
   AdaFace
             prompt = gr.Dropdown(label="Prompt",
                        info="Try something like 'man/woman walking on the beach'.",
+                       value="((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
                        allow_custom_value=True,
                        filterable=False,
                        choices=[
+                            "((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
+                            "walking on the beach, sunset, orange sky, eye level shot",
+                            "in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
+                            "dancing pose among folks in a park, waving hands",
+                            "in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
+                            "jedi wielding a lightsaber, star wars, full body view, eye level shot",
+                            "playing guitar on a boat, ocean waves",
+                            "with a passion for reading, curled up with a book in a cozy nook near a window",
+                            #"running pose in a park, full body view, eye level shot",
+                            "in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
                        ])
             init_image_strength = gr.Slider(
                 label="Base Model Style Type",
                 info="Switching the base model type will take 10~20 seconds to reload the model",
                 value=args.model_style_type.capitalize(),
+                choices=["Realistic", "Anime"], #"Photorealistic"],
                 allow_custom_value=False,
                 filterable=False,
             )
             guidance_scale = gr.Slider(
                 label="Guidance scale",
+                info="If > 10, there may be artifacts.",
                 minimum=1.0,
+                maximum=12.0,
+                step=1,
+                value=args.guidance_scale,
+            )
+            do_neg_id_prompt_weight = gr.Slider(
+                label="Weight of ID prompt in the negative prompt",
+                minimum=0.0,
+                maximum=0.9,
+                step=0.1,
+                value=args.do_neg_id_prompt_weight,
             )
             seed = gr.Slider(
                 value="(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime), text, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, bare breasts, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, long neck, UnrealisticDream",
             )
             num_steps = gr.Slider(
+                label="Number of sampling steps. More steps for better composition, but longer time.",
                 minimum=30,
+                maximum=70,
+                step=10,
                 value=40,
             )
                  fn=generate_video,
                  inputs=[image_container, files, init_img_files, init_img_selected_idx, init_image_strength,
                          init_image_final_weight, model_style_type,
+                         prompt, negative_prompt, num_steps, video_length, guidance_scale, do_neg_id_prompt_weight,
                          seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
                          is_adaface_enabled, adaface_ckpt_path, adaface_power_scale, id_animator_anneal_steps],
                  outputs=[result_video]

faceadapter/face_adapter.py CHANGED Viewed

@@ -307,10 +307,15 @@ class FaceAdapterPlusForVideoLora(FaceAdapterLora):
                 negative_prompt=negative_prompt,
             )
-            if adaface_prompt_embeds is not None:
                 # self.torch_type == torch.float16. adaface_prompt_embeds is torch.float32.
-                prompt_embeds_ = adaface_prompt_embeds.repeat(num_samples, 1, 1).to(dtype=self.torch_type) \
-                                    * adaface_power_scale
                 # Note to balance image_prompt_embeds with uncond_image_prompt_embeds after scaling.
                 image_prompt_embeds_begin = image_prompt_embeds * image_embed_cfg_scales[0] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[0])
                 image_prompt_embeds_end   = image_prompt_embeds * image_embed_cfg_scales[1] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[1])

                 negative_prompt=negative_prompt,
             )
+            if adaface_prompt_embeds is not None and adaface_prompt_embeds[0] is not None:
+                negative_prompt_embeds0 = negative_prompt_embeds_
+                adaface_prompt_embeds, negative_prompt_embeds_ = adaface_prompt_embeds
                 # self.torch_type == torch.float16. adaface_prompt_embeds is torch.float32.
+                prompt_embeds_ = adaface_prompt_embeds.repeat(num_samples, 1, 1).to(dtype=self.torch_type)
+                negative_prompt_embeds_ = negative_prompt_embeds_.repeat(num_samples, 1, 1).to(dtype=self.torch_type)
+                if adaface_power_scale != 1.0:
+                    prompt_embeds_ = prompt_embeds_ * adaface_power_scale - negative_prompt_embeds0 * (1 - adaface_power_scale)
                 # Note to balance image_prompt_embeds with uncond_image_prompt_embeds after scaling.
                 image_prompt_embeds_begin = image_prompt_embeds * image_embed_cfg_scales[0] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[0])
                 image_prompt_embeds_end   = image_prompt_embeds * image_embed_cfg_scales[1] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[1])

models/adaface/{VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt → VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-30000.pt} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c66b1847072c66deaa38b9ec91c0d76ac5274dec8d02444fc9672f0defa4d156
 size 1814921594

 version https://git-lfs.github.com/spec/v1
+oid sha256:34bbdaa97fb2da9e2aae4204bfd2f5c1565a84c664520a5f537129419ecb53fa
 size 1814921594