adaface-neurips commited on
Commit
f0b9ada
1 Parent(s): 2a110ec

Integrate do_neg_id_prompt_weight, fix bugs, various refinements

Browse files
adaface/adaface_infer.py CHANGED
@@ -151,5 +151,5 @@ if __name__ == "__main__":
151
  adaface.prepare_adaface_embeddings(image_paths, init_id_embs,
152
  perturb_at_stage='img_prompt_emb',
153
  perturb_std=args.perturb_std, update_text_encoder=True)
154
- images = adaface(noise, args.prompt, None, args.guidance_scale, args.out_image_count, verbose=True)
155
  save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.perturb_std)
 
151
  adaface.prepare_adaface_embeddings(image_paths, init_id_embs,
152
  perturb_at_stage='img_prompt_emb',
153
  perturb_std=args.perturb_std, update_text_encoder=True)
154
+ images = adaface(noise, args.prompt, None, 'append', args.guidance_scale, args.out_image_count, verbose=True)
155
  save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.perturb_std)
adaface/adaface_translate.py CHANGED
@@ -195,7 +195,7 @@ if __name__ == "__main__":
195
  # A noise level of 0.08 could change gender, but 0.06 is usually safe.
196
  # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
197
  # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
198
- out_images = adaface(in_images, args.prompt, None, args.guidance_scale, num_out_images, ref_img_strength=args.ref_img_strength)
199
 
200
  for img_i, img in enumerate(out_images):
201
  # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...
 
195
  # A noise level of 0.08 could change gender, but 0.06 is usually safe.
196
  # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
197
  # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
198
+ out_images = adaface(in_images, args.prompt, None, 'append', args.guidance_scale, num_out_images, ref_img_strength=args.ref_img_strength)
199
 
200
  for img_i, img in enumerate(out_images):
201
  # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...
adaface/adaface_wrapper.py CHANGED
@@ -217,6 +217,9 @@ class AdaFaceWrapper(nn.Module):
217
  self.placeholder_tokens_strs.append(placeholder_tokens_str)
218
 
219
  self.all_placeholder_tokens_str = " ".join(self.placeholder_tokens_strs)
 
 
 
220
 
221
  # Add the new tokens to the tokenizer.
222
  num_added_tokens = tokenizer.add_tokens(self.all_placeholder_tokens)
@@ -226,7 +229,7 @@ class AdaFaceWrapper(nn.Module):
226
  " `subject_string` that is not already in the tokenizer.")
227
 
228
  print(f"Added {num_added_tokens} tokens ({self.all_placeholder_tokens_str}) to the tokenizer.")
229
-
230
  # placeholder_token_ids: [49408, ..., 49423].
231
  self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.all_placeholder_tokens)
232
  #print("New tokens:", self.placeholder_token_ids)
@@ -247,22 +250,30 @@ class AdaFaceWrapper(nn.Module):
247
  token_embeds[token_id] = subj_embs[i]
248
  print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
249
 
250
- def update_prompt(self, prompt, placeholder_tokens_pos='postpend'):
 
251
  if prompt is None:
252
  prompt = ""
253
 
 
 
 
 
 
254
  # Delete the subject_string from the prompt.
255
- re.sub(r'\b(a|an|the)\s+' + self.subject_string + r'\b,?', "", prompt)
256
- re.sub(r'\b' + self.subject_string + r'\b,?', "", prompt)
257
  # Prevously, arc2face ada prompts work better if they are prepended to the prompt,
258
  # and consistentID ada prompts work better if they are appended to the prompt.
259
  # When we do joint training, seems both work better if they are appended to the prompt.
260
  # Therefore we simply appended all placeholder_tokens_str's to the prompt.
261
  # NOTE: Prepending them hurts compositional prompts.
262
  if placeholder_tokens_pos == 'prepend':
263
- prompt = self.all_placeholder_tokens_str + " " + prompt
264
- elif placeholder_tokens_pos == 'postpend':
265
- prompt = prompt + " " + self.all_placeholder_tokens_str
 
 
266
 
267
  return prompt
268
 
@@ -293,22 +304,7 @@ class AdaFaceWrapper(nn.Module):
293
  self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
294
  return all_adaface_subj_embs
295
 
296
- def encode_prompt(self, prompt, negative_prompt=None,
297
- placeholder_tokens_pos='postpend',
298
- device=None, verbose=False):
299
- if negative_prompt is None:
300
- negative_prompt = self.negative_prompt
301
-
302
- if device is None:
303
- device = self.device
304
-
305
- prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos)
306
- if verbose:
307
- print(f"Subject prompt: {prompt}")
308
-
309
- # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
310
- # So we manually move it to GPU here.
311
- self.pipeline.text_encoder.to(device)
312
  # pooled_prompt_embeds_, negative_pooled_prompt_embeds_ are used by text2img3 and flux.
313
  pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = None, None
314
 
@@ -317,7 +313,8 @@ class AdaFaceWrapper(nn.Module):
317
  # prompt_embeds_, negative_prompt_embeds_: [77, 768] -> [1, 77, 768].
318
  prompt_embeds_, negative_prompt_embeds_ = \
319
  self.pipeline._encode_prompt(prompt, device=device, num_images_per_prompt=1,
320
- do_classifier_free_guidance=True, negative_prompt=negative_prompt)
 
321
  prompt_embeds_ = prompt_embeds_.unsqueeze(0)
322
  negative_prompt_embeds_ = negative_prompt_embeds_.unsqueeze(0)
323
  else:
@@ -351,12 +348,58 @@ class AdaFaceWrapper(nn.Module):
351
  num_images_per_prompt=1,
352
  do_classifier_free_guidance=True,
353
  negative_prompt=negative_prompt)
354
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
356
 
357
  # ref_img_strength is used only in the img2img pipeline.
358
  def forward(self, noise, prompt, negative_prompt=None,
359
- placeholder_tokens_pos='postpend',
 
360
  guidance_scale=6.0, out_image_count=4,
361
  ref_img_strength=0.8, generator=None, verbose=False):
362
  noise = noise.to(device=self.device, dtype=torch.float16)
@@ -368,6 +411,7 @@ class AdaFaceWrapper(nn.Module):
368
  negative_pooled_prompt_embeds_ = \
369
  self.encode_prompt(prompt, negative_prompt,
370
  placeholder_tokens_pos=placeholder_tokens_pos,
 
371
  device=self.device, verbose=verbose)
372
  # Repeat the prompt embeddings for all images in the batch.
373
  prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
 
217
  self.placeholder_tokens_strs.append(placeholder_tokens_str)
218
 
219
  self.all_placeholder_tokens_str = " ".join(self.placeholder_tokens_strs)
220
+ # all_null_placeholder_tokens_str: ", , , , ..." (20 times).
221
+ # It just contains the commas and spaces with the same length, but no actual tokens.
222
+ self.all_null_placeholder_tokens_str = " ".join([", "] * len(self.all_placeholder_tokens))
223
 
224
  # Add the new tokens to the tokenizer.
225
  num_added_tokens = tokenizer.add_tokens(self.all_placeholder_tokens)
 
229
  " `subject_string` that is not already in the tokenizer.")
230
 
231
  print(f"Added {num_added_tokens} tokens ({self.all_placeholder_tokens_str}) to the tokenizer.")
232
+
233
  # placeholder_token_ids: [49408, ..., 49423].
234
  self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.all_placeholder_tokens)
235
  #print("New tokens:", self.placeholder_token_ids)
 
250
  token_embeds[token_id] = subj_embs[i]
251
  print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
252
 
253
+ def update_prompt(self, prompt, placeholder_tokens_pos='append',
254
+ use_null_placeholders=False):
255
  if prompt is None:
256
  prompt = ""
257
 
258
+ if use_null_placeholders:
259
+ all_placeholder_tokens_str = self.all_null_placeholder_tokens_str
260
+ else:
261
+ all_placeholder_tokens_str = self.all_placeholder_tokens_str
262
+
263
  # Delete the subject_string from the prompt.
264
+ prompt = re.sub(r'\b(a|an|the)\s+' + self.subject_string + r'\b,?', "", prompt)
265
+ prompt = re.sub(r'\b' + self.subject_string + r'\b,?', "", prompt)
266
  # Prevously, arc2face ada prompts work better if they are prepended to the prompt,
267
  # and consistentID ada prompts work better if they are appended to the prompt.
268
  # When we do joint training, seems both work better if they are appended to the prompt.
269
  # Therefore we simply appended all placeholder_tokens_str's to the prompt.
270
  # NOTE: Prepending them hurts compositional prompts.
271
  if placeholder_tokens_pos == 'prepend':
272
+ prompt = all_placeholder_tokens_str + " " + prompt
273
+ elif placeholder_tokens_pos == 'append':
274
+ prompt = prompt + " " + all_placeholder_tokens_str
275
+ else:
276
+ breakpoint()
277
 
278
  return prompt
279
 
 
304
  self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
305
  return all_adaface_subj_embs
306
 
307
+ def diffusers_encode_prompts(self, prompt, negative_prompt, device):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  # pooled_prompt_embeds_, negative_pooled_prompt_embeds_ are used by text2img3 and flux.
309
  pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = None, None
310
 
 
313
  # prompt_embeds_, negative_prompt_embeds_: [77, 768] -> [1, 77, 768].
314
  prompt_embeds_, negative_prompt_embeds_ = \
315
  self.pipeline._encode_prompt(prompt, device=device, num_images_per_prompt=1,
316
+ do_classifier_free_guidance=True,
317
+ negative_prompt=negative_prompt)
318
  prompt_embeds_ = prompt_embeds_.unsqueeze(0)
319
  negative_prompt_embeds_ = negative_prompt_embeds_.unsqueeze(0)
320
  else:
 
348
  num_images_per_prompt=1,
349
  do_classifier_free_guidance=True,
350
  negative_prompt=negative_prompt)
351
+
352
+ return prompt_embeds_, negative_prompt_embeds_, \
353
+ pooled_prompt_embeds_, negative_pooled_prompt_embeds_
354
+
355
+ def encode_prompt(self, prompt, negative_prompt=None,
356
+ placeholder_tokens_pos='append',
357
+ do_neg_id_prompt_weight=0,
358
+ device=None, verbose=False):
359
+ if negative_prompt is None:
360
+ negative_prompt = self.negative_prompt
361
+
362
+ if device is None:
363
+ device = self.device
364
+
365
+ prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos)
366
+ if verbose:
367
+ print(f"Subject prompt:\n{prompt}")
368
+
369
+ if do_neg_id_prompt_weight > 0:
370
+ # Use 'prepend' for the negative prompt, since it's long and we want to make sure
371
+ # the placeholder tokens are not cut off.
372
+ negative_prompt0 = negative_prompt
373
+ negative_prompt = self.update_prompt(negative_prompt0, placeholder_tokens_pos='prepend')
374
+ null_negative_prompt = self.update_prompt(negative_prompt0, placeholder_tokens_pos='prepend',
375
+ use_null_placeholders=True)
376
+ if verbose:
377
+ print(f"do_neg_id_prompt_weight: {do_neg_id_prompt_weight}")
378
+ #print(f"Negative prompt:\n{negative_prompt}")
379
+ #print(f"Null negative prompt:\n{null_negative_prompt}")
380
+
381
+ else:
382
+ null_negative_prompt = None
383
+
384
+ # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
385
+ # So we manually move it to GPU here.
386
+ self.pipeline.text_encoder.to(device)
387
+
388
+ prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
389
+ self.diffusers_encode_prompts(prompt, negative_prompt, device)
390
+
391
+ if 0 < do_neg_id_prompt_weight < 1:
392
+ _, negative_prompt_embeds_null, _, _ = \
393
+ self.diffusers_encode_prompts(prompt, null_negative_prompt, device)
394
+ negative_prompt_embeds_ = negative_prompt_embeds_ * do_neg_id_prompt_weight + \
395
+ negative_prompt_embeds_null * (1 - do_neg_id_prompt_weight)
396
+
397
  return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
398
 
399
  # ref_img_strength is used only in the img2img pipeline.
400
  def forward(self, noise, prompt, negative_prompt=None,
401
+ placeholder_tokens_pos='append',
402
+ do_neg_id_prompt_weight=0,
403
  guidance_scale=6.0, out_image_count=4,
404
  ref_img_strength=0.8, generator=None, verbose=False):
405
  noise = noise.to(device=self.device, dtype=torch.float16)
 
411
  negative_pooled_prompt_embeds_ = \
412
  self.encode_prompt(prompt, negative_prompt,
413
  placeholder_tokens_pos=placeholder_tokens_pos,
414
+ do_neg_id_prompt_weight=do_neg_id_prompt_weight,
415
  device=self.device, verbose=verbose)
416
  # Repeat the prompt embeddings for all images in the batch.
417
  prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
adaface/face_id_to_ada_prompt.py CHANGED
@@ -672,6 +672,10 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
672
  # are not used and will be released soon.
673
  # Only the consistentID modules and bise_net are used.
674
  assert base_model_path is not None, "base_model_path should be provided."
 
 
 
 
675
  pipe = ConsistentIDPipeline.from_single_file(base_model_path)
676
  pipe.load_ConsistentID_model(consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
677
  bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")
 
672
  # are not used and will be released soon.
673
  # Only the consistentID modules and bise_net are used.
674
  assert base_model_path is not None, "base_model_path should be provided."
675
+ # Avoid passing dtype to ConsistentIDPipeline.from_single_file(),
676
+ # because we've overloaded .to() to convert consistentID specific modules as well,
677
+ # but diffusers will call .to(dtype) in .from_single_file(),
678
+ # and at that moment, the consistentID specific modules are not loaded yet.
679
  pipe = ConsistentIDPipeline.from_single_file(base_model_path)
680
  pipe.load_ConsistentID_model(consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
681
  bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")
adaface/test_img_prompt_model.py CHANGED
@@ -159,7 +159,7 @@ if __name__ == "__main__":
159
  pipeline.encode_prompt(comp_prompt, device='cuda', num_images_per_prompt=args.out_image_count,
160
  do_classifier_free_guidance=True, negative_prompt=negative_prompt)
161
  #pipeline.text_encoder = text_encoder
162
- # Postpend the id prompt embeddings to the prompt embeddings.
163
  # For arc2face, id_prompt_emb can be either pre- or post-pended.
164
  # But for ConsistentID, id_prompt_emb has to be **post-pended**. Otherwise, the result images are blank.
165
 
 
159
  pipeline.encode_prompt(comp_prompt, device='cuda', num_images_per_prompt=args.out_image_count,
160
  do_classifier_free_guidance=True, negative_prompt=negative_prompt)
161
  #pipeline.text_encoder = text_encoder
162
+ # Append the id prompt embeddings to the prompt embeddings.
163
  # For arc2face, id_prompt_emb can be either pre- or post-pended.
164
  # But for ConsistentID, id_prompt_emb has to be **post-pended**. Otherwise, the result images are blank.
165
 
adaface/util.py CHANGED
@@ -225,16 +225,18 @@ class UNetEnsemble(nn.Module):
225
 
226
  def create_consistentid_pipeline(base_model_path="models/sd15-dste8-vae.safetensors",
227
  dtype=torch.float16, unet_only=False):
228
- pipe = ConsistentIDPipeline.from_single_file(
229
- base_model_path,
230
- torch_dtype=dtype,
231
- )
232
  # consistentID specific modules are still in fp32. Will be converted to fp16
233
  # later with .to(device, torch_dtype) by the caller.
234
  pipe.load_ConsistentID_model(
235
  consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
236
  bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
237
  )
 
 
 
 
 
238
  # We load the pipeline first, then use the unet in the pipeline.
239
  # Since the pipeline initialization will load LoRA into the unet,
240
  # now we have the unet with LoRA loaded.
 
225
 
226
  def create_consistentid_pipeline(base_model_path="models/sd15-dste8-vae.safetensors",
227
  dtype=torch.float16, unet_only=False):
228
+ pipe = ConsistentIDPipeline.from_single_file(base_model_path)
 
 
 
229
  # consistentID specific modules are still in fp32. Will be converted to fp16
230
  # later with .to(device, torch_dtype) by the caller.
231
  pipe.load_ConsistentID_model(
232
  consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
233
  bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
234
  )
235
+ # Avoid passing dtype to ConsistentIDPipeline.from_single_file(),
236
+ # because we've overloaded .to() to convert consistentID specific modules as well,
237
+ # but diffusers will call .to(dtype) in .from_single_file(),
238
+ # and at that moment, the consistentID specific modules are not loaded yet.
239
+ pipe.to(dtype=dtype)
240
  # We load the pipeline first, then use the unet in the pipeline.
241
  # Since the pipeline initialization will load LoRA into the unet,
242
  # now we have the unet with LoRA loaded.
app.py CHANGED
@@ -24,9 +24,14 @@ parser = argparse.ArgumentParser()
24
  parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
25
  choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
26
  parser.add_argument('--adaface_ckpt_path', type=str,
27
- default='models/adaface/VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt')
28
  parser.add_argument('--model_style_type', type=str, default='realistic',
29
  choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
 
 
 
 
 
30
  parser.add_argument('--gpu', type=int, default=None)
31
  parser.add_argument('--ip', type=str, default="0.0.0.0")
32
  args = parser.parse_args()
@@ -116,7 +121,7 @@ def gen_init_images(uploaded_image_paths, model_style_type, prompt, out_image_co
116
  # samples: A list of PIL Image instances.
117
  with torch.no_grad():
118
  samples = adaface(noise, prompt,
119
- placeholder_tokens_pos='prepend',
120
  out_image_count=out_image_count, verbose=True)
121
 
122
  face_paths = []
@@ -133,8 +138,8 @@ def gen_init_images(uploaded_image_paths, model_style_type, prompt, out_image_co
133
  @spaces.GPU(duration=90)
134
  def generate_video(image_container, uploaded_image_paths, init_img_file_paths, init_img_selected_idx,
135
  init_image_strength, init_image_final_weight, model_style_type,
136
- prompt, negative_prompt, num_steps, video_length, guidance_scale, seed,
137
- attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
138
  is_adaface_enabled, adaface_ckpt_path, adaface_power_scale,
139
  id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
140
 
@@ -169,7 +174,7 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
169
  prompt_img_lists.append(load_image(face_path).resize((224,224)))
170
 
171
  if adaface is None or not is_adaface_enabled:
172
- adaface_prompt_embeds = None
173
  image_embed_cfg_scales = (1, 1)
174
  else:
175
  if (adaface_ckpt_path is not None and adaface_ckpt_path.strip() != '') \
@@ -184,9 +189,10 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
184
  update_text_encoder=True)
185
 
186
  # adaface_prompt_embeds: [1, 77, 768].
187
- adaface_prompt_embeds, _, _, _ = adaface.encode_prompt(prompt,
188
- placeholder_tokens_pos='prepend',
189
- verbose=True)
 
190
 
191
  image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
192
 
@@ -206,7 +212,7 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
206
  init_image_strength = (init_image_strength, init_image_final_weight),
207
  prompt = prompt,
208
  negative_prompt = negative_prompt,
209
- adaface_prompt_embeds = adaface_prompt_embeds,
210
  # adaface_power_scale is not so useful, and when it's set >= 2, weird artifacts appear.
211
  # Here it's limited to 0.7~1.3.
212
  adaface_power_scale = adaface_power_scale,
@@ -241,7 +247,9 @@ with gr.Blocks(css=css) as demo:
241
 
242
  ❗️**Tips**❗️
243
  - You can upload one or more subject images for generating ID-specific video.
244
- - Try different parameter combinations for the best generation quality.
 
 
245
  - Usage explanations and demos: [Readme](https://huggingface.co/spaces/adaface-neurips/adaface-animate/blob/main/README2.md).
246
  - AdaFace Text-to-Image: <a href="https://huggingface.co/spaces/adaface-neurips/adaface" style="display: inline-flex; align-items: center;">
247
  AdaFace
@@ -284,20 +292,20 @@ with gr.Blocks(css=css) as demo:
284
 
285
  prompt = gr.Dropdown(label="Prompt",
286
  info="Try something like 'man/woman walking on the beach'.",
287
- value="portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
288
  allow_custom_value=True,
289
  filterable=False,
290
  choices=[
291
- "portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
292
- "portrait, walking on the beach, sunset, orange sky, eye level shot",
293
- "portrait, in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
294
- "portrait, dancing pose among folks in a park, waving hands",
295
- "portrait, in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
296
- "portrait, jedi wielding a lightsaber, star wars, full body view, eye level shot",
297
- "portrait, playing guitar on a boat, ocean waves",
298
- "portrait, with a passion for reading, curled up with a book in a cozy nook near a window",
299
- "portrait, running pose in a park, eye level shot",
300
- "portrait, in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
301
  ])
302
 
303
  init_image_strength = gr.Slider(
@@ -321,16 +329,25 @@ with gr.Blocks(css=css) as demo:
321
  label="Base Model Style Type",
322
  info="Switching the base model type will take 10~20 seconds to reload the model",
323
  value=args.model_style_type.capitalize(),
324
- choices=["Rrealistic", "Anime", "Photorealistic"],
325
  allow_custom_value=False,
326
  filterable=False,
327
  )
328
  guidance_scale = gr.Slider(
329
  label="Guidance scale",
 
330
  minimum=1.0,
331
- maximum=8.0,
332
- step=0.5,
333
- value=6,
 
 
 
 
 
 
 
 
334
  )
335
 
336
  seed = gr.Slider(
@@ -351,10 +368,10 @@ with gr.Blocks(css=css) as demo:
351
  value="(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime), text, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, bare breasts, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, long neck, UnrealisticDream",
352
  )
353
  num_steps = gr.Slider(
354
- label="Number of sampling steps",
355
  minimum=30,
356
- maximum=80,
357
- step=1,
358
  value=40,
359
  )
360
 
@@ -448,7 +465,7 @@ with gr.Blocks(css=css) as demo:
448
  fn=generate_video,
449
  inputs=[image_container, files, init_img_files, init_img_selected_idx, init_image_strength,
450
  init_image_final_weight, model_style_type,
451
- prompt, negative_prompt, num_steps, video_length, guidance_scale,
452
  seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
453
  is_adaface_enabled, adaface_ckpt_path, adaface_power_scale, id_animator_anneal_steps],
454
  outputs=[result_video]
 
24
  parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
25
  choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
26
  parser.add_argument('--adaface_ckpt_path', type=str,
27
+ default='models/adaface/VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-30000.pt')
28
  parser.add_argument('--model_style_type', type=str, default='realistic',
29
  choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
30
+ parser.add_argument("--guidance_scale", type=float, default=8.0,
31
+ help="The guidance scale for the diffusion model. Default: 8.0")
32
+ parser.add_argument("--do_neg_id_prompt_weight", type=float, default=0.2,
33
+ help="The weight of added ID prompt embeddings into the negative prompt. Default: 0, disabled.")
34
+
35
  parser.add_argument('--gpu', type=int, default=None)
36
  parser.add_argument('--ip', type=str, default="0.0.0.0")
37
  args = parser.parse_args()
 
121
  # samples: A list of PIL Image instances.
122
  with torch.no_grad():
123
  samples = adaface(noise, prompt,
124
+ placeholder_tokens_pos='append',
125
  out_image_count=out_image_count, verbose=True)
126
 
127
  face_paths = []
 
138
  @spaces.GPU(duration=90)
139
  def generate_video(image_container, uploaded_image_paths, init_img_file_paths, init_img_selected_idx,
140
  init_image_strength, init_image_final_weight, model_style_type,
141
+ prompt, negative_prompt, num_steps, video_length, guidance_scale, do_neg_id_prompt_weight,
142
+ seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
143
  is_adaface_enabled, adaface_ckpt_path, adaface_power_scale,
144
  id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
145
 
 
174
  prompt_img_lists.append(load_image(face_path).resize((224,224)))
175
 
176
  if adaface is None or not is_adaface_enabled:
177
+ adaface_prompt_embeds, negative_prompt_embeds = None, None
178
  image_embed_cfg_scales = (1, 1)
179
  else:
180
  if (adaface_ckpt_path is not None and adaface_ckpt_path.strip() != '') \
 
189
  update_text_encoder=True)
190
 
191
  # adaface_prompt_embeds: [1, 77, 768].
192
+ adaface_prompt_embeds, negative_prompt_embeds, _, _ = \
193
+ adaface.encode_prompt(prompt, placeholder_tokens_pos='append',
194
+ do_neg_id_prompt_weight=do_neg_id_prompt_weight,
195
+ verbose=True)
196
 
197
  image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
198
 
 
212
  init_image_strength = (init_image_strength, init_image_final_weight),
213
  prompt = prompt,
214
  negative_prompt = negative_prompt,
215
+ adaface_prompt_embeds = (adaface_prompt_embeds, negative_prompt_embeds),
216
  # adaface_power_scale is not so useful, and when it's set >= 2, weird artifacts appear.
217
  # Here it's limited to 0.7~1.3.
218
  adaface_power_scale = adaface_power_scale,
 
247
 
248
  ❗️**Tips**❗️
249
  - You can upload one or more subject images for generating ID-specific video.
250
+ - If the face dominates the video frames, try increasing the Weight of ID prompt in the negative prompt, at the cost of slight drop of ID authenticity.
251
+ - If the face loses focu, try increasing the guidance scale. At the same time, increase the Weight of ID prompt in the negative prompt proportionally.
252
+ - If the motion is weird, e.g., running, try increasing the number of sampling steps.
253
  - Usage explanations and demos: [Readme](https://huggingface.co/spaces/adaface-neurips/adaface-animate/blob/main/README2.md).
254
  - AdaFace Text-to-Image: <a href="https://huggingface.co/spaces/adaface-neurips/adaface" style="display: inline-flex; align-items: center;">
255
  AdaFace
 
292
 
293
  prompt = gr.Dropdown(label="Prompt",
294
  info="Try something like 'man/woman walking on the beach'.",
295
+ value="((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
296
  allow_custom_value=True,
297
  filterable=False,
298
  choices=[
299
+ "((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
300
+ "walking on the beach, sunset, orange sky, eye level shot",
301
+ "in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
302
+ "dancing pose among folks in a park, waving hands",
303
+ "in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
304
+ "jedi wielding a lightsaber, star wars, full body view, eye level shot",
305
+ "playing guitar on a boat, ocean waves",
306
+ "with a passion for reading, curled up with a book in a cozy nook near a window",
307
+ #"running pose in a park, full body view, eye level shot",
308
+ "in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
309
  ])
310
 
311
  init_image_strength = gr.Slider(
 
329
  label="Base Model Style Type",
330
  info="Switching the base model type will take 10~20 seconds to reload the model",
331
  value=args.model_style_type.capitalize(),
332
+ choices=["Realistic", "Anime"], #"Photorealistic"],
333
  allow_custom_value=False,
334
  filterable=False,
335
  )
336
  guidance_scale = gr.Slider(
337
  label="Guidance scale",
338
+ info="If > 10, there may be artifacts.",
339
  minimum=1.0,
340
+ maximum=12.0,
341
+ step=1,
342
+ value=args.guidance_scale,
343
+ )
344
+
345
+ do_neg_id_prompt_weight = gr.Slider(
346
+ label="Weight of ID prompt in the negative prompt",
347
+ minimum=0.0,
348
+ maximum=0.9,
349
+ step=0.1,
350
+ value=args.do_neg_id_prompt_weight,
351
  )
352
 
353
  seed = gr.Slider(
 
368
  value="(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime), text, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, bare breasts, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, long neck, UnrealisticDream",
369
  )
370
  num_steps = gr.Slider(
371
+ label="Number of sampling steps. More steps for better composition, but longer time.",
372
  minimum=30,
373
+ maximum=70,
374
+ step=10,
375
  value=40,
376
  )
377
 
 
465
  fn=generate_video,
466
  inputs=[image_container, files, init_img_files, init_img_selected_idx, init_image_strength,
467
  init_image_final_weight, model_style_type,
468
+ prompt, negative_prompt, num_steps, video_length, guidance_scale, do_neg_id_prompt_weight,
469
  seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
470
  is_adaface_enabled, adaface_ckpt_path, adaface_power_scale, id_animator_anneal_steps],
471
  outputs=[result_video]
faceadapter/face_adapter.py CHANGED
@@ -307,10 +307,15 @@ class FaceAdapterPlusForVideoLora(FaceAdapterLora):
307
  negative_prompt=negative_prompt,
308
  )
309
 
310
- if adaface_prompt_embeds is not None:
 
 
311
  # self.torch_type == torch.float16. adaface_prompt_embeds is torch.float32.
312
- prompt_embeds_ = adaface_prompt_embeds.repeat(num_samples, 1, 1).to(dtype=self.torch_type) \
313
- * adaface_power_scale
 
 
 
314
  # Note to balance image_prompt_embeds with uncond_image_prompt_embeds after scaling.
315
  image_prompt_embeds_begin = image_prompt_embeds * image_embed_cfg_scales[0] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[0])
316
  image_prompt_embeds_end = image_prompt_embeds * image_embed_cfg_scales[1] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[1])
 
307
  negative_prompt=negative_prompt,
308
  )
309
 
310
+ if adaface_prompt_embeds is not None and adaface_prompt_embeds[0] is not None:
311
+ negative_prompt_embeds0 = negative_prompt_embeds_
312
+ adaface_prompt_embeds, negative_prompt_embeds_ = adaface_prompt_embeds
313
  # self.torch_type == torch.float16. adaface_prompt_embeds is torch.float32.
314
+ prompt_embeds_ = adaface_prompt_embeds.repeat(num_samples, 1, 1).to(dtype=self.torch_type)
315
+ negative_prompt_embeds_ = negative_prompt_embeds_.repeat(num_samples, 1, 1).to(dtype=self.torch_type)
316
+ if adaface_power_scale != 1.0:
317
+ prompt_embeds_ = prompt_embeds_ * adaface_power_scale - negative_prompt_embeds0 * (1 - adaface_power_scale)
318
+
319
  # Note to balance image_prompt_embeds with uncond_image_prompt_embeds after scaling.
320
  image_prompt_embeds_begin = image_prompt_embeds * image_embed_cfg_scales[0] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[0])
321
  image_prompt_embeds_end = image_prompt_embeds * image_embed_cfg_scales[1] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[1])
models/adaface/{VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt → VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-30000.pt} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c66b1847072c66deaa38b9ec91c0d76ac5274dec8d02444fc9672f0defa4d156
3
  size 1814921594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34bbdaa97fb2da9e2aae4204bfd2f5c1565a84c664520a5f537129419ecb53fa
3
  size 1814921594