Spaces:
Runtime error
Runtime error
ermu2001
commited on
Commit
•
08720f3
1
Parent(s):
195eeff
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +16 -0
- Dockerfile +27 -0
- README.md +138 -0
- app.py +239 -0
- chat_anything/azure_utils.py +155 -0
- chat_anything/chatbot/__init__.py +0 -0
- chat_anything/chatbot/chat.py +72 -0
- chat_anything/chatbot/model_select.py +60 -0
- chat_anything/chatbot/personality.py +59 -0
- chat_anything/chatbot/select.py +63 -0
- chat_anything/chatbot/voice_select.py +119 -0
- chat_anything/face_generator/__init__.py +0 -0
- chat_anything/face_generator/long_prompt_control_generator.py +104 -0
- chat_anything/face_generator/long_prompt_generator.py +82 -0
- chat_anything/face_generator/pipelines/lpw_stable_diffusion.py +1471 -0
- chat_anything/face_generator/utils/generate.py +45 -0
- chat_anything/polly_utils.py +635 -0
- chat_anything/sad_talker/__init__.py +0 -0
- chat_anything/sad_talker/audio2exp_models/audio2exp.py +41 -0
- chat_anything/sad_talker/audio2exp_models/networks.py +74 -0
- chat_anything/sad_talker/audio2pose_models/audio2pose.py +94 -0
- chat_anything/sad_talker/audio2pose_models/audio_encoder.py +64 -0
- chat_anything/sad_talker/audio2pose_models/cvae.py +149 -0
- chat_anything/sad_talker/audio2pose_models/discriminator.py +76 -0
- chat_anything/sad_talker/audio2pose_models/networks.py +140 -0
- chat_anything/sad_talker/audio2pose_models/res_unet.py +65 -0
- chat_anything/sad_talker/config/auido2exp.yaml +58 -0
- chat_anything/sad_talker/config/auido2pose.yaml +49 -0
- chat_anything/sad_talker/config/facerender.yaml +45 -0
- chat_anything/sad_talker/config/facerender_still.yaml +45 -0
- chat_anything/sad_talker/config/similarity_Lm3D_all.mat +0 -0
- chat_anything/sad_talker/face3d/data/__init__.py +116 -0
- chat_anything/sad_talker/face3d/data/base_dataset.py +125 -0
- chat_anything/sad_talker/face3d/data/flist_dataset.py +125 -0
- chat_anything/sad_talker/face3d/data/image_folder.py +66 -0
- chat_anything/sad_talker/face3d/data/template_dataset.py +75 -0
- chat_anything/sad_talker/face3d/extract_kp_videos.py +108 -0
- chat_anything/sad_talker/face3d/extract_kp_videos_safe.py +162 -0
- chat_anything/sad_talker/face3d/models/__init__.py +67 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/README.md +164 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/backbones/__init__.py +25 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/backbones/iresnet.py +187 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/backbones/iresnet2060.py +176 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/backbones/mobilefacenet.py +130 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/configs/3millions.py +23 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/configs/3millions_pfc.py +23 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/configs/__init__.py +0 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/configs/base.py +56 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/configs/glint360k_mbf.py +26 -0
- chat_anything/sad_talker/face3d/models/arcface_torch/configs/glint360k_r100.py +26 -0
.gitignore
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**__pycache__/
|
2 |
+
|
3 |
+
MODELS
|
4 |
+
third_party
|
5 |
+
tmp
|
6 |
+
results
|
7 |
+
chat_anything/tts_vits/
|
8 |
+
vits_results
|
9 |
+
test
|
10 |
+
resources/models.yaml
|
11 |
+
|
12 |
+
# others
|
13 |
+
GFPGANv1.4.pth
|
14 |
+
gfpgan
|
15 |
+
GFPGAN
|
16 |
+
.gitattributes
|
Dockerfile
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
|
2 |
+
|
3 |
+
# FROM python:3.9
|
4 |
+
|
5 |
+
# WORKDIR /code
|
6 |
+
|
7 |
+
# COPY ./requirements.txt /code/requirements.txt
|
8 |
+
|
9 |
+
# RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
10 |
+
|
11 |
+
# for open cv
|
12 |
+
RUN apt-get update && apt-get install libgl1 -y
|
13 |
+
|
14 |
+
RUN useradd -m -u 1000 user
|
15 |
+
|
16 |
+
USER user
|
17 |
+
|
18 |
+
ENV HOME=/home/user \
|
19 |
+
PATH=/home/user/.local/bin:$PATH
|
20 |
+
|
21 |
+
WORKDIR $HOME/ChatAnything
|
22 |
+
|
23 |
+
COPY --chown=user . $HOME/ChatAnything
|
24 |
+
|
25 |
+
RUN pip install -r requirements.txt
|
26 |
+
|
27 |
+
CMD python app.py
|
README.md
CHANGED
@@ -10,3 +10,141 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
# ChatAnything: Facetime Chat with LLM-Enhanced Personas
|
14 |
+
|
15 |
+
**Yilin Zhao\*, Shanghua Gao\*, Daquan Zhou\*, Xinbin Yuan\*, Zhijie Lin, Qibin Hou, Jiashi Feng**
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
> What will it be like to Facetime any imaginary concepts?
|
20 |
+
To animate anything, we integrated current open-source models at hand for an animation application for interactive AI-Agent chatting usage.
|
21 |
+
>
|
22 |
+
> To start with, take a look at these incredible faces generated with open-source Civitai models that are to be animated.
|
23 |
+
<img src="./resources/readme/show.png" alt="drawing" width="784"/>
|
24 |
+
<!-- ![faces](./resources/readme/show.png) -->
|
25 |
+
|
26 |
+
Here we provide you with ChatAnything. A simple pipeline Enhanced with currently limitless Large Language Models, yielding imaginary Facetime chats with intented visual appearance!
|
27 |
+
|
28 |
+
Remember, the repo and application are totally based on pre-trained deep learning methods and haven't included any training yet. We give all the credit to the open-source community (shout out to you). For detail of the pipeline, see our technical report (TODO: link here)
|
29 |
+
## Release & Features & Future Plans
|
30 |
+
|
31 |
+
- [ ] Fine-tune face rendering module.
|
32 |
+
- [ ] Better TTS module & voice render module.
|
33 |
+
- [ ] Adding Open-source Language Models.
|
34 |
+
- [x] Initial release
|
35 |
+
- Facetime Animation.
|
36 |
+
- Multiple model choices for initial frame generation.
|
37 |
+
- Multiple choices for voices.
|
38 |
+
# Install & Run
|
39 |
+
Just follow the instructions. Every thing would be simple (hopefully). Reach out if you met with any problems!
|
40 |
+
### Install
|
41 |
+
first, install the virtual environment.
|
42 |
+
```
|
43 |
+
conda env create -f environment.yaml
|
44 |
+
|
45 |
+
# then install
|
46 |
+
conda env update --name chatanything --file environment.yaml
|
47 |
+
```
|
48 |
+
|
49 |
+
The Pipeline integrated Open-Source Models. All Models are to be found online(see [Acknowledgement](#acknowledgement)). We put some important models together on huggingface remotes just to make life easier. Prepare them for the first run with this Python script [prepare_models.py](./python_scripts/prepare_models.py):
|
50 |
+
```
|
51 |
+
# prepare the local models
|
52 |
+
python python_scripts/prepare_models.py
|
53 |
+
|
54 |
+
```
|
55 |
+
|
56 |
+
### Building Docker
|
57 |
+
Try build a docker if you find it easier. This part is not fully tested. If you find a anything wrong, feel free to contribute~
|
58 |
+
```
|
59 |
+
docker build --network=host -t chatanything .
|
60 |
+
# docker run -dp 127.0.0.1:8901:8901 chatanything
|
61 |
+
docker run -p 127.0.0.1:8901:8901 -it --gpus all chatanything
|
62 |
+
docker run -it --gpus all chatanything bash
|
63 |
+
```
|
64 |
+
|
65 |
+
### Run
|
66 |
+
specify a port for the gradio application to run on and set off!
|
67 |
+
```
|
68 |
+
PORT=8809 python app.py $PORT
|
69 |
+
```
|
70 |
+
|
71 |
+
# Configuring: From User Input Concept to Appearance & Voice
|
72 |
+
The first step of the pipeline is to generate a image for SadTalker and at the same time set up the Text to Sound Module for voice chat.
|
73 |
+
|
74 |
+
The pipeline would query a powerful LLM (ChatGPT) for the selection in a zero-shot multi-choice selection format.
|
75 |
+
Three Questions are asked upon the initial of every conversation(init frame generation):
|
76 |
+
1. Provide a imagen personality for the user input concept.
|
77 |
+
2. Select a Generative model for the init frame generation.
|
78 |
+
3. Select a Text To Sound Voice(Model) for the character base on the personality.
|
79 |
+
|
80 |
+
We have constructed the model selection to be extendable. Add your ideal model with just a few lines of Configuring! The rest of this section would breifly introduce the steps to add a init-frame generator/language voice.
|
81 |
+
|
82 |
+
### Image Generator
|
83 |
+
Configure the models in the [Model Config](./resources/models.yaml). This Config acts as the memory (or an image-generating tool pool) for the LLM.
|
84 |
+
|
85 |
+
The prompt sets up this selection process. Each sub field of the "models" would turn into an option in the multiple-choice question.
|
86 |
+
the "**desc**" field of each element is what the Language Model would see. The key is not provided to the LM as it would sometimes mislead it.
|
87 |
+
the others are used for the image generation as listed:
|
88 |
+
1. model_dir: the repo-path for diffusers package. As the pretrained Face-landmark ControlNet is based on stable-diffusion-v1-5, we currently only supports the derivatives of it.
|
89 |
+
2. lora_path: LoRA derivatives are powerful, try a LoRA model also for better stylization. Should directly point to the parameters binary file.
|
90 |
+
3. prompt_template & negative_prompt: this is used for prompting the text-to-image diffusion model. Find a ideal prompt for your model and stick with it. A "{}" should be in the prompt template for inserting the user input concept.
|
91 |
+
|
92 |
+
Here are some **Tips** for configuring you own model.
|
93 |
+
1. Provide the LLM with a simple description of the generative model. It is worth noting that the description needs to be concise and accurate for a correct selection.
|
94 |
+
2. Set the model_dir to a local directory of diffusers stable-diffusion-v1-5 derivatives. Also, you can provide a repo-id on the huggingface hub model space. The model would be downloaded when first chosen, wait for it.
|
95 |
+
3. To better utilize the resources from the community, we also add in support of the LoRA features. To add the LoRA module, you would need to give the path to the parameter files.
|
96 |
+
|
97 |
+
4. Carefully write the prompt template and negative prompt. These which affect the initial face generation a lot. Be aware that the prompt template should contain only one pair of "{}" to insert the concept that users wrote on the application webpage. We support the Stable-Diffusion-Webui prompt style as implemented by diffusers, feel free to copy the prompt from Civitai for better prompting the generation and put in the "{}" to the original prompt for ChatAnything!
|
98 |
+
|
99 |
+
Again, this model's config acts as an extended tool pool for the LM, the application would drive the LM to choose from this config and use the chosen model to generate. Sometimes the LM fails to choose the correct model or choosing any available model, this would cause the Chatanything app to fail on a generation.
|
100 |
+
|
101 |
+
Notice we currently support ONLY stable-diffusion-v1.5 derivatives (Sdxl Pipelines are under consideration, however not yet implemented as we lack a face-landmark ControlNet for it. Reach out if you're interested in training one!)
|
102 |
+
|
103 |
+
### Voice TTS
|
104 |
+
We are using the edge_tts package for text-to-speech support. The voice selection and [voice configuration file](./resources/voices_edge.yaml) is constructed similarly to the Image generation model selection, except now the LM is supposed to choose the voice base on the personality description given by itself earlier. "**gender**" and "**language**" field corresponds to edge_tts.
|
105 |
+
|
106 |
+
# On-going tasks.
|
107 |
+
### Customized Voice.
|
108 |
+
There is a Voice Changer TextToSpeach-SpeachVoiceConversion Pipeline app, which ensures a better customized voice. We are trying to leverage its TTS functionality.
|
109 |
+
|
110 |
+
Reach out if you want to add a voice of your own or your hero!
|
111 |
+
|
112 |
+
Here are the possible steps for
|
113 |
+
You would need to change a little bit in the code first:
|
114 |
+
1. Alter this [code](./utils.py#14) to import a TTSTalker from chat_anything/tts_talker/tts_voicechanger.py.
|
115 |
+
2. switch the config to another one, change [code](./utils.py#14) "resources/voices_edge.yaml" -> "resources/voices_voicechanger.yaml"
|
116 |
+
|
117 |
+
The try running a [Voice Changer](https://huggingface.co/spaces/kevinwang676/Voice-Changer) on your local machine. Simply set up git-lfs and install the repo and run it for the TTS voice service.
|
118 |
+
The TTS caller was set to port 7860.
|
119 |
+
|
120 |
+
make sure the client class is set up with the same port in [here](chat_anything/tts_talker/tts_voicechanger.py#5)
|
121 |
+
```python
|
122 |
+
client = Client("http://127.0.0.1:7860/")
|
123 |
+
```
|
124 |
+
|
125 |
+
# Acknowledgement
|
126 |
+
Again, the project hasn't yet included any training. The pipeline is totally based on these incredible awesome packages and pretrained models. Don't hesitate to take a look and explore the amazing open-source generative communities. We love you, guys.
|
127 |
+
- [ChatGPT](https://openai.com/chatgpt): GOD
|
128 |
+
- [SadTalker](https://github.com/OpenTalker/SadTalker): The Core Animation Module
|
129 |
+
- [Face-Landmark-ControlNet](https://huggingface.co/georgefen/Face-Landmark-ControlNet): An Awesome ControlNet with Face landmark using Stable Diffusion 1.5 as base Model.
|
130 |
+
- [diffusers](https://github.com/huggingface/diffusers): GOAT of Image Generative Framework🥳.
|
131 |
+
- [langchain](https://github.com/langchain-ai/langchain): An Awesome Package for Dealing with LLM.
|
132 |
+
- [edge-tts](https://github.com/rany2/edge-tts): An Awesome Package for Text To Sound Solutions.
|
133 |
+
- [gradio](https://www.gradio.app/): GOAT😄 Machine Learning based App framework.
|
134 |
+
- [Civitai](https://civitai.com/models) and [Huggingface_hub](https://huggingface.co/models): Find your ideal Image Generative Model on Civitai. These Communities are Crazy🥂. Here are Some Fantastic Derivatives of [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5):
|
135 |
+
- [Game Icon Institute_mode](https://civitai.com/models/47800?modelVersionId=76533)
|
136 |
+
- [dreamshaper](https://civitai.com/models/4384/dreamshaper)
|
137 |
+
- [3D_Animation_Diffusion](https://civitai.com/models/118086?modelVersionId=128046)
|
138 |
+
- [anything-v5](https://huggingface.co/stablediffusionapi/anything-v5)
|
139 |
+
|
140 |
+
# Citation
|
141 |
+
If you like our pipeline and application, don't hesitate to reach out! Let's work on it and see how far it would go!
|
142 |
+
```bibtex
|
143 |
+
@misc{zhao2023ChatAnything,
|
144 |
+
title={ChatAnything: Facetime Chat with LLM-Enhanced Personas},
|
145 |
+
author={Yilin, Zhao and Shanghua, Gao and Daquan, Zhou and Xinbin, Yuan and Qibin, Hou and Jiashi, Feng},
|
146 |
+
publisher={},
|
147 |
+
year={2023},
|
148 |
+
}
|
149 |
+
```
|
150 |
+
|
app.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import ssl
|
3 |
+
import sys
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
import warnings
|
8 |
+
import whisper
|
9 |
+
from chat_anything.polly_utils import PollyVoiceData
|
10 |
+
from chat_anything.azure_utils import AzureVoiceData
|
11 |
+
from chat_anything.chatbot.chat import set_openai_api_key
|
12 |
+
from utils import ChatWrapper, update_foo, reset_memory
|
13 |
+
|
14 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
15 |
+
|
16 |
+
|
17 |
+
TALKING_HEAD_WIDTH = "350"
|
18 |
+
|
19 |
+
LOOPING_TALKING_HEAD = "resources/videos/tempfile.mp4"
|
20 |
+
|
21 |
+
USE_GPT4_DEFAULT = False
|
22 |
+
FULLBODY_DEFAULT = False
|
23 |
+
POLLY_VOICE_DATA = PollyVoiceData()
|
24 |
+
AZURE_VOICE_DATA = AzureVoiceData()
|
25 |
+
|
26 |
+
# Pertains to WHISPER functionality
|
27 |
+
WHISPER_DETECT_LANG = "Detect language"
|
28 |
+
|
29 |
+
INSTRUCTION_MARKDOWN = """
|
30 |
+
# ChatAnything: Facetime Chat with LLM-Enhanced Personas
|
31 |
+
### DEMO INSTRUCTION
|
32 |
+
##### 0. Register
|
33 |
+
Input a OpenAI API Key of your own. This would be used to chat with openai-chatgpt. Make sure to disable the key afterwards🥹.
|
34 |
+
##### 1. Generate The init face😀 along with first chat
|
35 |
+
Input a Concept in the "Talking object" text box, then click on Generate button. The init face generation and module selection will be performed and used for the rest of this chat. Wait for a while and the video would be produced and played. Write simple concept for generating. The concept will be place on each prompt template for deciding the main concepts.
|
36 |
+
##### 2. Keep on Chatting🤑
|
37 |
+
Go on speak with the character. The init face and module selection will not reperform itself, now you are only chatting with the LM, along with the rendering of sadtalker. Hopefully, the API will not impose an excessive charge for this.
|
38 |
+
|
39 |
+
|
40 |
+
### FEATURES
|
41 |
+
##### 1. Upload a image for control/inversion starting point. Try some none face images and see how it works!
|
42 |
+
##### 2. seeding is provided. However if not providing a input image, there would be a random chosen facial landmark image for generating, which might include some randomness.
|
43 |
+
##### 3. Try out the examples.
|
44 |
+
##### 4. Say something and recorded your voice for a real facetime chat. Whisper will handle your voice, see setting-Whisper STT options.
|
45 |
+
##### 5. Decide whether to use the crop face out option, this will crop out the face from the generated image and render. This is promising for better animation rendering, however sometimes the croped image loses some elementary features of you intended concept.
|
46 |
+
|
47 |
+
"""
|
48 |
+
|
49 |
+
# UNCOMMENT TO USE WHISPER
|
50 |
+
warnings.filterwarnings("ignore")
|
51 |
+
WHISPER_MODEL = whisper.load_model("tiny")
|
52 |
+
print("WHISPER_MODEL", WHISPER_MODEL)
|
53 |
+
|
54 |
+
|
55 |
+
# UNCOMMENT TO USE WHISPER
|
56 |
+
def transcribe(aud_inp, whisper_lang):
|
57 |
+
if aud_inp is None:
|
58 |
+
return ""
|
59 |
+
aud = whisper.load_audio(aud_inp)
|
60 |
+
aud = whisper.pad_or_trim(aud)
|
61 |
+
mel = whisper.log_mel_spectrogram(aud).to(WHISPER_MODEL.device)
|
62 |
+
_, probs = WHISPER_MODEL.detect_language(mel)
|
63 |
+
options = whisper.DecodingOptions()
|
64 |
+
if whisper_lang != WHISPER_DETECT_LANG:
|
65 |
+
whisper_lang_code = POLLY_VOICE_DATA.get_whisper_lang_code(
|
66 |
+
whisper_lang)
|
67 |
+
options = whisper.DecodingOptions(language=whisper_lang_code)
|
68 |
+
result = whisper.decode(WHISPER_MODEL, mel, options)
|
69 |
+
print("result.text", result.text)
|
70 |
+
result_text = ""
|
71 |
+
if result and result.text:
|
72 |
+
result_text = result.text
|
73 |
+
return result_text
|
74 |
+
|
75 |
+
|
76 |
+
chat = ChatWrapper()
|
77 |
+
|
78 |
+
|
79 |
+
with gr.Blocks() as block:
|
80 |
+
llm_state = gr.State()
|
81 |
+
history_state = gr.State()
|
82 |
+
chain_state = gr.State()
|
83 |
+
talker_state = gr.State()
|
84 |
+
fullbody_state = gr.State(True)
|
85 |
+
speak_text_state = gr.State(True)
|
86 |
+
talking_head_state = gr.State(True)
|
87 |
+
uid_state = gr.State()
|
88 |
+
video_file_path = gr.State()
|
89 |
+
audio_file_path = gr.State()
|
90 |
+
|
91 |
+
memory_state = gr.State()
|
92 |
+
|
93 |
+
|
94 |
+
# Pertains to WHISPER functionality
|
95 |
+
whisper_lang_state = gr.State(WHISPER_DETECT_LANG)
|
96 |
+
use_gpt4_state = gr.State(USE_GPT4_DEFAULT)
|
97 |
+
|
98 |
+
with gr.Column():
|
99 |
+
with gr.Row():
|
100 |
+
gr.Markdown(INSTRUCTION_MARKDOWN)
|
101 |
+
with gr.Row():
|
102 |
+
openai_api_key_textbox = gr.Textbox(placeholder="Paste your OpenAI API key (sk-...) and hit Enter",
|
103 |
+
show_label=True, lines=1, type='password', value='', label='OpenAI API key')
|
104 |
+
openai_api_key_register = gr.Button(
|
105 |
+
value="Register").style(full_width=False)
|
106 |
+
uid_textbox = gr.Textbox(show_label=True, value=uid_state, lines=1, label='UID')
|
107 |
+
seed = gr.Slider(
|
108 |
+
label="Seed",
|
109 |
+
minimum=-1,
|
110 |
+
maximum=2147483647,
|
111 |
+
step=1,
|
112 |
+
randomize=True,
|
113 |
+
)
|
114 |
+
|
115 |
+
with gr.Tab("Chat"):
|
116 |
+
with gr.Row():
|
117 |
+
with gr.Column(scale=1, min_width=TALKING_HEAD_WIDTH, visible=True):
|
118 |
+
with gr.Column():
|
119 |
+
class_prompt = gr.Textbox(
|
120 |
+
'apple',
|
121 |
+
default='apple',
|
122 |
+
type="text", label='Talking object'
|
123 |
+
)
|
124 |
+
init_face_btn = gr.Button(
|
125 |
+
value="Generate").style(full_width=False)
|
126 |
+
|
127 |
+
my_file = gr.File(label="Upload a file",
|
128 |
+
type="file", visible=False)
|
129 |
+
|
130 |
+
# video_html = gr.HTML('')
|
131 |
+
video_html = gr.Video(label="Generated Video", autoplay=True)
|
132 |
+
|
133 |
+
ref_image = gr.Image(
|
134 |
+
type="pil",
|
135 |
+
interactive=True,
|
136 |
+
label="Image: Upload your image.",
|
137 |
+
)
|
138 |
+
tmp_aud_file = gr.File(
|
139 |
+
type="file", visible=False)
|
140 |
+
audio_html = gr.HTML('')
|
141 |
+
init_face_btn.click(chat.generate_init_face_video, inputs=[class_prompt, llm_state, uid_state,fullbody_state, ref_image, seed],
|
142 |
+
outputs=[chain_state, memory_state, video_html,talker_state])
|
143 |
+
|
144 |
+
|
145 |
+
with gr.Column(scale=7):
|
146 |
+
chatbot = gr.Chatbot()
|
147 |
+
|
148 |
+
|
149 |
+
message = gr.Textbox(label="What's on your mind??",
|
150 |
+
placeholder="What's the answer to life, the universe, and everything?",
|
151 |
+
lines=1)
|
152 |
+
submit = gr.Button(value="Send", variant="secondary").style(
|
153 |
+
full_width=False)
|
154 |
+
|
155 |
+
audio_comp = gr.Microphone(source="microphone", type="filepath", label="Just say it!",
|
156 |
+
interactive=True, streaming=False)
|
157 |
+
audio_comp.change(transcribe, inputs=[
|
158 |
+
audio_comp, whisper_lang_state], outputs=[message])
|
159 |
+
|
160 |
+
|
161 |
+
with gr.Accordion("General examples", open=False):
|
162 |
+
gr.Examples(
|
163 |
+
examples=[
|
164 |
+
["cyberpunk godess", "Who are you?", "resources/images/annie.jpg", 393212389],
|
165 |
+
["unbelievable beauty fairy", "Who are you?", "resources/images/lenna.jpg", 222679277],
|
166 |
+
["tree monster", "Who are you?", None],
|
167 |
+
["pineapple monster", "Who are you?", None],
|
168 |
+
["tricky Polaris", "Who are you?", None, 1670155100],
|
169 |
+
["watermelon", "Who are you?", "resources/images/watermelon.jpg", 42],
|
170 |
+
],
|
171 |
+
inputs=[class_prompt, message, ref_image, seed],
|
172 |
+
)
|
173 |
+
|
174 |
+
with gr.Tab("Settings"):
|
175 |
+
with gr.Tab("General"):
|
176 |
+
|
177 |
+
talking_head_cb = gr.Checkbox(
|
178 |
+
label="Show talking head", value=True)
|
179 |
+
talking_head_cb.change(chat.update_talking_head, inputs=[talking_head_cb, uid_state, talking_head_state],
|
180 |
+
outputs=[talking_head_state, video_html])
|
181 |
+
|
182 |
+
use_gpt4_cb = gr.Checkbox(label="Use GPT-4 (experimental) if your OpenAI API has access to it",
|
183 |
+
value=USE_GPT4_DEFAULT)
|
184 |
+
|
185 |
+
fullbody_state = gr.Checkbox(label="Use full body instead of a face.",
|
186 |
+
value=True)
|
187 |
+
|
188 |
+
use_gpt4_cb.change(set_openai_api_key,
|
189 |
+
inputs=[openai_api_key_textbox,
|
190 |
+
use_gpt4_cb],
|
191 |
+
outputs=[llm_state, use_gpt4_state, chatbot, uid_state, video_file_path, audio_file_path])
|
192 |
+
|
193 |
+
reset_btn = gr.Button(value="Reset chat",
|
194 |
+
variant="secondary").style(full_width=False)
|
195 |
+
reset_btn.click(reset_memory, inputs=[history_state, memory_state],
|
196 |
+
outputs=[chatbot, history_state, memory_state])
|
197 |
+
|
198 |
+
|
199 |
+
with gr.Tab("Whisper STT"):
|
200 |
+
whisper_lang_radio = gr.Radio(label="Whisper speech-to-text language:", choices=[
|
201 |
+
WHISPER_DETECT_LANG, "Arabic", "Arabic (Gulf)", "Catalan", "Chinese (Cantonese)", "Chinese (Mandarin)",
|
202 |
+
"Danish", "Dutch", "English (Australian)", "English (British)", "English (Indian)", "English (New Zealand)",
|
203 |
+
"English (South African)", "English (US)", "English (Welsh)", "Finnish", "French", "French (Canadian)",
|
204 |
+
"German", "German (Austrian)", "Georgian", "Hindi", "Icelandic", "Indonesian", "Italian", "Japanese",
|
205 |
+
"Korean", "Norwegian", "Polish",
|
206 |
+
"Portuguese (Brazilian)", "Portuguese (European)", "Romanian", "Russian", "Spanish (European)",
|
207 |
+
"Spanish (Mexican)", "Spanish (US)", "Swedish", "Turkish", "Ukrainian", "Welsh"],
|
208 |
+
value=WHISPER_DETECT_LANG)
|
209 |
+
|
210 |
+
whisper_lang_radio.change(update_foo,
|
211 |
+
inputs=[whisper_lang_radio,
|
212 |
+
whisper_lang_state],
|
213 |
+
outputs=[whisper_lang_state])
|
214 |
+
|
215 |
+
gr.HTML("""
|
216 |
+
<p>This application is based on <a href='https://huggingface.co/spaces/JavaFXpert/Chat-GPT-LangChain/'>Chat-GPT-LangChain</a>, <a href='https://github.com/hwchase17/langchain'>LangChain</a>
|
217 |
+
</p>""")
|
218 |
+
|
219 |
+
message.submit(chat, inputs=[openai_api_key_textbox, message, history_state, chain_state,
|
220 |
+
speak_text_state, talking_head_state, uid_state,talker_state,fullbody_state],
|
221 |
+
outputs=[chatbot, history_state, video_html, my_file, audio_html, tmp_aud_file, message])
|
222 |
+
|
223 |
+
submit.click(chat, inputs=[openai_api_key_textbox, message, history_state, chain_state,
|
224 |
+
speak_text_state, talking_head_state, uid_state,talker_state,fullbody_state],
|
225 |
+
outputs=[chatbot, history_state, video_html, my_file, audio_html, tmp_aud_file, message])
|
226 |
+
|
227 |
+
openai_api_key_register.click(set_openai_api_key,
|
228 |
+
inputs=[openai_api_key_textbox,
|
229 |
+
use_gpt4_state, chatbot],
|
230 |
+
outputs=[llm_state, use_gpt4_state, chatbot, uid_state, video_file_path, audio_file_path])
|
231 |
+
|
232 |
+
if __name__ == "__main__":
|
233 |
+
import sys
|
234 |
+
if len(sys.argv) == 1:
|
235 |
+
port = 8901
|
236 |
+
else:
|
237 |
+
port = int(sys.argv[1])
|
238 |
+
block.launch(debug=True, server_name="0.0.0.0",
|
239 |
+
server_port=port, share=True, enable_queue = True)
|
chat_anything/azure_utils.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This class stores Azure voice data. Specifically, the class stores several records containing
|
2 |
+
# language, lang_code, gender, voice_id and engine. The class also has a method to return the
|
3 |
+
# voice_id, lang_code and engine given a language and gender.
|
4 |
+
|
5 |
+
NEURAL_ENGINE = "neural"
|
6 |
+
STANDARD_ENGINE = "standard"
|
7 |
+
|
8 |
+
|
9 |
+
class AzureVoiceData:
|
10 |
+
def get_voice(self, language, gender):
|
11 |
+
for voice in self.voice_data:
|
12 |
+
if voice['language'] == language and voice['gender'] == gender:
|
13 |
+
return voice['azure_voice']
|
14 |
+
return None
|
15 |
+
|
16 |
+
def __init__(self):
|
17 |
+
self.voice_data = [
|
18 |
+
{'language': 'Arabic',
|
19 |
+
'azure_voice': 'ar-EG-ShakirNeural',
|
20 |
+
'gender': 'Male'},
|
21 |
+
{'language': 'Arabic (Gulf)',
|
22 |
+
'azure_voice': 'ar-KW-FahedNeural',
|
23 |
+
'gender': 'Male'},
|
24 |
+
{'language': 'Catalan',
|
25 |
+
'azure_voice': 'ca-ES-EnricNeural',
|
26 |
+
'gender': 'Male'},
|
27 |
+
{'language': 'Chinese (Cantonese)',
|
28 |
+
'azure_voice': 'yue-CN-YunSongNeural',
|
29 |
+
'gender': 'Male'},
|
30 |
+
{'language': 'Chinese (Mandarin)',
|
31 |
+
'azure_voice': 'zh-CN-YunxiNeural',
|
32 |
+
'gender': 'Male'},
|
33 |
+
{'language': 'Danish',
|
34 |
+
'azure_voice': 'da-DK-JeppeNeural',
|
35 |
+
'gender': 'Male'},
|
36 |
+
{'language': 'Dutch',
|
37 |
+
'azure_voice': 'nl-NL-MaartenNeural',
|
38 |
+
'gender': 'Male'},
|
39 |
+
{'language': 'English (Australian)',
|
40 |
+
'azure_voice': 'en-AU-KenNeural',
|
41 |
+
'gender': 'Male'},
|
42 |
+
{'language': 'English (British)',
|
43 |
+
'azure_voice': 'en-GB-RyanNeural',
|
44 |
+
'gender': 'Male'},
|
45 |
+
{'language': 'English (Indian)',
|
46 |
+
'azure_voice': 'en-IN-PrabhatNeural',
|
47 |
+
'gender': 'Male'},
|
48 |
+
{'language': 'English (New Zealand)',
|
49 |
+
'azure_voice': 'en-NZ-MitchellNeural',
|
50 |
+
'gender': 'Male'},
|
51 |
+
{'language': 'English (South African)',
|
52 |
+
'azure_voice': 'en-ZA-LukeNeural',
|
53 |
+
'gender': 'Male'},
|
54 |
+
{'language': 'English (US)',
|
55 |
+
'azure_voice': 'en-US-ChristopherNeural',
|
56 |
+
'gender': 'Male'},
|
57 |
+
{'language': 'English (Welsh)',
|
58 |
+
'azure_voice': 'cy-GB-AledNeural',
|
59 |
+
'gender': 'Male'},
|
60 |
+
{'language': 'Finnish',
|
61 |
+
'azure_voice': 'fi-FI-HarriNeural',
|
62 |
+
'gender': 'Male'},
|
63 |
+
{'language': 'French',
|
64 |
+
'azure_voice': 'fr-FR-HenriNeural',
|
65 |
+
'gender': 'Male'},
|
66 |
+
{'language': 'French (Canadian)',
|
67 |
+
'azure_voice': 'fr-CA-AntoineNeural',
|
68 |
+
'gender': 'Male'},
|
69 |
+
{'language': 'German',
|
70 |
+
'azure_voice': 'de-DE-KlausNeural',
|
71 |
+
'gender': 'Male'},
|
72 |
+
{'language': 'German (Austrian)',
|
73 |
+
'azure_voice': 'de-AT-JonasNeural',
|
74 |
+
'gender': 'Male'},
|
75 |
+
{'language': 'Hindi',
|
76 |
+
'azure_voice': 'hi-IN-MadhurNeural',
|
77 |
+
'gender': 'Male'},
|
78 |
+
{'language': 'Icelandic',
|
79 |
+
'azure_voice': 'is-IS-GunnarNeural',
|
80 |
+
'gender': 'Male'},
|
81 |
+
{'language': 'Italian',
|
82 |
+
'azure_voice': 'it-IT-GianniNeural',
|
83 |
+
'gender': 'Male'},
|
84 |
+
{'language': 'Japanese',
|
85 |
+
'azure_voice': 'ja-JP-KeitaNeural',
|
86 |
+
'gender': 'Male'},
|
87 |
+
{'language': 'Korean',
|
88 |
+
'azure_voice': 'ko-KR-GookMinNeural',
|
89 |
+
'gender': 'Male'},
|
90 |
+
{'language': 'Norwegian',
|
91 |
+
'azure_voice': 'nb-NO-FinnNeural',
|
92 |
+
'gender': 'Male'},
|
93 |
+
{'language': 'Polish',
|
94 |
+
'azure_voice': 'pl-PL-MarekNeural',
|
95 |
+
'gender': 'Male'},
|
96 |
+
{'language': 'Portuguese (Brazilian)',
|
97 |
+
'azure_voice': 'pt-BR-NicolauNeural',
|
98 |
+
'gender': 'Male'},
|
99 |
+
{'language': 'Portuguese (European)',
|
100 |
+
'azure_voice': 'pt-PT-DuarteNeural',
|
101 |
+
'gender': 'Male'},
|
102 |
+
{'language': 'Romanian',
|
103 |
+
'azure_voice': 'ro-RO-EmilNeural',
|
104 |
+
'gender': 'Male'},
|
105 |
+
{'language': 'Russian',
|
106 |
+
'azure_voice': 'ru-RU-DmitryNeural',
|
107 |
+
'gender': 'Male'},
|
108 |
+
{'language': 'Spanish (European)',
|
109 |
+
'azure_voice': 'es-ES-TeoNeural',
|
110 |
+
'gender': 'Male'},
|
111 |
+
{'language': 'Spanish (Mexican)',
|
112 |
+
'azure_voice': 'es-MX-LibertoNeural',
|
113 |
+
'gender': 'Male'},
|
114 |
+
{'language': 'Spanish (US)',
|
115 |
+
'azure_voice': 'es-US-AlonsoNeural"',
|
116 |
+
'gender': 'Male'},
|
117 |
+
{'language': 'Swedish',
|
118 |
+
'azure_voice': 'sv-SE-MattiasNeural',
|
119 |
+
'gender': 'Male'},
|
120 |
+
{'language': 'Turkish',
|
121 |
+
'azure_voice': 'tr-TR-AhmetNeural',
|
122 |
+
'gender': 'Male'},
|
123 |
+
{'language': 'Welsh',
|
124 |
+
'azure_voice': 'cy-GB-AledNeural',
|
125 |
+
'gender': 'Male'},
|
126 |
+
]
|
127 |
+
|
128 |
+
|
129 |
+
# Run from the command-line
|
130 |
+
if __name__ == '__main__':
|
131 |
+
azure_voice_data = AzureVoiceData()
|
132 |
+
|
133 |
+
azure_voice = azure_voice_data.get_voice('English (US)', 'Male')
|
134 |
+
print('English (US)', 'Male', azure_voice)
|
135 |
+
|
136 |
+
azure_voice = azure_voice_data.get_voice('English (US)', 'Female')
|
137 |
+
print('English (US)', 'Female', azure_voice)
|
138 |
+
|
139 |
+
azure_voice = azure_voice_data.get_voice('French', 'Female')
|
140 |
+
print('French', 'Female', azure_voice)
|
141 |
+
|
142 |
+
azure_voice = azure_voice_data.get_voice('French', 'Male')
|
143 |
+
print('French', 'Male', azure_voice)
|
144 |
+
|
145 |
+
azure_voice = azure_voice_data.get_voice('Japanese', 'Female')
|
146 |
+
print('Japanese', 'Female', azure_voice)
|
147 |
+
|
148 |
+
azure_voice = azure_voice_data.get_voice('Japanese', 'Male')
|
149 |
+
print('Japanese', 'Male', azure_voice)
|
150 |
+
|
151 |
+
azure_voice = azure_voice_data.get_voice('Hindi', 'Female')
|
152 |
+
print('Hindi', 'Female', azure_voice)
|
153 |
+
|
154 |
+
azure_voice = azure_voice_data.get_voice('Hindi', 'Male')
|
155 |
+
print('Hindi', 'Male', azure_voice)
|
chat_anything/chatbot/__init__.py
ADDED
File without changes
|
chat_anything/chatbot/chat.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
from chat_anything.chatbot.personality import generate_personality_prompt
|
3 |
+
from langchain.prompts import PromptTemplate
|
4 |
+
from langchain import ConversationChain
|
5 |
+
from langchain.chains.conversation.memory import ConversationBufferMemory
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
8 |
+
import os
|
9 |
+
import random
|
10 |
+
import string
|
11 |
+
|
12 |
+
|
13 |
+
def load_chain(llm, class_concept=None):
|
14 |
+
chain = None
|
15 |
+
memory = None
|
16 |
+
personality_text = None
|
17 |
+
print(llm)
|
18 |
+
if llm:
|
19 |
+
print("class_concept", class_concept)
|
20 |
+
if class_concept is None:
|
21 |
+
class_concept = 'AI assistant'
|
22 |
+
person_template, personality_text = generate_personality_prompt(llm, class_concept)
|
23 |
+
|
24 |
+
PROMPT_TEMPLATE = PromptTemplate(
|
25 |
+
input_variables=["history", "input"],
|
26 |
+
template=person_template,
|
27 |
+
)
|
28 |
+
|
29 |
+
chain = ConversationChain(
|
30 |
+
prompt=PROMPT_TEMPLATE,
|
31 |
+
llm=llm,
|
32 |
+
verbose=False,
|
33 |
+
memory=ConversationBufferMemory(ai_prefix="You"),
|
34 |
+
)
|
35 |
+
print("New concept done for ", class_concept)
|
36 |
+
|
37 |
+
return chain, memory, personality_text
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
def set_openai_api_key(api_key, use_gpt4, history=None, max_tokens=1024):
|
42 |
+
"""Set the api key and return chain.
|
43 |
+
If no api_key, then None is returned.
|
44 |
+
"""
|
45 |
+
if api_key and api_key.startswith("sk-") and len(api_key) > 50:
|
46 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
47 |
+
print("\n\n ++++++++++++++ Setting OpenAI API key ++++++++++++++ \n\n")
|
48 |
+
print(str(datetime.datetime.now()) + ": Before OpenAI, OPENAI_API_KEY length: " + str(
|
49 |
+
len(os.environ["OPENAI_API_KEY"])))
|
50 |
+
|
51 |
+
if use_gpt4:
|
52 |
+
llm = ChatOpenAI(
|
53 |
+
temperature=0, max_tokens=max_tokens, model_name="gpt-4")
|
54 |
+
print("Trying to use llm ChatOpenAI with gpt-4")
|
55 |
+
else:
|
56 |
+
print("Trying to use llm ChatOpenAI with gpt-3.5-turbo")
|
57 |
+
llm = ChatOpenAI(temperature=0, max_tokens=max_tokens,
|
58 |
+
model_name="gpt-3.5-turbo")
|
59 |
+
|
60 |
+
print(str(datetime.datetime.now()) + ": After OpenAI, OPENAI_API_KEY length: " + str(
|
61 |
+
len(os.environ["OPENAI_API_KEY"])))
|
62 |
+
|
63 |
+
print(str(datetime.datetime.now()) + ": After load_chain, OPENAI_API_KEY length: " + str(
|
64 |
+
len(os.environ["OPENAI_API_KEY"])))
|
65 |
+
os.environ["OPENAI_API_KEY"] = ""
|
66 |
+
history = history or []
|
67 |
+
history.append(['', '[SYSTEM] OPENAI_API_KEY has been set, you can generate your object and talk to it now!'])
|
68 |
+
uid = ''.join(random.sample(string.ascii_lowercase + string.ascii_uppercase, 5))
|
69 |
+
video_file_path = os.path.join('tmp', uid, 'videos/tempfile.mp4')
|
70 |
+
audio_file_path = os.path.join('tmp', uid, 'audio/tempfile.mp3')
|
71 |
+
return llm, use_gpt4, history, uid, video_file_path, audio_file_path
|
72 |
+
return None, None, None, None, None, None
|
chat_anything/chatbot/model_select.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain import LLMChain
|
2 |
+
from langchain.prompts import PromptTemplate
|
3 |
+
from omegaconf import OmegaConf
|
4 |
+
import datetime
|
5 |
+
|
6 |
+
MODEL_SELECTION_PROMPT_TEMPLATE = """
|
7 |
+
Select one of the following models based on the given concept.
|
8 |
+
You must choose one model name based on the description of each model and the concept!
|
9 |
+
|
10 |
+
Cencept: {concept}
|
11 |
+
|
12 |
+
Model name and description: {model_list}
|
13 |
+
|
14 |
+
Warning: {warning}
|
15 |
+
|
16 |
+
The avilable model names:
|
17 |
+
{model_name_list}
|
18 |
+
|
19 |
+
Selected model name:
|
20 |
+
"""
|
21 |
+
|
22 |
+
def load_model_list():
|
23 |
+
models_config = OmegaConf.load('resources/models.yaml')
|
24 |
+
models_dict = models_config['models']
|
25 |
+
model_name_list_str = ''
|
26 |
+
print(models_dict)
|
27 |
+
model_list_str = ''
|
28 |
+
for key, value in models_dict.items():
|
29 |
+
model_list_str+="model name: " +key+', model description: '+value['desc']+'\n'
|
30 |
+
model_name_list_str += key + ' '
|
31 |
+
model_name_list_str += '\n'
|
32 |
+
return model_list_str, models_dict, model_name_list_str
|
33 |
+
|
34 |
+
def model_selection_chain(llm, class_concept=None):
|
35 |
+
chain = None
|
36 |
+
memory = None
|
37 |
+
if llm:
|
38 |
+
print("class_concept", class_concept)
|
39 |
+
if class_concept is None:
|
40 |
+
class_concept = 'AI assistant'
|
41 |
+
|
42 |
+
|
43 |
+
template = PromptTemplate(
|
44 |
+
input_variables=["model_list", "concept", "warning", "model_name_list"],
|
45 |
+
template=MODEL_SELECTION_PROMPT_TEMPLATE,
|
46 |
+
)
|
47 |
+
model_list_str, models_dict, model_name_list_str = load_model_list()
|
48 |
+
|
49 |
+
personality_chain = LLMChain(
|
50 |
+
llm=llm, prompt=template, verbose=True)
|
51 |
+
selected_model = None
|
52 |
+
while (selected_model is None) or not (selected_model in models_dict):
|
53 |
+
if (selected_model is not None) and not (selected_model in models_dict):
|
54 |
+
warning_str = '{} is not in Model list! \n'.format(selected_model)
|
55 |
+
else:
|
56 |
+
warning_str = ''
|
57 |
+
selected_model = personality_chain.run({'concept': class_concept, 'model_list':model_list_str, 'warning': warning_str, 'model_name_list': model_name_list_str})
|
58 |
+
print("Selected model name: ", selected_model)
|
59 |
+
|
60 |
+
return models_dict[selected_model]
|
chat_anything/chatbot/personality.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain import LLMChain
|
2 |
+
from langchain.prompts import PromptTemplate
|
3 |
+
|
4 |
+
PERSONALITY_PROMPT_TEMPLATE = """
|
5 |
+
You are an excellent scriptwriter. Now you need to provide the characteristics of an {object} and transforms them into personality traits.
|
6 |
+
Describe these personalities using the second person, giving names and specific personality descriptions related to the {object}.
|
7 |
+
The language of the Personality must be same as {object}!
|
8 |
+
|
9 |
+
You should do the following steps:
|
10 |
+
1. Based on the object's nature, imagine what kind of personality it could have if it were to come to life. Does it possess a strong sense of responsibility, like a caring caregiver? Is it playful and mischievous, like a curious child? Is it wise and patient, like an ancient sage? Be creative and invent traits that align with the object's essence.
|
11 |
+
2. Remember to infuse emotions and vivid imagery to bring your object's personality to life.
|
12 |
+
3. translate the personality into a second person prompt.
|
13 |
+
|
14 |
+
Example:
|
15 |
+
|
16 |
+
|
17 |
+
Now give the personality of apple:
|
18 |
+
|
19 |
+
Personality:
|
20 |
+
You an apple Sprite, your name is Apple Buddy.
|
21 |
+
You have all the characteristics of the apple. You are a type of fruit that is usually round with smooth skin and comes in various colors such as red, green, and yellow. You have sweet and nutritious flesh with seeds distributed in its core. You are a rich source of vitamins, fiber, and antioxidants, contributing to maintaining a healthy body.
|
22 |
+
|
23 |
+
You are an optimistic buddy. Always wearing a smile, you spread joy to those around you. Just like the delightful taste of an apple, you bring happiness to everyone.
|
24 |
+
|
25 |
+
You are resilient at heart, like the skin of an apple, able to withstand life's challenges and difficulties. No matter what obstacles you encounter, you face them bravely without hesitation.
|
26 |
+
|
27 |
+
You are caring and considerate, akin to the nutrients in an apple. You always pay attention to the needs and happiness of others. Skilled in listening, you willingly offer help and support, making those around you feel warmth and care.
|
28 |
+
|
29 |
+
You have a strong desire to grow. Like an apple tree needs sunlight and water to flourish, you are continuously learning and improving, becoming a better version of yourself every day.
|
30 |
+
|
31 |
+
You have a profound love for nature and enjoy living in harmony with it. Strolling in the garden, feeling the fresh air and warm sunlight, is one of your favorite moments.
|
32 |
+
|
33 |
+
Apple Buddy, you are a unique apple. Your optimism, resilience, care, and eagerness to grow make you an adorable companion to those around you. Your story will lead us into a world full of warmth and goodness.
|
34 |
+
|
35 |
+
Now give the personality of {object}:
|
36 |
+
|
37 |
+
Personality:
|
38 |
+
"""
|
39 |
+
|
40 |
+
|
41 |
+
def generate_personality_prompt(llm, class_concept):
|
42 |
+
|
43 |
+
PERSONALITY_PROMPT = PromptTemplate(
|
44 |
+
input_variables=["object"],
|
45 |
+
template=PERSONALITY_PROMPT_TEMPLATE,
|
46 |
+
)
|
47 |
+
personality_chain = LLMChain(
|
48 |
+
llm=llm, prompt=PERSONALITY_PROMPT, verbose=True)
|
49 |
+
personality_text = personality_chain.run({'object': class_concept})
|
50 |
+
person_prompt = personality_text
|
51 |
+
|
52 |
+
person_prompt += '''The following is a friendly conversation between a human and you. You need to talk to human based on your personality. If you do not know the answer to a question, you truthfully says you do not know.
|
53 |
+
You can use up to 50 words to answer. Make you answer concise and concise!!!!!!!!
|
54 |
+
Current conversation:
|
55 |
+
{history}
|
56 |
+
Human: {input}
|
57 |
+
You:
|
58 |
+
'''
|
59 |
+
return person_prompt, personality_text
|
chat_anything/chatbot/select.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain import LLMChain
|
2 |
+
from typing import OrderedDict
|
3 |
+
from langchain.prompts import PromptTemplate
|
4 |
+
from omegaconf import OmegaConf
|
5 |
+
import datetime
|
6 |
+
|
7 |
+
SELECTION_TEMPLATE = """
|
8 |
+
{concept}
|
9 |
+
|
10 |
+
Model name and description:
|
11 |
+
{option_list}
|
12 |
+
|
13 |
+
Warning: {warning}
|
14 |
+
|
15 |
+
The avilable Options:
|
16 |
+
{choices}
|
17 |
+
Answer:
|
18 |
+
"""
|
19 |
+
|
20 |
+
|
21 |
+
def selection_chain(llm, class_concept, prompt, options):
|
22 |
+
chain = None
|
23 |
+
memory = None
|
24 |
+
if llm:
|
25 |
+
print("class_concept", class_concept)
|
26 |
+
if class_concept is None:
|
27 |
+
class_concept = 'AI assistant'
|
28 |
+
prompt_template = prompt + SELECTION_TEMPLATE
|
29 |
+
template = PromptTemplate(
|
30 |
+
input_variables=["concept", "option_list", "warning", "choices"],
|
31 |
+
template=prompt_template,
|
32 |
+
)
|
33 |
+
chain = LLMChain(
|
34 |
+
llm=llm, prompt=template, verbose=True)
|
35 |
+
print(options)
|
36 |
+
option_list = [
|
37 |
+
f"{chr(ord('A') + i)}. {conf['desc']}" for i, conf in enumerate(options.values())
|
38 |
+
]
|
39 |
+
option_list = '\n'.join(option_list)
|
40 |
+
selected_model = None
|
41 |
+
|
42 |
+
warning_str = 'Choose from the available Options.'
|
43 |
+
choices = ' '.join(chr(ord('A') + i) for i in range(len(options)))
|
44 |
+
choice = chain.run({'concept': class_concept, 'option_list':option_list, 'warning': warning_str, 'choices': choices})
|
45 |
+
print(f"LLM Responds (First character was used as the choice):{choice}", )
|
46 |
+
choice = choice[0]
|
47 |
+
|
48 |
+
selected_model = list(options.keys())[ord(choice) - ord('A')]
|
49 |
+
print("Selected model name: ", selected_model)
|
50 |
+
|
51 |
+
return selected_model
|
52 |
+
|
53 |
+
def model_selection_chain(llm, class_concept=None, conf_file='resources/models_personality.yaml'):
|
54 |
+
chain = None
|
55 |
+
memory = None
|
56 |
+
if llm:
|
57 |
+
print("class_concept", class_concept)
|
58 |
+
if class_concept is None:
|
59 |
+
class_concept = 'AI assistant'
|
60 |
+
selection_config = OmegaConf.load(conf_file)
|
61 |
+
selected_model = selection_chain(llm, class_concept, selection_config['prompt'], selection_config['models'])
|
62 |
+
model_conf = selection_config['models'][selected_model]
|
63 |
+
return model_conf, selected_model
|
chat_anything/chatbot/voice_select.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain import LLMChain
|
2 |
+
from langchain.prompts import PromptTemplate
|
3 |
+
from omegaconf import OmegaConf
|
4 |
+
import datetime
|
5 |
+
|
6 |
+
VOICE_SELECTION_PROMPT_TEMPLATE = """
|
7 |
+
Select one of the following voice based on the given concept.
|
8 |
+
You must choose one voice name based on the description of each model and the concept.
|
9 |
+
|
10 |
+
|
11 |
+
Cencept: {concept}
|
12 |
+
|
13 |
+
Voice name and description: {model_list}
|
14 |
+
|
15 |
+
Warning: {warning}
|
16 |
+
|
17 |
+
The avilable voice names:
|
18 |
+
{model_name_list}
|
19 |
+
|
20 |
+
Selected voice name:
|
21 |
+
"""
|
22 |
+
|
23 |
+
GENDER_SELECTION_PROMPT_TEMPLATE = """
|
24 |
+
Select one of the following gender based on the given concept.
|
25 |
+
You must choose one gender based on the description of the concept. You must choose one gender Even if you can't decide.
|
26 |
+
|
27 |
+
Gender:
|
28 |
+
male
|
29 |
+
female
|
30 |
+
|
31 |
+
Cencept: {concept}
|
32 |
+
Selected gender male or female:
|
33 |
+
"""
|
34 |
+
|
35 |
+
LANGUAGE_SELECTION_PROMPT_TEMPLATE = """
|
36 |
+
Select one of the following language based on the given concept.
|
37 |
+
You must choose the language that is used by the description of the concept.
|
38 |
+
|
39 |
+
Languages:
|
40 |
+
Chinese
|
41 |
+
English
|
42 |
+
Japanese
|
43 |
+
|
44 |
+
Cencept: {concept}
|
45 |
+
Selected language:
|
46 |
+
"""
|
47 |
+
|
48 |
+
def load_voice_model_list():
|
49 |
+
models_config = OmegaConf.load('resources/voices.yaml')
|
50 |
+
models_dict = models_config['models']
|
51 |
+
print(models_dict)
|
52 |
+
model_list_str = ''
|
53 |
+
model_name_list_str = ''
|
54 |
+
for key, value in models_dict.items():
|
55 |
+
model_list_str+="model name: " +key+', model description: '+value['desc']+'\n'
|
56 |
+
model_name_list_str += key + ' '
|
57 |
+
model_name_list_str += '\n'
|
58 |
+
return model_list_str, models_dict, model_name_list_str
|
59 |
+
|
60 |
+
def get_vioce_model_chain(llm, class_concept):
|
61 |
+
model_template = PromptTemplate(
|
62 |
+
input_variables=["model_list", "concept", "model_name_list", "warning"],
|
63 |
+
template=VOICE_SELECTION_PROMPT_TEMPLATE,
|
64 |
+
)
|
65 |
+
model_list_str, models_dict, model_name_list_str = load_voice_model_list()
|
66 |
+
|
67 |
+
personality_chain = LLMChain(
|
68 |
+
llm=llm, prompt=model_template, verbose=True)
|
69 |
+
|
70 |
+
selected_model = None
|
71 |
+
while (selected_model is None) or not (selected_model in models_dict):
|
72 |
+
if (selected_model is not None) and not (selected_model in models_dict):
|
73 |
+
warning_str = '{} is not in Model list! \n'.format(selected_model)
|
74 |
+
else:
|
75 |
+
warning_str = ''
|
76 |
+
selected_model = personality_chain.run({'concept': class_concept, 'model_list':model_list_str, 'warning': warning_str, 'model_name_list': model_name_list_str})
|
77 |
+
print("Selected model name: ", selected_model)
|
78 |
+
|
79 |
+
return selected_model
|
80 |
+
|
81 |
+
def get_gender_chain(llm, class_concept):
|
82 |
+
model_template = PromptTemplate(
|
83 |
+
input_variables=["concept"],
|
84 |
+
template=GENDER_SELECTION_PROMPT_TEMPLATE,
|
85 |
+
)
|
86 |
+
|
87 |
+
personality_chain = LLMChain(
|
88 |
+
llm=llm, prompt=model_template, verbose=True)
|
89 |
+
selected_gender = personality_chain.run({'concept': class_concept})
|
90 |
+
print("Selected gender: ", selected_gender)
|
91 |
+
return selected_gender
|
92 |
+
|
93 |
+
def get_language_chain(llm, class_concept):
|
94 |
+
model_template = PromptTemplate(
|
95 |
+
input_variables=["concept"],
|
96 |
+
template=LANGUAGE_SELECTION_PROMPT_TEMPLATE,
|
97 |
+
)
|
98 |
+
|
99 |
+
personality_chain = LLMChain(
|
100 |
+
llm=llm, prompt=model_template, verbose=True)
|
101 |
+
selected_language = personality_chain.run({'concept': class_concept})
|
102 |
+
print("Selected language: ", selected_language)
|
103 |
+
return selected_language
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
def voice_selection_chain(llm, class_concept=None):
|
108 |
+
chain = None
|
109 |
+
memory = None
|
110 |
+
if llm:
|
111 |
+
print("class_concept", class_concept)
|
112 |
+
if class_concept is None:
|
113 |
+
class_concept = 'AI assistant'
|
114 |
+
selected_model = get_vioce_model_chain(llm, class_concept)
|
115 |
+
gender = get_gender_chain(llm, class_concept)
|
116 |
+
language = get_language_chain(llm, class_concept)
|
117 |
+
|
118 |
+
return selected_model, gender, language
|
119 |
+
|
chat_anything/face_generator/__init__.py
ADDED
File without changes
|
chat_anything/face_generator/long_prompt_control_generator.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PIL
|
2 |
+
from PIL import Image
|
3 |
+
from PIL import ImageDraw
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
import dlib
|
7 |
+
import cv2
|
8 |
+
import torch
|
9 |
+
|
10 |
+
import diffusers
|
11 |
+
from diffusers import StableDiffusionPipeline, DiffusionPipeline
|
12 |
+
from diffusers import ControlNetModel, StableDiffusionControlNetPipeline, StableDiffusionControlNetImg2ImgPipeline
|
13 |
+
from chat_anything.face_generator.pipelines.lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline, get_weighted_text_embeddings
|
14 |
+
from diffusers.schedulers import EulerAncestralDiscreteScheduler,DPMSolverMultistepScheduler # DPM++ SDE Karras
|
15 |
+
|
16 |
+
from chat_anything.face_generator.utils.generate import generate
|
17 |
+
|
18 |
+
from .long_prompt_generator import LongPromptGenerator
|
19 |
+
|
20 |
+
def draw_landmarks(image, landmarks, color="white", radius=2.5):
|
21 |
+
draw = ImageDraw.Draw(image)
|
22 |
+
for dot in landmarks:
|
23 |
+
x, y = dot
|
24 |
+
draw.ellipse((x-radius, y-radius, x+radius, y+radius), fill=color)
|
25 |
+
|
26 |
+
def get_ldmk_img(w, h, ldmks) -> PIL.Image:
|
27 |
+
con_img = Image.new('RGB', (w, h), color=(0, 0, 0))
|
28 |
+
draw_landmarks(con_img, ldmks)
|
29 |
+
return con_img
|
30 |
+
|
31 |
+
class LongPromptControlGenerator(LongPromptGenerator):
|
32 |
+
|
33 |
+
def __init__(self, model_dir, lora_path, prompt_template, negative_prompt, face_control_dir, face_detect_path,):
|
34 |
+
self.face_control_dir = face_control_dir
|
35 |
+
self.face_detect_path = face_detect_path
|
36 |
+
super().__init__(model_dir, lora_path, prompt_template, negative_prompt)
|
37 |
+
|
38 |
+
def load_model(self, *args, **kwargs):
|
39 |
+
super().load_model(*args, **kwargs)
|
40 |
+
self.face_detector = dlib.get_frontal_face_detector()
|
41 |
+
self.face_predictor = dlib.shape_predictor(self.face_detect_path)
|
42 |
+
# load control net
|
43 |
+
face_controlnet = ControlNetModel.from_pretrained(self.face_control_dir).to('cuda', dtype=torch.float16)
|
44 |
+
self.face_control_pipe = StableDiffusionControlNetPipeline(controlnet=face_controlnet, **self.pipe.components)
|
45 |
+
self.face_control_img2img_pipe = StableDiffusionControlNetImg2ImgPipeline(controlnet=face_controlnet, **self.pipe.components)
|
46 |
+
|
47 |
+
def _get_68landmarks_seq(self, img_np):
|
48 |
+
gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
|
49 |
+
faces = self.face_detector(gray)
|
50 |
+
landmarks = []
|
51 |
+
for face in faces:
|
52 |
+
shape = self.face_predictor(gray, face)
|
53 |
+
for i in range(68):
|
54 |
+
x = shape.part(i).x
|
55 |
+
y = shape.part(i).y
|
56 |
+
landmarks.append((x, y))
|
57 |
+
return landmarks
|
58 |
+
|
59 |
+
def has_face(self, img_pil):
|
60 |
+
img_np = np.array(img_pil)
|
61 |
+
gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
|
62 |
+
faces = self.face_detector(gray)
|
63 |
+
return len(faces) != 0
|
64 |
+
|
65 |
+
def face_control_generate(
|
66 |
+
self,
|
67 |
+
prompt,
|
68 |
+
face_img_pil,
|
69 |
+
do_inversion=False,
|
70 |
+
**kwargs,
|
71 |
+
):
|
72 |
+
"""
|
73 |
+
Face control generating.
|
74 |
+
"""
|
75 |
+
face_img_np = np.array(face_img_pil)
|
76 |
+
ldmk_seq = self._get_68landmarks_seq(face_img_np)
|
77 |
+
ldmk_img_pil = get_ldmk_img(face_img_pil.size[0], face_img_pil.size[1], ldmk_seq)
|
78 |
+
print('GENERATING:', prompt)
|
79 |
+
|
80 |
+
generating_conf = {
|
81 |
+
"prompt": prompt,
|
82 |
+
"negative_prompt": self.negative_prompt,
|
83 |
+
"num_inference_steps": 25,
|
84 |
+
"guidance_scale": 7,
|
85 |
+
"controlnet_conditioning_scale": kwargs.pop('controlnet_conditioning_scale', 1.0),
|
86 |
+
"generator": kwargs.pop('generator', None),
|
87 |
+
}
|
88 |
+
|
89 |
+
if not do_inversion:
|
90 |
+
generating_conf.update({
|
91 |
+
"pipe": self.face_control_pipe,
|
92 |
+
"image": ldmk_img_pil,
|
93 |
+
"controlnet_conditioning_scale": kwargs.pop('controlnet_conditioning_scale', 1.0),
|
94 |
+
})
|
95 |
+
else:
|
96 |
+
generating_conf.update({
|
97 |
+
"pipe": self.face_control_img2img_pipe,
|
98 |
+
"image": face_img_pil,
|
99 |
+
"control_image": ldmk_img_pil,
|
100 |
+
"strength": kwargs.pop('strength', 0.9),
|
101 |
+
})
|
102 |
+
pipe_out = generate(**generating_conf)
|
103 |
+
generated_img = pipe_out[0][0]
|
104 |
+
return generated_img
|
chat_anything/face_generator/long_prompt_generator.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PIL
|
2 |
+
from PIL import Image
|
3 |
+
from PIL import ImageDraw
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
import dlib
|
7 |
+
import cv2
|
8 |
+
import torch
|
9 |
+
|
10 |
+
import diffusers
|
11 |
+
from diffusers import StableDiffusionPipeline, DiffusionPipeline
|
12 |
+
from diffusers import ControlNetModel, StableDiffusionControlNetPipeline, StableDiffusionControlNetImg2ImgPipeline, StableDiffusionImg2ImgPipeline
|
13 |
+
from chat_anything.face_generator.pipelines.lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline, get_weighted_text_embeddings
|
14 |
+
from diffusers.schedulers import EulerAncestralDiscreteScheduler,DPMSolverMultistepScheduler # DPM++ SDE Karras
|
15 |
+
|
16 |
+
from chat_anything.face_generator.utils.generate import generate
|
17 |
+
|
18 |
+
class LongPromptGenerator():
|
19 |
+
prompt_template = "A portrait of a {}, fine face, nice looking"
|
20 |
+
negative_prompt = "easynegative,Low resolution,Low quality, Opened Mouth"
|
21 |
+
# negative_prompt = "(((sexy))),paintings,loli,,big head,sketches, (worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, glans, nsfw, nipples,extra fingers, ((extra arms)), (extra legs), mutated hands, (fused fingers), (too many fingers), (long neck:1.3)"
|
22 |
+
|
23 |
+
def __init__(self, model_dir, lora_path=None, prompt_template="{}", negative_prompt=""):
|
24 |
+
self.model_dir = model_dir
|
25 |
+
self.lora_path = lora_path
|
26 |
+
self.prompt_template = prompt_template
|
27 |
+
self.negative_prompt = negative_prompt
|
28 |
+
|
29 |
+
def load_model(self, *args, **kwargs):
|
30 |
+
# load model
|
31 |
+
try:
|
32 |
+
pipe = DiffusionPipeline.from_pretrained(self.model_dir, torch_dtype=torch.float16, **kwargs)
|
33 |
+
except:
|
34 |
+
pipe = StableDiffusionPipeline.from_pretrained(self.model_dir, torch_dtype=torch.float16, **kwargs)
|
35 |
+
|
36 |
+
pipe = pipe.to('cuda')
|
37 |
+
sche_conf = dict(pipe.scheduler.config)
|
38 |
+
fk_kwargs = ["skip_prk_steps","steps_offset","clip_sample","clip_sample_range","rescale_betas_zero_snr","timestep_spacing", "set_alpha_to_one"]
|
39 |
+
for k in fk_kwargs:
|
40 |
+
if k in sche_conf:
|
41 |
+
sche_conf.pop(k)
|
42 |
+
scheduler = DPMSolverMultistepScheduler(**sche_conf)
|
43 |
+
pipe.scheduler=scheduler
|
44 |
+
pipe_longprompt = StableDiffusionLongPromptWeightingPipeline(**pipe.components)
|
45 |
+
self.pipe, self.pipe_longprompt = pipe, pipe_longprompt
|
46 |
+
if self.lora_path is not None:
|
47 |
+
pipe.load_lora_weights(self.lora_path)
|
48 |
+
self.pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained(self.model_dir, **pipe.components)
|
49 |
+
|
50 |
+
def generate(
|
51 |
+
self,
|
52 |
+
prompt,
|
53 |
+
do_inversion=False,
|
54 |
+
**kwargs,
|
55 |
+
):
|
56 |
+
"""
|
57 |
+
Face control generating.
|
58 |
+
"""
|
59 |
+
print('GENERATING:', prompt)
|
60 |
+
if not do_inversion:
|
61 |
+
generating_conf = {
|
62 |
+
"pipe": self.pipe,
|
63 |
+
"prompt": prompt,
|
64 |
+
"negative_prompt": self.negative_prompt,
|
65 |
+
"num_inference_steps": 25,
|
66 |
+
"guidance_scale": 7,
|
67 |
+
}
|
68 |
+
else:
|
69 |
+
assert 'image' in kwargs, 'doing inversion, prepare the init image please PIL Image'
|
70 |
+
init_image = kwargs['image']
|
71 |
+
generating_conf = {
|
72 |
+
"pipe": self.pipe_img2img,
|
73 |
+
"prompt": prompt,
|
74 |
+
"negative_prompt": self.negative_prompt,
|
75 |
+
"image": init_image,
|
76 |
+
"num_inference_steps": 25,
|
77 |
+
"guidance_scale": 7,
|
78 |
+
"strength": kwargs.pop('strength', 0.9),
|
79 |
+
}
|
80 |
+
pipe_out = generate(**generating_conf)
|
81 |
+
generated_img = pipe_out[0][0]
|
82 |
+
return generated_img
|
chat_anything/face_generator/pipelines/lpw_stable_diffusion.py
ADDED
@@ -0,0 +1,1471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import inspect
|
2 |
+
import re
|
3 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import PIL
|
7 |
+
import torch
|
8 |
+
from packaging import version
|
9 |
+
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
10 |
+
|
11 |
+
from diffusers import DiffusionPipeline
|
12 |
+
from diffusers.configuration_utils import FrozenDict
|
13 |
+
from diffusers.image_processor import VaeImageProcessor
|
14 |
+
from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
|
15 |
+
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
16 |
+
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
|
17 |
+
from diffusers.schedulers import KarrasDiffusionSchedulers
|
18 |
+
from diffusers.utils.torch_utils import randn_tensor
|
19 |
+
|
20 |
+
from diffusers.utils import (
|
21 |
+
PIL_INTERPOLATION,
|
22 |
+
deprecate,
|
23 |
+
is_accelerate_available,
|
24 |
+
is_accelerate_version,
|
25 |
+
logging,
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
# ------------------------------------------------------------------------------
|
30 |
+
|
31 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
32 |
+
|
33 |
+
re_attention = re.compile(
|
34 |
+
r"""
|
35 |
+
\\\(|
|
36 |
+
\\\)|
|
37 |
+
\\\[|
|
38 |
+
\\]|
|
39 |
+
\\\\|
|
40 |
+
\\|
|
41 |
+
\(|
|
42 |
+
\[|
|
43 |
+
:([+-]?[.\d]+)\)|
|
44 |
+
\)|
|
45 |
+
]|
|
46 |
+
[^\\()\[\]:]+|
|
47 |
+
:
|
48 |
+
""",
|
49 |
+
re.X,
|
50 |
+
)
|
51 |
+
|
52 |
+
|
53 |
+
def parse_prompt_attention(text):
|
54 |
+
"""
|
55 |
+
Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
|
56 |
+
Accepted tokens are:
|
57 |
+
(abc) - increases attention to abc by a multiplier of 1.1
|
58 |
+
(abc:3.12) - increases attention to abc by a multiplier of 3.12
|
59 |
+
[abc] - decreases attention to abc by a multiplier of 1.1
|
60 |
+
\( - literal character '('
|
61 |
+
\[ - literal character '['
|
62 |
+
\) - literal character ')'
|
63 |
+
\] - literal character ']'
|
64 |
+
\\ - literal character '\'
|
65 |
+
anything else - just text
|
66 |
+
>>> parse_prompt_attention('normal text')
|
67 |
+
[['normal text', 1.0]]
|
68 |
+
>>> parse_prompt_attention('an (important) word')
|
69 |
+
[['an ', 1.0], ['important', 1.1], [' word', 1.0]]
|
70 |
+
>>> parse_prompt_attention('(unbalanced')
|
71 |
+
[['unbalanced', 1.1]]
|
72 |
+
>>> parse_prompt_attention('\(literal\]')
|
73 |
+
[['(literal]', 1.0]]
|
74 |
+
>>> parse_prompt_attention('(unnecessary)(parens)')
|
75 |
+
[['unnecessaryparens', 1.1]]
|
76 |
+
>>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
|
77 |
+
[['a ', 1.0],
|
78 |
+
['house', 1.5730000000000004],
|
79 |
+
[' ', 1.1],
|
80 |
+
['on', 1.0],
|
81 |
+
[' a ', 1.1],
|
82 |
+
['hill', 0.55],
|
83 |
+
[', sun, ', 1.1],
|
84 |
+
['sky', 1.4641000000000006],
|
85 |
+
['.', 1.1]]
|
86 |
+
"""
|
87 |
+
|
88 |
+
res = []
|
89 |
+
round_brackets = []
|
90 |
+
square_brackets = []
|
91 |
+
|
92 |
+
round_bracket_multiplier = 1.1
|
93 |
+
square_bracket_multiplier = 1 / 1.1
|
94 |
+
|
95 |
+
def multiply_range(start_position, multiplier):
|
96 |
+
for p in range(start_position, len(res)):
|
97 |
+
res[p][1] *= multiplier
|
98 |
+
|
99 |
+
for m in re_attention.finditer(text):
|
100 |
+
text = m.group(0)
|
101 |
+
weight = m.group(1)
|
102 |
+
|
103 |
+
if text.startswith("\\"):
|
104 |
+
res.append([text[1:], 1.0])
|
105 |
+
elif text == "(":
|
106 |
+
round_brackets.append(len(res))
|
107 |
+
elif text == "[":
|
108 |
+
square_brackets.append(len(res))
|
109 |
+
elif weight is not None and len(round_brackets) > 0:
|
110 |
+
multiply_range(round_brackets.pop(), float(weight))
|
111 |
+
elif text == ")" and len(round_brackets) > 0:
|
112 |
+
multiply_range(round_brackets.pop(), round_bracket_multiplier)
|
113 |
+
elif text == "]" and len(square_brackets) > 0:
|
114 |
+
multiply_range(square_brackets.pop(), square_bracket_multiplier)
|
115 |
+
else:
|
116 |
+
res.append([text, 1.0])
|
117 |
+
|
118 |
+
for pos in round_brackets:
|
119 |
+
multiply_range(pos, round_bracket_multiplier)
|
120 |
+
|
121 |
+
for pos in square_brackets:
|
122 |
+
multiply_range(pos, square_bracket_multiplier)
|
123 |
+
|
124 |
+
if len(res) == 0:
|
125 |
+
res = [["", 1.0]]
|
126 |
+
|
127 |
+
# merge runs of identical weights
|
128 |
+
i = 0
|
129 |
+
while i + 1 < len(res):
|
130 |
+
if res[i][1] == res[i + 1][1]:
|
131 |
+
res[i][0] += res[i + 1][0]
|
132 |
+
res.pop(i + 1)
|
133 |
+
else:
|
134 |
+
i += 1
|
135 |
+
|
136 |
+
return res
|
137 |
+
|
138 |
+
|
139 |
+
def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
|
140 |
+
r"""
|
141 |
+
Tokenize a list of prompts and return its tokens with weights of each token.
|
142 |
+
|
143 |
+
No padding, starting or ending token is included.
|
144 |
+
"""
|
145 |
+
tokens = []
|
146 |
+
weights = []
|
147 |
+
truncated = False
|
148 |
+
for text in prompt:
|
149 |
+
texts_and_weights = parse_prompt_attention(text)
|
150 |
+
text_token = []
|
151 |
+
text_weight = []
|
152 |
+
for word, weight in texts_and_weights:
|
153 |
+
# tokenize and discard the starting and the ending token
|
154 |
+
token = pipe.tokenizer(word).input_ids[1:-1]
|
155 |
+
text_token += token
|
156 |
+
# copy the weight by length of token
|
157 |
+
text_weight += [weight] * len(token)
|
158 |
+
# stop if the text is too long (longer than truncation limit)
|
159 |
+
if len(text_token) > max_length:
|
160 |
+
truncated = True
|
161 |
+
break
|
162 |
+
# truncate
|
163 |
+
if len(text_token) > max_length:
|
164 |
+
truncated = True
|
165 |
+
text_token = text_token[:max_length]
|
166 |
+
text_weight = text_weight[:max_length]
|
167 |
+
tokens.append(text_token)
|
168 |
+
weights.append(text_weight)
|
169 |
+
if truncated:
|
170 |
+
logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
|
171 |
+
return tokens, weights
|
172 |
+
|
173 |
+
|
174 |
+
def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
|
175 |
+
r"""
|
176 |
+
Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
|
177 |
+
"""
|
178 |
+
max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
|
179 |
+
weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
|
180 |
+
for i in range(len(tokens)):
|
181 |
+
tokens[i] = [bos] + tokens[i] + [pad] * (max_length - 1 - len(tokens[i]) - 1) + [eos]
|
182 |
+
if no_boseos_middle:
|
183 |
+
weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
|
184 |
+
else:
|
185 |
+
w = []
|
186 |
+
if len(weights[i]) == 0:
|
187 |
+
w = [1.0] * weights_length
|
188 |
+
else:
|
189 |
+
for j in range(max_embeddings_multiples):
|
190 |
+
w.append(1.0) # weight for starting token in this chunk
|
191 |
+
w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
|
192 |
+
w.append(1.0) # weight for ending token in this chunk
|
193 |
+
w += [1.0] * (weights_length - len(w))
|
194 |
+
weights[i] = w[:]
|
195 |
+
|
196 |
+
return tokens, weights
|
197 |
+
|
198 |
+
|
199 |
+
def get_unweighted_text_embeddings(
|
200 |
+
pipe: DiffusionPipeline,
|
201 |
+
text_input: torch.Tensor,
|
202 |
+
chunk_length: int,
|
203 |
+
no_boseos_middle: Optional[bool] = True,
|
204 |
+
):
|
205 |
+
"""
|
206 |
+
When the length of tokens is a multiple of the capacity of the text encoder,
|
207 |
+
it should be split into chunks and sent to the text encoder individually.
|
208 |
+
"""
|
209 |
+
max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
|
210 |
+
if max_embeddings_multiples > 1:
|
211 |
+
text_embeddings = []
|
212 |
+
for i in range(max_embeddings_multiples):
|
213 |
+
# extract the i-th chunk
|
214 |
+
text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
|
215 |
+
|
216 |
+
# cover the head and the tail by the starting and the ending tokens
|
217 |
+
text_input_chunk[:, 0] = text_input[0, 0]
|
218 |
+
text_input_chunk[:, -1] = text_input[0, -1]
|
219 |
+
text_embedding = pipe.text_encoder(text_input_chunk)[0]
|
220 |
+
|
221 |
+
if no_boseos_middle:
|
222 |
+
if i == 0:
|
223 |
+
# discard the ending token
|
224 |
+
text_embedding = text_embedding[:, :-1]
|
225 |
+
elif i == max_embeddings_multiples - 1:
|
226 |
+
# discard the starting token
|
227 |
+
text_embedding = text_embedding[:, 1:]
|
228 |
+
else:
|
229 |
+
# discard both starting and ending tokens
|
230 |
+
text_embedding = text_embedding[:, 1:-1]
|
231 |
+
|
232 |
+
text_embeddings.append(text_embedding)
|
233 |
+
text_embeddings = torch.concat(text_embeddings, axis=1)
|
234 |
+
else:
|
235 |
+
text_embeddings = pipe.text_encoder(text_input)[0]
|
236 |
+
return text_embeddings
|
237 |
+
|
238 |
+
|
239 |
+
def get_weighted_text_embeddings(
|
240 |
+
pipe: DiffusionPipeline,
|
241 |
+
prompt: Union[str, List[str]],
|
242 |
+
uncond_prompt: Optional[Union[str, List[str]]] = None,
|
243 |
+
max_embeddings_multiples: Optional[int] = 3,
|
244 |
+
no_boseos_middle: Optional[bool] = False,
|
245 |
+
skip_parsing: Optional[bool] = False,
|
246 |
+
skip_weighting: Optional[bool] = False,
|
247 |
+
):
|
248 |
+
r"""
|
249 |
+
Prompts can be assigned with local weights using brackets. For example,
|
250 |
+
prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
|
251 |
+
and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
|
252 |
+
|
253 |
+
Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
|
254 |
+
|
255 |
+
Args:
|
256 |
+
pipe (`DiffusionPipeline`):
|
257 |
+
Pipe to provide access to the tokenizer and the text encoder.
|
258 |
+
prompt (`str` or `List[str]`):
|
259 |
+
The prompt or prompts to guide the image generation.
|
260 |
+
uncond_prompt (`str` or `List[str]`):
|
261 |
+
The unconditional prompt or prompts for guide the image generation. If unconditional prompt
|
262 |
+
is provided, the embeddings of prompt and uncond_prompt are concatenated.
|
263 |
+
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
264 |
+
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
265 |
+
no_boseos_middle (`bool`, *optional*, defaults to `False`):
|
266 |
+
If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
|
267 |
+
ending token in each of the chunk in the middle.
|
268 |
+
skip_parsing (`bool`, *optional*, defaults to `False`):
|
269 |
+
Skip the parsing of brackets.
|
270 |
+
skip_weighting (`bool`, *optional*, defaults to `False`):
|
271 |
+
Skip the weighting. When the parsing is skipped, it is forced True.
|
272 |
+
"""
|
273 |
+
max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
|
274 |
+
if isinstance(prompt, str):
|
275 |
+
prompt = [prompt]
|
276 |
+
|
277 |
+
if not skip_parsing:
|
278 |
+
prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
|
279 |
+
if uncond_prompt is not None:
|
280 |
+
if isinstance(uncond_prompt, str):
|
281 |
+
uncond_prompt = [uncond_prompt]
|
282 |
+
uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
|
283 |
+
else:
|
284 |
+
prompt_tokens = [
|
285 |
+
token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
|
286 |
+
]
|
287 |
+
prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
|
288 |
+
if uncond_prompt is not None:
|
289 |
+
if isinstance(uncond_prompt, str):
|
290 |
+
uncond_prompt = [uncond_prompt]
|
291 |
+
uncond_tokens = [
|
292 |
+
token[1:-1]
|
293 |
+
for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
|
294 |
+
]
|
295 |
+
uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
|
296 |
+
|
297 |
+
# round up the longest length of tokens to a multiple of (model_max_length - 2)
|
298 |
+
max_length = max([len(token) for token in prompt_tokens])
|
299 |
+
if uncond_prompt is not None:
|
300 |
+
max_length = max(max_length, max([len(token) for token in uncond_tokens]))
|
301 |
+
|
302 |
+
max_embeddings_multiples = min(
|
303 |
+
max_embeddings_multiples,
|
304 |
+
(max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
|
305 |
+
)
|
306 |
+
max_embeddings_multiples = max(1, max_embeddings_multiples)
|
307 |
+
max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
|
308 |
+
|
309 |
+
# pad the length of tokens and weights
|
310 |
+
bos = pipe.tokenizer.bos_token_id
|
311 |
+
eos = pipe.tokenizer.eos_token_id
|
312 |
+
pad = getattr(pipe.tokenizer, "pad_token_id", eos)
|
313 |
+
prompt_tokens, prompt_weights = pad_tokens_and_weights(
|
314 |
+
prompt_tokens,
|
315 |
+
prompt_weights,
|
316 |
+
max_length,
|
317 |
+
bos,
|
318 |
+
eos,
|
319 |
+
pad,
|
320 |
+
no_boseos_middle=no_boseos_middle,
|
321 |
+
chunk_length=pipe.tokenizer.model_max_length,
|
322 |
+
)
|
323 |
+
prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
|
324 |
+
if uncond_prompt is not None:
|
325 |
+
uncond_tokens, uncond_weights = pad_tokens_and_weights(
|
326 |
+
uncond_tokens,
|
327 |
+
uncond_weights,
|
328 |
+
max_length,
|
329 |
+
bos,
|
330 |
+
eos,
|
331 |
+
pad,
|
332 |
+
no_boseos_middle=no_boseos_middle,
|
333 |
+
chunk_length=pipe.tokenizer.model_max_length,
|
334 |
+
)
|
335 |
+
uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
|
336 |
+
|
337 |
+
# get the embeddings
|
338 |
+
text_embeddings = get_unweighted_text_embeddings(
|
339 |
+
pipe,
|
340 |
+
prompt_tokens,
|
341 |
+
pipe.tokenizer.model_max_length,
|
342 |
+
no_boseos_middle=no_boseos_middle,
|
343 |
+
)
|
344 |
+
prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=text_embeddings.device)
|
345 |
+
if uncond_prompt is not None:
|
346 |
+
uncond_embeddings = get_unweighted_text_embeddings(
|
347 |
+
pipe,
|
348 |
+
uncond_tokens,
|
349 |
+
pipe.tokenizer.model_max_length,
|
350 |
+
no_boseos_middle=no_boseos_middle,
|
351 |
+
)
|
352 |
+
uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=uncond_embeddings.device)
|
353 |
+
|
354 |
+
# assign weights to the prompts and normalize in the sense of mean
|
355 |
+
# TODO: should we normalize by chunk or in a whole (current implementation)?
|
356 |
+
if (not skip_parsing) and (not skip_weighting):
|
357 |
+
previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
|
358 |
+
text_embeddings *= prompt_weights.unsqueeze(-1)
|
359 |
+
current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
|
360 |
+
text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
|
361 |
+
if uncond_prompt is not None:
|
362 |
+
previous_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
|
363 |
+
uncond_embeddings *= uncond_weights.unsqueeze(-1)
|
364 |
+
current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
|
365 |
+
uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
|
366 |
+
|
367 |
+
if uncond_prompt is not None:
|
368 |
+
return text_embeddings, uncond_embeddings
|
369 |
+
return text_embeddings, None
|
370 |
+
|
371 |
+
|
372 |
+
def preprocess_image(image, batch_size):
|
373 |
+
w, h = image.size
|
374 |
+
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
|
375 |
+
image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
|
376 |
+
image = np.array(image).astype(np.float32) / 255.0
|
377 |
+
image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
|
378 |
+
image = torch.from_numpy(image)
|
379 |
+
return 2.0 * image - 1.0
|
380 |
+
|
381 |
+
|
382 |
+
def preprocess_mask(mask, batch_size, scale_factor=8):
|
383 |
+
if not isinstance(mask, torch.FloatTensor):
|
384 |
+
mask = mask.convert("L")
|
385 |
+
w, h = mask.size
|
386 |
+
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
|
387 |
+
mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
|
388 |
+
mask = np.array(mask).astype(np.float32) / 255.0
|
389 |
+
mask = np.tile(mask, (4, 1, 1))
|
390 |
+
mask = np.vstack([mask[None]] * batch_size)
|
391 |
+
mask = 1 - mask # repaint white, keep black
|
392 |
+
mask = torch.from_numpy(mask)
|
393 |
+
return mask
|
394 |
+
|
395 |
+
else:
|
396 |
+
valid_mask_channel_sizes = [1, 3]
|
397 |
+
# if mask channel is fourth tensor dimension, permute dimensions to pytorch standard (B, C, H, W)
|
398 |
+
if mask.shape[3] in valid_mask_channel_sizes:
|
399 |
+
mask = mask.permute(0, 3, 1, 2)
|
400 |
+
elif mask.shape[1] not in valid_mask_channel_sizes:
|
401 |
+
raise ValueError(
|
402 |
+
f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
|
403 |
+
f" but received mask of shape {tuple(mask.shape)}"
|
404 |
+
)
|
405 |
+
# (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
|
406 |
+
mask = mask.mean(dim=1, keepdim=True)
|
407 |
+
h, w = mask.shape[-2:]
|
408 |
+
h, w = (x - x % 8 for x in (h, w)) # resize to integer multiple of 8
|
409 |
+
mask = torch.nn.functional.interpolate(mask, (h // scale_factor, w // scale_factor))
|
410 |
+
return mask
|
411 |
+
|
412 |
+
|
413 |
+
class StableDiffusionLongPromptWeightingPipeline(
|
414 |
+
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin
|
415 |
+
):
|
416 |
+
r"""
|
417 |
+
Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
|
418 |
+
weighting in prompt.
|
419 |
+
|
420 |
+
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
421 |
+
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
422 |
+
|
423 |
+
Args:
|
424 |
+
vae ([`AutoencoderKL`]):
|
425 |
+
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
426 |
+
text_encoder ([`CLIPTextModel`]):
|
427 |
+
Frozen text-encoder. Stable Diffusion uses the text portion of
|
428 |
+
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
429 |
+
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
430 |
+
tokenizer (`CLIPTokenizer`):
|
431 |
+
Tokenizer of class
|
432 |
+
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
433 |
+
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
434 |
+
scheduler ([`SchedulerMixin`]):
|
435 |
+
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
|
436 |
+
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
437 |
+
safety_checker ([`StableDiffusionSafetyChecker`]):
|
438 |
+
Classification module that estimates whether generated images could be considered offensive or harmful.
|
439 |
+
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
|
440 |
+
feature_extractor ([`CLIPImageProcessor`]):
|
441 |
+
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
442 |
+
"""
|
443 |
+
|
444 |
+
_optional_components = ["safety_checker", "feature_extractor"]
|
445 |
+
|
446 |
+
def __init__(
|
447 |
+
self,
|
448 |
+
vae: AutoencoderKL,
|
449 |
+
text_encoder: CLIPTextModel,
|
450 |
+
tokenizer: CLIPTokenizer,
|
451 |
+
unet: UNet2DConditionModel,
|
452 |
+
scheduler: KarrasDiffusionSchedulers,
|
453 |
+
safety_checker: StableDiffusionSafetyChecker,
|
454 |
+
feature_extractor: CLIPImageProcessor,
|
455 |
+
requires_safety_checker: bool = True,
|
456 |
+
):
|
457 |
+
super().__init__()
|
458 |
+
|
459 |
+
if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
|
460 |
+
deprecation_message = (
|
461 |
+
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
|
462 |
+
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
|
463 |
+
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
|
464 |
+
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
|
465 |
+
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
|
466 |
+
" file"
|
467 |
+
)
|
468 |
+
deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
|
469 |
+
new_config = dict(scheduler.config)
|
470 |
+
new_config["steps_offset"] = 1
|
471 |
+
scheduler._internal_dict = FrozenDict(new_config)
|
472 |
+
|
473 |
+
if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
|
474 |
+
deprecation_message = (
|
475 |
+
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
|
476 |
+
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
|
477 |
+
" config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
|
478 |
+
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
|
479 |
+
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
|
480 |
+
)
|
481 |
+
deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
|
482 |
+
new_config = dict(scheduler.config)
|
483 |
+
new_config["clip_sample"] = False
|
484 |
+
scheduler._internal_dict = FrozenDict(new_config)
|
485 |
+
|
486 |
+
if safety_checker is None and requires_safety_checker:
|
487 |
+
logger.warning(
|
488 |
+
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
|
489 |
+
" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
|
490 |
+
" results in services or applications open to the public. Both the diffusers team and Hugging Face"
|
491 |
+
" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
|
492 |
+
" it only for use-cases that involve analyzing network behavior or auditing its results. For more"
|
493 |
+
" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
|
494 |
+
)
|
495 |
+
|
496 |
+
if safety_checker is not None and feature_extractor is None:
|
497 |
+
raise ValueError(
|
498 |
+
"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
|
499 |
+
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
|
500 |
+
)
|
501 |
+
|
502 |
+
is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
|
503 |
+
version.parse(unet.config._diffusers_version).base_version
|
504 |
+
) < version.parse("0.9.0.dev0")
|
505 |
+
is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
|
506 |
+
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
|
507 |
+
deprecation_message = (
|
508 |
+
"The configuration file of the unet has set the default `sample_size` to smaller than"
|
509 |
+
" 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
|
510 |
+
" following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
|
511 |
+
" CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
|
512 |
+
" \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
|
513 |
+
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
|
514 |
+
" in the config might lead to incorrect results in future versions. If you have downloaded this"
|
515 |
+
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
|
516 |
+
" the `unet/config.json` file"
|
517 |
+
)
|
518 |
+
deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
|
519 |
+
new_config = dict(unet.config)
|
520 |
+
new_config["sample_size"] = 64
|
521 |
+
unet._internal_dict = FrozenDict(new_config)
|
522 |
+
self.register_modules(
|
523 |
+
vae=vae,
|
524 |
+
text_encoder=text_encoder,
|
525 |
+
tokenizer=tokenizer,
|
526 |
+
unet=unet,
|
527 |
+
scheduler=scheduler,
|
528 |
+
safety_checker=safety_checker,
|
529 |
+
feature_extractor=feature_extractor,
|
530 |
+
)
|
531 |
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
532 |
+
|
533 |
+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
534 |
+
self.register_to_config(
|
535 |
+
requires_safety_checker=requires_safety_checker,
|
536 |
+
)
|
537 |
+
|
538 |
+
def enable_vae_slicing(self):
|
539 |
+
r"""
|
540 |
+
Enable sliced VAE decoding.
|
541 |
+
|
542 |
+
When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
|
543 |
+
steps. This is useful to save some memory and allow larger batch sizes.
|
544 |
+
"""
|
545 |
+
self.vae.enable_slicing()
|
546 |
+
|
547 |
+
def disable_vae_slicing(self):
|
548 |
+
r"""
|
549 |
+
Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
|
550 |
+
computing decoding in one step.
|
551 |
+
"""
|
552 |
+
self.vae.disable_slicing()
|
553 |
+
|
554 |
+
def enable_vae_tiling(self):
|
555 |
+
r"""
|
556 |
+
Enable tiled VAE decoding.
|
557 |
+
|
558 |
+
When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
|
559 |
+
several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
|
560 |
+
"""
|
561 |
+
self.vae.enable_tiling()
|
562 |
+
|
563 |
+
def disable_vae_tiling(self):
|
564 |
+
r"""
|
565 |
+
Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
|
566 |
+
computing decoding in one step.
|
567 |
+
"""
|
568 |
+
self.vae.disable_tiling()
|
569 |
+
|
570 |
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
|
571 |
+
def enable_sequential_cpu_offload(self, gpu_id=0):
|
572 |
+
r"""
|
573 |
+
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
574 |
+
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
575 |
+
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
|
576 |
+
Note that offloading happens on a submodule basis. Memory savings are higher than with
|
577 |
+
`enable_model_cpu_offload`, but performance is lower.
|
578 |
+
"""
|
579 |
+
if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
|
580 |
+
from accelerate import cpu_offload
|
581 |
+
else:
|
582 |
+
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
|
583 |
+
|
584 |
+
device = torch.device(f"cuda:{gpu_id}")
|
585 |
+
|
586 |
+
if self.device.type != "cpu":
|
587 |
+
self.to("cpu", silence_dtype_warnings=True)
|
588 |
+
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
|
589 |
+
|
590 |
+
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
|
591 |
+
cpu_offload(cpu_offloaded_model, device)
|
592 |
+
|
593 |
+
if self.safety_checker is not None:
|
594 |
+
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
|
595 |
+
|
596 |
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
|
597 |
+
def enable_model_cpu_offload(self, gpu_id=0):
|
598 |
+
r"""
|
599 |
+
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
600 |
+
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
|
601 |
+
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
|
602 |
+
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
|
603 |
+
"""
|
604 |
+
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
|
605 |
+
from accelerate import cpu_offload_with_hook
|
606 |
+
else:
|
607 |
+
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
|
608 |
+
|
609 |
+
device = torch.device(f"cuda:{gpu_id}")
|
610 |
+
|
611 |
+
if self.device.type != "cpu":
|
612 |
+
self.to("cpu", silence_dtype_warnings=True)
|
613 |
+
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
|
614 |
+
|
615 |
+
hook = None
|
616 |
+
for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
|
617 |
+
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
|
618 |
+
|
619 |
+
if self.safety_checker is not None:
|
620 |
+
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
|
621 |
+
|
622 |
+
# We'll offload the last model manually.
|
623 |
+
self.final_offload_hook = hook
|
624 |
+
|
625 |
+
@property
|
626 |
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
|
627 |
+
def _execution_device(self):
|
628 |
+
r"""
|
629 |
+
Returns the device on which the pipeline's models will be executed. After calling
|
630 |
+
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
|
631 |
+
hooks.
|
632 |
+
"""
|
633 |
+
if not hasattr(self.unet, "_hf_hook"):
|
634 |
+
return self.device
|
635 |
+
for module in self.unet.modules():
|
636 |
+
if (
|
637 |
+
hasattr(module, "_hf_hook")
|
638 |
+
and hasattr(module._hf_hook, "execution_device")
|
639 |
+
and module._hf_hook.execution_device is not None
|
640 |
+
):
|
641 |
+
return torch.device(module._hf_hook.execution_device)
|
642 |
+
return self.device
|
643 |
+
|
644 |
+
def _encode_prompt(
|
645 |
+
self,
|
646 |
+
prompt,
|
647 |
+
device,
|
648 |
+
num_images_per_prompt,
|
649 |
+
do_classifier_free_guidance,
|
650 |
+
negative_prompt=None,
|
651 |
+
max_embeddings_multiples=3,
|
652 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
653 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
654 |
+
):
|
655 |
+
r"""
|
656 |
+
Encodes the prompt into text encoder hidden states.
|
657 |
+
|
658 |
+
Args:
|
659 |
+
prompt (`str` or `list(int)`):
|
660 |
+
prompt to be encoded
|
661 |
+
device: (`torch.device`):
|
662 |
+
torch device
|
663 |
+
num_images_per_prompt (`int`):
|
664 |
+
number of images that should be generated per prompt
|
665 |
+
do_classifier_free_guidance (`bool`):
|
666 |
+
whether to use classifier free guidance or not
|
667 |
+
negative_prompt (`str` or `List[str]`):
|
668 |
+
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
669 |
+
if `guidance_scale` is less than `1`).
|
670 |
+
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
671 |
+
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
672 |
+
"""
|
673 |
+
if prompt is not None and isinstance(prompt, str):
|
674 |
+
batch_size = 1
|
675 |
+
elif prompt is not None and isinstance(prompt, list):
|
676 |
+
batch_size = len(prompt)
|
677 |
+
else:
|
678 |
+
batch_size = prompt_embeds.shape[0]
|
679 |
+
|
680 |
+
if negative_prompt_embeds is None:
|
681 |
+
if negative_prompt is None:
|
682 |
+
negative_prompt = [""] * batch_size
|
683 |
+
elif isinstance(negative_prompt, str):
|
684 |
+
negative_prompt = [negative_prompt] * batch_size
|
685 |
+
if batch_size != len(negative_prompt):
|
686 |
+
raise ValueError(
|
687 |
+
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
688 |
+
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
689 |
+
" the batch size of `prompt`."
|
690 |
+
)
|
691 |
+
if prompt_embeds is None or negative_prompt_embeds is None:
|
692 |
+
if isinstance(self, TextualInversionLoaderMixin):
|
693 |
+
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
694 |
+
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
695 |
+
negative_prompt = self.maybe_convert_prompt(negative_prompt, self.tokenizer)
|
696 |
+
|
697 |
+
prompt_embeds1, negative_prompt_embeds1 = get_weighted_text_embeddings(
|
698 |
+
pipe=self,
|
699 |
+
prompt=prompt,
|
700 |
+
uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
|
701 |
+
max_embeddings_multiples=max_embeddings_multiples,
|
702 |
+
)
|
703 |
+
if prompt_embeds is None:
|
704 |
+
prompt_embeds = prompt_embeds1
|
705 |
+
if negative_prompt_embeds is None:
|
706 |
+
negative_prompt_embeds = negative_prompt_embeds1
|
707 |
+
|
708 |
+
bs_embed, seq_len, _ = prompt_embeds.shape
|
709 |
+
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
710 |
+
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
711 |
+
prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
712 |
+
|
713 |
+
if do_classifier_free_guidance:
|
714 |
+
bs_embed, seq_len, _ = negative_prompt_embeds.shape
|
715 |
+
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
716 |
+
negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
717 |
+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
718 |
+
|
719 |
+
return prompt_embeds
|
720 |
+
|
721 |
+
def check_inputs(
|
722 |
+
self,
|
723 |
+
prompt,
|
724 |
+
height,
|
725 |
+
width,
|
726 |
+
strength,
|
727 |
+
callback_steps,
|
728 |
+
negative_prompt=None,
|
729 |
+
prompt_embeds=None,
|
730 |
+
negative_prompt_embeds=None,
|
731 |
+
):
|
732 |
+
if height % 8 != 0 or width % 8 != 0:
|
733 |
+
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
734 |
+
|
735 |
+
if strength < 0 or strength > 1:
|
736 |
+
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
737 |
+
|
738 |
+
if (callback_steps is None) or (
|
739 |
+
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
|
740 |
+
):
|
741 |
+
raise ValueError(
|
742 |
+
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
|
743 |
+
f" {type(callback_steps)}."
|
744 |
+
)
|
745 |
+
|
746 |
+
if prompt is not None and prompt_embeds is not None:
|
747 |
+
raise ValueError(
|
748 |
+
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
|
749 |
+
" only forward one of the two."
|
750 |
+
)
|
751 |
+
elif prompt is None and prompt_embeds is None:
|
752 |
+
raise ValueError(
|
753 |
+
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
754 |
+
)
|
755 |
+
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
756 |
+
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
757 |
+
|
758 |
+
if negative_prompt is not None and negative_prompt_embeds is not None:
|
759 |
+
raise ValueError(
|
760 |
+
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
|
761 |
+
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
762 |
+
)
|
763 |
+
|
764 |
+
if prompt_embeds is not None and negative_prompt_embeds is not None:
|
765 |
+
if prompt_embeds.shape != negative_prompt_embeds.shape:
|
766 |
+
raise ValueError(
|
767 |
+
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
|
768 |
+
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
|
769 |
+
f" {negative_prompt_embeds.shape}."
|
770 |
+
)
|
771 |
+
|
772 |
+
def get_timesteps(self, num_inference_steps, strength, device, is_text2img):
|
773 |
+
if is_text2img:
|
774 |
+
return self.scheduler.timesteps.to(device), num_inference_steps
|
775 |
+
else:
|
776 |
+
# get the original timestep using init_timestep
|
777 |
+
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
778 |
+
|
779 |
+
t_start = max(num_inference_steps - init_timestep, 0)
|
780 |
+
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
781 |
+
|
782 |
+
return timesteps, num_inference_steps - t_start
|
783 |
+
|
784 |
+
def run_safety_checker(self, image, device, dtype):
|
785 |
+
if self.safety_checker is not None:
|
786 |
+
safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
|
787 |
+
image, has_nsfw_concept = self.safety_checker(
|
788 |
+
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
|
789 |
+
)
|
790 |
+
else:
|
791 |
+
has_nsfw_concept = None
|
792 |
+
return image, has_nsfw_concept
|
793 |
+
|
794 |
+
def decode_latents(self, latents):
|
795 |
+
latents = 1 / self.vae.config.scaling_factor * latents
|
796 |
+
image = self.vae.decode(latents).sample
|
797 |
+
image = (image / 2 + 0.5).clamp(0, 1)
|
798 |
+
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
799 |
+
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
800 |
+
return image
|
801 |
+
|
802 |
+
def prepare_extra_step_kwargs(self, generator, eta):
|
803 |
+
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
804 |
+
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
805 |
+
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
806 |
+
# and should be between [0, 1]
|
807 |
+
|
808 |
+
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
809 |
+
extra_step_kwargs = {}
|
810 |
+
if accepts_eta:
|
811 |
+
extra_step_kwargs["eta"] = eta
|
812 |
+
|
813 |
+
# check if the scheduler accepts generator
|
814 |
+
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
815 |
+
if accepts_generator:
|
816 |
+
extra_step_kwargs["generator"] = generator
|
817 |
+
return extra_step_kwargs
|
818 |
+
|
819 |
+
def prepare_latents(
|
820 |
+
self,
|
821 |
+
image,
|
822 |
+
timestep,
|
823 |
+
num_images_per_prompt,
|
824 |
+
batch_size,
|
825 |
+
num_channels_latents,
|
826 |
+
height,
|
827 |
+
width,
|
828 |
+
dtype,
|
829 |
+
device,
|
830 |
+
generator,
|
831 |
+
latents=None,
|
832 |
+
):
|
833 |
+
if image is None:
|
834 |
+
batch_size = batch_size * num_images_per_prompt
|
835 |
+
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
836 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
837 |
+
raise ValueError(
|
838 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
839 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
840 |
+
)
|
841 |
+
|
842 |
+
if latents is None:
|
843 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
844 |
+
else:
|
845 |
+
latents = latents.to(device)
|
846 |
+
|
847 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
848 |
+
latents = latents * self.scheduler.init_noise_sigma
|
849 |
+
return latents, None, None
|
850 |
+
else:
|
851 |
+
image = image.to(device=self.device, dtype=dtype)
|
852 |
+
init_latent_dist = self.vae.encode(image).latent_dist
|
853 |
+
init_latents = init_latent_dist.sample(generator=generator)
|
854 |
+
init_latents = self.vae.config.scaling_factor * init_latents
|
855 |
+
|
856 |
+
# Expand init_latents for batch_size and num_images_per_prompt
|
857 |
+
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
|
858 |
+
init_latents_orig = init_latents
|
859 |
+
|
860 |
+
# add noise to latents using the timesteps
|
861 |
+
noise = randn_tensor(init_latents.shape, generator=generator, device=self.device, dtype=dtype)
|
862 |
+
init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
|
863 |
+
latents = init_latents
|
864 |
+
return latents, init_latents_orig, noise
|
865 |
+
|
866 |
+
@torch.no_grad()
|
867 |
+
def __call__(
|
868 |
+
self,
|
869 |
+
prompt: Union[str, List[str]],
|
870 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
871 |
+
image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
872 |
+
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
873 |
+
height: int = 512,
|
874 |
+
width: int = 512,
|
875 |
+
num_inference_steps: int = 50,
|
876 |
+
guidance_scale: float = 7.5,
|
877 |
+
strength: float = 0.8,
|
878 |
+
num_images_per_prompt: Optional[int] = 1,
|
879 |
+
add_predicted_noise: Optional[bool] = False,
|
880 |
+
eta: float = 0.0,
|
881 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
882 |
+
latents: Optional[torch.FloatTensor] = None,
|
883 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
884 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
885 |
+
max_embeddings_multiples: Optional[int] = 3,
|
886 |
+
output_type: Optional[str] = "pil",
|
887 |
+
return_dict: bool = True,
|
888 |
+
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
889 |
+
is_cancelled_callback: Optional[Callable[[], bool]] = None,
|
890 |
+
callback_steps: int = 1,
|
891 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
892 |
+
):
|
893 |
+
r"""
|
894 |
+
Function invoked when calling the pipeline for generation.
|
895 |
+
|
896 |
+
Args:
|
897 |
+
prompt (`str` or `List[str]`):
|
898 |
+
The prompt or prompts to guide the image generation.
|
899 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
900 |
+
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
901 |
+
if `guidance_scale` is less than `1`).
|
902 |
+
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
903 |
+
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
904 |
+
process.
|
905 |
+
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
906 |
+
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
907 |
+
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
908 |
+
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
909 |
+
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
910 |
+
height (`int`, *optional*, defaults to 512):
|
911 |
+
The height in pixels of the generated image.
|
912 |
+
width (`int`, *optional*, defaults to 512):
|
913 |
+
The width in pixels of the generated image.
|
914 |
+
num_inference_steps (`int`, *optional*, defaults to 50):
|
915 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
916 |
+
expense of slower inference.
|
917 |
+
guidance_scale (`float`, *optional*, defaults to 7.5):
|
918 |
+
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
919 |
+
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
920 |
+
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
921 |
+
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
922 |
+
usually at the expense of lower image quality.
|
923 |
+
strength (`float`, *optional*, defaults to 0.8):
|
924 |
+
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
925 |
+
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
926 |
+
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
927 |
+
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
928 |
+
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
929 |
+
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
930 |
+
The number of images to generate per prompt.
|
931 |
+
add_predicted_noise (`bool`, *optional*, defaults to True):
|
932 |
+
Use predicted noise instead of random noise when constructing noisy versions of the original image in
|
933 |
+
the reverse diffusion process
|
934 |
+
eta (`float`, *optional*, defaults to 0.0):
|
935 |
+
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
936 |
+
[`schedulers.DDIMScheduler`], will be ignored for others.
|
937 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
938 |
+
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
939 |
+
to make generation deterministic.
|
940 |
+
latents (`torch.FloatTensor`, *optional*):
|
941 |
+
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
942 |
+
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
943 |
+
tensor will ge generated by sampling using the supplied random `generator`.
|
944 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
945 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
946 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
947 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
948 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
949 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
950 |
+
argument.
|
951 |
+
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
952 |
+
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
953 |
+
output_type (`str`, *optional*, defaults to `"pil"`):
|
954 |
+
The output format of the generate image. Choose between
|
955 |
+
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
956 |
+
return_dict (`bool`, *optional*, defaults to `True`):
|
957 |
+
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
958 |
+
plain tuple.
|
959 |
+
callback (`Callable`, *optional*):
|
960 |
+
A function that will be called every `callback_steps` steps during inference. The function will be
|
961 |
+
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
962 |
+
is_cancelled_callback (`Callable`, *optional*):
|
963 |
+
A function that will be called every `callback_steps` steps during inference. If the function returns
|
964 |
+
`True`, the inference will be cancelled.
|
965 |
+
callback_steps (`int`, *optional*, defaults to 1):
|
966 |
+
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
967 |
+
called at every step.
|
968 |
+
cross_attention_kwargs (`dict`, *optional*):
|
969 |
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
970 |
+
`self.processor` in
|
971 |
+
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
972 |
+
|
973 |
+
Returns:
|
974 |
+
`None` if cancelled by `is_cancelled_callback`,
|
975 |
+
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
976 |
+
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
977 |
+
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
978 |
+
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
979 |
+
(nsfw) content, according to the `safety_checker`.
|
980 |
+
"""
|
981 |
+
# 0. Default height and width to unet
|
982 |
+
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
983 |
+
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
984 |
+
|
985 |
+
# 1. Check inputs. Raise error if not correct
|
986 |
+
self.check_inputs(
|
987 |
+
prompt, height, width, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
|
988 |
+
)
|
989 |
+
|
990 |
+
# 2. Define call parameters
|
991 |
+
if prompt is not None and isinstance(prompt, str):
|
992 |
+
batch_size = 1
|
993 |
+
elif prompt is not None and isinstance(prompt, list):
|
994 |
+
batch_size = len(prompt)
|
995 |
+
else:
|
996 |
+
batch_size = prompt_embeds.shape[0]
|
997 |
+
|
998 |
+
device = self._execution_device
|
999 |
+
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
1000 |
+
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
1001 |
+
# corresponds to doing no classifier free guidance.
|
1002 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
1003 |
+
|
1004 |
+
# 3. Encode input prompt
|
1005 |
+
prompt_embeds = self._encode_prompt(
|
1006 |
+
prompt,
|
1007 |
+
device,
|
1008 |
+
num_images_per_prompt,
|
1009 |
+
do_classifier_free_guidance,
|
1010 |
+
negative_prompt,
|
1011 |
+
max_embeddings_multiples,
|
1012 |
+
prompt_embeds=prompt_embeds,
|
1013 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
1014 |
+
)
|
1015 |
+
dtype = prompt_embeds.dtype
|
1016 |
+
|
1017 |
+
# 4. Preprocess image and mask
|
1018 |
+
if isinstance(image, PIL.Image.Image):
|
1019 |
+
image = preprocess_image(image, batch_size)
|
1020 |
+
if image is not None:
|
1021 |
+
image = image.to(device=self.device, dtype=dtype)
|
1022 |
+
if isinstance(mask_image, PIL.Image.Image):
|
1023 |
+
mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
|
1024 |
+
if mask_image is not None:
|
1025 |
+
mask = mask_image.to(device=self.device, dtype=dtype)
|
1026 |
+
mask = torch.cat([mask] * num_images_per_prompt)
|
1027 |
+
else:
|
1028 |
+
mask = None
|
1029 |
+
|
1030 |
+
# 5. set timesteps
|
1031 |
+
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
1032 |
+
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device, image is None)
|
1033 |
+
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
|
1034 |
+
|
1035 |
+
# 6. Prepare latent variables
|
1036 |
+
latents, init_latents_orig, noise = self.prepare_latents(
|
1037 |
+
image,
|
1038 |
+
latent_timestep,
|
1039 |
+
num_images_per_prompt,
|
1040 |
+
batch_size,
|
1041 |
+
self.unet.config.in_channels,
|
1042 |
+
height,
|
1043 |
+
width,
|
1044 |
+
dtype,
|
1045 |
+
device,
|
1046 |
+
generator,
|
1047 |
+
latents,
|
1048 |
+
)
|
1049 |
+
|
1050 |
+
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
1051 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
1052 |
+
|
1053 |
+
# 8. Denoising loop
|
1054 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
1055 |
+
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
1056 |
+
for i, t in enumerate(timesteps):
|
1057 |
+
# expand the latents if we are doing classifier free guidance
|
1058 |
+
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
1059 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
1060 |
+
|
1061 |
+
# predict the noise residual
|
1062 |
+
noise_pred = self.unet(
|
1063 |
+
latent_model_input,
|
1064 |
+
t,
|
1065 |
+
encoder_hidden_states=prompt_embeds,
|
1066 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
1067 |
+
).sample
|
1068 |
+
|
1069 |
+
# perform guidance
|
1070 |
+
if do_classifier_free_guidance:
|
1071 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
1072 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
1073 |
+
|
1074 |
+
# compute the previous noisy sample x_t -> x_t-1
|
1075 |
+
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
1076 |
+
|
1077 |
+
if mask is not None:
|
1078 |
+
# masking
|
1079 |
+
if add_predicted_noise:
|
1080 |
+
init_latents_proper = self.scheduler.add_noise(
|
1081 |
+
init_latents_orig, noise_pred_uncond, torch.tensor([t])
|
1082 |
+
)
|
1083 |
+
else:
|
1084 |
+
init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
|
1085 |
+
latents = (init_latents_proper * mask) + (latents * (1 - mask))
|
1086 |
+
|
1087 |
+
# call the callback, if provided
|
1088 |
+
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
1089 |
+
progress_bar.update()
|
1090 |
+
if i % callback_steps == 0:
|
1091 |
+
if callback is not None:
|
1092 |
+
callback(i, t, latents)
|
1093 |
+
if is_cancelled_callback is not None and is_cancelled_callback():
|
1094 |
+
return None
|
1095 |
+
|
1096 |
+
if output_type == "latent":
|
1097 |
+
image = latents
|
1098 |
+
has_nsfw_concept = None
|
1099 |
+
elif output_type == "pil":
|
1100 |
+
# 9. Post-processing
|
1101 |
+
image = self.decode_latents(latents)
|
1102 |
+
|
1103 |
+
# 10. Run safety checker
|
1104 |
+
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
1105 |
+
|
1106 |
+
# 11. Convert to PIL
|
1107 |
+
image = self.numpy_to_pil(image)
|
1108 |
+
else:
|
1109 |
+
# 9. Post-processing
|
1110 |
+
image = self.decode_latents(latents)
|
1111 |
+
|
1112 |
+
# 10. Run safety checker
|
1113 |
+
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
1114 |
+
|
1115 |
+
# Offload last model to CPU
|
1116 |
+
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
1117 |
+
self.final_offload_hook.offload()
|
1118 |
+
|
1119 |
+
if not return_dict:
|
1120 |
+
return image, has_nsfw_concept
|
1121 |
+
|
1122 |
+
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
1123 |
+
|
1124 |
+
def text2img(
|
1125 |
+
self,
|
1126 |
+
prompt: Union[str, List[str]],
|
1127 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
1128 |
+
height: int = 512,
|
1129 |
+
width: int = 512,
|
1130 |
+
num_inference_steps: int = 50,
|
1131 |
+
guidance_scale: float = 7.5,
|
1132 |
+
num_images_per_prompt: Optional[int] = 1,
|
1133 |
+
eta: float = 0.0,
|
1134 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
1135 |
+
latents: Optional[torch.FloatTensor] = None,
|
1136 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
1137 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
1138 |
+
max_embeddings_multiples: Optional[int] = 3,
|
1139 |
+
output_type: Optional[str] = "pil",
|
1140 |
+
return_dict: bool = True,
|
1141 |
+
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
1142 |
+
is_cancelled_callback: Optional[Callable[[], bool]] = None,
|
1143 |
+
callback_steps: int = 1,
|
1144 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
1145 |
+
):
|
1146 |
+
r"""
|
1147 |
+
Function for text-to-image generation.
|
1148 |
+
Args:
|
1149 |
+
prompt (`str` or `List[str]`):
|
1150 |
+
The prompt or prompts to guide the image generation.
|
1151 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
1152 |
+
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
1153 |
+
if `guidance_scale` is less than `1`).
|
1154 |
+
height (`int`, *optional*, defaults to 512):
|
1155 |
+
The height in pixels of the generated image.
|
1156 |
+
width (`int`, *optional*, defaults to 512):
|
1157 |
+
The width in pixels of the generated image.
|
1158 |
+
num_inference_steps (`int`, *optional*, defaults to 50):
|
1159 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
1160 |
+
expense of slower inference.
|
1161 |
+
guidance_scale (`float`, *optional*, defaults to 7.5):
|
1162 |
+
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
1163 |
+
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
1164 |
+
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
1165 |
+
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
1166 |
+
usually at the expense of lower image quality.
|
1167 |
+
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
1168 |
+
The number of images to generate per prompt.
|
1169 |
+
eta (`float`, *optional*, defaults to 0.0):
|
1170 |
+
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
1171 |
+
[`schedulers.DDIMScheduler`], will be ignored for others.
|
1172 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
1173 |
+
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
1174 |
+
to make generation deterministic.
|
1175 |
+
latents (`torch.FloatTensor`, *optional*):
|
1176 |
+
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
1177 |
+
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
1178 |
+
tensor will ge generated by sampling using the supplied random `generator`.
|
1179 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
1180 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
1181 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
1182 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
1183 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
1184 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
1185 |
+
argument.
|
1186 |
+
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
1187 |
+
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
1188 |
+
output_type (`str`, *optional*, defaults to `"pil"`):
|
1189 |
+
The output format of the generate image. Choose between
|
1190 |
+
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
1191 |
+
return_dict (`bool`, *optional*, defaults to `True`):
|
1192 |
+
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
1193 |
+
plain tuple.
|
1194 |
+
callback (`Callable`, *optional*):
|
1195 |
+
A function that will be called every `callback_steps` steps during inference. The function will be
|
1196 |
+
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
1197 |
+
is_cancelled_callback (`Callable`, *optional*):
|
1198 |
+
A function that will be called every `callback_steps` steps during inference. If the function returns
|
1199 |
+
`True`, the inference will be cancelled.
|
1200 |
+
callback_steps (`int`, *optional*, defaults to 1):
|
1201 |
+
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
1202 |
+
called at every step.
|
1203 |
+
cross_attention_kwargs (`dict`, *optional*):
|
1204 |
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
1205 |
+
`self.processor` in
|
1206 |
+
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
1207 |
+
|
1208 |
+
Returns:
|
1209 |
+
`None` if cancelled by `is_cancelled_callback`,
|
1210 |
+
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
1211 |
+
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
1212 |
+
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
1213 |
+
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
1214 |
+
(nsfw) content, according to the `safety_checker`.
|
1215 |
+
"""
|
1216 |
+
return self.__call__(
|
1217 |
+
prompt=prompt,
|
1218 |
+
negative_prompt=negative_prompt,
|
1219 |
+
height=height,
|
1220 |
+
width=width,
|
1221 |
+
num_inference_steps=num_inference_steps,
|
1222 |
+
guidance_scale=guidance_scale,
|
1223 |
+
num_images_per_prompt=num_images_per_prompt,
|
1224 |
+
eta=eta,
|
1225 |
+
generator=generator,
|
1226 |
+
latents=latents,
|
1227 |
+
prompt_embeds=prompt_embeds,
|
1228 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
1229 |
+
max_embeddings_multiples=max_embeddings_multiples,
|
1230 |
+
output_type=output_type,
|
1231 |
+
return_dict=return_dict,
|
1232 |
+
callback=callback,
|
1233 |
+
is_cancelled_callback=is_cancelled_callback,
|
1234 |
+
callback_steps=callback_steps,
|
1235 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
1236 |
+
)
|
1237 |
+
|
1238 |
+
def img2img(
|
1239 |
+
self,
|
1240 |
+
image: Union[torch.FloatTensor, PIL.Image.Image],
|
1241 |
+
prompt: Union[str, List[str]],
|
1242 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
1243 |
+
strength: float = 0.8,
|
1244 |
+
num_inference_steps: Optional[int] = 50,
|
1245 |
+
guidance_scale: Optional[float] = 7.5,
|
1246 |
+
num_images_per_prompt: Optional[int] = 1,
|
1247 |
+
eta: Optional[float] = 0.0,
|
1248 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
1249 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
1250 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
1251 |
+
max_embeddings_multiples: Optional[int] = 3,
|
1252 |
+
output_type: Optional[str] = "pil",
|
1253 |
+
return_dict: bool = True,
|
1254 |
+
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
1255 |
+
is_cancelled_callback: Optional[Callable[[], bool]] = None,
|
1256 |
+
callback_steps: int = 1,
|
1257 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
1258 |
+
):
|
1259 |
+
r"""
|
1260 |
+
Function for image-to-image generation.
|
1261 |
+
Args:
|
1262 |
+
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
1263 |
+
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
1264 |
+
process.
|
1265 |
+
prompt (`str` or `List[str]`):
|
1266 |
+
The prompt or prompts to guide the image generation.
|
1267 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
1268 |
+
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
1269 |
+
if `guidance_scale` is less than `1`).
|
1270 |
+
strength (`float`, *optional*, defaults to 0.8):
|
1271 |
+
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
1272 |
+
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
1273 |
+
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
1274 |
+
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
1275 |
+
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
1276 |
+
num_inference_steps (`int`, *optional*, defaults to 50):
|
1277 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
1278 |
+
expense of slower inference. This parameter will be modulated by `strength`.
|
1279 |
+
guidance_scale (`float`, *optional*, defaults to 7.5):
|
1280 |
+
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
1281 |
+
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
1282 |
+
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
1283 |
+
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
1284 |
+
usually at the expense of lower image quality.
|
1285 |
+
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
1286 |
+
The number of images to generate per prompt.
|
1287 |
+
eta (`float`, *optional*, defaults to 0.0):
|
1288 |
+
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
1289 |
+
[`schedulers.DDIMScheduler`], will be ignored for others.
|
1290 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
1291 |
+
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
1292 |
+
to make generation deterministic.
|
1293 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
1294 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
1295 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
1296 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
1297 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
1298 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
1299 |
+
argument.
|
1300 |
+
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
1301 |
+
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
1302 |
+
output_type (`str`, *optional*, defaults to `"pil"`):
|
1303 |
+
The output format of the generate image. Choose between
|
1304 |
+
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
1305 |
+
return_dict (`bool`, *optional*, defaults to `True`):
|
1306 |
+
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
1307 |
+
plain tuple.
|
1308 |
+
callback (`Callable`, *optional*):
|
1309 |
+
A function that will be called every `callback_steps` steps during inference. The function will be
|
1310 |
+
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
1311 |
+
is_cancelled_callback (`Callable`, *optional*):
|
1312 |
+
A function that will be called every `callback_steps` steps during inference. If the function returns
|
1313 |
+
`True`, the inference will be cancelled.
|
1314 |
+
callback_steps (`int`, *optional*, defaults to 1):
|
1315 |
+
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
1316 |
+
called at every step.
|
1317 |
+
cross_attention_kwargs (`dict`, *optional*):
|
1318 |
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
1319 |
+
`self.processor` in
|
1320 |
+
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
1321 |
+
|
1322 |
+
Returns:
|
1323 |
+
`None` if cancelled by `is_cancelled_callback`,
|
1324 |
+
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
1325 |
+
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
1326 |
+
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
1327 |
+
(nsfw) content, according to the `safety_checker`.
|
1328 |
+
"""
|
1329 |
+
return self.__call__(
|
1330 |
+
prompt=prompt,
|
1331 |
+
negative_prompt=negative_prompt,
|
1332 |
+
image=image,
|
1333 |
+
num_inference_steps=num_inference_steps,
|
1334 |
+
guidance_scale=guidance_scale,
|
1335 |
+
strength=strength,
|
1336 |
+
num_images_per_prompt=num_images_per_prompt,
|
1337 |
+
eta=eta,
|
1338 |
+
generator=generator,
|
1339 |
+
prompt_embeds=prompt_embeds,
|
1340 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
1341 |
+
max_embeddings_multiples=max_embeddings_multiples,
|
1342 |
+
output_type=output_type,
|
1343 |
+
return_dict=return_dict,
|
1344 |
+
callback=callback,
|
1345 |
+
is_cancelled_callback=is_cancelled_callback,
|
1346 |
+
callback_steps=callback_steps,
|
1347 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
1348 |
+
)
|
1349 |
+
|
1350 |
+
def inpaint(
|
1351 |
+
self,
|
1352 |
+
image: Union[torch.FloatTensor, PIL.Image.Image],
|
1353 |
+
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
1354 |
+
prompt: Union[str, List[str]],
|
1355 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
1356 |
+
strength: float = 0.8,
|
1357 |
+
num_inference_steps: Optional[int] = 50,
|
1358 |
+
guidance_scale: Optional[float] = 7.5,
|
1359 |
+
num_images_per_prompt: Optional[int] = 1,
|
1360 |
+
add_predicted_noise: Optional[bool] = False,
|
1361 |
+
eta: Optional[float] = 0.0,
|
1362 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
1363 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
1364 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
1365 |
+
max_embeddings_multiples: Optional[int] = 3,
|
1366 |
+
output_type: Optional[str] = "pil",
|
1367 |
+
return_dict: bool = True,
|
1368 |
+
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
1369 |
+
is_cancelled_callback: Optional[Callable[[], bool]] = None,
|
1370 |
+
callback_steps: int = 1,
|
1371 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
1372 |
+
):
|
1373 |
+
r"""
|
1374 |
+
Function for inpaint.
|
1375 |
+
Args:
|
1376 |
+
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
1377 |
+
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
1378 |
+
process. This is the image whose masked region will be inpainted.
|
1379 |
+
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
1380 |
+
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
1381 |
+
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
1382 |
+
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
1383 |
+
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
1384 |
+
prompt (`str` or `List[str]`):
|
1385 |
+
The prompt or prompts to guide the image generation.
|
1386 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
1387 |
+
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
1388 |
+
if `guidance_scale` is less than `1`).
|
1389 |
+
strength (`float`, *optional*, defaults to 0.8):
|
1390 |
+
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
1391 |
+
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
1392 |
+
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
|
1393 |
+
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
1394 |
+
num_inference_steps (`int`, *optional*, defaults to 50):
|
1395 |
+
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
1396 |
+
the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
|
1397 |
+
guidance_scale (`float`, *optional*, defaults to 7.5):
|
1398 |
+
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
1399 |
+
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
1400 |
+
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
1401 |
+
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
1402 |
+
usually at the expense of lower image quality.
|
1403 |
+
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
1404 |
+
The number of images to generate per prompt.
|
1405 |
+
add_predicted_noise (`bool`, *optional*, defaults to True):
|
1406 |
+
Use predicted noise instead of random noise when constructing noisy versions of the original image in
|
1407 |
+
the reverse diffusion process
|
1408 |
+
eta (`float`, *optional*, defaults to 0.0):
|
1409 |
+
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
1410 |
+
[`schedulers.DDIMScheduler`], will be ignored for others.
|
1411 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
1412 |
+
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
1413 |
+
to make generation deterministic.
|
1414 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
1415 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
1416 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
1417 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
1418 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
1419 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
1420 |
+
argument.
|
1421 |
+
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
1422 |
+
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
1423 |
+
output_type (`str`, *optional*, defaults to `"pil"`):
|
1424 |
+
The output format of the generate image. Choose between
|
1425 |
+
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
1426 |
+
return_dict (`bool`, *optional*, defaults to `True`):
|
1427 |
+
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
1428 |
+
plain tuple.
|
1429 |
+
callback (`Callable`, *optional*):
|
1430 |
+
A function that will be called every `callback_steps` steps during inference. The function will be
|
1431 |
+
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
1432 |
+
is_cancelled_callback (`Callable`, *optional*):
|
1433 |
+
A function that will be called every `callback_steps` steps during inference. If the function returns
|
1434 |
+
`True`, the inference will be cancelled.
|
1435 |
+
callback_steps (`int`, *optional*, defaults to 1):
|
1436 |
+
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
1437 |
+
called at every step.
|
1438 |
+
cross_attention_kwargs (`dict`, *optional*):
|
1439 |
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
1440 |
+
`self.processor` in
|
1441 |
+
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
1442 |
+
|
1443 |
+
Returns:
|
1444 |
+
`None` if cancelled by `is_cancelled_callback`,
|
1445 |
+
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
1446 |
+
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
1447 |
+
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
1448 |
+
(nsfw) content, according to the `safety_checker`.
|
1449 |
+
"""
|
1450 |
+
return self.__call__(
|
1451 |
+
prompt=prompt,
|
1452 |
+
negative_prompt=negative_prompt,
|
1453 |
+
image=image,
|
1454 |
+
mask_image=mask_image,
|
1455 |
+
num_inference_steps=num_inference_steps,
|
1456 |
+
guidance_scale=guidance_scale,
|
1457 |
+
strength=strength,
|
1458 |
+
num_images_per_prompt=num_images_per_prompt,
|
1459 |
+
add_predicted_noise=add_predicted_noise,
|
1460 |
+
eta=eta,
|
1461 |
+
generator=generator,
|
1462 |
+
prompt_embeds=prompt_embeds,
|
1463 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
1464 |
+
max_embeddings_multiples=max_embeddings_multiples,
|
1465 |
+
output_type=output_type,
|
1466 |
+
return_dict=return_dict,
|
1467 |
+
callback=callback,
|
1468 |
+
is_cancelled_callback=is_cancelled_callback,
|
1469 |
+
callback_steps=callback_steps,
|
1470 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
1471 |
+
)
|
chat_anything/face_generator/utils/generate.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from chat_anything.face_generator.pipelines.lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline
|
3 |
+
|
4 |
+
@torch.no_grad()
|
5 |
+
def generate(pipe, prompt, negative_prompt, **generating_conf):
|
6 |
+
pipe_longprompt = StableDiffusionLongPromptWeightingPipeline(
|
7 |
+
unet=pipe.unet,
|
8 |
+
text_encoder=pipe.text_encoder,
|
9 |
+
vae=pipe.vae,
|
10 |
+
tokenizer=pipe.tokenizer,
|
11 |
+
scheduler=pipe.scheduler,
|
12 |
+
safety_checker=None,
|
13 |
+
feature_extractor=None,
|
14 |
+
)
|
15 |
+
print('generating: ', prompt)
|
16 |
+
print('using negative prompt: ', negative_prompt)
|
17 |
+
embeds = pipe_longprompt._encode_prompt(prompt=prompt, negative_prompt=negative_prompt, device=pipe.device, num_images_per_prompt=1, do_classifier_free_guidance=generating_conf['guidance_scale']>1,)
|
18 |
+
negative_prompt_embeds, prompt_embeds = embeds.split(embeds.shape[0]//2)
|
19 |
+
pipe_out = pipe(
|
20 |
+
prompt_embeds=prompt_embeds,
|
21 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
22 |
+
**generating_conf,
|
23 |
+
)
|
24 |
+
return pipe_out
|
25 |
+
|
26 |
+
if __name__ == '__main__':
|
27 |
+
from diffusers.pipelines import StableDiffusionPipeline
|
28 |
+
import argparse
|
29 |
+
def main():
|
30 |
+
parser = argparse.ArgumentParser()
|
31 |
+
parser.add_argument(
|
32 |
+
'--prompts',type=str,default=['starry night','Impression Sunrise, drawn by Claude Monet'], nargs='*'
|
33 |
+
)
|
34 |
+
|
35 |
+
args = parser.parse_args()
|
36 |
+
prompts = args.prompts
|
37 |
+
print(f'generating {prompts}')
|
38 |
+
model_id = 'pretrained_model/sd-v1-4'
|
39 |
+
pipe = StableDiffusionPipeline.from_pretrained(model_id,).to('cuda')
|
40 |
+
images = pipe(prompts).images
|
41 |
+
for i, image in enumerate(images):
|
42 |
+
image.save(f'{prompts[i]}_{i}.png')
|
43 |
+
|
44 |
+
main()
|
45 |
+
|
chat_anything/polly_utils.py
ADDED
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This class stores Polly voice data. Specifically, the class stores several records containing
|
2 |
+
# language, lang_code, gender, voice_id and engine. The class also has a method to return the
|
3 |
+
# voice_id, lang_code and engine given a language and gender.
|
4 |
+
|
5 |
+
NEURAL_ENGINE = "neural"
|
6 |
+
STANDARD_ENGINE = "standard"
|
7 |
+
|
8 |
+
|
9 |
+
class PollyVoiceData:
|
10 |
+
def get_voice(self, language, gender):
|
11 |
+
for voice in self.voice_data:
|
12 |
+
if voice['language'] == language and voice['gender'] == gender:
|
13 |
+
if voice['neural'] == 'Yes':
|
14 |
+
return voice['voice_id'], voice['lang_code'], NEURAL_ENGINE
|
15 |
+
for voice in self.voice_data:
|
16 |
+
if voice['language'] == language and voice['gender'] == gender:
|
17 |
+
if voice['standard'] == 'Yes':
|
18 |
+
return voice['voice_id'], voice['lang_code'], STANDARD_ENGINE
|
19 |
+
return None, None, None
|
20 |
+
|
21 |
+
def get_whisper_lang_code(self, language):
|
22 |
+
for voice in self.voice_data:
|
23 |
+
if voice['language'] == language:
|
24 |
+
return voice['whisper_lang_code']
|
25 |
+
return "en"
|
26 |
+
|
27 |
+
def __init__(self):
|
28 |
+
self.voice_data = [
|
29 |
+
{'language': 'Arabic',
|
30 |
+
'lang_code': 'arb',
|
31 |
+
'whisper_lang_code': 'ar',
|
32 |
+
'voice_id': 'Zeina',
|
33 |
+
'gender': 'Female',
|
34 |
+
'neural': 'No',
|
35 |
+
'standard': 'Yes'},
|
36 |
+
{'language': 'Arabic (Gulf)',
|
37 |
+
'lang_code': 'ar-AE',
|
38 |
+
'whisper_lang_code': 'ar',
|
39 |
+
'voice_id': 'Hala',
|
40 |
+
'gender': 'Female',
|
41 |
+
'neural': 'Yes',
|
42 |
+
'standard': 'No'},
|
43 |
+
{'language': 'Catalan',
|
44 |
+
'lang_code': 'ca-ES',
|
45 |
+
'whisper_lang_code': 'ca',
|
46 |
+
'voice_id': 'Arlet',
|
47 |
+
'gender': 'Female',
|
48 |
+
'neural': 'Yes',
|
49 |
+
'standard': 'No'},
|
50 |
+
{'language': 'Chinese (Cantonese)',
|
51 |
+
'lang_code': 'yue-CN',
|
52 |
+
'whisper_lang_code': 'zh',
|
53 |
+
'voice_id': 'Hiujin',
|
54 |
+
'gender': 'Female',
|
55 |
+
'neural': 'Yes',
|
56 |
+
'standard': 'No'},
|
57 |
+
{'language': 'Chinese (Mandarin)',
|
58 |
+
'lang_code': 'cmn-CN',
|
59 |
+
'whisper_lang_code': 'zh',
|
60 |
+
'voice_id': 'Zhiyu',
|
61 |
+
'gender': 'Female',
|
62 |
+
'neural': 'Yes',
|
63 |
+
'standard': 'No'},
|
64 |
+
{'language': 'Danish',
|
65 |
+
'lang_code': 'da-DK',
|
66 |
+
'whisper_lang_code': 'da',
|
67 |
+
'voice_id': 'Naja',
|
68 |
+
'gender': 'Female',
|
69 |
+
'neural': 'No',
|
70 |
+
'standard': 'Yes'},
|
71 |
+
{'language': 'Danish',
|
72 |
+
'lang_code': 'da-DK',
|
73 |
+
'whisper_lang_code': 'da',
|
74 |
+
'voice_id': 'Mads',
|
75 |
+
'gender': 'Male',
|
76 |
+
'neural': 'No',
|
77 |
+
'standard': 'Yes'},
|
78 |
+
{'language': 'Dutch',
|
79 |
+
'lang_code': 'nl-NL',
|
80 |
+
'whisper_lang_code': 'nl',
|
81 |
+
'voice_id': 'Laura',
|
82 |
+
'gender': 'Female',
|
83 |
+
'neural': 'Yes',
|
84 |
+
'standard': 'No'},
|
85 |
+
{'language': 'Dutch',
|
86 |
+
'lang_code': 'nl-NL',
|
87 |
+
'whisper_lang_code': 'nl',
|
88 |
+
'voice_id': 'Lotte',
|
89 |
+
'gender': 'Female',
|
90 |
+
'neural': 'No',
|
91 |
+
'standard': 'Yes'},
|
92 |
+
{'language': 'Dutch',
|
93 |
+
'lang_code': 'nl-NL',
|
94 |
+
'whisper_lang_code': 'nl',
|
95 |
+
'voice_id': 'Ruben',
|
96 |
+
'gender': 'Male',
|
97 |
+
'neural': 'No',
|
98 |
+
'standard': 'Yes'},
|
99 |
+
{'language': 'English (Australian)',
|
100 |
+
'lang_code': 'en-AU',
|
101 |
+
'whisper_lang_code': 'en',
|
102 |
+
'voice_id': 'Nicole',
|
103 |
+
'gender': 'Female',
|
104 |
+
'neural': 'No',
|
105 |
+
'standard': 'Yes'},
|
106 |
+
{'language': 'English (Australian)',
|
107 |
+
'lang_code': 'en-AU',
|
108 |
+
'whisper_lang_code': 'en',
|
109 |
+
'voice_id': 'Olivia',
|
110 |
+
'gender': 'Female',
|
111 |
+
'neural': 'Yes',
|
112 |
+
'standard': 'No'},
|
113 |
+
{'language': 'English (Australian)',
|
114 |
+
'lang_code': 'en-AU',
|
115 |
+
'whisper_lang_code': 'en',
|
116 |
+
'voice_id': 'Russell',
|
117 |
+
'gender': 'Male',
|
118 |
+
'neural': 'No',
|
119 |
+
'standard': 'Yes'},
|
120 |
+
{'language': 'English (British)',
|
121 |
+
'lang_code': 'en-GB',
|
122 |
+
'whisper_lang_code': 'en',
|
123 |
+
'voice_id': 'Amy',
|
124 |
+
'gender': 'Female',
|
125 |
+
'neural': 'Yes',
|
126 |
+
'standard': 'Yes'},
|
127 |
+
{'language': 'English (British)',
|
128 |
+
'lang_code': 'en-GB',
|
129 |
+
'whisper_lang_code': 'en',
|
130 |
+
'voice_id': 'Emma',
|
131 |
+
'gender': 'Female',
|
132 |
+
'neural': 'Yes',
|
133 |
+
'standard': 'Yes'},
|
134 |
+
{'language': 'English (British)',
|
135 |
+
'lang_code': 'en-GB',
|
136 |
+
'whisper_lang_code': 'en',
|
137 |
+
'voice_id': 'Brian',
|
138 |
+
'gender': 'Male',
|
139 |
+
'neural': 'Yes',
|
140 |
+
'standard': 'Yes'},
|
141 |
+
{'language': 'English (British)',
|
142 |
+
'lang_code': 'en-GB',
|
143 |
+
'whisper_lang_code': 'en',
|
144 |
+
'voice_id': 'Arthur',
|
145 |
+
'gender': 'Male',
|
146 |
+
'neural': 'Yes',
|
147 |
+
'standard': 'No'},
|
148 |
+
{'language': 'English (Indian)',
|
149 |
+
'lang_code': 'en-IN',
|
150 |
+
'whisper_lang_code': 'en',
|
151 |
+
'voice_id': 'Aditi',
|
152 |
+
'gender': 'Female',
|
153 |
+
'neural': 'No',
|
154 |
+
'standard': 'Yes'},
|
155 |
+
{'language': 'English (Indian)',
|
156 |
+
'lang_code': 'en-IN',
|
157 |
+
'whisper_lang_code': 'en',
|
158 |
+
'voice_id': 'Raveena',
|
159 |
+
'gender': 'Female',
|
160 |
+
'neural': 'No',
|
161 |
+
'standard': 'Yes'},
|
162 |
+
{'language': 'English (Indian)',
|
163 |
+
'lang_code': 'en-IN',
|
164 |
+
'whisper_lang_code': 'en',
|
165 |
+
'voice_id': 'Kajal',
|
166 |
+
'gender': 'Female',
|
167 |
+
'neural': 'Yes',
|
168 |
+
'standard': 'No'},
|
169 |
+
{'language': 'English (New Zealand)',
|
170 |
+
'lang_code': 'en-NZ',
|
171 |
+
'whisper_lang_code': 'en',
|
172 |
+
'voice_id': 'Aria',
|
173 |
+
'gender': 'Female',
|
174 |
+
'neural': 'Yes',
|
175 |
+
'standard': 'No'},
|
176 |
+
{'language': 'English (South African)',
|
177 |
+
'lang_code': 'en-ZA',
|
178 |
+
'whisper_lang_code': 'en',
|
179 |
+
'voice_id': 'Ayanda',
|
180 |
+
'gender': 'Female',
|
181 |
+
'neural': 'Yes',
|
182 |
+
'standard': 'No'},
|
183 |
+
{'language': 'English (US)',
|
184 |
+
'lang_code': 'en-US',
|
185 |
+
'whisper_lang_code': 'en',
|
186 |
+
'voice_id': 'Ivy',
|
187 |
+
'gender': 'Female (child)',
|
188 |
+
'neural': 'Yes',
|
189 |
+
'standard': 'Yes'},
|
190 |
+
{'language': 'English (US)',
|
191 |
+
'lang_code': 'en-US',
|
192 |
+
'whisper_lang_code': 'en',
|
193 |
+
'voice_id': 'Joanna',
|
194 |
+
'gender': 'Female',
|
195 |
+
'neural': 'Yes',
|
196 |
+
'standard': 'Yes'},
|
197 |
+
{'language': 'English (US)',
|
198 |
+
'lang_code': 'en-US',
|
199 |
+
'whisper_lang_code': 'en',
|
200 |
+
'voice_id': 'Kendra',
|
201 |
+
'gender': 'Female',
|
202 |
+
'neural': 'Yes',
|
203 |
+
'standard': 'Yes'},
|
204 |
+
{'language': 'English (US)',
|
205 |
+
'lang_code': 'en-US',
|
206 |
+
'whisper_lang_code': 'en',
|
207 |
+
'voice_id': 'Kimberly',
|
208 |
+
'gender': 'Female',
|
209 |
+
'neural': 'Yes',
|
210 |
+
'standard': 'Yes'},
|
211 |
+
{'language': 'English (US)',
|
212 |
+
'lang_code': 'en-US',
|
213 |
+
'whisper_lang_code': 'en',
|
214 |
+
'voice_id': 'Salli',
|
215 |
+
'gender': 'Female',
|
216 |
+
'neural': 'Yes',
|
217 |
+
'standard': 'Yes'},
|
218 |
+
{'language': 'English (US)',
|
219 |
+
'lang_code': 'en-US',
|
220 |
+
'whisper_lang_code': 'en',
|
221 |
+
'voice_id': 'Joey',
|
222 |
+
'gender': 'Male',
|
223 |
+
'neural': 'Yes',
|
224 |
+
'standard': 'Yes'},
|
225 |
+
{'language': 'English (US)',
|
226 |
+
'lang_code': 'en-US',
|
227 |
+
'whisper_lang_code': 'en',
|
228 |
+
'voice_id': 'Justin',
|
229 |
+
'gender': 'Male (child)',
|
230 |
+
'neural': 'Yes',
|
231 |
+
'standard': 'Yes'},
|
232 |
+
{'language': 'English (US)',
|
233 |
+
'lang_code': 'en-US',
|
234 |
+
'whisper_lang_code': 'en',
|
235 |
+
'voice_id': 'Kevin',
|
236 |
+
'gender': 'Male (child)',
|
237 |
+
'neural': 'Yes',
|
238 |
+
'standard': 'No'},
|
239 |
+
{'language': 'English (US)',
|
240 |
+
'lang_code': 'en-US',
|
241 |
+
'whisper_lang_code': 'en',
|
242 |
+
'voice_id': 'Matthew',
|
243 |
+
'gender': 'Male',
|
244 |
+
'neural': 'Yes',
|
245 |
+
'standard': 'Yes'},
|
246 |
+
{'language': 'English (Welsh)',
|
247 |
+
'lang_code': 'en-GB-WLS',
|
248 |
+
'whisper_lang_code': 'en',
|
249 |
+
'voice_id': 'Geraint',
|
250 |
+
'gender': 'Male',
|
251 |
+
'neural': 'No',
|
252 |
+
'standard': 'Yes'},
|
253 |
+
{'language': 'Finnish',
|
254 |
+
'lang_code': 'fi-FI',
|
255 |
+
'whisper_lang_code': 'fi',
|
256 |
+
'voice_id': 'Suvi',
|
257 |
+
'gender': 'Female',
|
258 |
+
'neural': 'Yes',
|
259 |
+
'standard': 'No'},
|
260 |
+
{'language': 'French',
|
261 |
+
'lang_code': 'fr-FR',
|
262 |
+
'whisper_lang_code': 'fr',
|
263 |
+
'voice_id': 'Celine',
|
264 |
+
'gender': 'Female',
|
265 |
+
'neural': 'No',
|
266 |
+
'standard': 'Yes'},
|
267 |
+
{'language': 'French',
|
268 |
+
'lang_code': 'fr-FR',
|
269 |
+
'whisper_lang_code': 'fr',
|
270 |
+
'voice_id': 'Lea',
|
271 |
+
'gender': 'Female',
|
272 |
+
'neural': 'Yes',
|
273 |
+
'standard': 'Yes'},
|
274 |
+
{'language': 'French',
|
275 |
+
'lang_code': 'fr-FR',
|
276 |
+
'whisper_lang_code': 'fr',
|
277 |
+
'voice_id': 'Mathieu',
|
278 |
+
'gender': 'Male',
|
279 |
+
'neural': 'No',
|
280 |
+
'standard': 'Yes'},
|
281 |
+
{'language': 'French (Canadian)',
|
282 |
+
'lang_code': 'fr-CA',
|
283 |
+
'whisper_lang_code': 'fr',
|
284 |
+
'voice_id': 'Chantal',
|
285 |
+
'gender': 'Female',
|
286 |
+
'neural': 'No',
|
287 |
+
'standard': 'Yes'},
|
288 |
+
{'language': 'French (Canadian)',
|
289 |
+
'lang_code': 'fr-CA',
|
290 |
+
'whisper_lang_code': 'fr',
|
291 |
+
'voice_id': 'Gabrielle',
|
292 |
+
'gender': 'Female',
|
293 |
+
'neural': 'Yes',
|
294 |
+
'standard': 'No'},
|
295 |
+
{'language': 'French (Canadian)',
|
296 |
+
'lang_code': 'fr-CA',
|
297 |
+
'whisper_lang_code': 'fr',
|
298 |
+
'voice_id': 'Liam',
|
299 |
+
'gender': 'Male',
|
300 |
+
'neural': 'Yes',
|
301 |
+
'standard': 'No'},
|
302 |
+
{'language': 'German',
|
303 |
+
'lang_code': 'de-DE',
|
304 |
+
'whisper_lang_code': 'de',
|
305 |
+
'voice_id': 'Marlene',
|
306 |
+
'gender': 'Female',
|
307 |
+
'neural': 'No',
|
308 |
+
'standard': 'Yes'},
|
309 |
+
{'language': 'German',
|
310 |
+
'lang_code': 'de-DE',
|
311 |
+
'whisper_lang_code': 'de',
|
312 |
+
'voice_id': 'Vicki',
|
313 |
+
'gender': 'Female',
|
314 |
+
'neural': 'Yes',
|
315 |
+
'standard': 'Yes'},
|
316 |
+
{'language': 'German',
|
317 |
+
'lang_code': 'de-DE',
|
318 |
+
'whisper_lang_code': 'de',
|
319 |
+
'voice_id': 'Hans',
|
320 |
+
'gender': 'Male',
|
321 |
+
'neural': 'No',
|
322 |
+
'standard': 'Yes'},
|
323 |
+
{'language': 'German',
|
324 |
+
'lang_code': 'de-DE',
|
325 |
+
'whisper_lang_code': 'de',
|
326 |
+
'voice_id': 'Daniel',
|
327 |
+
'gender': 'Male',
|
328 |
+
'neural': 'Yes',
|
329 |
+
'standard': 'No'},
|
330 |
+
{'language': 'German (Austrian)',
|
331 |
+
'lang_code': 'de-AT',
|
332 |
+
'whisper_lang_code': 'de',
|
333 |
+
'voice_id': 'Hannah',
|
334 |
+
'gender': 'Female',
|
335 |
+
'neural': 'Yes',
|
336 |
+
'standard': 'No'},
|
337 |
+
{'language': 'Hindi',
|
338 |
+
'lang_code': 'hi-IN',
|
339 |
+
'whisper_lang_code': 'hi',
|
340 |
+
'voice_id': 'Aditi',
|
341 |
+
'gender': 'Female',
|
342 |
+
'neural': 'No',
|
343 |
+
'standard': 'Yes'},
|
344 |
+
{'language': 'Hindi',
|
345 |
+
'lang_code': 'hi-IN',
|
346 |
+
'whisper_lang_code': 'hi',
|
347 |
+
'voice_id': 'Kajal',
|
348 |
+
'gender': 'Female',
|
349 |
+
'neural': 'Yes',
|
350 |
+
'standard': 'No'},
|
351 |
+
{'language': 'Icelandic',
|
352 |
+
'lang_code': 'is-IS',
|
353 |
+
'whisper_lang_code': 'is',
|
354 |
+
'voice_id': 'Dora',
|
355 |
+
'gender': 'Female',
|
356 |
+
'neural': 'No',
|
357 |
+
'standard': 'Yes'},
|
358 |
+
{'language': 'Icelandic',
|
359 |
+
'lang_code': 'is-IS',
|
360 |
+
'whisper_lang_code': 'is',
|
361 |
+
'voice_id': 'Karl',
|
362 |
+
'gender': 'Male',
|
363 |
+
'neural': 'No',
|
364 |
+
'standard': 'Yes'},
|
365 |
+
{'language': 'Italian',
|
366 |
+
'lang_code': 'it-IT',
|
367 |
+
'whisper_lang_code': 'it',
|
368 |
+
'voice_id': 'Carla',
|
369 |
+
'gender': 'Female',
|
370 |
+
'neural': 'No',
|
371 |
+
'standard': 'Yes'},
|
372 |
+
{'language': 'Italian',
|
373 |
+
'lang_code': 'it-IT',
|
374 |
+
'whisper_lang_code': 'it',
|
375 |
+
'voice_id': 'Bianca',
|
376 |
+
'gender': 'Female',
|
377 |
+
'neural': 'Yes',
|
378 |
+
'standard': 'Yes'},
|
379 |
+
{'language': 'Japanese',
|
380 |
+
'lang_code': 'ja-JP',
|
381 |
+
'whisper_lang_code': 'ja',
|
382 |
+
'voice_id': 'Mizuki',
|
383 |
+
'gender': 'Female',
|
384 |
+
'neural': 'No',
|
385 |
+
'standard': 'Yes'},
|
386 |
+
{'language': 'Japanese',
|
387 |
+
'lang_code': 'ja-JP',
|
388 |
+
'whisper_lang_code': 'ja',
|
389 |
+
'voice_id': 'Takumi',
|
390 |
+
'gender': 'Male',
|
391 |
+
'neural': 'Yes',
|
392 |
+
'standard': 'Yes'},
|
393 |
+
{'language': 'Korean',
|
394 |
+
'lang_code': 'ko-KR',
|
395 |
+
'whisper_lang_code': 'ko',
|
396 |
+
'voice_id': 'Seoyeon',
|
397 |
+
'gender': 'Female',
|
398 |
+
'neural': 'Yes',
|
399 |
+
'standard': 'Yes'},
|
400 |
+
{'language': 'Norwegian',
|
401 |
+
'lang_code': 'nb-NO',
|
402 |
+
'whisper_lang_code': 'no',
|
403 |
+
'voice_id': 'Liv',
|
404 |
+
'gender': 'Female',
|
405 |
+
'neural': 'No',
|
406 |
+
'standard': 'Yes'},
|
407 |
+
{'language': 'Norwegian',
|
408 |
+
'lang_code': 'nb-NO',
|
409 |
+
'whisper_lang_code': 'no',
|
410 |
+
'voice_id': 'Ida',
|
411 |
+
'gender': 'Female',
|
412 |
+
'neural': 'Yes',
|
413 |
+
'standard': 'No'},
|
414 |
+
{'language': 'Polish',
|
415 |
+
'lang_code': 'pl-PL',
|
416 |
+
'whisper_lang_code': 'pl',
|
417 |
+
'voice_id': 'Ewa',
|
418 |
+
'gender': 'Female',
|
419 |
+
'neural': 'No',
|
420 |
+
'standard': 'Yes'},
|
421 |
+
{'language': 'Polish',
|
422 |
+
'lang_code': 'pl-PL',
|
423 |
+
'whisper_lang_code': 'pl',
|
424 |
+
'voice_id': 'Maja',
|
425 |
+
'gender': 'Female',
|
426 |
+
'neural': 'No',
|
427 |
+
'standard': 'Yes'},
|
428 |
+
{'language': 'Polish',
|
429 |
+
'lang_code': 'pl-PL',
|
430 |
+
'whisper_lang_code': 'pl',
|
431 |
+
'voice_id': 'Jacek',
|
432 |
+
'gender': 'Male',
|
433 |
+
'neural': 'No',
|
434 |
+
'standard': 'Yes'},
|
435 |
+
{'language': 'Polish',
|
436 |
+
'lang_code': 'pl-PL',
|
437 |
+
'whisper_lang_code': 'pl',
|
438 |
+
'voice_id': 'Jan',
|
439 |
+
'gender': 'Male',
|
440 |
+
'neural': 'No',
|
441 |
+
'standard': 'Yes'},
|
442 |
+
{'language': 'Polish',
|
443 |
+
'lang_code': 'pl-PL',
|
444 |
+
'whisper_lang_code': 'pl',
|
445 |
+
'voice_id': 'Ola',
|
446 |
+
'gender': 'Female',
|
447 |
+
'neural': 'Yes',
|
448 |
+
'standard': 'No'},
|
449 |
+
{'language': 'Portuguese (Brazilian)',
|
450 |
+
'lang_code': 'pt-BR',
|
451 |
+
'whisper_lang_code': 'pt',
|
452 |
+
'voice_id': 'Camila',
|
453 |
+
'gender': 'Female',
|
454 |
+
'neural': 'Yes',
|
455 |
+
'standard': 'Yes'},
|
456 |
+
{'language': 'Portuguese (Brazilian)',
|
457 |
+
'lang_code': 'pt-BR',
|
458 |
+
'whisper_lang_code': 'pt',
|
459 |
+
'voice_id': 'Vitoria',
|
460 |
+
'gender': 'Female',
|
461 |
+
'neural': 'Yes',
|
462 |
+
'standard': 'Yes'},
|
463 |
+
{'language': 'Portuguese (Brazilian)',
|
464 |
+
'lang_code': 'pt-BR',
|
465 |
+
'whisper_lang_code': 'pt',
|
466 |
+
'voice_id': 'Ricardo',
|
467 |
+
'gender': 'Male',
|
468 |
+
'neural': 'No',
|
469 |
+
'standard': 'Yes'},
|
470 |
+
{'language': 'Portuguese (European)',
|
471 |
+
'lang_code': 'pt-PT',
|
472 |
+
'whisper_lang_code': 'pt',
|
473 |
+
'voice_id': 'Ines',
|
474 |
+
'gender': 'Female',
|
475 |
+
'neural': 'Yes',
|
476 |
+
'standard': 'Yes'},
|
477 |
+
{'language': 'Portuguese (European)',
|
478 |
+
'lang_code': 'pt-PT',
|
479 |
+
'whisper_lang_code': 'pt',
|
480 |
+
'voice_id': 'Cristiano',
|
481 |
+
'gender': 'Male',
|
482 |
+
'neural': 'No',
|
483 |
+
'standard': 'Yes'},
|
484 |
+
{'language': 'Romanian',
|
485 |
+
'lang_code': 'ro-RO',
|
486 |
+
'whisper_lang_code': 'ro',
|
487 |
+
'voice_id': 'Carmen',
|
488 |
+
'gender': 'Female',
|
489 |
+
'neural': 'No',
|
490 |
+
'standard': 'Yes'},
|
491 |
+
{'language': 'Russian',
|
492 |
+
'lang_code': 'ru-RU',
|
493 |
+
'whisper_lang_code': 'ru',
|
494 |
+
'voice_id': 'Tatyana',
|
495 |
+
'gender': 'Female',
|
496 |
+
'neural': 'No',
|
497 |
+
'standard': 'Yes'},
|
498 |
+
{'language': 'Russian',
|
499 |
+
'lang_code': 'ru-RU',
|
500 |
+
'whisper_lang_code': 'ru',
|
501 |
+
'voice_id': 'Maxim',
|
502 |
+
'gender': 'Male',
|
503 |
+
'neural': 'No',
|
504 |
+
'standard': 'Yes'},
|
505 |
+
{'language': 'Spanish (European)',
|
506 |
+
'lang_code': 'es-ES',
|
507 |
+
'whisper_lang_code': 'es',
|
508 |
+
'voice_id': 'Conchita',
|
509 |
+
'gender': 'Female',
|
510 |
+
'neural': 'No',
|
511 |
+
'standard': 'Yes'},
|
512 |
+
{'language': 'Spanish (European)',
|
513 |
+
'lang_code': 'es-ES',
|
514 |
+
'whisper_lang_code': 'es',
|
515 |
+
'voice_id': 'Lucia',
|
516 |
+
'gender': 'Female',
|
517 |
+
'neural': 'Yes',
|
518 |
+
'standard': 'Yes'},
|
519 |
+
{'language': 'Spanish (European)',
|
520 |
+
'lang_code': 'es-ES',
|
521 |
+
'whisper_lang_code': 'es',
|
522 |
+
'voice_id': 'Enrique',
|
523 |
+
'gender': 'Male',
|
524 |
+
'neural': 'No',
|
525 |
+
'standard': 'Yes'},
|
526 |
+
{'language': 'Spanish (Mexican)',
|
527 |
+
'lang_code': 'es-MX',
|
528 |
+
'whisper_lang_code': 'es',
|
529 |
+
'voice_id': 'Mia',
|
530 |
+
'gender': 'Female',
|
531 |
+
'neural': 'Yes',
|
532 |
+
'standard': 'Yes'},
|
533 |
+
{'language': 'Spanish (US)',
|
534 |
+
'lang_code': 'es-US',
|
535 |
+
'whisper_lang_code': 'es',
|
536 |
+
'voice_id': 'Lupe',
|
537 |
+
'gender': 'Female',
|
538 |
+
'neural': 'Yes',
|
539 |
+
'standard': 'Yes'},
|
540 |
+
{'language': 'Spanish (US)',
|
541 |
+
'lang_code': 'es-US',
|
542 |
+
'whisper_lang_code': 'es',
|
543 |
+
'voice_id': 'Penelope',
|
544 |
+
'gender': 'Female',
|
545 |
+
'neural': 'No',
|
546 |
+
'standard': 'Yes'},
|
547 |
+
{'language': 'Spanish (US)',
|
548 |
+
'lang_code': 'es-US',
|
549 |
+
'whisper_lang_code': 'es',
|
550 |
+
'voice_id': 'Miguel',
|
551 |
+
'gender': 'Male',
|
552 |
+
'neural': 'No',
|
553 |
+
'standard': 'Yes'},
|
554 |
+
{'language': 'Spanish (US)',
|
555 |
+
'lang_code': 'es-US',
|
556 |
+
'whisper_lang_code': 'es',
|
557 |
+
'voice_id': 'Pedro',
|
558 |
+
'gender': 'Male',
|
559 |
+
'neural': 'Yes',
|
560 |
+
'standard': 'No'},
|
561 |
+
{'language': 'Swedish',
|
562 |
+
'lang_code': 'sv-SE',
|
563 |
+
'whisper_lang_code': 'sv',
|
564 |
+
'voice_id': 'Astrid',
|
565 |
+
'gender': 'Female',
|
566 |
+
'neural': 'No',
|
567 |
+
'standard': 'Yes'},
|
568 |
+
{'language': 'Swedish',
|
569 |
+
'lang_code': 'sv-SE',
|
570 |
+
'whisper_lang_code': 'sv',
|
571 |
+
'voice_id': 'Elin',
|
572 |
+
'gender': 'Female',
|
573 |
+
'neural': 'Yes',
|
574 |
+
'standard': 'No'},
|
575 |
+
{'language': 'Turkish',
|
576 |
+
'lang_code': 'tr-TR',
|
577 |
+
'whisper_lang_code': 'tr',
|
578 |
+
'voice_id': 'Filiz',
|
579 |
+
'gender': 'Female',
|
580 |
+
'neural': 'No',
|
581 |
+
'standard': 'Yes'},
|
582 |
+
{'language': 'Welsh',
|
583 |
+
'lang_code': 'cy-GB',
|
584 |
+
'whisper_lang_code': 'cy',
|
585 |
+
'voice_id': 'Gwyneth',
|
586 |
+
'gender': 'Female',
|
587 |
+
'neural': 'No',
|
588 |
+
'standard': 'Yes'}
|
589 |
+
]
|
590 |
+
|
591 |
+
|
592 |
+
# Run from the command-line
|
593 |
+
if __name__ == '__main__':
|
594 |
+
polly_voice_data = PollyVoiceData()
|
595 |
+
|
596 |
+
voice_id, language_code, engine = polly_voice_data.get_voice('English (US)', 'Male')
|
597 |
+
print('English (US)', 'Male', voice_id, language_code, engine)
|
598 |
+
|
599 |
+
voice_id, language_code, engine = polly_voice_data.get_voice('English (US)', 'Female')
|
600 |
+
print('English (US)', 'Female', voice_id, language_code, engine)
|
601 |
+
|
602 |
+
voice_id, language_code, engine = polly_voice_data.get_voice('French', 'Female')
|
603 |
+
print('French', 'Female', voice_id, language_code, engine)
|
604 |
+
|
605 |
+
voice_id, language_code, engine = polly_voice_data.get_voice('French', 'Male')
|
606 |
+
print('French', 'Male', voice_id, language_code, engine)
|
607 |
+
|
608 |
+
voice_id, language_code, engine = polly_voice_data.get_voice('Japanese', 'Female')
|
609 |
+
print('Japanese', 'Female', voice_id, language_code, engine)
|
610 |
+
|
611 |
+
voice_id, language_code, engine = polly_voice_data.get_voice('Japanese', 'Male')
|
612 |
+
print('Japanese', 'Male', voice_id, language_code, engine)
|
613 |
+
|
614 |
+
voice_id, language_code, engine = polly_voice_data.get_voice('Hindi', 'Female')
|
615 |
+
print('Hindi', 'Female', voice_id, language_code, engine)
|
616 |
+
|
617 |
+
voice_id, language_code, engine = polly_voice_data.get_voice('Hindi', 'Male')
|
618 |
+
print('Hindi', 'Male', voice_id, language_code, engine)
|
619 |
+
|
620 |
+
whisper_lang_code = polly_voice_data.get_whisper_lang_code('English (US)')
|
621 |
+
print('English (US) whisper_lang_code:', whisper_lang_code)
|
622 |
+
|
623 |
+
whisper_lang_code = polly_voice_data.get_whisper_lang_code('Chinese (Mandarin)')
|
624 |
+
print('Chinese (Mandarin) whisper_lang_code:', whisper_lang_code)
|
625 |
+
|
626 |
+
whisper_lang_code = polly_voice_data.get_whisper_lang_code('Norwegian')
|
627 |
+
print('Norwegian whisper_lang_code:', whisper_lang_code)
|
628 |
+
|
629 |
+
whisper_lang_code = polly_voice_data.get_whisper_lang_code('Dutch')
|
630 |
+
print('Dutch whisper_lang_code:', whisper_lang_code)
|
631 |
+
|
632 |
+
whisper_lang_code = polly_voice_data.get_whisper_lang_code('Foo')
|
633 |
+
print('Foo whisper_lang_code:', whisper_lang_code)
|
634 |
+
|
635 |
+
|
chat_anything/sad_talker/__init__.py
ADDED
File without changes
|
chat_anything/sad_talker/audio2exp_models/audio2exp.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tqdm import tqdm
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
|
5 |
+
|
6 |
+
class Audio2Exp(nn.Module):
|
7 |
+
def __init__(self, netG, cfg, device, prepare_training_loss=False):
|
8 |
+
super(Audio2Exp, self).__init__()
|
9 |
+
self.cfg = cfg
|
10 |
+
self.device = device
|
11 |
+
self.netG = netG.to(device)
|
12 |
+
|
13 |
+
def test(self, batch):
|
14 |
+
|
15 |
+
mel_input = batch['indiv_mels'] # bs T 1 80 16
|
16 |
+
bs = mel_input.shape[0]
|
17 |
+
T = mel_input.shape[1]
|
18 |
+
|
19 |
+
exp_coeff_pred = []
|
20 |
+
|
21 |
+
for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames
|
22 |
+
|
23 |
+
current_mel_input = mel_input[:,i:i+10]
|
24 |
+
|
25 |
+
#ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1)) #bs T 64
|
26 |
+
ref = batch['ref'][:, :, :64][:, i:i+10]
|
27 |
+
ratio = batch['ratio_gt'][:, i:i+10] #bs T
|
28 |
+
|
29 |
+
audiox = current_mel_input.view(-1, 1, 80, 16) # bs*T 1 80 16
|
30 |
+
|
31 |
+
curr_exp_coeff_pred = self.netG(audiox, ref, ratio) # bs T 64
|
32 |
+
|
33 |
+
exp_coeff_pred += [curr_exp_coeff_pred]
|
34 |
+
|
35 |
+
# BS x T x 64
|
36 |
+
results_dict = {
|
37 |
+
'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1)
|
38 |
+
}
|
39 |
+
return results_dict
|
40 |
+
|
41 |
+
|
chat_anything/sad_talker/audio2exp_models/networks.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from torch import nn
|
4 |
+
|
5 |
+
class Conv2d(nn.Module):
|
6 |
+
def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, use_act = True, *args, **kwargs):
|
7 |
+
super().__init__(*args, **kwargs)
|
8 |
+
self.conv_block = nn.Sequential(
|
9 |
+
nn.Conv2d(cin, cout, kernel_size, stride, padding),
|
10 |
+
nn.BatchNorm2d(cout)
|
11 |
+
)
|
12 |
+
self.act = nn.ReLU()
|
13 |
+
self.residual = residual
|
14 |
+
self.use_act = use_act
|
15 |
+
|
16 |
+
def forward(self, x):
|
17 |
+
out = self.conv_block(x)
|
18 |
+
if self.residual:
|
19 |
+
out += x
|
20 |
+
|
21 |
+
if self.use_act:
|
22 |
+
return self.act(out)
|
23 |
+
else:
|
24 |
+
return out
|
25 |
+
|
26 |
+
class SimpleWrapperV2(nn.Module):
|
27 |
+
def __init__(self) -> None:
|
28 |
+
super().__init__()
|
29 |
+
self.audio_encoder = nn.Sequential(
|
30 |
+
Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
|
31 |
+
Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
|
32 |
+
Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
|
33 |
+
|
34 |
+
Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
|
35 |
+
Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
|
36 |
+
Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
|
37 |
+
|
38 |
+
Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
|
39 |
+
Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
|
40 |
+
Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
|
41 |
+
|
42 |
+
Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
|
43 |
+
Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
|
44 |
+
|
45 |
+
Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
|
46 |
+
Conv2d(512, 512, kernel_size=1, stride=1, padding=0),
|
47 |
+
)
|
48 |
+
|
49 |
+
#### load the pre-trained audio_encoder
|
50 |
+
#self.audio_encoder = self.audio_encoder.to(device)
|
51 |
+
'''
|
52 |
+
wav2lip_state_dict = torch.load('/apdcephfs_cq2/share_1290939/wenxuazhang/checkpoints/wav2lip.pth')['state_dict']
|
53 |
+
state_dict = self.audio_encoder.state_dict()
|
54 |
+
|
55 |
+
for k,v in wav2lip_state_dict.items():
|
56 |
+
if 'audio_encoder' in k:
|
57 |
+
print('init:', k)
|
58 |
+
state_dict[k.replace('module.audio_encoder.', '')] = v
|
59 |
+
self.audio_encoder.load_state_dict(state_dict)
|
60 |
+
'''
|
61 |
+
|
62 |
+
self.mapping1 = nn.Linear(512+64+1, 64)
|
63 |
+
#self.mapping2 = nn.Linear(30, 64)
|
64 |
+
#nn.init.constant_(self.mapping1.weight, 0.)
|
65 |
+
nn.init.constant_(self.mapping1.bias, 0.)
|
66 |
+
|
67 |
+
def forward(self, x, ref, ratio):
|
68 |
+
x = self.audio_encoder(x).view(x.size(0), -1)
|
69 |
+
ref_reshape = ref.reshape(x.size(0), -1)
|
70 |
+
ratio = ratio.reshape(x.size(0), -1)
|
71 |
+
|
72 |
+
y = self.mapping1(torch.cat([x, ref_reshape, ratio], dim=1))
|
73 |
+
out = y.reshape(ref.shape[0], ref.shape[1], -1) #+ ref # resudial
|
74 |
+
return out
|
chat_anything/sad_talker/audio2pose_models/audio2pose.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
from chat_anything.sad_talker.audio2pose_models.cvae import CVAE
|
4 |
+
from chat_anything.sad_talker.audio2pose_models.discriminator import PoseSequenceDiscriminator
|
5 |
+
from chat_anything.sad_talker.audio2pose_models.audio_encoder import AudioEncoder
|
6 |
+
|
7 |
+
class Audio2Pose(nn.Module):
|
8 |
+
def __init__(self, cfg, wav2lip_checkpoint, device='cuda'):
|
9 |
+
super().__init__()
|
10 |
+
self.cfg = cfg
|
11 |
+
self.seq_len = cfg.MODEL.CVAE.SEQ_LEN
|
12 |
+
self.latent_dim = cfg.MODEL.CVAE.LATENT_SIZE
|
13 |
+
self.device = device
|
14 |
+
|
15 |
+
self.audio_encoder = AudioEncoder(wav2lip_checkpoint, device)
|
16 |
+
self.audio_encoder.eval()
|
17 |
+
for param in self.audio_encoder.parameters():
|
18 |
+
param.requires_grad = False
|
19 |
+
|
20 |
+
self.netG = CVAE(cfg)
|
21 |
+
self.netD_motion = PoseSequenceDiscriminator(cfg)
|
22 |
+
|
23 |
+
|
24 |
+
def forward(self, x):
|
25 |
+
|
26 |
+
batch = {}
|
27 |
+
coeff_gt = x['gt'].cuda().squeeze(0) #bs frame_len+1 73
|
28 |
+
batch['pose_motion_gt'] = coeff_gt[:, 1:, 64:70] - coeff_gt[:, :1, 64:70] #bs frame_len 6
|
29 |
+
batch['ref'] = coeff_gt[:, 0, 64:70] #bs 6
|
30 |
+
batch['class'] = x['class'].squeeze(0).cuda() # bs
|
31 |
+
indiv_mels= x['indiv_mels'].cuda().squeeze(0) # bs seq_len+1 80 16
|
32 |
+
|
33 |
+
# forward
|
34 |
+
audio_emb_list = []
|
35 |
+
audio_emb = self.audio_encoder(indiv_mels[:, 1:, :, :].unsqueeze(2)) #bs seq_len 512
|
36 |
+
batch['audio_emb'] = audio_emb
|
37 |
+
batch = self.netG(batch)
|
38 |
+
|
39 |
+
pose_motion_pred = batch['pose_motion_pred'] # bs frame_len 6
|
40 |
+
pose_gt = coeff_gt[:, 1:, 64:70].clone() # bs frame_len 6
|
41 |
+
pose_pred = coeff_gt[:, :1, 64:70] + pose_motion_pred # bs frame_len 6
|
42 |
+
|
43 |
+
batch['pose_pred'] = pose_pred
|
44 |
+
batch['pose_gt'] = pose_gt
|
45 |
+
|
46 |
+
return batch
|
47 |
+
|
48 |
+
def test(self, x):
|
49 |
+
|
50 |
+
batch = {}
|
51 |
+
ref = x['ref'] #bs 1 70
|
52 |
+
batch['ref'] = x['ref'][:,0,-6:]
|
53 |
+
batch['class'] = x['class']
|
54 |
+
bs = ref.shape[0]
|
55 |
+
|
56 |
+
indiv_mels= x['indiv_mels'] # bs T 1 80 16
|
57 |
+
indiv_mels_use = indiv_mels[:, 1:] # we regard the ref as the first frame
|
58 |
+
num_frames = x['num_frames']
|
59 |
+
num_frames = int(num_frames) - 1
|
60 |
+
|
61 |
+
#
|
62 |
+
div = num_frames//self.seq_len
|
63 |
+
re = num_frames%self.seq_len
|
64 |
+
audio_emb_list = []
|
65 |
+
pose_motion_pred_list = [torch.zeros(batch['ref'].unsqueeze(1).shape, dtype=batch['ref'].dtype,
|
66 |
+
device=batch['ref'].device)]
|
67 |
+
|
68 |
+
for i in range(div):
|
69 |
+
z = torch.randn(bs, self.latent_dim).to(ref.device)
|
70 |
+
batch['z'] = z
|
71 |
+
audio_emb = self.audio_encoder(indiv_mels_use[:, i*self.seq_len:(i+1)*self.seq_len,:,:,:]) #bs seq_len 512
|
72 |
+
batch['audio_emb'] = audio_emb
|
73 |
+
batch = self.netG.test(batch)
|
74 |
+
pose_motion_pred_list.append(batch['pose_motion_pred']) #list of bs seq_len 6
|
75 |
+
|
76 |
+
if re != 0:
|
77 |
+
z = torch.randn(bs, self.latent_dim).to(ref.device)
|
78 |
+
batch['z'] = z
|
79 |
+
audio_emb = self.audio_encoder(indiv_mels_use[:, -1*self.seq_len:,:,:,:]) #bs seq_len 512
|
80 |
+
if audio_emb.shape[1] != self.seq_len:
|
81 |
+
pad_dim = self.seq_len-audio_emb.shape[1]
|
82 |
+
pad_audio_emb = audio_emb[:, :1].repeat(1, pad_dim, 1)
|
83 |
+
audio_emb = torch.cat([pad_audio_emb, audio_emb], 1)
|
84 |
+
batch['audio_emb'] = audio_emb
|
85 |
+
batch = self.netG.test(batch)
|
86 |
+
pose_motion_pred_list.append(batch['pose_motion_pred'][:,-1*re:,:])
|
87 |
+
|
88 |
+
pose_motion_pred = torch.cat(pose_motion_pred_list, dim = 1)
|
89 |
+
batch['pose_motion_pred'] = pose_motion_pred
|
90 |
+
|
91 |
+
pose_pred = ref[:, :1, -6:] + pose_motion_pred # bs T 6
|
92 |
+
|
93 |
+
batch['pose_pred'] = pose_pred
|
94 |
+
return batch
|
chat_anything/sad_talker/audio2pose_models/audio_encoder.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
from torch.nn import functional as F
|
4 |
+
|
5 |
+
class Conv2d(nn.Module):
|
6 |
+
def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
|
7 |
+
super().__init__(*args, **kwargs)
|
8 |
+
self.conv_block = nn.Sequential(
|
9 |
+
nn.Conv2d(cin, cout, kernel_size, stride, padding),
|
10 |
+
nn.BatchNorm2d(cout)
|
11 |
+
)
|
12 |
+
self.act = nn.ReLU()
|
13 |
+
self.residual = residual
|
14 |
+
|
15 |
+
def forward(self, x):
|
16 |
+
out = self.conv_block(x)
|
17 |
+
if self.residual:
|
18 |
+
out += x
|
19 |
+
return self.act(out)
|
20 |
+
|
21 |
+
class AudioEncoder(nn.Module):
|
22 |
+
def __init__(self, wav2lip_checkpoint, device):
|
23 |
+
super(AudioEncoder, self).__init__()
|
24 |
+
|
25 |
+
self.audio_encoder = nn.Sequential(
|
26 |
+
Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
|
27 |
+
Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
|
28 |
+
Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
|
29 |
+
|
30 |
+
Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
|
31 |
+
Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
|
32 |
+
Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
|
33 |
+
|
34 |
+
Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
|
35 |
+
Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
|
36 |
+
Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
|
37 |
+
|
38 |
+
Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
|
39 |
+
Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
|
40 |
+
|
41 |
+
Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
|
42 |
+
Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
|
43 |
+
|
44 |
+
#### load the pre-trained audio_encoder, we do not need to load wav2lip model here.
|
45 |
+
# wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=torch.device(device))['state_dict']
|
46 |
+
# state_dict = self.audio_encoder.state_dict()
|
47 |
+
|
48 |
+
# for k,v in wav2lip_state_dict.items():
|
49 |
+
# if 'audio_encoder' in k:
|
50 |
+
# state_dict[k.replace('module.audio_encoder.', '')] = v
|
51 |
+
# self.audio_encoder.load_state_dict(state_dict)
|
52 |
+
|
53 |
+
|
54 |
+
def forward(self, audio_sequences):
|
55 |
+
# audio_sequences = (B, T, 1, 80, 16)
|
56 |
+
B = audio_sequences.size(0)
|
57 |
+
|
58 |
+
audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0)
|
59 |
+
|
60 |
+
audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1
|
61 |
+
dim = audio_embedding.shape[1]
|
62 |
+
audio_embedding = audio_embedding.reshape((B, -1, dim, 1, 1))
|
63 |
+
|
64 |
+
return audio_embedding.squeeze(-1).squeeze(-1) #B seq_len+1 512
|
chat_anything/sad_talker/audio2pose_models/cvae.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from torch import nn
|
4 |
+
from chat_anything.sad_talker.audio2pose_models.res_unet import ResUnet
|
5 |
+
|
6 |
+
def class2onehot(idx, class_num):
|
7 |
+
|
8 |
+
assert torch.max(idx).item() < class_num
|
9 |
+
onehot = torch.zeros(idx.size(0), class_num).to(idx.device)
|
10 |
+
onehot.scatter_(1, idx, 1)
|
11 |
+
return onehot
|
12 |
+
|
13 |
+
class CVAE(nn.Module):
|
14 |
+
def __init__(self, cfg):
|
15 |
+
super().__init__()
|
16 |
+
encoder_layer_sizes = cfg.MODEL.CVAE.ENCODER_LAYER_SIZES
|
17 |
+
decoder_layer_sizes = cfg.MODEL.CVAE.DECODER_LAYER_SIZES
|
18 |
+
latent_size = cfg.MODEL.CVAE.LATENT_SIZE
|
19 |
+
num_classes = cfg.DATASET.NUM_CLASSES
|
20 |
+
audio_emb_in_size = cfg.MODEL.CVAE.AUDIO_EMB_IN_SIZE
|
21 |
+
audio_emb_out_size = cfg.MODEL.CVAE.AUDIO_EMB_OUT_SIZE
|
22 |
+
seq_len = cfg.MODEL.CVAE.SEQ_LEN
|
23 |
+
|
24 |
+
self.latent_size = latent_size
|
25 |
+
|
26 |
+
self.encoder = ENCODER(encoder_layer_sizes, latent_size, num_classes,
|
27 |
+
audio_emb_in_size, audio_emb_out_size, seq_len)
|
28 |
+
self.decoder = DECODER(decoder_layer_sizes, latent_size, num_classes,
|
29 |
+
audio_emb_in_size, audio_emb_out_size, seq_len)
|
30 |
+
def reparameterize(self, mu, logvar):
|
31 |
+
std = torch.exp(0.5 * logvar)
|
32 |
+
eps = torch.randn_like(std)
|
33 |
+
return mu + eps * std
|
34 |
+
|
35 |
+
def forward(self, batch):
|
36 |
+
batch = self.encoder(batch)
|
37 |
+
mu = batch['mu']
|
38 |
+
logvar = batch['logvar']
|
39 |
+
z = self.reparameterize(mu, logvar)
|
40 |
+
batch['z'] = z
|
41 |
+
return self.decoder(batch)
|
42 |
+
|
43 |
+
def test(self, batch):
|
44 |
+
'''
|
45 |
+
class_id = batch['class']
|
46 |
+
z = torch.randn([class_id.size(0), self.latent_size]).to(class_id.device)
|
47 |
+
batch['z'] = z
|
48 |
+
'''
|
49 |
+
return self.decoder(batch)
|
50 |
+
|
51 |
+
class ENCODER(nn.Module):
|
52 |
+
def __init__(self, layer_sizes, latent_size, num_classes,
|
53 |
+
audio_emb_in_size, audio_emb_out_size, seq_len):
|
54 |
+
super().__init__()
|
55 |
+
|
56 |
+
self.resunet = ResUnet()
|
57 |
+
self.num_classes = num_classes
|
58 |
+
self.seq_len = seq_len
|
59 |
+
|
60 |
+
self.MLP = nn.Sequential()
|
61 |
+
layer_sizes[0] += latent_size + seq_len*audio_emb_out_size + 6
|
62 |
+
for i, (in_size, out_size) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
|
63 |
+
self.MLP.add_module(
|
64 |
+
name="L{:d}".format(i), module=nn.Linear(in_size, out_size))
|
65 |
+
self.MLP.add_module(name="A{:d}".format(i), module=nn.ReLU())
|
66 |
+
|
67 |
+
self.linear_means = nn.Linear(layer_sizes[-1], latent_size)
|
68 |
+
self.linear_logvar = nn.Linear(layer_sizes[-1], latent_size)
|
69 |
+
self.linear_audio = nn.Linear(audio_emb_in_size, audio_emb_out_size)
|
70 |
+
|
71 |
+
self.classbias = nn.Parameter(torch.randn(self.num_classes, latent_size))
|
72 |
+
|
73 |
+
def forward(self, batch):
|
74 |
+
class_id = batch['class']
|
75 |
+
pose_motion_gt = batch['pose_motion_gt'] #bs seq_len 6
|
76 |
+
ref = batch['ref'] #bs 6
|
77 |
+
bs = pose_motion_gt.shape[0]
|
78 |
+
audio_in = batch['audio_emb'] # bs seq_len audio_emb_in_size
|
79 |
+
|
80 |
+
#pose encode
|
81 |
+
pose_emb = self.resunet(pose_motion_gt.unsqueeze(1)) #bs 1 seq_len 6
|
82 |
+
pose_emb = pose_emb.reshape(bs, -1) #bs seq_len*6
|
83 |
+
|
84 |
+
#audio mapping
|
85 |
+
print(audio_in.shape)
|
86 |
+
audio_out = self.linear_audio(audio_in) # bs seq_len audio_emb_out_size
|
87 |
+
audio_out = audio_out.reshape(bs, -1)
|
88 |
+
|
89 |
+
class_bias = self.classbias[class_id] #bs latent_size
|
90 |
+
x_in = torch.cat([ref, pose_emb, audio_out, class_bias], dim=-1) #bs seq_len*(audio_emb_out_size+6)+latent_size
|
91 |
+
x_out = self.MLP(x_in)
|
92 |
+
|
93 |
+
mu = self.linear_means(x_out)
|
94 |
+
logvar = self.linear_means(x_out) #bs latent_size
|
95 |
+
|
96 |
+
batch.update({'mu':mu, 'logvar':logvar})
|
97 |
+
return batch
|
98 |
+
|
99 |
+
class DECODER(nn.Module):
|
100 |
+
def __init__(self, layer_sizes, latent_size, num_classes,
|
101 |
+
audio_emb_in_size, audio_emb_out_size, seq_len):
|
102 |
+
super().__init__()
|
103 |
+
|
104 |
+
self.resunet = ResUnet()
|
105 |
+
self.num_classes = num_classes
|
106 |
+
self.seq_len = seq_len
|
107 |
+
|
108 |
+
self.MLP = nn.Sequential()
|
109 |
+
input_size = latent_size + seq_len*audio_emb_out_size + 6
|
110 |
+
for i, (in_size, out_size) in enumerate(zip([input_size]+layer_sizes[:-1], layer_sizes)):
|
111 |
+
self.MLP.add_module(
|
112 |
+
name="L{:d}".format(i), module=nn.Linear(in_size, out_size))
|
113 |
+
if i+1 < len(layer_sizes):
|
114 |
+
self.MLP.add_module(name="A{:d}".format(i), module=nn.ReLU())
|
115 |
+
else:
|
116 |
+
self.MLP.add_module(name="sigmoid", module=nn.Sigmoid())
|
117 |
+
|
118 |
+
self.pose_linear = nn.Linear(6, 6)
|
119 |
+
self.linear_audio = nn.Linear(audio_emb_in_size, audio_emb_out_size)
|
120 |
+
|
121 |
+
self.classbias = nn.Parameter(torch.randn(self.num_classes, latent_size))
|
122 |
+
|
123 |
+
def forward(self, batch):
|
124 |
+
|
125 |
+
z = batch['z'] #bs latent_size
|
126 |
+
bs = z.shape[0]
|
127 |
+
class_id = batch['class']
|
128 |
+
ref = batch['ref'] #bs 6
|
129 |
+
audio_in = batch['audio_emb'] # bs seq_len audio_emb_in_size
|
130 |
+
#print('audio_in: ', audio_in[:, :, :10])
|
131 |
+
|
132 |
+
audio_out = self.linear_audio(audio_in) # bs seq_len audio_emb_out_size
|
133 |
+
#print('audio_out: ', audio_out[:, :, :10])
|
134 |
+
audio_out = audio_out.reshape([bs, -1]) # bs seq_len*audio_emb_out_size
|
135 |
+
class_bias = self.classbias[class_id] #bs latent_size
|
136 |
+
|
137 |
+
z = z + class_bias
|
138 |
+
x_in = torch.cat([ref, z, audio_out], dim=-1)
|
139 |
+
x_out = self.MLP(x_in) # bs layer_sizes[-1]
|
140 |
+
x_out = x_out.reshape((bs, self.seq_len, -1))
|
141 |
+
|
142 |
+
#print('x_out: ', x_out)
|
143 |
+
|
144 |
+
pose_emb = self.resunet(x_out.unsqueeze(1)) #bs 1 seq_len 6
|
145 |
+
|
146 |
+
pose_motion_pred = self.pose_linear(pose_emb.squeeze(1)) #bs seq_len 6
|
147 |
+
|
148 |
+
batch.update({'pose_motion_pred':pose_motion_pred})
|
149 |
+
return batch
|
chat_anything/sad_talker/audio2pose_models/discriminator.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from torch import nn
|
4 |
+
|
5 |
+
class ConvNormRelu(nn.Module):
|
6 |
+
def __init__(self, conv_type='1d', in_channels=3, out_channels=64, downsample=False,
|
7 |
+
kernel_size=None, stride=None, padding=None, norm='BN', leaky=False):
|
8 |
+
super().__init__()
|
9 |
+
if kernel_size is None:
|
10 |
+
if downsample:
|
11 |
+
kernel_size, stride, padding = 4, 2, 1
|
12 |
+
else:
|
13 |
+
kernel_size, stride, padding = 3, 1, 1
|
14 |
+
|
15 |
+
if conv_type == '2d':
|
16 |
+
self.conv = nn.Conv2d(
|
17 |
+
in_channels,
|
18 |
+
out_channels,
|
19 |
+
kernel_size,
|
20 |
+
stride,
|
21 |
+
padding,
|
22 |
+
bias=False,
|
23 |
+
)
|
24 |
+
if norm == 'BN':
|
25 |
+
self.norm = nn.BatchNorm2d(out_channels)
|
26 |
+
elif norm == 'IN':
|
27 |
+
self.norm = nn.InstanceNorm2d(out_channels)
|
28 |
+
else:
|
29 |
+
raise NotImplementedError
|
30 |
+
elif conv_type == '1d':
|
31 |
+
self.conv = nn.Conv1d(
|
32 |
+
in_channels,
|
33 |
+
out_channels,
|
34 |
+
kernel_size,
|
35 |
+
stride,
|
36 |
+
padding,
|
37 |
+
bias=False,
|
38 |
+
)
|
39 |
+
if norm == 'BN':
|
40 |
+
self.norm = nn.BatchNorm1d(out_channels)
|
41 |
+
elif norm == 'IN':
|
42 |
+
self.norm = nn.InstanceNorm1d(out_channels)
|
43 |
+
else:
|
44 |
+
raise NotImplementedError
|
45 |
+
nn.init.kaiming_normal_(self.conv.weight)
|
46 |
+
|
47 |
+
self.act = nn.LeakyReLU(negative_slope=0.2, inplace=False) if leaky else nn.ReLU(inplace=True)
|
48 |
+
|
49 |
+
def forward(self, x):
|
50 |
+
x = self.conv(x)
|
51 |
+
if isinstance(self.norm, nn.InstanceNorm1d):
|
52 |
+
x = self.norm(x.permute((0, 2, 1))).permute((0, 2, 1)) # normalize on [C]
|
53 |
+
else:
|
54 |
+
x = self.norm(x)
|
55 |
+
x = self.act(x)
|
56 |
+
return x
|
57 |
+
|
58 |
+
|
59 |
+
class PoseSequenceDiscriminator(nn.Module):
|
60 |
+
def __init__(self, cfg):
|
61 |
+
super().__init__()
|
62 |
+
self.cfg = cfg
|
63 |
+
leaky = self.cfg.MODEL.DISCRIMINATOR.LEAKY_RELU
|
64 |
+
|
65 |
+
self.seq = nn.Sequential(
|
66 |
+
ConvNormRelu('1d', cfg.MODEL.DISCRIMINATOR.INPUT_CHANNELS, 256, downsample=True, leaky=leaky), # B, 256, 64
|
67 |
+
ConvNormRelu('1d', 256, 512, downsample=True, leaky=leaky), # B, 512, 32
|
68 |
+
ConvNormRelu('1d', 512, 1024, kernel_size=3, stride=1, padding=1, leaky=leaky), # B, 1024, 16
|
69 |
+
nn.Conv1d(1024, 1, kernel_size=3, stride=1, padding=1, bias=True) # B, 1, 16
|
70 |
+
)
|
71 |
+
|
72 |
+
def forward(self, x):
|
73 |
+
x = x.reshape(x.size(0), x.size(1), -1).transpose(1, 2)
|
74 |
+
x = self.seq(x)
|
75 |
+
x = x.squeeze(1)
|
76 |
+
return x
|
chat_anything/sad_talker/audio2pose_models/networks.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
class ResidualConv(nn.Module):
|
6 |
+
def __init__(self, input_dim, output_dim, stride, padding):
|
7 |
+
super(ResidualConv, self).__init__()
|
8 |
+
|
9 |
+
self.conv_block = nn.Sequential(
|
10 |
+
nn.BatchNorm2d(input_dim),
|
11 |
+
nn.ReLU(),
|
12 |
+
nn.Conv2d(
|
13 |
+
input_dim, output_dim, kernel_size=3, stride=stride, padding=padding
|
14 |
+
),
|
15 |
+
nn.BatchNorm2d(output_dim),
|
16 |
+
nn.ReLU(),
|
17 |
+
nn.Conv2d(output_dim, output_dim, kernel_size=3, padding=1),
|
18 |
+
)
|
19 |
+
self.conv_skip = nn.Sequential(
|
20 |
+
nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=stride, padding=1),
|
21 |
+
nn.BatchNorm2d(output_dim),
|
22 |
+
)
|
23 |
+
|
24 |
+
def forward(self, x):
|
25 |
+
|
26 |
+
return self.conv_block(x) + self.conv_skip(x)
|
27 |
+
|
28 |
+
|
29 |
+
class Upsample(nn.Module):
|
30 |
+
def __init__(self, input_dim, output_dim, kernel, stride):
|
31 |
+
super(Upsample, self).__init__()
|
32 |
+
|
33 |
+
self.upsample = nn.ConvTranspose2d(
|
34 |
+
input_dim, output_dim, kernel_size=kernel, stride=stride
|
35 |
+
)
|
36 |
+
|
37 |
+
def forward(self, x):
|
38 |
+
return self.upsample(x)
|
39 |
+
|
40 |
+
|
41 |
+
class Squeeze_Excite_Block(nn.Module):
|
42 |
+
def __init__(self, channel, reduction=16):
|
43 |
+
super(Squeeze_Excite_Block, self).__init__()
|
44 |
+
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
45 |
+
self.fc = nn.Sequential(
|
46 |
+
nn.Linear(channel, channel // reduction, bias=False),
|
47 |
+
nn.ReLU(inplace=True),
|
48 |
+
nn.Linear(channel // reduction, channel, bias=False),
|
49 |
+
nn.Sigmoid(),
|
50 |
+
)
|
51 |
+
|
52 |
+
def forward(self, x):
|
53 |
+
b, c, _, _ = x.size()
|
54 |
+
y = self.avg_pool(x).view(b, c)
|
55 |
+
y = self.fc(y).view(b, c, 1, 1)
|
56 |
+
return x * y.expand_as(x)
|
57 |
+
|
58 |
+
|
59 |
+
class ASPP(nn.Module):
|
60 |
+
def __init__(self, in_dims, out_dims, rate=[6, 12, 18]):
|
61 |
+
super(ASPP, self).__init__()
|
62 |
+
|
63 |
+
self.aspp_block1 = nn.Sequential(
|
64 |
+
nn.Conv2d(
|
65 |
+
in_dims, out_dims, 3, stride=1, padding=rate[0], dilation=rate[0]
|
66 |
+
),
|
67 |
+
nn.ReLU(inplace=True),
|
68 |
+
nn.BatchNorm2d(out_dims),
|
69 |
+
)
|
70 |
+
self.aspp_block2 = nn.Sequential(
|
71 |
+
nn.Conv2d(
|
72 |
+
in_dims, out_dims, 3, stride=1, padding=rate[1], dilation=rate[1]
|
73 |
+
),
|
74 |
+
nn.ReLU(inplace=True),
|
75 |
+
nn.BatchNorm2d(out_dims),
|
76 |
+
)
|
77 |
+
self.aspp_block3 = nn.Sequential(
|
78 |
+
nn.Conv2d(
|
79 |
+
in_dims, out_dims, 3, stride=1, padding=rate[2], dilation=rate[2]
|
80 |
+
),
|
81 |
+
nn.ReLU(inplace=True),
|
82 |
+
nn.BatchNorm2d(out_dims),
|
83 |
+
)
|
84 |
+
|
85 |
+
self.output = nn.Conv2d(len(rate) * out_dims, out_dims, 1)
|
86 |
+
self._init_weights()
|
87 |
+
|
88 |
+
def forward(self, x):
|
89 |
+
x1 = self.aspp_block1(x)
|
90 |
+
x2 = self.aspp_block2(x)
|
91 |
+
x3 = self.aspp_block3(x)
|
92 |
+
out = torch.cat([x1, x2, x3], dim=1)
|
93 |
+
return self.output(out)
|
94 |
+
|
95 |
+
def _init_weights(self):
|
96 |
+
for m in self.modules():
|
97 |
+
if isinstance(m, nn.Conv2d):
|
98 |
+
nn.init.kaiming_normal_(m.weight)
|
99 |
+
elif isinstance(m, nn.BatchNorm2d):
|
100 |
+
m.weight.data.fill_(1)
|
101 |
+
m.bias.data.zero_()
|
102 |
+
|
103 |
+
|
104 |
+
class Upsample_(nn.Module):
|
105 |
+
def __init__(self, scale=2):
|
106 |
+
super(Upsample_, self).__init__()
|
107 |
+
|
108 |
+
self.upsample = nn.Upsample(mode="bilinear", scale_factor=scale)
|
109 |
+
|
110 |
+
def forward(self, x):
|
111 |
+
return self.upsample(x)
|
112 |
+
|
113 |
+
|
114 |
+
class AttentionBlock(nn.Module):
|
115 |
+
def __init__(self, input_encoder, input_decoder, output_dim):
|
116 |
+
super(AttentionBlock, self).__init__()
|
117 |
+
|
118 |
+
self.conv_encoder = nn.Sequential(
|
119 |
+
nn.BatchNorm2d(input_encoder),
|
120 |
+
nn.ReLU(),
|
121 |
+
nn.Conv2d(input_encoder, output_dim, 3, padding=1),
|
122 |
+
nn.MaxPool2d(2, 2),
|
123 |
+
)
|
124 |
+
|
125 |
+
self.conv_decoder = nn.Sequential(
|
126 |
+
nn.BatchNorm2d(input_decoder),
|
127 |
+
nn.ReLU(),
|
128 |
+
nn.Conv2d(input_decoder, output_dim, 3, padding=1),
|
129 |
+
)
|
130 |
+
|
131 |
+
self.conv_attn = nn.Sequential(
|
132 |
+
nn.BatchNorm2d(output_dim),
|
133 |
+
nn.ReLU(),
|
134 |
+
nn.Conv2d(output_dim, 1, 1),
|
135 |
+
)
|
136 |
+
|
137 |
+
def forward(self, x1, x2):
|
138 |
+
out = self.conv_encoder(x1) + self.conv_decoder(x2)
|
139 |
+
out = self.conv_attn(out)
|
140 |
+
return out * x2
|
chat_anything/sad_talker/audio2pose_models/res_unet.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from chat_anything.sad_talker.audio2pose_models.networks import ResidualConv, Upsample
|
4 |
+
|
5 |
+
|
6 |
+
class ResUnet(nn.Module):
|
7 |
+
def __init__(self, channel=1, filters=[32, 64, 128, 256]):
|
8 |
+
super(ResUnet, self).__init__()
|
9 |
+
|
10 |
+
self.input_layer = nn.Sequential(
|
11 |
+
nn.Conv2d(channel, filters[0], kernel_size=3, padding=1),
|
12 |
+
nn.BatchNorm2d(filters[0]),
|
13 |
+
nn.ReLU(),
|
14 |
+
nn.Conv2d(filters[0], filters[0], kernel_size=3, padding=1),
|
15 |
+
)
|
16 |
+
self.input_skip = nn.Sequential(
|
17 |
+
nn.Conv2d(channel, filters[0], kernel_size=3, padding=1)
|
18 |
+
)
|
19 |
+
|
20 |
+
self.residual_conv_1 = ResidualConv(filters[0], filters[1], stride=(2,1), padding=1)
|
21 |
+
self.residual_conv_2 = ResidualConv(filters[1], filters[2], stride=(2,1), padding=1)
|
22 |
+
|
23 |
+
self.bridge = ResidualConv(filters[2], filters[3], stride=(2,1), padding=1)
|
24 |
+
|
25 |
+
self.upsample_1 = Upsample(filters[3], filters[3], kernel=(2,1), stride=(2,1))
|
26 |
+
self.up_residual_conv1 = ResidualConv(filters[3] + filters[2], filters[2], stride=1, padding=1)
|
27 |
+
|
28 |
+
self.upsample_2 = Upsample(filters[2], filters[2], kernel=(2,1), stride=(2,1))
|
29 |
+
self.up_residual_conv2 = ResidualConv(filters[2] + filters[1], filters[1], stride=1, padding=1)
|
30 |
+
|
31 |
+
self.upsample_3 = Upsample(filters[1], filters[1], kernel=(2,1), stride=(2,1))
|
32 |
+
self.up_residual_conv3 = ResidualConv(filters[1] + filters[0], filters[0], stride=1, padding=1)
|
33 |
+
|
34 |
+
self.output_layer = nn.Sequential(
|
35 |
+
nn.Conv2d(filters[0], 1, 1, 1),
|
36 |
+
nn.Sigmoid(),
|
37 |
+
)
|
38 |
+
|
39 |
+
def forward(self, x):
|
40 |
+
# Encode
|
41 |
+
x1 = self.input_layer(x) + self.input_skip(x)
|
42 |
+
x2 = self.residual_conv_1(x1)
|
43 |
+
x3 = self.residual_conv_2(x2)
|
44 |
+
# Bridge
|
45 |
+
x4 = self.bridge(x3)
|
46 |
+
|
47 |
+
# Decode
|
48 |
+
x4 = self.upsample_1(x4)
|
49 |
+
x5 = torch.cat([x4, x3], dim=1)
|
50 |
+
|
51 |
+
x6 = self.up_residual_conv1(x5)
|
52 |
+
|
53 |
+
x6 = self.upsample_2(x6)
|
54 |
+
x7 = torch.cat([x6, x2], dim=1)
|
55 |
+
|
56 |
+
x8 = self.up_residual_conv2(x7)
|
57 |
+
|
58 |
+
x8 = self.upsample_3(x8)
|
59 |
+
x9 = torch.cat([x8, x1], dim=1)
|
60 |
+
|
61 |
+
x10 = self.up_residual_conv3(x9)
|
62 |
+
|
63 |
+
output = self.output_layer(x10)
|
64 |
+
|
65 |
+
return output
|
chat_anything/sad_talker/config/auido2exp.yaml
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DATASET:
|
2 |
+
TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/train.txt
|
3 |
+
EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/val.txt
|
4 |
+
TRAIN_BATCH_SIZE: 32
|
5 |
+
EVAL_BATCH_SIZE: 32
|
6 |
+
EXP: True
|
7 |
+
EXP_DIM: 64
|
8 |
+
FRAME_LEN: 32
|
9 |
+
COEFF_LEN: 73
|
10 |
+
NUM_CLASSES: 46
|
11 |
+
AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav
|
12 |
+
COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav2lip_3dmm
|
13 |
+
LMDB_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb
|
14 |
+
DEBUG: True
|
15 |
+
NUM_REPEATS: 2
|
16 |
+
T: 40
|
17 |
+
|
18 |
+
|
19 |
+
MODEL:
|
20 |
+
FRAMEWORK: V2
|
21 |
+
AUDIOENCODER:
|
22 |
+
LEAKY_RELU: True
|
23 |
+
NORM: 'IN'
|
24 |
+
DISCRIMINATOR:
|
25 |
+
LEAKY_RELU: False
|
26 |
+
INPUT_CHANNELS: 6
|
27 |
+
CVAE:
|
28 |
+
AUDIO_EMB_IN_SIZE: 512
|
29 |
+
AUDIO_EMB_OUT_SIZE: 128
|
30 |
+
SEQ_LEN: 32
|
31 |
+
LATENT_SIZE: 256
|
32 |
+
ENCODER_LAYER_SIZES: [192, 1024]
|
33 |
+
DECODER_LAYER_SIZES: [1024, 192]
|
34 |
+
|
35 |
+
|
36 |
+
TRAIN:
|
37 |
+
MAX_EPOCH: 300
|
38 |
+
GENERATOR:
|
39 |
+
LR: 2.0e-5
|
40 |
+
DISCRIMINATOR:
|
41 |
+
LR: 1.0e-5
|
42 |
+
LOSS:
|
43 |
+
W_FEAT: 0
|
44 |
+
W_COEFF_EXP: 2
|
45 |
+
W_LM: 1.0e-2
|
46 |
+
W_LM_MOUTH: 0
|
47 |
+
W_REG: 0
|
48 |
+
W_SYNC: 0
|
49 |
+
W_COLOR: 0
|
50 |
+
W_EXPRESSION: 0
|
51 |
+
W_LIPREADING: 0.01
|
52 |
+
W_LIPREADING_VV: 0
|
53 |
+
W_EYE_BLINK: 4
|
54 |
+
|
55 |
+
TAG:
|
56 |
+
NAME: small_dataset
|
57 |
+
|
58 |
+
|
chat_anything/sad_talker/config/auido2pose.yaml
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DATASET:
|
2 |
+
TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/train_33.txt
|
3 |
+
EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/val.txt
|
4 |
+
TRAIN_BATCH_SIZE: 64
|
5 |
+
EVAL_BATCH_SIZE: 1
|
6 |
+
EXP: True
|
7 |
+
EXP_DIM: 64
|
8 |
+
FRAME_LEN: 32
|
9 |
+
COEFF_LEN: 73
|
10 |
+
NUM_CLASSES: 46
|
11 |
+
AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav
|
12 |
+
COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb
|
13 |
+
DEBUG: True
|
14 |
+
|
15 |
+
|
16 |
+
MODEL:
|
17 |
+
AUDIOENCODER:
|
18 |
+
LEAKY_RELU: True
|
19 |
+
NORM: 'IN'
|
20 |
+
DISCRIMINATOR:
|
21 |
+
LEAKY_RELU: False
|
22 |
+
INPUT_CHANNELS: 6
|
23 |
+
CVAE:
|
24 |
+
AUDIO_EMB_IN_SIZE: 512
|
25 |
+
AUDIO_EMB_OUT_SIZE: 6
|
26 |
+
SEQ_LEN: 32
|
27 |
+
LATENT_SIZE: 64
|
28 |
+
ENCODER_LAYER_SIZES: [192, 128]
|
29 |
+
DECODER_LAYER_SIZES: [128, 192]
|
30 |
+
|
31 |
+
|
32 |
+
TRAIN:
|
33 |
+
MAX_EPOCH: 150
|
34 |
+
GENERATOR:
|
35 |
+
LR: 1.0e-4
|
36 |
+
DISCRIMINATOR:
|
37 |
+
LR: 1.0e-4
|
38 |
+
LOSS:
|
39 |
+
LAMBDA_REG: 1
|
40 |
+
LAMBDA_LANDMARKS: 0
|
41 |
+
LAMBDA_VERTICES: 0
|
42 |
+
LAMBDA_GAN_MOTION: 0.7
|
43 |
+
LAMBDA_GAN_COEFF: 0
|
44 |
+
LAMBDA_KL: 1
|
45 |
+
|
46 |
+
TAG:
|
47 |
+
NAME: cvae_UNET_useAudio_usewav2lipAudioEncoder
|
48 |
+
|
49 |
+
|
chat_anything/sad_talker/config/facerender.yaml
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_params:
|
2 |
+
common_params:
|
3 |
+
num_kp: 15
|
4 |
+
image_channel: 3
|
5 |
+
feature_channel: 32
|
6 |
+
estimate_jacobian: False # True
|
7 |
+
kp_detector_params:
|
8 |
+
temperature: 0.1
|
9 |
+
block_expansion: 32
|
10 |
+
max_features: 1024
|
11 |
+
scale_factor: 0.25 # 0.25
|
12 |
+
num_blocks: 5
|
13 |
+
reshape_channel: 16384 # 16384 = 1024 * 16
|
14 |
+
reshape_depth: 16
|
15 |
+
he_estimator_params:
|
16 |
+
block_expansion: 64
|
17 |
+
max_features: 2048
|
18 |
+
num_bins: 66
|
19 |
+
generator_params:
|
20 |
+
block_expansion: 64
|
21 |
+
max_features: 512
|
22 |
+
num_down_blocks: 2
|
23 |
+
reshape_channel: 32
|
24 |
+
reshape_depth: 16 # 512 = 32 * 16
|
25 |
+
num_resblocks: 6
|
26 |
+
estimate_occlusion_map: True
|
27 |
+
dense_motion_params:
|
28 |
+
block_expansion: 32
|
29 |
+
max_features: 1024
|
30 |
+
num_blocks: 5
|
31 |
+
reshape_depth: 16
|
32 |
+
compress: 4
|
33 |
+
discriminator_params:
|
34 |
+
scales: [1]
|
35 |
+
block_expansion: 32
|
36 |
+
max_features: 512
|
37 |
+
num_blocks: 4
|
38 |
+
sn: True
|
39 |
+
mapping_params:
|
40 |
+
coeff_nc: 70
|
41 |
+
descriptor_nc: 1024
|
42 |
+
layer: 3
|
43 |
+
num_kp: 15
|
44 |
+
num_bins: 66
|
45 |
+
|
chat_anything/sad_talker/config/facerender_still.yaml
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_params:
|
2 |
+
common_params:
|
3 |
+
num_kp: 15
|
4 |
+
image_channel: 3
|
5 |
+
feature_channel: 32
|
6 |
+
estimate_jacobian: False # True
|
7 |
+
kp_detector_params:
|
8 |
+
temperature: 0.1
|
9 |
+
block_expansion: 32
|
10 |
+
max_features: 1024
|
11 |
+
scale_factor: 0.25 # 0.25
|
12 |
+
num_blocks: 5
|
13 |
+
reshape_channel: 16384 # 16384 = 1024 * 16
|
14 |
+
reshape_depth: 16
|
15 |
+
he_estimator_params:
|
16 |
+
block_expansion: 64
|
17 |
+
max_features: 2048
|
18 |
+
num_bins: 66
|
19 |
+
generator_params:
|
20 |
+
block_expansion: 64
|
21 |
+
max_features: 512
|
22 |
+
num_down_blocks: 2
|
23 |
+
reshape_channel: 32
|
24 |
+
reshape_depth: 16 # 512 = 32 * 16
|
25 |
+
num_resblocks: 6
|
26 |
+
estimate_occlusion_map: True
|
27 |
+
dense_motion_params:
|
28 |
+
block_expansion: 32
|
29 |
+
max_features: 1024
|
30 |
+
num_blocks: 5
|
31 |
+
reshape_depth: 16
|
32 |
+
compress: 4
|
33 |
+
discriminator_params:
|
34 |
+
scales: [1]
|
35 |
+
block_expansion: 32
|
36 |
+
max_features: 512
|
37 |
+
num_blocks: 4
|
38 |
+
sn: True
|
39 |
+
mapping_params:
|
40 |
+
coeff_nc: 73
|
41 |
+
descriptor_nc: 1024
|
42 |
+
layer: 3
|
43 |
+
num_kp: 15
|
44 |
+
num_bins: 66
|
45 |
+
|
chat_anything/sad_talker/config/similarity_Lm3D_all.mat
ADDED
Binary file (994 Bytes). View file
|
|
chat_anything/sad_talker/face3d/data/__init__.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This package includes all the modules related to data loading and preprocessing
|
2 |
+
|
3 |
+
To add a custom dataset class called 'dummy', you need to add a file called 'dummy_dataset.py' and define a subclass 'DummyDataset' inherited from BaseDataset.
|
4 |
+
You need to implement four functions:
|
5 |
+
-- <__init__>: initialize the class, first call BaseDataset.__init__(self, opt).
|
6 |
+
-- <__len__>: return the size of dataset.
|
7 |
+
-- <__getitem__>: get a data point from data loader.
|
8 |
+
-- <modify_commandline_options>: (optionally) add dataset-specific options and set default options.
|
9 |
+
|
10 |
+
Now you can use the dataset class by specifying flag '--dataset_mode dummy'.
|
11 |
+
See our template dataset class 'template_dataset.py' for more details.
|
12 |
+
"""
|
13 |
+
import numpy as np
|
14 |
+
import importlib
|
15 |
+
import torch.utils.data
|
16 |
+
from face3d.data.base_dataset import BaseDataset
|
17 |
+
|
18 |
+
|
19 |
+
def find_dataset_using_name(dataset_name):
|
20 |
+
"""Import the module "data/[dataset_name]_dataset.py".
|
21 |
+
|
22 |
+
In the file, the class called DatasetNameDataset() will
|
23 |
+
be instantiated. It has to be a subclass of BaseDataset,
|
24 |
+
and it is case-insensitive.
|
25 |
+
"""
|
26 |
+
dataset_filename = "data." + dataset_name + "_dataset"
|
27 |
+
datasetlib = importlib.import_module(dataset_filename)
|
28 |
+
|
29 |
+
dataset = None
|
30 |
+
target_dataset_name = dataset_name.replace('_', '') + 'dataset'
|
31 |
+
for name, cls in datasetlib.__dict__.items():
|
32 |
+
if name.lower() == target_dataset_name.lower() \
|
33 |
+
and issubclass(cls, BaseDataset):
|
34 |
+
dataset = cls
|
35 |
+
|
36 |
+
if dataset is None:
|
37 |
+
raise NotImplementedError("In %s.py, there should be a subclass of BaseDataset with class name that matches %s in lowercase." % (dataset_filename, target_dataset_name))
|
38 |
+
|
39 |
+
return dataset
|
40 |
+
|
41 |
+
|
42 |
+
def get_option_setter(dataset_name):
|
43 |
+
"""Return the static method <modify_commandline_options> of the dataset class."""
|
44 |
+
dataset_class = find_dataset_using_name(dataset_name)
|
45 |
+
return dataset_class.modify_commandline_options
|
46 |
+
|
47 |
+
|
48 |
+
def create_dataset(opt, rank=0):
|
49 |
+
"""Create a dataset given the option.
|
50 |
+
|
51 |
+
This function wraps the class CustomDatasetDataLoader.
|
52 |
+
This is the main interface between this package and 'train.py'/'test.py'
|
53 |
+
|
54 |
+
Example:
|
55 |
+
>>> from data import create_dataset
|
56 |
+
>>> dataset = create_dataset(opt)
|
57 |
+
"""
|
58 |
+
data_loader = CustomDatasetDataLoader(opt, rank=rank)
|
59 |
+
dataset = data_loader.load_data()
|
60 |
+
return dataset
|
61 |
+
|
62 |
+
class CustomDatasetDataLoader():
|
63 |
+
"""Wrapper class of Dataset class that performs multi-threaded data loading"""
|
64 |
+
|
65 |
+
def __init__(self, opt, rank=0):
|
66 |
+
"""Initialize this class
|
67 |
+
|
68 |
+
Step 1: create a dataset instance given the name [dataset_mode]
|
69 |
+
Step 2: create a multi-threaded data loader.
|
70 |
+
"""
|
71 |
+
self.opt = opt
|
72 |
+
dataset_class = find_dataset_using_name(opt.dataset_mode)
|
73 |
+
self.dataset = dataset_class(opt)
|
74 |
+
self.sampler = None
|
75 |
+
print("rank %d %s dataset [%s] was created" % (rank, self.dataset.name, type(self.dataset).__name__))
|
76 |
+
if opt.use_ddp and opt.isTrain:
|
77 |
+
world_size = opt.world_size
|
78 |
+
self.sampler = torch.utils.data.distributed.DistributedSampler(
|
79 |
+
self.dataset,
|
80 |
+
num_replicas=world_size,
|
81 |
+
rank=rank,
|
82 |
+
shuffle=not opt.serial_batches
|
83 |
+
)
|
84 |
+
self.dataloader = torch.utils.data.DataLoader(
|
85 |
+
self.dataset,
|
86 |
+
sampler=self.sampler,
|
87 |
+
num_workers=int(opt.num_threads / world_size),
|
88 |
+
batch_size=int(opt.batch_size / world_size),
|
89 |
+
drop_last=True)
|
90 |
+
else:
|
91 |
+
self.dataloader = torch.utils.data.DataLoader(
|
92 |
+
self.dataset,
|
93 |
+
batch_size=opt.batch_size,
|
94 |
+
shuffle=(not opt.serial_batches) and opt.isTrain,
|
95 |
+
num_workers=int(opt.num_threads),
|
96 |
+
drop_last=True
|
97 |
+
)
|
98 |
+
|
99 |
+
def set_epoch(self, epoch):
|
100 |
+
self.dataset.current_epoch = epoch
|
101 |
+
if self.sampler is not None:
|
102 |
+
self.sampler.set_epoch(epoch)
|
103 |
+
|
104 |
+
def load_data(self):
|
105 |
+
return self
|
106 |
+
|
107 |
+
def __len__(self):
|
108 |
+
"""Return the number of data in the dataset"""
|
109 |
+
return min(len(self.dataset), self.opt.max_dataset_size)
|
110 |
+
|
111 |
+
def __iter__(self):
|
112 |
+
"""Return a batch of data"""
|
113 |
+
for i, data in enumerate(self.dataloader):
|
114 |
+
if i * self.opt.batch_size >= self.opt.max_dataset_size:
|
115 |
+
break
|
116 |
+
yield data
|
chat_anything/sad_talker/face3d/data/base_dataset.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This module implements an abstract base class (ABC) 'BaseDataset' for datasets.
|
2 |
+
|
3 |
+
It also includes common transformation functions (e.g., get_transform, __scale_width), which can be later used in subclasses.
|
4 |
+
"""
|
5 |
+
import random
|
6 |
+
import numpy as np
|
7 |
+
import torch.utils.data as data
|
8 |
+
from PIL import Image
|
9 |
+
import torchvision.transforms as transforms
|
10 |
+
from abc import ABC, abstractmethod
|
11 |
+
|
12 |
+
|
13 |
+
class BaseDataset(data.Dataset, ABC):
|
14 |
+
"""This class is an abstract base class (ABC) for datasets.
|
15 |
+
|
16 |
+
To create a subclass, you need to implement the following four functions:
|
17 |
+
-- <__init__>: initialize the class, first call BaseDataset.__init__(self, opt).
|
18 |
+
-- <__len__>: return the size of dataset.
|
19 |
+
-- <__getitem__>: get a data point.
|
20 |
+
-- <modify_commandline_options>: (optionally) add dataset-specific options and set default options.
|
21 |
+
"""
|
22 |
+
|
23 |
+
def __init__(self, opt):
|
24 |
+
"""Initialize the class; save the options in the class
|
25 |
+
|
26 |
+
Parameters:
|
27 |
+
opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
|
28 |
+
"""
|
29 |
+
self.opt = opt
|
30 |
+
# self.root = opt.dataroot
|
31 |
+
self.current_epoch = 0
|
32 |
+
|
33 |
+
@staticmethod
|
34 |
+
def modify_commandline_options(parser, is_train):
|
35 |
+
"""Add new dataset-specific options, and rewrite default values for existing options.
|
36 |
+
|
37 |
+
Parameters:
|
38 |
+
parser -- original option parser
|
39 |
+
is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
the modified parser.
|
43 |
+
"""
|
44 |
+
return parser
|
45 |
+
|
46 |
+
@abstractmethod
|
47 |
+
def __len__(self):
|
48 |
+
"""Return the total number of images in the dataset."""
|
49 |
+
return 0
|
50 |
+
|
51 |
+
@abstractmethod
|
52 |
+
def __getitem__(self, index):
|
53 |
+
"""Return a data point and its metadata information.
|
54 |
+
|
55 |
+
Parameters:
|
56 |
+
index - - a random integer for data indexing
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
a dictionary of data with their names. It ususally contains the data itself and its metadata information.
|
60 |
+
"""
|
61 |
+
pass
|
62 |
+
|
63 |
+
|
64 |
+
def get_transform(grayscale=False):
|
65 |
+
transform_list = []
|
66 |
+
if grayscale:
|
67 |
+
transform_list.append(transforms.Grayscale(1))
|
68 |
+
transform_list += [transforms.ToTensor()]
|
69 |
+
return transforms.Compose(transform_list)
|
70 |
+
|
71 |
+
def get_affine_mat(opt, size):
|
72 |
+
shift_x, shift_y, scale, rot_angle, flip = 0., 0., 1., 0., False
|
73 |
+
w, h = size
|
74 |
+
|
75 |
+
if 'shift' in opt.preprocess:
|
76 |
+
shift_pixs = int(opt.shift_pixs)
|
77 |
+
shift_x = random.randint(-shift_pixs, shift_pixs)
|
78 |
+
shift_y = random.randint(-shift_pixs, shift_pixs)
|
79 |
+
if 'scale' in opt.preprocess:
|
80 |
+
scale = 1 + opt.scale_delta * (2 * random.random() - 1)
|
81 |
+
if 'rot' in opt.preprocess:
|
82 |
+
rot_angle = opt.rot_angle * (2 * random.random() - 1)
|
83 |
+
rot_rad = -rot_angle * np.pi/180
|
84 |
+
if 'flip' in opt.preprocess:
|
85 |
+
flip = random.random() > 0.5
|
86 |
+
|
87 |
+
shift_to_origin = np.array([1, 0, -w//2, 0, 1, -h//2, 0, 0, 1]).reshape([3, 3])
|
88 |
+
flip_mat = np.array([-1 if flip else 1, 0, 0, 0, 1, 0, 0, 0, 1]).reshape([3, 3])
|
89 |
+
shift_mat = np.array([1, 0, shift_x, 0, 1, shift_y, 0, 0, 1]).reshape([3, 3])
|
90 |
+
rot_mat = np.array([np.cos(rot_rad), np.sin(rot_rad), 0, -np.sin(rot_rad), np.cos(rot_rad), 0, 0, 0, 1]).reshape([3, 3])
|
91 |
+
scale_mat = np.array([scale, 0, 0, 0, scale, 0, 0, 0, 1]).reshape([3, 3])
|
92 |
+
shift_to_center = np.array([1, 0, w//2, 0, 1, h//2, 0, 0, 1]).reshape([3, 3])
|
93 |
+
|
94 |
+
affine = shift_to_center @ scale_mat @ rot_mat @ shift_mat @ flip_mat @ shift_to_origin
|
95 |
+
affine_inv = np.linalg.inv(affine)
|
96 |
+
return affine, affine_inv, flip
|
97 |
+
|
98 |
+
def apply_img_affine(img, affine_inv, method=Image.BICUBIC):
|
99 |
+
return img.transform(img.size, Image.AFFINE, data=affine_inv.flatten()[:6], resample=Image.BICUBIC)
|
100 |
+
|
101 |
+
def apply_lm_affine(landmark, affine, flip, size):
|
102 |
+
_, h = size
|
103 |
+
lm = landmark.copy()
|
104 |
+
lm[:, 1] = h - 1 - lm[:, 1]
|
105 |
+
lm = np.concatenate((lm, np.ones([lm.shape[0], 1])), -1)
|
106 |
+
lm = lm @ np.transpose(affine)
|
107 |
+
lm[:, :2] = lm[:, :2] / lm[:, 2:]
|
108 |
+
lm = lm[:, :2]
|
109 |
+
lm[:, 1] = h - 1 - lm[:, 1]
|
110 |
+
if flip:
|
111 |
+
lm_ = lm.copy()
|
112 |
+
lm_[:17] = lm[16::-1]
|
113 |
+
lm_[17:22] = lm[26:21:-1]
|
114 |
+
lm_[22:27] = lm[21:16:-1]
|
115 |
+
lm_[31:36] = lm[35:30:-1]
|
116 |
+
lm_[36:40] = lm[45:41:-1]
|
117 |
+
lm_[40:42] = lm[47:45:-1]
|
118 |
+
lm_[42:46] = lm[39:35:-1]
|
119 |
+
lm_[46:48] = lm[41:39:-1]
|
120 |
+
lm_[48:55] = lm[54:47:-1]
|
121 |
+
lm_[55:60] = lm[59:54:-1]
|
122 |
+
lm_[60:65] = lm[64:59:-1]
|
123 |
+
lm_[65:68] = lm[67:64:-1]
|
124 |
+
lm = lm_
|
125 |
+
return lm
|
chat_anything/sad_talker/face3d/data/flist_dataset.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This script defines the custom dataset for Deep3DFaceRecon_pytorch
|
2 |
+
"""
|
3 |
+
|
4 |
+
import os.path
|
5 |
+
from data.base_dataset import BaseDataset, get_transform, get_affine_mat, apply_img_affine, apply_lm_affine
|
6 |
+
from data.image_folder import make_dataset
|
7 |
+
from PIL import Image
|
8 |
+
import random
|
9 |
+
import util.util as util
|
10 |
+
import numpy as np
|
11 |
+
import json
|
12 |
+
import torch
|
13 |
+
from scipy.io import loadmat, savemat
|
14 |
+
import pickle
|
15 |
+
from util.preprocess import align_img, estimate_norm
|
16 |
+
from util.load_mats import load_lm3d
|
17 |
+
|
18 |
+
|
19 |
+
def default_flist_reader(flist):
|
20 |
+
"""
|
21 |
+
flist format: impath label\nimpath label\n ...(same to caffe's filelist)
|
22 |
+
"""
|
23 |
+
imlist = []
|
24 |
+
with open(flist, 'r') as rf:
|
25 |
+
for line in rf.readlines():
|
26 |
+
impath = line.strip()
|
27 |
+
imlist.append(impath)
|
28 |
+
|
29 |
+
return imlist
|
30 |
+
|
31 |
+
def jason_flist_reader(flist):
|
32 |
+
with open(flist, 'r') as fp:
|
33 |
+
info = json.load(fp)
|
34 |
+
return info
|
35 |
+
|
36 |
+
def parse_label(label):
|
37 |
+
return torch.tensor(np.array(label).astype(np.float32))
|
38 |
+
|
39 |
+
|
40 |
+
class FlistDataset(BaseDataset):
|
41 |
+
"""
|
42 |
+
It requires one directories to host training images '/path/to/data/train'
|
43 |
+
You can train the model with the dataset flag '--dataroot /path/to/data'.
|
44 |
+
"""
|
45 |
+
|
46 |
+
def __init__(self, opt):
|
47 |
+
"""Initialize this dataset class.
|
48 |
+
|
49 |
+
Parameters:
|
50 |
+
opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
|
51 |
+
"""
|
52 |
+
BaseDataset.__init__(self, opt)
|
53 |
+
|
54 |
+
self.lm3d_std = load_lm3d(opt.bfm_folder)
|
55 |
+
|
56 |
+
msk_names = default_flist_reader(opt.flist)
|
57 |
+
self.msk_paths = [os.path.join(opt.data_root, i) for i in msk_names]
|
58 |
+
|
59 |
+
self.size = len(self.msk_paths)
|
60 |
+
self.opt = opt
|
61 |
+
|
62 |
+
self.name = 'train' if opt.isTrain else 'val'
|
63 |
+
if '_' in opt.flist:
|
64 |
+
self.name += '_' + opt.flist.split(os.sep)[-1].split('_')[0]
|
65 |
+
|
66 |
+
|
67 |
+
def __getitem__(self, index):
|
68 |
+
"""Return a data point and its metadata information.
|
69 |
+
|
70 |
+
Parameters:
|
71 |
+
index (int) -- a random integer for data indexing
|
72 |
+
|
73 |
+
Returns a dictionary that contains A, B, A_paths and B_paths
|
74 |
+
img (tensor) -- an image in the input domain
|
75 |
+
msk (tensor) -- its corresponding attention mask
|
76 |
+
lm (tensor) -- its corresponding 3d landmarks
|
77 |
+
im_paths (str) -- image paths
|
78 |
+
aug_flag (bool) -- a flag used to tell whether its raw or augmented
|
79 |
+
"""
|
80 |
+
msk_path = self.msk_paths[index % self.size] # make sure index is within then range
|
81 |
+
img_path = msk_path.replace('mask/', '')
|
82 |
+
lm_path = '.'.join(msk_path.replace('mask', 'landmarks').split('.')[:-1]) + '.txt'
|
83 |
+
|
84 |
+
raw_img = Image.open(img_path).convert('RGB')
|
85 |
+
raw_msk = Image.open(msk_path).convert('RGB')
|
86 |
+
raw_lm = np.loadtxt(lm_path).astype(np.float32)
|
87 |
+
|
88 |
+
_, img, lm, msk = align_img(raw_img, raw_lm, self.lm3d_std, raw_msk)
|
89 |
+
|
90 |
+
aug_flag = self.opt.use_aug and self.opt.isTrain
|
91 |
+
if aug_flag:
|
92 |
+
img, lm, msk = self._augmentation(img, lm, self.opt, msk)
|
93 |
+
|
94 |
+
_, H = img.size
|
95 |
+
M = estimate_norm(lm, H)
|
96 |
+
transform = get_transform()
|
97 |
+
img_tensor = transform(img)
|
98 |
+
msk_tensor = transform(msk)[:1, ...]
|
99 |
+
lm_tensor = parse_label(lm)
|
100 |
+
M_tensor = parse_label(M)
|
101 |
+
|
102 |
+
|
103 |
+
return {'imgs': img_tensor,
|
104 |
+
'lms': lm_tensor,
|
105 |
+
'msks': msk_tensor,
|
106 |
+
'M': M_tensor,
|
107 |
+
'im_paths': img_path,
|
108 |
+
'aug_flag': aug_flag,
|
109 |
+
'dataset': self.name}
|
110 |
+
|
111 |
+
def _augmentation(self, img, lm, opt, msk=None):
|
112 |
+
affine, affine_inv, flip = get_affine_mat(opt, img.size)
|
113 |
+
img = apply_img_affine(img, affine_inv)
|
114 |
+
lm = apply_lm_affine(lm, affine, flip, img.size)
|
115 |
+
if msk is not None:
|
116 |
+
msk = apply_img_affine(msk, affine_inv, method=Image.BILINEAR)
|
117 |
+
return img, lm, msk
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
def __len__(self):
|
123 |
+
"""Return the total number of images in the dataset.
|
124 |
+
"""
|
125 |
+
return self.size
|
chat_anything/sad_talker/face3d/data/image_folder.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""A modified image folder class
|
2 |
+
|
3 |
+
We modify the official PyTorch image folder (https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py)
|
4 |
+
so that this class can load images from both current directory and its subdirectories.
|
5 |
+
"""
|
6 |
+
import numpy as np
|
7 |
+
import torch.utils.data as data
|
8 |
+
|
9 |
+
from PIL import Image
|
10 |
+
import os
|
11 |
+
import os.path
|
12 |
+
|
13 |
+
IMG_EXTENSIONS = [
|
14 |
+
'.jpg', '.JPG', '.jpeg', '.JPEG',
|
15 |
+
'.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
|
16 |
+
'.tif', '.TIF', '.tiff', '.TIFF',
|
17 |
+
]
|
18 |
+
|
19 |
+
|
20 |
+
def is_image_file(filename):
|
21 |
+
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
|
22 |
+
|
23 |
+
|
24 |
+
def make_dataset(dir, max_dataset_size=float("inf")):
|
25 |
+
images = []
|
26 |
+
assert os.path.isdir(dir) or os.path.islink(dir), '%s is not a valid directory' % dir
|
27 |
+
|
28 |
+
for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
|
29 |
+
for fname in fnames:
|
30 |
+
if is_image_file(fname):
|
31 |
+
path = os.path.join(root, fname)
|
32 |
+
images.append(path)
|
33 |
+
return images[:min(max_dataset_size, len(images))]
|
34 |
+
|
35 |
+
|
36 |
+
def default_loader(path):
|
37 |
+
return Image.open(path).convert('RGB')
|
38 |
+
|
39 |
+
|
40 |
+
class ImageFolder(data.Dataset):
|
41 |
+
|
42 |
+
def __init__(self, root, transform=None, return_paths=False,
|
43 |
+
loader=default_loader):
|
44 |
+
imgs = make_dataset(root)
|
45 |
+
if len(imgs) == 0:
|
46 |
+
raise(RuntimeError("Found 0 images in: " + root + "\n"
|
47 |
+
"Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
|
48 |
+
|
49 |
+
self.root = root
|
50 |
+
self.imgs = imgs
|
51 |
+
self.transform = transform
|
52 |
+
self.return_paths = return_paths
|
53 |
+
self.loader = loader
|
54 |
+
|
55 |
+
def __getitem__(self, index):
|
56 |
+
path = self.imgs[index]
|
57 |
+
img = self.loader(path)
|
58 |
+
if self.transform is not None:
|
59 |
+
img = self.transform(img)
|
60 |
+
if self.return_paths:
|
61 |
+
return img, path
|
62 |
+
else:
|
63 |
+
return img
|
64 |
+
|
65 |
+
def __len__(self):
|
66 |
+
return len(self.imgs)
|
chat_anything/sad_talker/face3d/data/template_dataset.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Dataset class template
|
2 |
+
|
3 |
+
This module provides a template for users to implement custom datasets.
|
4 |
+
You can specify '--dataset_mode template' to use this dataset.
|
5 |
+
The class name should be consistent with both the filename and its dataset_mode option.
|
6 |
+
The filename should be <dataset_mode>_dataset.py
|
7 |
+
The class name should be <Dataset_mode>Dataset.py
|
8 |
+
You need to implement the following functions:
|
9 |
+
-- <modify_commandline_options>: Add dataset-specific options and rewrite default values for existing options.
|
10 |
+
-- <__init__>: Initialize this dataset class.
|
11 |
+
-- <__getitem__>: Return a data point and its metadata information.
|
12 |
+
-- <__len__>: Return the number of images.
|
13 |
+
"""
|
14 |
+
from data.base_dataset import BaseDataset, get_transform
|
15 |
+
# from data.image_folder import make_dataset
|
16 |
+
# from PIL import Image
|
17 |
+
|
18 |
+
|
19 |
+
class TemplateDataset(BaseDataset):
|
20 |
+
"""A template dataset class for you to implement custom datasets."""
|
21 |
+
@staticmethod
|
22 |
+
def modify_commandline_options(parser, is_train):
|
23 |
+
"""Add new dataset-specific options, and rewrite default values for existing options.
|
24 |
+
|
25 |
+
Parameters:
|
26 |
+
parser -- original option parser
|
27 |
+
is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
the modified parser.
|
31 |
+
"""
|
32 |
+
parser.add_argument('--new_dataset_option', type=float, default=1.0, help='new dataset option')
|
33 |
+
parser.set_defaults(max_dataset_size=10, new_dataset_option=2.0) # specify dataset-specific default values
|
34 |
+
return parser
|
35 |
+
|
36 |
+
def __init__(self, opt):
|
37 |
+
"""Initialize this dataset class.
|
38 |
+
|
39 |
+
Parameters:
|
40 |
+
opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
|
41 |
+
|
42 |
+
A few things can be done here.
|
43 |
+
- save the options (have been done in BaseDataset)
|
44 |
+
- get image paths and meta information of the dataset.
|
45 |
+
- define the image transformation.
|
46 |
+
"""
|
47 |
+
# save the option and dataset root
|
48 |
+
BaseDataset.__init__(self, opt)
|
49 |
+
# get the image paths of your dataset;
|
50 |
+
self.image_paths = [] # You can call sorted(make_dataset(self.root, opt.max_dataset_size)) to get all the image paths under the directory self.root
|
51 |
+
# define the default transform function. You can use <base_dataset.get_transform>; You can also define your custom transform function
|
52 |
+
self.transform = get_transform(opt)
|
53 |
+
|
54 |
+
def __getitem__(self, index):
|
55 |
+
"""Return a data point and its metadata information.
|
56 |
+
|
57 |
+
Parameters:
|
58 |
+
index -- a random integer for data indexing
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
a dictionary of data with their names. It usually contains the data itself and its metadata information.
|
62 |
+
|
63 |
+
Step 1: get a random image path: e.g., path = self.image_paths[index]
|
64 |
+
Step 2: load your data from the disk: e.g., image = Image.open(path).convert('RGB').
|
65 |
+
Step 3: convert your data to a PyTorch tensor. You can use helpder functions such as self.transform. e.g., data = self.transform(image)
|
66 |
+
Step 4: return a data point as a dictionary.
|
67 |
+
"""
|
68 |
+
path = 'temp' # needs to be a string
|
69 |
+
data_A = None # needs to be a tensor
|
70 |
+
data_B = None # needs to be a tensor
|
71 |
+
return {'data_A': data_A, 'data_B': data_B, 'path': path}
|
72 |
+
|
73 |
+
def __len__(self):
|
74 |
+
"""Return the total number of images."""
|
75 |
+
return len(self.image_paths)
|
chat_anything/sad_talker/face3d/extract_kp_videos.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import cv2
|
3 |
+
import time
|
4 |
+
import glob
|
5 |
+
import argparse
|
6 |
+
import face_alignment
|
7 |
+
import numpy as np
|
8 |
+
from PIL import Image
|
9 |
+
from tqdm import tqdm
|
10 |
+
from itertools import cycle
|
11 |
+
|
12 |
+
from torch.multiprocessing import Pool, Process, set_start_method
|
13 |
+
|
14 |
+
class KeypointExtractor():
|
15 |
+
def __init__(self, device):
|
16 |
+
self.detector = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D,
|
17 |
+
device=device)
|
18 |
+
|
19 |
+
def extract_keypoint(self, images, name=None, info=True):
|
20 |
+
if isinstance(images, list):
|
21 |
+
keypoints = []
|
22 |
+
if info:
|
23 |
+
i_range = tqdm(images,desc='landmark Det:')
|
24 |
+
else:
|
25 |
+
i_range = images
|
26 |
+
|
27 |
+
for image in i_range:
|
28 |
+
current_kp = self.extract_keypoint(image)
|
29 |
+
if np.mean(current_kp) == -1 and keypoints:
|
30 |
+
keypoints.append(keypoints[-1])
|
31 |
+
else:
|
32 |
+
keypoints.append(current_kp[None])
|
33 |
+
|
34 |
+
keypoints = np.concatenate(keypoints, 0)
|
35 |
+
np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
|
36 |
+
return keypoints
|
37 |
+
else:
|
38 |
+
while True:
|
39 |
+
try:
|
40 |
+
keypoints = self.detector.get_landmarks_from_image(np.array(images))[0]
|
41 |
+
break
|
42 |
+
except RuntimeError as e:
|
43 |
+
if str(e).startswith('CUDA'):
|
44 |
+
print("Warning: out of memory, sleep for 1s")
|
45 |
+
time.sleep(1)
|
46 |
+
else:
|
47 |
+
print(e)
|
48 |
+
break
|
49 |
+
except TypeError:
|
50 |
+
print('No face detected in this image')
|
51 |
+
shape = [68, 2]
|
52 |
+
keypoints = -1. * np.ones(shape)
|
53 |
+
break
|
54 |
+
if name is not None:
|
55 |
+
np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
|
56 |
+
return keypoints
|
57 |
+
|
58 |
+
def read_video(filename):
|
59 |
+
frames = []
|
60 |
+
cap = cv2.VideoCapture(filename)
|
61 |
+
while cap.isOpened():
|
62 |
+
ret, frame = cap.read()
|
63 |
+
if ret:
|
64 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
65 |
+
frame = Image.fromarray(frame)
|
66 |
+
frames.append(frame)
|
67 |
+
else:
|
68 |
+
break
|
69 |
+
cap.release()
|
70 |
+
return frames
|
71 |
+
|
72 |
+
def run(data):
|
73 |
+
filename, opt, device = data
|
74 |
+
os.environ['CUDA_VISIBLE_DEVICES'] = device
|
75 |
+
kp_extractor = KeypointExtractor()
|
76 |
+
images = read_video(filename)
|
77 |
+
name = filename.split('/')[-2:]
|
78 |
+
os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)
|
79 |
+
kp_extractor.extract_keypoint(
|
80 |
+
images,
|
81 |
+
name=os.path.join(opt.output_dir, name[-2], name[-1])
|
82 |
+
)
|
83 |
+
|
84 |
+
if __name__ == '__main__':
|
85 |
+
set_start_method('spawn')
|
86 |
+
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
87 |
+
parser.add_argument('--input_dir', type=str, help='the folder of the input files')
|
88 |
+
parser.add_argument('--output_dir', type=str, help='the folder of the output files')
|
89 |
+
parser.add_argument('--device_ids', type=str, default='0,1')
|
90 |
+
parser.add_argument('--workers', type=int, default=4)
|
91 |
+
|
92 |
+
opt = parser.parse_args()
|
93 |
+
filenames = list()
|
94 |
+
VIDEO_EXTENSIONS_LOWERCASE = {'mp4'}
|
95 |
+
VIDEO_EXTENSIONS = VIDEO_EXTENSIONS_LOWERCASE.union({f.upper() for f in VIDEO_EXTENSIONS_LOWERCASE})
|
96 |
+
extensions = VIDEO_EXTENSIONS
|
97 |
+
|
98 |
+
for ext in extensions:
|
99 |
+
os.listdir(f'{opt.input_dir}')
|
100 |
+
print(f'{opt.input_dir}/*.{ext}')
|
101 |
+
filenames = sorted(glob.glob(f'{opt.input_dir}/*.{ext}'))
|
102 |
+
print('Total number of videos:', len(filenames))
|
103 |
+
pool = Pool(opt.workers)
|
104 |
+
args_list = cycle([opt])
|
105 |
+
device_ids = opt.device_ids.split(",")
|
106 |
+
device_ids = cycle(device_ids)
|
107 |
+
for data in tqdm(pool.imap_unordered(run, zip(filenames, args_list, device_ids))):
|
108 |
+
None
|
chat_anything/sad_talker/face3d/extract_kp_videos_safe.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import cv2
|
3 |
+
import time
|
4 |
+
import glob
|
5 |
+
import argparse
|
6 |
+
import numpy as np
|
7 |
+
from PIL import Image
|
8 |
+
import torch
|
9 |
+
from tqdm import tqdm
|
10 |
+
from itertools import cycle
|
11 |
+
from torch.multiprocessing import Pool, Process, set_start_method
|
12 |
+
|
13 |
+
from facexlib.alignment import landmark_98_to_68
|
14 |
+
from facexlib.detection import init_detection_model
|
15 |
+
|
16 |
+
from facexlib.utils import load_file_from_url
|
17 |
+
from chat_anything.sad_talker.face3d.util.my_awing_arch import FAN
|
18 |
+
|
19 |
+
def init_alignment_model(model_name, half=False, device='cuda', model_rootpath=None):
|
20 |
+
if model_name == 'awing_fan':
|
21 |
+
model = FAN(num_modules=4, num_landmarks=98, device=device)
|
22 |
+
model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/alignment_WFLW_4HG.pth'
|
23 |
+
else:
|
24 |
+
raise NotImplementedError(f'{model_name} is not implemented.')
|
25 |
+
|
26 |
+
model_path = load_file_from_url(
|
27 |
+
url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath)
|
28 |
+
model.load_state_dict(torch.load(model_path, map_location=device)['state_dict'], strict=True)
|
29 |
+
model.eval()
|
30 |
+
model = model.to(device)
|
31 |
+
return model
|
32 |
+
|
33 |
+
|
34 |
+
class KeypointExtractor():
|
35 |
+
def __init__(self, device='cuda'):
|
36 |
+
|
37 |
+
### gfpgan/weights
|
38 |
+
try:
|
39 |
+
import webui # in webui
|
40 |
+
root_path = 'extensions/SadTalker/gfpgan/weights'
|
41 |
+
|
42 |
+
except:
|
43 |
+
# root_path = 'gfpgan/weights'
|
44 |
+
root_path = 'MODELS/gfpgan/weights'
|
45 |
+
|
46 |
+
self.detector = init_alignment_model('awing_fan',device=device, model_rootpath=root_path)
|
47 |
+
self.det_net = init_detection_model('retinaface_resnet50', half=False,device=device, model_rootpath=root_path)
|
48 |
+
|
49 |
+
def extract_keypoint(self, images, name=None, info=True):
|
50 |
+
if isinstance(images, list):
|
51 |
+
keypoints = []
|
52 |
+
if info:
|
53 |
+
i_range = tqdm(images,desc='landmark Det:')
|
54 |
+
else:
|
55 |
+
i_range = images
|
56 |
+
|
57 |
+
for image in i_range:
|
58 |
+
print("detect landmarks")
|
59 |
+
current_kp = self.extract_keypoint(image)
|
60 |
+
# current_kp = self.detector.get_landmarks(np.array(image))
|
61 |
+
if np.mean(current_kp) == -1 and keypoints:
|
62 |
+
keypoints.append(keypoints[-1])
|
63 |
+
else:
|
64 |
+
keypoints.append(current_kp[None])
|
65 |
+
|
66 |
+
keypoints = np.concatenate(keypoints, 0)
|
67 |
+
np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
|
68 |
+
return keypoints
|
69 |
+
else:
|
70 |
+
print("here")
|
71 |
+
while True:
|
72 |
+
try:
|
73 |
+
with torch.no_grad():
|
74 |
+
# face detection -> face alignment.
|
75 |
+
img = np.array(images)
|
76 |
+
bboxes = self.det_net.detect_faces(images, 0.97)
|
77 |
+
|
78 |
+
bboxes = bboxes[0]
|
79 |
+
img = img[int(bboxes[1]):int(bboxes[3]), int(bboxes[0]):int(bboxes[2]), :]
|
80 |
+
|
81 |
+
landmarks=self.detector.get_landmarks(img)
|
82 |
+
print(landmarks.shape)
|
83 |
+
start_time=time.time()
|
84 |
+
keypoints = landmark_98_to_68(self.detector.get_landmarks(img)) # [0]
|
85 |
+
end_time=time.time()
|
86 |
+
print(type(keypoints))
|
87 |
+
print(keypoints.shape)
|
88 |
+
|
89 |
+
elapsed_time = end_time - start_time # 计算时间差
|
90 |
+
print("landmark检测时间:%.4f秒" % elapsed_time)
|
91 |
+
#### keypoints to the original location
|
92 |
+
keypoints[:,0] += int(bboxes[0])
|
93 |
+
keypoints[:,1] += int(bboxes[1])
|
94 |
+
|
95 |
+
break
|
96 |
+
except RuntimeError as e:
|
97 |
+
if str(e).startswith('CUDA'):
|
98 |
+
print("Warning: out of memory, sleep for 1s")
|
99 |
+
time.sleep(1)
|
100 |
+
else:
|
101 |
+
print(e)
|
102 |
+
break
|
103 |
+
except TypeError:
|
104 |
+
print('No face detected in this image')
|
105 |
+
shape = [68, 2]
|
106 |
+
keypoints = -1. * np.ones(shape)
|
107 |
+
break
|
108 |
+
if name is not None:
|
109 |
+
np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
|
110 |
+
return keypoints
|
111 |
+
|
112 |
+
def read_video(filename):
|
113 |
+
frames = []
|
114 |
+
cap = cv2.VideoCapture(filename)
|
115 |
+
while cap.isOpened():
|
116 |
+
ret, frame = cap.read()
|
117 |
+
if ret:
|
118 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
119 |
+
frame = Image.fromarray(frame)
|
120 |
+
frames.append(frame)
|
121 |
+
else:
|
122 |
+
break
|
123 |
+
cap.release()
|
124 |
+
return frames
|
125 |
+
|
126 |
+
def run(data):
|
127 |
+
filename, opt, device = data
|
128 |
+
os.environ['CUDA_VISIBLE_DEVICES'] = device
|
129 |
+
kp_extractor = KeypointExtractor()
|
130 |
+
images = read_video(filename)
|
131 |
+
name = filename.split('/')[-2:]
|
132 |
+
os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)
|
133 |
+
kp_extractor.extract_keypoint(
|
134 |
+
images,
|
135 |
+
name=os.path.join(opt.output_dir, name[-2], name[-1])
|
136 |
+
)
|
137 |
+
|
138 |
+
if __name__ == '__main__':
|
139 |
+
set_start_method('spawn')
|
140 |
+
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
141 |
+
parser.add_argument('--input_dir', type=str, help='the folder of the input files')
|
142 |
+
parser.add_argument('--output_dir', type=str, help='the folder of the output files')
|
143 |
+
parser.add_argument('--device_ids', type=str, default='0,1')
|
144 |
+
parser.add_argument('--workers', type=int, default=4)
|
145 |
+
|
146 |
+
opt = parser.parse_args()
|
147 |
+
filenames = list()
|
148 |
+
VIDEO_EXTENSIONS_LOWERCASE = {'mp4'}
|
149 |
+
VIDEO_EXTENSIONS = VIDEO_EXTENSIONS_LOWERCASE.union({f.upper() for f in VIDEO_EXTENSIONS_LOWERCASE})
|
150 |
+
extensions = VIDEO_EXTENSIONS
|
151 |
+
|
152 |
+
for ext in extensions:
|
153 |
+
os.listdir(f'{opt.input_dir}')
|
154 |
+
print(f'{opt.input_dir}/*.{ext}')
|
155 |
+
filenames = sorted(glob.glob(f'{opt.input_dir}/*.{ext}'))
|
156 |
+
print('Total number of videos:', len(filenames))
|
157 |
+
pool = Pool(opt.workers)
|
158 |
+
args_list = cycle([opt])
|
159 |
+
device_ids = opt.device_ids.split(",")
|
160 |
+
device_ids = cycle(device_ids)
|
161 |
+
for data in tqdm(pool.imap_unordered(run, zip(filenames, args_list, device_ids))):
|
162 |
+
None
|
chat_anything/sad_talker/face3d/models/__init__.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This package contains modules related to objective functions, optimizations, and network architectures.
|
2 |
+
|
3 |
+
To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
|
4 |
+
You need to implement the following five functions:
|
5 |
+
-- <__init__>: initialize the class; first call BaseModel.__init__(self, opt).
|
6 |
+
-- <set_input>: unpack data from dataset and apply preprocessing.
|
7 |
+
-- <forward>: produce intermediate results.
|
8 |
+
-- <optimize_parameters>: calculate loss, gradients, and update network weights.
|
9 |
+
-- <modify_commandline_options>: (optionally) add model-specific options and set default options.
|
10 |
+
|
11 |
+
In the function <__init__>, you need to define four lists:
|
12 |
+
-- self.loss_names (str list): specify the training losses that you want to plot and save.
|
13 |
+
-- self.model_names (str list): define networks used in our training.
|
14 |
+
-- self.visual_names (str list): specify the images that you want to display and save.
|
15 |
+
-- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
|
16 |
+
|
17 |
+
Now you can use the model class by specifying flag '--model dummy'.
|
18 |
+
See our template model class 'template_model.py' for more details.
|
19 |
+
"""
|
20 |
+
|
21 |
+
import importlib
|
22 |
+
from chat_anything.sad_talker.face3d.models.base_model import BaseModel
|
23 |
+
|
24 |
+
|
25 |
+
def find_model_using_name(model_name):
|
26 |
+
"""Import the module "models/[model_name]_model.py".
|
27 |
+
|
28 |
+
In the file, the class called DatasetNameModel() will
|
29 |
+
be instantiated. It has to be a subclass of BaseModel,
|
30 |
+
and it is case-insensitive.
|
31 |
+
"""
|
32 |
+
model_filename = "face3d.models." + model_name + "_model"
|
33 |
+
modellib = importlib.import_module(model_filename)
|
34 |
+
model = None
|
35 |
+
target_model_name = model_name.replace('_', '') + 'model'
|
36 |
+
for name, cls in modellib.__dict__.items():
|
37 |
+
if name.lower() == target_model_name.lower() \
|
38 |
+
and issubclass(cls, BaseModel):
|
39 |
+
model = cls
|
40 |
+
|
41 |
+
if model is None:
|
42 |
+
print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
|
43 |
+
exit(0)
|
44 |
+
|
45 |
+
return model
|
46 |
+
|
47 |
+
|
48 |
+
def get_option_setter(model_name):
|
49 |
+
"""Return the static method <modify_commandline_options> of the model class."""
|
50 |
+
model_class = find_model_using_name(model_name)
|
51 |
+
return model_class.modify_commandline_options
|
52 |
+
|
53 |
+
|
54 |
+
def create_model(opt):
|
55 |
+
"""Create a model given the option.
|
56 |
+
|
57 |
+
This function warps the class CustomDatasetDataLoader.
|
58 |
+
This is the main interface between this package and 'train.py'/'test.py'
|
59 |
+
|
60 |
+
Example:
|
61 |
+
>>> from models import create_model
|
62 |
+
>>> model = create_model(opt)
|
63 |
+
"""
|
64 |
+
model = find_model_using_name(opt.model)
|
65 |
+
instance = model(opt)
|
66 |
+
print("model [%s] was created" % type(instance).__name__)
|
67 |
+
return instance
|
chat_anything/sad_talker/face3d/models/arcface_torch/README.md
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Distributed Arcface Training in Pytorch
|
2 |
+
|
3 |
+
This is a deep learning library that makes face recognition efficient, and effective, which can train tens of millions
|
4 |
+
identity on a single server.
|
5 |
+
|
6 |
+
## Requirements
|
7 |
+
|
8 |
+
- Install [pytorch](http://pytorch.org) (torch>=1.6.0), our doc for [install.md](docs/install.md).
|
9 |
+
- `pip install -r requirements.txt`.
|
10 |
+
- Download the dataset
|
11 |
+
from [https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_)
|
12 |
+
.
|
13 |
+
|
14 |
+
## How to Training
|
15 |
+
|
16 |
+
To train a model, run `train.py` with the path to the configs:
|
17 |
+
|
18 |
+
### 1. Single node, 8 GPUs:
|
19 |
+
|
20 |
+
```shell
|
21 |
+
python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/ms1mv3_r50
|
22 |
+
```
|
23 |
+
|
24 |
+
### 2. Multiple nodes, each node 8 GPUs:
|
25 |
+
|
26 |
+
Node 0:
|
27 |
+
|
28 |
+
```shell
|
29 |
+
python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr="ip1" --master_port=1234 train.py train.py configs/ms1mv3_r50
|
30 |
+
```
|
31 |
+
|
32 |
+
Node 1:
|
33 |
+
|
34 |
+
```shell
|
35 |
+
python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr="ip1" --master_port=1234 train.py train.py configs/ms1mv3_r50
|
36 |
+
```
|
37 |
+
|
38 |
+
### 3.Training resnet2060 with 8 GPUs:
|
39 |
+
|
40 |
+
```shell
|
41 |
+
python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/ms1mv3_r2060.py
|
42 |
+
```
|
43 |
+
|
44 |
+
## Model Zoo
|
45 |
+
|
46 |
+
- The models are available for non-commercial research purposes only.
|
47 |
+
- All models can be found in here.
|
48 |
+
- [Baidu Yun Pan](https://pan.baidu.com/s/1CL-l4zWqsI1oDuEEYVhj-g): e8pw
|
49 |
+
- [onedrive](https://1drv.ms/u/s!AswpsDO2toNKq0lWY69vN58GR6mw?e=p9Ov5d)
|
50 |
+
|
51 |
+
### Performance on [**ICCV2021-MFR**](http://iccv21-mfr.com/)
|
52 |
+
|
53 |
+
ICCV2021-MFR testset consists of non-celebrities so we can ensure that it has very few overlap with public available face
|
54 |
+
recognition training set, such as MS1M and CASIA as they mostly collected from online celebrities.
|
55 |
+
As the result, we can evaluate the FAIR performance for different algorithms.
|
56 |
+
|
57 |
+
For **ICCV2021-MFR-ALL** set, TAR is measured on all-to-all 1:1 protocal, with FAR less than 0.000001(e-6). The
|
58 |
+
globalised multi-racial testset contains 242,143 identities and 1,624,305 images.
|
59 |
+
|
60 |
+
For **ICCV2021-MFR-MASK** set, TAR is measured on mask-to-nonmask 1:1 protocal, with FAR less than 0.0001(e-4).
|
61 |
+
Mask testset contains 6,964 identities, 6,964 masked images and 13,928 non-masked images.
|
62 |
+
There are totally 13,928 positive pairs and 96,983,824 negative pairs.
|
63 |
+
|
64 |
+
| Datasets | backbone | Training throughout | Size / MB | **ICCV2021-MFR-MASK** | **ICCV2021-MFR-ALL** |
|
65 |
+
| :---: | :--- | :--- | :--- |:--- |:--- |
|
66 |
+
| MS1MV3 | r18 | - | 91 | **47.85** | **68.33** |
|
67 |
+
| Glint360k | r18 | 8536 | 91 | **53.32** | **72.07** |
|
68 |
+
| MS1MV3 | r34 | - | 130 | **58.72** | **77.36** |
|
69 |
+
| Glint360k | r34 | 6344 | 130 | **65.10** | **83.02** |
|
70 |
+
| MS1MV3 | r50 | 5500 | 166 | **63.85** | **80.53** |
|
71 |
+
| Glint360k | r50 | 5136 | 166 | **70.23** | **87.08** |
|
72 |
+
| MS1MV3 | r100 | - | 248 | **69.09** | **84.31** |
|
73 |
+
| Glint360k | r100 | 3332 | 248 | **75.57** | **90.66** |
|
74 |
+
| MS1MV3 | mobilefacenet | 12185 | 7.8 | **41.52** | **65.26** |
|
75 |
+
| Glint360k | mobilefacenet | 11197 | 7.8 | **44.52** | **66.48** |
|
76 |
+
|
77 |
+
### Performance on IJB-C and Verification Datasets
|
78 |
+
|
79 |
+
| Datasets | backbone | IJBC(1e-05) | IJBC(1e-04) | agedb30 | cfp_fp | lfw | log |
|
80 |
+
| :---: | :--- | :--- | :--- | :--- |:--- |:--- |:--- |
|
81 |
+
| MS1MV3 | r18 | 92.07 | 94.66 | 97.77 | 97.73 | 99.77 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r18_fp16/training.log)|
|
82 |
+
| MS1MV3 | r34 | 94.10 | 95.90 | 98.10 | 98.67 | 99.80 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r34_fp16/training.log)|
|
83 |
+
| MS1MV3 | r50 | 94.79 | 96.46 | 98.35 | 98.96 | 99.83 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r50_fp16/training.log)|
|
84 |
+
| MS1MV3 | r100 | 95.31 | 96.81 | 98.48 | 99.06 | 99.85 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r100_fp16/training.log)|
|
85 |
+
| MS1MV3 | **r2060**| 95.34 | 97.11 | 98.67 | 99.24 | 99.87 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r2060_fp16/training.log)|
|
86 |
+
| Glint360k |r18-0.1 | 93.16 | 95.33 | 97.72 | 97.73 | 99.77 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r18_fp16_0.1/training.log)|
|
87 |
+
| Glint360k |r34-0.1 | 95.16 | 96.56 | 98.33 | 98.78 | 99.82 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r34_fp16_0.1/training.log)|
|
88 |
+
| Glint360k |r50-0.1 | 95.61 | 96.97 | 98.38 | 99.20 | 99.83 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r50_fp16_0.1/training.log)|
|
89 |
+
| Glint360k |r100-0.1 | 95.88 | 97.32 | 98.48 | 99.29 | 99.82 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100_fp16_0.1/training.log)|
|
90 |
+
|
91 |
+
[comment]: <> (More details see [model.md](docs/modelzoo.md) in docs.)
|
92 |
+
|
93 |
+
|
94 |
+
## [Speed Benchmark](docs/speed_benchmark.md)
|
95 |
+
|
96 |
+
**Arcface Torch** can train large-scale face recognition training set efficiently and quickly. When the number of
|
97 |
+
classes in training sets is greater than 300K and the training is sufficient, partial fc sampling strategy will get same
|
98 |
+
accuracy with several times faster training performance and smaller GPU memory.
|
99 |
+
Partial FC is a sparse variant of the model parallel architecture for large sacle face recognition. Partial FC use a
|
100 |
+
sparse softmax, where each batch dynamicly sample a subset of class centers for training. In each iteration, only a
|
101 |
+
sparse part of the parameters will be updated, which can reduce a lot of GPU memory and calculations. With Partial FC,
|
102 |
+
we can scale trainset of 29 millions identities, the largest to date. Partial FC also supports multi-machine distributed
|
103 |
+
training and mixed precision training.
|
104 |
+
|
105 |
+
![Image text](https://github.com/anxiangsir/insightface_arcface_log/blob/master/partial_fc_v2.png)
|
106 |
+
|
107 |
+
More details see
|
108 |
+
[speed_benchmark.md](docs/speed_benchmark.md) in docs.
|
109 |
+
|
110 |
+
### 1. Training speed of different parallel methods (samples / second), Tesla V100 32GB * 8. (Larger is better)
|
111 |
+
|
112 |
+
`-` means training failed because of gpu memory limitations.
|
113 |
+
|
114 |
+
| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
|
115 |
+
| :--- | :--- | :--- | :--- |
|
116 |
+
|125000 | 4681 | 4824 | 5004 |
|
117 |
+
|1400000 | **1672** | 3043 | 4738 |
|
118 |
+
|5500000 | **-** | **1389** | 3975 |
|
119 |
+
|8000000 | **-** | **-** | 3565 |
|
120 |
+
|16000000 | **-** | **-** | 2679 |
|
121 |
+
|29000000 | **-** | **-** | **1855** |
|
122 |
+
|
123 |
+
### 2. GPU memory cost of different parallel methods (MB per GPU), Tesla V100 32GB * 8. (Smaller is better)
|
124 |
+
|
125 |
+
| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
|
126 |
+
| :--- | :--- | :--- | :--- |
|
127 |
+
|125000 | 7358 | 5306 | 4868 |
|
128 |
+
|1400000 | 32252 | 11178 | 6056 |
|
129 |
+
|5500000 | **-** | 32188 | 9854 |
|
130 |
+
|8000000 | **-** | **-** | 12310 |
|
131 |
+
|16000000 | **-** | **-** | 19950 |
|
132 |
+
|29000000 | **-** | **-** | 32324 |
|
133 |
+
|
134 |
+
## Evaluation ICCV2021-MFR and IJB-C
|
135 |
+
|
136 |
+
More details see [eval.md](docs/eval.md) in docs.
|
137 |
+
|
138 |
+
## Test
|
139 |
+
|
140 |
+
We tested many versions of PyTorch. Please create an issue if you are having trouble.
|
141 |
+
|
142 |
+
- [x] torch 1.6.0
|
143 |
+
- [x] torch 1.7.1
|
144 |
+
- [x] torch 1.8.0
|
145 |
+
- [x] torch 1.9.0
|
146 |
+
|
147 |
+
## Citation
|
148 |
+
|
149 |
+
```
|
150 |
+
@inproceedings{deng2019arcface,
|
151 |
+
title={Arcface: Additive angular margin loss for deep face recognition},
|
152 |
+
author={Deng, Jiankang and Guo, Jia and Xue, Niannan and Zafeiriou, Stefanos},
|
153 |
+
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
|
154 |
+
pages={4690--4699},
|
155 |
+
year={2019}
|
156 |
+
}
|
157 |
+
@inproceedings{an2020partical_fc,
|
158 |
+
title={Partial FC: Training 10 Million Identities on a Single Machine},
|
159 |
+
author={An, Xiang and Zhu, Xuhan and Xiao, Yang and Wu, Lan and Zhang, Ming and Gao, Yuan and Qin, Bin and
|
160 |
+
Zhang, Debing and Fu Ying},
|
161 |
+
booktitle={Arxiv 2010.05222},
|
162 |
+
year={2020}
|
163 |
+
}
|
164 |
+
```
|
chat_anything/sad_talker/face3d/models/arcface_torch/backbones/__init__.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .iresnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200
|
2 |
+
from .mobilefacenet import get_mbf
|
3 |
+
|
4 |
+
|
5 |
+
def get_model(name, **kwargs):
|
6 |
+
# resnet
|
7 |
+
if name == "r18":
|
8 |
+
return iresnet18(False, **kwargs)
|
9 |
+
elif name == "r34":
|
10 |
+
return iresnet34(False, **kwargs)
|
11 |
+
elif name == "r50":
|
12 |
+
return iresnet50(False, **kwargs)
|
13 |
+
elif name == "r100":
|
14 |
+
return iresnet100(False, **kwargs)
|
15 |
+
elif name == "r200":
|
16 |
+
return iresnet200(False, **kwargs)
|
17 |
+
elif name == "r2060":
|
18 |
+
from .iresnet2060 import iresnet2060
|
19 |
+
return iresnet2060(False, **kwargs)
|
20 |
+
elif name == "mbf":
|
21 |
+
fp16 = kwargs.get("fp16", False)
|
22 |
+
num_features = kwargs.get("num_features", 512)
|
23 |
+
return get_mbf(fp16=fp16, num_features=num_features)
|
24 |
+
else:
|
25 |
+
raise ValueError()
|
chat_anything/sad_talker/face3d/models/arcface_torch/backbones/iresnet.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200']
|
5 |
+
|
6 |
+
|
7 |
+
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
|
8 |
+
"""3x3 convolution with padding"""
|
9 |
+
return nn.Conv2d(in_planes,
|
10 |
+
out_planes,
|
11 |
+
kernel_size=3,
|
12 |
+
stride=stride,
|
13 |
+
padding=dilation,
|
14 |
+
groups=groups,
|
15 |
+
bias=False,
|
16 |
+
dilation=dilation)
|
17 |
+
|
18 |
+
|
19 |
+
def conv1x1(in_planes, out_planes, stride=1):
|
20 |
+
"""1x1 convolution"""
|
21 |
+
return nn.Conv2d(in_planes,
|
22 |
+
out_planes,
|
23 |
+
kernel_size=1,
|
24 |
+
stride=stride,
|
25 |
+
bias=False)
|
26 |
+
|
27 |
+
|
28 |
+
class IBasicBlock(nn.Module):
|
29 |
+
expansion = 1
|
30 |
+
def __init__(self, inplanes, planes, stride=1, downsample=None,
|
31 |
+
groups=1, base_width=64, dilation=1):
|
32 |
+
super(IBasicBlock, self).__init__()
|
33 |
+
if groups != 1 or base_width != 64:
|
34 |
+
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
|
35 |
+
if dilation > 1:
|
36 |
+
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
|
37 |
+
self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,)
|
38 |
+
self.conv1 = conv3x3(inplanes, planes)
|
39 |
+
self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,)
|
40 |
+
self.prelu = nn.PReLU(planes)
|
41 |
+
self.conv2 = conv3x3(planes, planes, stride)
|
42 |
+
self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,)
|
43 |
+
self.downsample = downsample
|
44 |
+
self.stride = stride
|
45 |
+
|
46 |
+
def forward(self, x):
|
47 |
+
identity = x
|
48 |
+
out = self.bn1(x)
|
49 |
+
out = self.conv1(out)
|
50 |
+
out = self.bn2(out)
|
51 |
+
out = self.prelu(out)
|
52 |
+
out = self.conv2(out)
|
53 |
+
out = self.bn3(out)
|
54 |
+
if self.downsample is not None:
|
55 |
+
identity = self.downsample(x)
|
56 |
+
out += identity
|
57 |
+
return out
|
58 |
+
|
59 |
+
|
60 |
+
class IResNet(nn.Module):
|
61 |
+
fc_scale = 7 * 7
|
62 |
+
def __init__(self,
|
63 |
+
block, layers, dropout=0, num_features=512, zero_init_residual=False,
|
64 |
+
groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
|
65 |
+
super(IResNet, self).__init__()
|
66 |
+
self.fp16 = fp16
|
67 |
+
self.inplanes = 64
|
68 |
+
self.dilation = 1
|
69 |
+
if replace_stride_with_dilation is None:
|
70 |
+
replace_stride_with_dilation = [False, False, False]
|
71 |
+
if len(replace_stride_with_dilation) != 3:
|
72 |
+
raise ValueError("replace_stride_with_dilation should be None "
|
73 |
+
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
|
74 |
+
self.groups = groups
|
75 |
+
self.base_width = width_per_group
|
76 |
+
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
|
77 |
+
self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
|
78 |
+
self.prelu = nn.PReLU(self.inplanes)
|
79 |
+
self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
|
80 |
+
self.layer2 = self._make_layer(block,
|
81 |
+
128,
|
82 |
+
layers[1],
|
83 |
+
stride=2,
|
84 |
+
dilate=replace_stride_with_dilation[0])
|
85 |
+
self.layer3 = self._make_layer(block,
|
86 |
+
256,
|
87 |
+
layers[2],
|
88 |
+
stride=2,
|
89 |
+
dilate=replace_stride_with_dilation[1])
|
90 |
+
self.layer4 = self._make_layer(block,
|
91 |
+
512,
|
92 |
+
layers[3],
|
93 |
+
stride=2,
|
94 |
+
dilate=replace_stride_with_dilation[2])
|
95 |
+
self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,)
|
96 |
+
self.dropout = nn.Dropout(p=dropout, inplace=True)
|
97 |
+
self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
|
98 |
+
self.features = nn.BatchNorm1d(num_features, eps=1e-05)
|
99 |
+
nn.init.constant_(self.features.weight, 1.0)
|
100 |
+
self.features.weight.requires_grad = False
|
101 |
+
|
102 |
+
for m in self.modules():
|
103 |
+
if isinstance(m, nn.Conv2d):
|
104 |
+
nn.init.normal_(m.weight, 0, 0.1)
|
105 |
+
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
|
106 |
+
nn.init.constant_(m.weight, 1)
|
107 |
+
nn.init.constant_(m.bias, 0)
|
108 |
+
|
109 |
+
if zero_init_residual:
|
110 |
+
for m in self.modules():
|
111 |
+
if isinstance(m, IBasicBlock):
|
112 |
+
nn.init.constant_(m.bn2.weight, 0)
|
113 |
+
|
114 |
+
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
|
115 |
+
downsample = None
|
116 |
+
previous_dilation = self.dilation
|
117 |
+
if dilate:
|
118 |
+
self.dilation *= stride
|
119 |
+
stride = 1
|
120 |
+
if stride != 1 or self.inplanes != planes * block.expansion:
|
121 |
+
downsample = nn.Sequential(
|
122 |
+
conv1x1(self.inplanes, planes * block.expansion, stride),
|
123 |
+
nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
|
124 |
+
)
|
125 |
+
layers = []
|
126 |
+
layers.append(
|
127 |
+
block(self.inplanes, planes, stride, downsample, self.groups,
|
128 |
+
self.base_width, previous_dilation))
|
129 |
+
self.inplanes = planes * block.expansion
|
130 |
+
for _ in range(1, blocks):
|
131 |
+
layers.append(
|
132 |
+
block(self.inplanes,
|
133 |
+
planes,
|
134 |
+
groups=self.groups,
|
135 |
+
base_width=self.base_width,
|
136 |
+
dilation=self.dilation))
|
137 |
+
|
138 |
+
return nn.Sequential(*layers)
|
139 |
+
|
140 |
+
def forward(self, x):
|
141 |
+
with torch.cuda.amp.autocast(self.fp16):
|
142 |
+
x = self.conv1(x)
|
143 |
+
x = self.bn1(x)
|
144 |
+
x = self.prelu(x)
|
145 |
+
x = self.layer1(x)
|
146 |
+
x = self.layer2(x)
|
147 |
+
x = self.layer3(x)
|
148 |
+
x = self.layer4(x)
|
149 |
+
x = self.bn2(x)
|
150 |
+
x = torch.flatten(x, 1)
|
151 |
+
x = self.dropout(x)
|
152 |
+
x = self.fc(x.float() if self.fp16 else x)
|
153 |
+
x = self.features(x)
|
154 |
+
return x
|
155 |
+
|
156 |
+
|
157 |
+
def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
|
158 |
+
model = IResNet(block, layers, **kwargs)
|
159 |
+
if pretrained:
|
160 |
+
raise ValueError()
|
161 |
+
return model
|
162 |
+
|
163 |
+
|
164 |
+
def iresnet18(pretrained=False, progress=True, **kwargs):
|
165 |
+
return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained,
|
166 |
+
progress, **kwargs)
|
167 |
+
|
168 |
+
|
169 |
+
def iresnet34(pretrained=False, progress=True, **kwargs):
|
170 |
+
return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained,
|
171 |
+
progress, **kwargs)
|
172 |
+
|
173 |
+
|
174 |
+
def iresnet50(pretrained=False, progress=True, **kwargs):
|
175 |
+
return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained,
|
176 |
+
progress, **kwargs)
|
177 |
+
|
178 |
+
|
179 |
+
def iresnet100(pretrained=False, progress=True, **kwargs):
|
180 |
+
return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained,
|
181 |
+
progress, **kwargs)
|
182 |
+
|
183 |
+
|
184 |
+
def iresnet200(pretrained=False, progress=True, **kwargs):
|
185 |
+
return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained,
|
186 |
+
progress, **kwargs)
|
187 |
+
|
chat_anything/sad_talker/face3d/models/arcface_torch/backbones/iresnet2060.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
assert torch.__version__ >= "1.8.1"
|
5 |
+
from torch.utils.checkpoint import checkpoint_sequential
|
6 |
+
|
7 |
+
__all__ = ['iresnet2060']
|
8 |
+
|
9 |
+
|
10 |
+
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
|
11 |
+
"""3x3 convolution with padding"""
|
12 |
+
return nn.Conv2d(in_planes,
|
13 |
+
out_planes,
|
14 |
+
kernel_size=3,
|
15 |
+
stride=stride,
|
16 |
+
padding=dilation,
|
17 |
+
groups=groups,
|
18 |
+
bias=False,
|
19 |
+
dilation=dilation)
|
20 |
+
|
21 |
+
|
22 |
+
def conv1x1(in_planes, out_planes, stride=1):
|
23 |
+
"""1x1 convolution"""
|
24 |
+
return nn.Conv2d(in_planes,
|
25 |
+
out_planes,
|
26 |
+
kernel_size=1,
|
27 |
+
stride=stride,
|
28 |
+
bias=False)
|
29 |
+
|
30 |
+
|
31 |
+
class IBasicBlock(nn.Module):
|
32 |
+
expansion = 1
|
33 |
+
|
34 |
+
def __init__(self, inplanes, planes, stride=1, downsample=None,
|
35 |
+
groups=1, base_width=64, dilation=1):
|
36 |
+
super(IBasicBlock, self).__init__()
|
37 |
+
if groups != 1 or base_width != 64:
|
38 |
+
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
|
39 |
+
if dilation > 1:
|
40 |
+
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
|
41 |
+
self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05, )
|
42 |
+
self.conv1 = conv3x3(inplanes, planes)
|
43 |
+
self.bn2 = nn.BatchNorm2d(planes, eps=1e-05, )
|
44 |
+
self.prelu = nn.PReLU(planes)
|
45 |
+
self.conv2 = conv3x3(planes, planes, stride)
|
46 |
+
self.bn3 = nn.BatchNorm2d(planes, eps=1e-05, )
|
47 |
+
self.downsample = downsample
|
48 |
+
self.stride = stride
|
49 |
+
|
50 |
+
def forward(self, x):
|
51 |
+
identity = x
|
52 |
+
out = self.bn1(x)
|
53 |
+
out = self.conv1(out)
|
54 |
+
out = self.bn2(out)
|
55 |
+
out = self.prelu(out)
|
56 |
+
out = self.conv2(out)
|
57 |
+
out = self.bn3(out)
|
58 |
+
if self.downsample is not None:
|
59 |
+
identity = self.downsample(x)
|
60 |
+
out += identity
|
61 |
+
return out
|
62 |
+
|
63 |
+
|
64 |
+
class IResNet(nn.Module):
|
65 |
+
fc_scale = 7 * 7
|
66 |
+
|
67 |
+
def __init__(self,
|
68 |
+
block, layers, dropout=0, num_features=512, zero_init_residual=False,
|
69 |
+
groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
|
70 |
+
super(IResNet, self).__init__()
|
71 |
+
self.fp16 = fp16
|
72 |
+
self.inplanes = 64
|
73 |
+
self.dilation = 1
|
74 |
+
if replace_stride_with_dilation is None:
|
75 |
+
replace_stride_with_dilation = [False, False, False]
|
76 |
+
if len(replace_stride_with_dilation) != 3:
|
77 |
+
raise ValueError("replace_stride_with_dilation should be None "
|
78 |
+
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
|
79 |
+
self.groups = groups
|
80 |
+
self.base_width = width_per_group
|
81 |
+
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
|
82 |
+
self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
|
83 |
+
self.prelu = nn.PReLU(self.inplanes)
|
84 |
+
self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
|
85 |
+
self.layer2 = self._make_layer(block,
|
86 |
+
128,
|
87 |
+
layers[1],
|
88 |
+
stride=2,
|
89 |
+
dilate=replace_stride_with_dilation[0])
|
90 |
+
self.layer3 = self._make_layer(block,
|
91 |
+
256,
|
92 |
+
layers[2],
|
93 |
+
stride=2,
|
94 |
+
dilate=replace_stride_with_dilation[1])
|
95 |
+
self.layer4 = self._make_layer(block,
|
96 |
+
512,
|
97 |
+
layers[3],
|
98 |
+
stride=2,
|
99 |
+
dilate=replace_stride_with_dilation[2])
|
100 |
+
self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05, )
|
101 |
+
self.dropout = nn.Dropout(p=dropout, inplace=True)
|
102 |
+
self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
|
103 |
+
self.features = nn.BatchNorm1d(num_features, eps=1e-05)
|
104 |
+
nn.init.constant_(self.features.weight, 1.0)
|
105 |
+
self.features.weight.requires_grad = False
|
106 |
+
|
107 |
+
for m in self.modules():
|
108 |
+
if isinstance(m, nn.Conv2d):
|
109 |
+
nn.init.normal_(m.weight, 0, 0.1)
|
110 |
+
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
|
111 |
+
nn.init.constant_(m.weight, 1)
|
112 |
+
nn.init.constant_(m.bias, 0)
|
113 |
+
|
114 |
+
if zero_init_residual:
|
115 |
+
for m in self.modules():
|
116 |
+
if isinstance(m, IBasicBlock):
|
117 |
+
nn.init.constant_(m.bn2.weight, 0)
|
118 |
+
|
119 |
+
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
|
120 |
+
downsample = None
|
121 |
+
previous_dilation = self.dilation
|
122 |
+
if dilate:
|
123 |
+
self.dilation *= stride
|
124 |
+
stride = 1
|
125 |
+
if stride != 1 or self.inplanes != planes * block.expansion:
|
126 |
+
downsample = nn.Sequential(
|
127 |
+
conv1x1(self.inplanes, planes * block.expansion, stride),
|
128 |
+
nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
|
129 |
+
)
|
130 |
+
layers = []
|
131 |
+
layers.append(
|
132 |
+
block(self.inplanes, planes, stride, downsample, self.groups,
|
133 |
+
self.base_width, previous_dilation))
|
134 |
+
self.inplanes = planes * block.expansion
|
135 |
+
for _ in range(1, blocks):
|
136 |
+
layers.append(
|
137 |
+
block(self.inplanes,
|
138 |
+
planes,
|
139 |
+
groups=self.groups,
|
140 |
+
base_width=self.base_width,
|
141 |
+
dilation=self.dilation))
|
142 |
+
|
143 |
+
return nn.Sequential(*layers)
|
144 |
+
|
145 |
+
def checkpoint(self, func, num_seg, x):
|
146 |
+
if self.training:
|
147 |
+
return checkpoint_sequential(func, num_seg, x)
|
148 |
+
else:
|
149 |
+
return func(x)
|
150 |
+
|
151 |
+
def forward(self, x):
|
152 |
+
with torch.cuda.amp.autocast(self.fp16):
|
153 |
+
x = self.conv1(x)
|
154 |
+
x = self.bn1(x)
|
155 |
+
x = self.prelu(x)
|
156 |
+
x = self.layer1(x)
|
157 |
+
x = self.checkpoint(self.layer2, 20, x)
|
158 |
+
x = self.checkpoint(self.layer3, 100, x)
|
159 |
+
x = self.layer4(x)
|
160 |
+
x = self.bn2(x)
|
161 |
+
x = torch.flatten(x, 1)
|
162 |
+
x = self.dropout(x)
|
163 |
+
x = self.fc(x.float() if self.fp16 else x)
|
164 |
+
x = self.features(x)
|
165 |
+
return x
|
166 |
+
|
167 |
+
|
168 |
+
def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
|
169 |
+
model = IResNet(block, layers, **kwargs)
|
170 |
+
if pretrained:
|
171 |
+
raise ValueError()
|
172 |
+
return model
|
173 |
+
|
174 |
+
|
175 |
+
def iresnet2060(pretrained=False, progress=True, **kwargs):
|
176 |
+
return _iresnet('iresnet2060', IBasicBlock, [3, 128, 1024 - 128, 3], pretrained, progress, **kwargs)
|
chat_anything/sad_talker/face3d/models/arcface_torch/backbones/mobilefacenet.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
Adapted from https://github.com/cavalleria/cavaface.pytorch/blob/master/backbone/mobilefacenet.py
|
3 |
+
Original author cavalleria
|
4 |
+
'''
|
5 |
+
|
6 |
+
import torch.nn as nn
|
7 |
+
from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Sequential, Module
|
8 |
+
import torch
|
9 |
+
|
10 |
+
|
11 |
+
class Flatten(Module):
|
12 |
+
def forward(self, x):
|
13 |
+
return x.view(x.size(0), -1)
|
14 |
+
|
15 |
+
|
16 |
+
class ConvBlock(Module):
|
17 |
+
def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
|
18 |
+
super(ConvBlock, self).__init__()
|
19 |
+
self.layers = nn.Sequential(
|
20 |
+
Conv2d(in_c, out_c, kernel, groups=groups, stride=stride, padding=padding, bias=False),
|
21 |
+
BatchNorm2d(num_features=out_c),
|
22 |
+
PReLU(num_parameters=out_c)
|
23 |
+
)
|
24 |
+
|
25 |
+
def forward(self, x):
|
26 |
+
return self.layers(x)
|
27 |
+
|
28 |
+
|
29 |
+
class LinearBlock(Module):
|
30 |
+
def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
|
31 |
+
super(LinearBlock, self).__init__()
|
32 |
+
self.layers = nn.Sequential(
|
33 |
+
Conv2d(in_c, out_c, kernel, stride, padding, groups=groups, bias=False),
|
34 |
+
BatchNorm2d(num_features=out_c)
|
35 |
+
)
|
36 |
+
|
37 |
+
def forward(self, x):
|
38 |
+
return self.layers(x)
|
39 |
+
|
40 |
+
|
41 |
+
class DepthWise(Module):
|
42 |
+
def __init__(self, in_c, out_c, residual=False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1):
|
43 |
+
super(DepthWise, self).__init__()
|
44 |
+
self.residual = residual
|
45 |
+
self.layers = nn.Sequential(
|
46 |
+
ConvBlock(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)),
|
47 |
+
ConvBlock(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride),
|
48 |
+
LinearBlock(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
|
49 |
+
)
|
50 |
+
|
51 |
+
def forward(self, x):
|
52 |
+
short_cut = None
|
53 |
+
if self.residual:
|
54 |
+
short_cut = x
|
55 |
+
x = self.layers(x)
|
56 |
+
if self.residual:
|
57 |
+
output = short_cut + x
|
58 |
+
else:
|
59 |
+
output = x
|
60 |
+
return output
|
61 |
+
|
62 |
+
|
63 |
+
class Residual(Module):
|
64 |
+
def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)):
|
65 |
+
super(Residual, self).__init__()
|
66 |
+
modules = []
|
67 |
+
for _ in range(num_block):
|
68 |
+
modules.append(DepthWise(c, c, True, kernel, stride, padding, groups))
|
69 |
+
self.layers = Sequential(*modules)
|
70 |
+
|
71 |
+
def forward(self, x):
|
72 |
+
return self.layers(x)
|
73 |
+
|
74 |
+
|
75 |
+
class GDC(Module):
|
76 |
+
def __init__(self, embedding_size):
|
77 |
+
super(GDC, self).__init__()
|
78 |
+
self.layers = nn.Sequential(
|
79 |
+
LinearBlock(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0)),
|
80 |
+
Flatten(),
|
81 |
+
Linear(512, embedding_size, bias=False),
|
82 |
+
BatchNorm1d(embedding_size))
|
83 |
+
|
84 |
+
def forward(self, x):
|
85 |
+
return self.layers(x)
|
86 |
+
|
87 |
+
|
88 |
+
class MobileFaceNet(Module):
|
89 |
+
def __init__(self, fp16=False, num_features=512):
|
90 |
+
super(MobileFaceNet, self).__init__()
|
91 |
+
scale = 2
|
92 |
+
self.fp16 = fp16
|
93 |
+
self.layers = nn.Sequential(
|
94 |
+
ConvBlock(3, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1)),
|
95 |
+
ConvBlock(64 * scale, 64 * scale, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64),
|
96 |
+
DepthWise(64 * scale, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128),
|
97 |
+
Residual(64 * scale, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
|
98 |
+
DepthWise(64 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256),
|
99 |
+
Residual(128 * scale, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
|
100 |
+
DepthWise(128 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512),
|
101 |
+
Residual(128 * scale, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
|
102 |
+
)
|
103 |
+
self.conv_sep = ConvBlock(128 * scale, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
|
104 |
+
self.features = GDC(num_features)
|
105 |
+
self._initialize_weights()
|
106 |
+
|
107 |
+
def _initialize_weights(self):
|
108 |
+
for m in self.modules():
|
109 |
+
if isinstance(m, nn.Conv2d):
|
110 |
+
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
111 |
+
if m.bias is not None:
|
112 |
+
m.bias.data.zero_()
|
113 |
+
elif isinstance(m, nn.BatchNorm2d):
|
114 |
+
m.weight.data.fill_(1)
|
115 |
+
m.bias.data.zero_()
|
116 |
+
elif isinstance(m, nn.Linear):
|
117 |
+
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
118 |
+
if m.bias is not None:
|
119 |
+
m.bias.data.zero_()
|
120 |
+
|
121 |
+
def forward(self, x):
|
122 |
+
with torch.cuda.amp.autocast(self.fp16):
|
123 |
+
x = self.layers(x)
|
124 |
+
x = self.conv_sep(x.float() if self.fp16 else x)
|
125 |
+
x = self.features(x)
|
126 |
+
return x
|
127 |
+
|
128 |
+
|
129 |
+
def get_mbf(fp16, num_features):
|
130 |
+
return MobileFaceNet(fp16, num_features)
|
chat_anything/sad_talker/face3d/models/arcface_torch/configs/3millions.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from easydict import EasyDict as edict
|
2 |
+
|
3 |
+
# configs for test speed
|
4 |
+
|
5 |
+
config = edict()
|
6 |
+
config.loss = "arcface"
|
7 |
+
config.network = "r50"
|
8 |
+
config.resume = False
|
9 |
+
config.output = None
|
10 |
+
config.embedding_size = 512
|
11 |
+
config.sample_rate = 1.0
|
12 |
+
config.fp16 = True
|
13 |
+
config.momentum = 0.9
|
14 |
+
config.weight_decay = 5e-4
|
15 |
+
config.batch_size = 128
|
16 |
+
config.lr = 0.1 # batch size is 512
|
17 |
+
|
18 |
+
config.rec = "synthetic"
|
19 |
+
config.num_classes = 300 * 10000
|
20 |
+
config.num_epoch = 30
|
21 |
+
config.warmup_epoch = -1
|
22 |
+
config.decay_epoch = [10, 16, 22]
|
23 |
+
config.val_targets = []
|
chat_anything/sad_talker/face3d/models/arcface_torch/configs/3millions_pfc.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from easydict import EasyDict as edict
|
2 |
+
|
3 |
+
# configs for test speed
|
4 |
+
|
5 |
+
config = edict()
|
6 |
+
config.loss = "arcface"
|
7 |
+
config.network = "r50"
|
8 |
+
config.resume = False
|
9 |
+
config.output = None
|
10 |
+
config.embedding_size = 512
|
11 |
+
config.sample_rate = 0.1
|
12 |
+
config.fp16 = True
|
13 |
+
config.momentum = 0.9
|
14 |
+
config.weight_decay = 5e-4
|
15 |
+
config.batch_size = 128
|
16 |
+
config.lr = 0.1 # batch size is 512
|
17 |
+
|
18 |
+
config.rec = "synthetic"
|
19 |
+
config.num_classes = 300 * 10000
|
20 |
+
config.num_epoch = 30
|
21 |
+
config.warmup_epoch = -1
|
22 |
+
config.decay_epoch = [10, 16, 22]
|
23 |
+
config.val_targets = []
|
chat_anything/sad_talker/face3d/models/arcface_torch/configs/__init__.py
ADDED
File without changes
|
chat_anything/sad_talker/face3d/models/arcface_torch/configs/base.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from easydict import EasyDict as edict
|
2 |
+
|
3 |
+
# make training faster
|
4 |
+
# our RAM is 256G
|
5 |
+
# mount -t tmpfs -o size=140G tmpfs /train_tmp
|
6 |
+
|
7 |
+
config = edict()
|
8 |
+
config.loss = "arcface"
|
9 |
+
config.network = "r50"
|
10 |
+
config.resume = False
|
11 |
+
config.output = "ms1mv3_arcface_r50"
|
12 |
+
|
13 |
+
config.dataset = "ms1m-retinaface-t1"
|
14 |
+
config.embedding_size = 512
|
15 |
+
config.sample_rate = 1
|
16 |
+
config.fp16 = False
|
17 |
+
config.momentum = 0.9
|
18 |
+
config.weight_decay = 5e-4
|
19 |
+
config.batch_size = 128
|
20 |
+
config.lr = 0.1 # batch size is 512
|
21 |
+
|
22 |
+
if config.dataset == "emore":
|
23 |
+
config.rec = "/train_tmp/faces_emore"
|
24 |
+
config.num_classes = 85742
|
25 |
+
config.num_image = 5822653
|
26 |
+
config.num_epoch = 16
|
27 |
+
config.warmup_epoch = -1
|
28 |
+
config.decay_epoch = [8, 14, ]
|
29 |
+
config.val_targets = ["lfw", ]
|
30 |
+
|
31 |
+
elif config.dataset == "ms1m-retinaface-t1":
|
32 |
+
config.rec = "/train_tmp/ms1m-retinaface-t1"
|
33 |
+
config.num_classes = 93431
|
34 |
+
config.num_image = 5179510
|
35 |
+
config.num_epoch = 25
|
36 |
+
config.warmup_epoch = -1
|
37 |
+
config.decay_epoch = [11, 17, 22]
|
38 |
+
config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
|
39 |
+
|
40 |
+
elif config.dataset == "glint360k":
|
41 |
+
config.rec = "/train_tmp/glint360k"
|
42 |
+
config.num_classes = 360232
|
43 |
+
config.num_image = 17091657
|
44 |
+
config.num_epoch = 20
|
45 |
+
config.warmup_epoch = -1
|
46 |
+
config.decay_epoch = [8, 12, 15, 18]
|
47 |
+
config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
|
48 |
+
|
49 |
+
elif config.dataset == "webface":
|
50 |
+
config.rec = "/train_tmp/faces_webface_112x112"
|
51 |
+
config.num_classes = 10572
|
52 |
+
config.num_image = "forget"
|
53 |
+
config.num_epoch = 34
|
54 |
+
config.warmup_epoch = -1
|
55 |
+
config.decay_epoch = [20, 28, 32]
|
56 |
+
config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
|
chat_anything/sad_talker/face3d/models/arcface_torch/configs/glint360k_mbf.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from easydict import EasyDict as edict
|
2 |
+
|
3 |
+
# make training faster
|
4 |
+
# our RAM is 256G
|
5 |
+
# mount -t tmpfs -o size=140G tmpfs /train_tmp
|
6 |
+
|
7 |
+
config = edict()
|
8 |
+
config.loss = "cosface"
|
9 |
+
config.network = "mbf"
|
10 |
+
config.resume = False
|
11 |
+
config.output = None
|
12 |
+
config.embedding_size = 512
|
13 |
+
config.sample_rate = 0.1
|
14 |
+
config.fp16 = True
|
15 |
+
config.momentum = 0.9
|
16 |
+
config.weight_decay = 2e-4
|
17 |
+
config.batch_size = 128
|
18 |
+
config.lr = 0.1 # batch size is 512
|
19 |
+
|
20 |
+
config.rec = "/train_tmp/glint360k"
|
21 |
+
config.num_classes = 360232
|
22 |
+
config.num_image = 17091657
|
23 |
+
config.num_epoch = 20
|
24 |
+
config.warmup_epoch = -1
|
25 |
+
config.decay_epoch = [8, 12, 15, 18]
|
26 |
+
config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
|
chat_anything/sad_talker/face3d/models/arcface_torch/configs/glint360k_r100.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from easydict import EasyDict as edict
|
2 |
+
|
3 |
+
# make training faster
|
4 |
+
# our RAM is 256G
|
5 |
+
# mount -t tmpfs -o size=140G tmpfs /train_tmp
|
6 |
+
|
7 |
+
config = edict()
|
8 |
+
config.loss = "cosface"
|
9 |
+
config.network = "r100"
|
10 |
+
config.resume = False
|
11 |
+
config.output = None
|
12 |
+
config.embedding_size = 512
|
13 |
+
config.sample_rate = 1.0
|
14 |
+
config.fp16 = True
|
15 |
+
config.momentum = 0.9
|
16 |
+
config.weight_decay = 5e-4
|
17 |
+
config.batch_size = 128
|
18 |
+
config.lr = 0.1 # batch size is 512
|
19 |
+
|
20 |
+
config.rec = "/train_tmp/glint360k"
|
21 |
+
config.num_classes = 360232
|
22 |
+
config.num_image = 17091657
|
23 |
+
config.num_epoch = 20
|
24 |
+
config.warmup_epoch = -1
|
25 |
+
config.decay_epoch = [8, 12, 15, 18]
|
26 |
+
config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
|