sirikan commited on
Commit
d74182a
·
verified ·
1 Parent(s): fc757fa

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +79 -0
  2. .github/workflows/update_space.yml +28 -0
  3. .gitignore +4 -0
  4. LICENSE +201 -0
  5. MODEL_LICENSE +94 -0
  6. README.md +380 -7
  7. README_CN.md +387 -0
  8. build/lib/kolors/__init__.py +0 -0
  9. build/lib/kolors/models/__init__.py +0 -0
  10. build/lib/kolors/models/configuration_chatglm.py +61 -0
  11. build/lib/kolors/models/controlnet.py +887 -0
  12. build/lib/kolors/models/ipa_faceid_plus/__init__.py +0 -0
  13. build/lib/kolors/models/ipa_faceid_plus/attention_processor.py +215 -0
  14. build/lib/kolors/models/ipa_faceid_plus/ipa_faceid_plus.py +137 -0
  15. build/lib/kolors/models/modeling_chatglm.py +1298 -0
  16. build/lib/kolors/models/tokenization_chatglm.py +300 -0
  17. build/lib/kolors/models/unet_2d_condition.py +1318 -0
  18. build/lib/kolors/pipelines/__init__.py +0 -0
  19. build/lib/kolors/pipelines/pipeline_controlnet_xl_kolors_img2img.py +1365 -0
  20. build/lib/kolors/pipelines/pipeline_stable_diffusion_xl_chatglm_256.py +841 -0
  21. build/lib/kolors/pipelines/pipeline_stable_diffusion_xl_chatglm_256_inpainting.py +1790 -0
  22. build/lib/kolors/pipelines/pipeline_stable_diffusion_xl_chatglm_256_ipadapter.py +948 -0
  23. build/lib/kolors/pipelines/pipeline_stable_diffusion_xl_chatglm_256_ipadapter_FaceID.py +951 -0
  24. controlnet/README.md +233 -0
  25. controlnet/annotator/__init__.py +0 -0
  26. controlnet/annotator/canny/__init__.py +6 -0
  27. controlnet/annotator/dwpose/__init__.py +91 -0
  28. controlnet/annotator/dwpose/onnxdet.py +125 -0
  29. controlnet/annotator/dwpose/onnxpose.py +360 -0
  30. controlnet/annotator/dwpose/util.py +297 -0
  31. controlnet/annotator/dwpose/wholebody.py +49 -0
  32. controlnet/annotator/midas/LICENSE +21 -0
  33. controlnet/annotator/midas/__init__.py +35 -0
  34. controlnet/annotator/midas/api.py +169 -0
  35. controlnet/annotator/midas/midas/__init__.py +0 -0
  36. controlnet/annotator/midas/midas/base_model.py +16 -0
  37. controlnet/annotator/midas/midas/blocks.py +342 -0
  38. controlnet/annotator/midas/midas/dpt_depth.py +109 -0
  39. controlnet/annotator/midas/midas/midas_net.py +76 -0
  40. controlnet/annotator/midas/midas/midas_net_custom.py +128 -0
  41. controlnet/annotator/midas/midas/transforms.py +234 -0
  42. controlnet/annotator/midas/midas/vit.py +491 -0
  43. controlnet/annotator/midas/utils.py +189 -0
  44. controlnet/annotator/util.py +129 -0
  45. controlnet/assets/bird.png +3 -0
  46. controlnet/assets/dog.png +3 -0
  47. controlnet/assets/woman_1.png +0 -0
  48. controlnet/assets/woman_2.png +3 -0
  49. controlnet/assets/woman_3.png +3 -0
  50. controlnet/assets/woman_4.png +0 -0
.gitattributes CHANGED
@@ -33,3 +33,82 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ controlnet/assets/bird.png filter=lfs diff=lfs merge=lfs -text
37
+ controlnet/assets/dog.png filter=lfs diff=lfs merge=lfs -text
38
+ controlnet/assets/woman_2.png filter=lfs diff=lfs merge=lfs -text
39
+ controlnet/assets/woman_3.png filter=lfs diff=lfs merge=lfs -text
40
+ controlnet/outputs/Canny_dog_condition.jpg filter=lfs diff=lfs merge=lfs -text
41
+ controlnet/outputs/Canny_woman_1_condition.jpg filter=lfs diff=lfs merge=lfs -text
42
+ controlnet/outputs/Canny_woman_1_sdxl.jpg filter=lfs diff=lfs merge=lfs -text
43
+ controlnet/outputs/Depth_bird_sdxl.jpg filter=lfs diff=lfs merge=lfs -text
44
+ controlnet/outputs/Depth_ipadapter_1.jpg filter=lfs diff=lfs merge=lfs -text
45
+ controlnet/outputs/Depth_ipadapter_woman_2.jpg filter=lfs diff=lfs merge=lfs -text
46
+ controlnet/outputs/Depth_woman_2.jpg filter=lfs diff=lfs merge=lfs -text
47
+ controlnet/outputs/Pose_woman_3.jpg filter=lfs diff=lfs merge=lfs -text
48
+ controlnet/outputs/Pose_woman_3_sdxl.jpg filter=lfs diff=lfs merge=lfs -text
49
+ controlnet/outputs/Pose_woman_4.jpg filter=lfs diff=lfs merge=lfs -text
50
+ dist/kolors-0.1-py3.8.egg filter=lfs diff=lfs merge=lfs -text
51
+ dreambooth/ktxl_test_image.png filter=lfs diff=lfs merge=lfs -text
52
+ imgs/Kolors_paper.pdf filter=lfs diff=lfs merge=lfs -text
53
+ imgs/cn_all.png filter=lfs diff=lfs merge=lfs -text
54
+ imgs/fz_all.png filter=lfs diff=lfs merge=lfs -text
55
+ imgs/head_final3.png filter=lfs diff=lfs merge=lfs -text
56
+ imgs/wz_all.png filter=lfs diff=lfs merge=lfs -text
57
+ imgs/zl8.png filter=lfs diff=lfs merge=lfs -text
58
+ inpainting/asset/1.png filter=lfs diff=lfs merge=lfs -text
59
+ inpainting/asset/1_kolors.png filter=lfs diff=lfs merge=lfs -text
60
+ inpainting/asset/1_masked.png filter=lfs diff=lfs merge=lfs -text
61
+ inpainting/asset/1_sdxl.png filter=lfs diff=lfs merge=lfs -text
62
+ inpainting/asset/2.png filter=lfs diff=lfs merge=lfs -text
63
+ inpainting/asset/2_kolors.png filter=lfs diff=lfs merge=lfs -text
64
+ inpainting/asset/2_masked.png filter=lfs diff=lfs merge=lfs -text
65
+ inpainting/asset/2_sdxl.png filter=lfs diff=lfs merge=lfs -text
66
+ inpainting/asset/3.png filter=lfs diff=lfs merge=lfs -text
67
+ inpainting/asset/3_masked.png filter=lfs diff=lfs merge=lfs -text
68
+ inpainting/asset/3_sdxl.png filter=lfs diff=lfs merge=lfs -text
69
+ inpainting/asset/4.png filter=lfs diff=lfs merge=lfs -text
70
+ inpainting/asset/4_kolors.png filter=lfs diff=lfs merge=lfs -text
71
+ inpainting/asset/4_masked.png filter=lfs diff=lfs merge=lfs -text
72
+ inpainting/asset/4_sdxl.png filter=lfs diff=lfs merge=lfs -text
73
+ inpainting/asset/5.png filter=lfs diff=lfs merge=lfs -text
74
+ inpainting/asset/5_kolors.png filter=lfs diff=lfs merge=lfs -text
75
+ inpainting/asset/5_masked.png filter=lfs diff=lfs merge=lfs -text
76
+ inpainting/asset/5_sdxl.png filter=lfs diff=lfs merge=lfs -text
77
+ ipadapter/asset/1.png filter=lfs diff=lfs merge=lfs -text
78
+ ipadapter/asset/1_kolors_ip_result.jpg filter=lfs diff=lfs merge=lfs -text
79
+ ipadapter/asset/1_mj_cw_result.png filter=lfs diff=lfs merge=lfs -text
80
+ ipadapter/asset/1_sdxl_ip_result.jpg filter=lfs diff=lfs merge=lfs -text
81
+ ipadapter/asset/2.png filter=lfs diff=lfs merge=lfs -text
82
+ ipadapter/asset/2_kolors_ip_result.jpg filter=lfs diff=lfs merge=lfs -text
83
+ ipadapter/asset/2_mj_cw_result.png filter=lfs diff=lfs merge=lfs -text
84
+ ipadapter/asset/2_sdxl_ip_result.jpg filter=lfs diff=lfs merge=lfs -text
85
+ ipadapter/asset/3.png filter=lfs diff=lfs merge=lfs -text
86
+ ipadapter/asset/3_kolors_ip_result.jpg filter=lfs diff=lfs merge=lfs -text
87
+ ipadapter/asset/3_mj_cw_result.png filter=lfs diff=lfs merge=lfs -text
88
+ ipadapter/asset/4.png filter=lfs diff=lfs merge=lfs -text
89
+ ipadapter/asset/4_kolors_ip_result.jpg filter=lfs diff=lfs merge=lfs -text
90
+ ipadapter/asset/4_mj_cw_result.png filter=lfs diff=lfs merge=lfs -text
91
+ ipadapter/asset/4_sdxl_ip_result.jpg filter=lfs diff=lfs merge=lfs -text
92
+ ipadapter/asset/5.png filter=lfs diff=lfs merge=lfs -text
93
+ ipadapter/asset/5_mj_cw_result.png filter=lfs diff=lfs merge=lfs -text
94
+ ipadapter/asset/test_ip.jpg filter=lfs diff=lfs merge=lfs -text
95
+ ipadapter/asset/test_ip2.png filter=lfs diff=lfs merge=lfs -text
96
+ ipadapter_FaceID/assets/image1.png filter=lfs diff=lfs merge=lfs -text
97
+ ipadapter_FaceID/assets/image1_res.png filter=lfs diff=lfs merge=lfs -text
98
+ ipadapter_FaceID/assets/image2.png filter=lfs diff=lfs merge=lfs -text
99
+ ipadapter_FaceID/assets/image2_res.png filter=lfs diff=lfs merge=lfs -text
100
+ ipadapter_FaceID/assets/test_img1_Kolors.png filter=lfs diff=lfs merge=lfs -text
101
+ ipadapter_FaceID/assets/test_img1_SDXL.png filter=lfs diff=lfs merge=lfs -text
102
+ ipadapter_FaceID/assets/test_img1_org.png filter=lfs diff=lfs merge=lfs -text
103
+ ipadapter_FaceID/assets/test_img2_Kolors.png filter=lfs diff=lfs merge=lfs -text
104
+ ipadapter_FaceID/assets/test_img2_SDXL.png filter=lfs diff=lfs merge=lfs -text
105
+ ipadapter_FaceID/assets/test_img2_org.png filter=lfs diff=lfs merge=lfs -text
106
+ ipadapter_FaceID/assets/test_img3_Kolors.png filter=lfs diff=lfs merge=lfs -text
107
+ ipadapter_FaceID/assets/test_img3_SDXL.png filter=lfs diff=lfs merge=lfs -text
108
+ ipadapter_FaceID/assets/test_img3_org.png filter=lfs diff=lfs merge=lfs -text
109
+ ipadapter_FaceID/assets/test_img4_Kolors.png filter=lfs diff=lfs merge=lfs -text
110
+ ipadapter_FaceID/assets/test_img4_SDXL.png filter=lfs diff=lfs merge=lfs -text
111
+ ipadapter_FaceID/assets/test_img4_org.png filter=lfs diff=lfs merge=lfs -text
112
+ scripts/outputs/sample_inpainting_4.jpg filter=lfs diff=lfs merge=lfs -text
113
+ scripts/outputs/sample_ip_test_ip.jpg filter=lfs diff=lfs merge=lfs -text
114
+ scripts/outputs/sample_test.jpg filter=lfs diff=lfs merge=lfs -text
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ weights
2
+ *.egg-info
3
+ __pycache__
4
+ *.pyc
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
MODEL_LICENSE ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 中文版
2
+ 模型许可协议
3
+ 模型发布日期:2024/7/6
4
+
5
+ 通过点击同意或使用、复制、修改、分发、表演或展示模型作品的任何部分或元素,您将被视为已承认并接受本协议的内容,本协议立即生效。
6
+
7
+ 1.定义。
8
+ a. “协议”指本协议中所规定的使用、复制、分发、修改、表演和展示模型作品或其任何部分或元素的条款和条件。
9
+ b. “材料”是指根据本协议提供的专有的模型和文档(及其任何部分)的统称。
10
+ c. “模型”指大型语言模型、图像/视频/音频/3D 生成模型、多模态大型语言模型及其软件和算法,包括训练后的模型权重、参数(包括优化器状态)、机器学习模型代码、推理支持代码、训练支持代码、微调支持代码以及我们公开提供的前述其他元素。
11
+ d. “输出”是指通过操作或以其他方式使用模型或模型衍生品而产生的模型或模型衍生品的信息和/或内容输出。
12
+ e. “模型衍生品”包括:(i)对模型或任何模型衍生物的修改;(ii)基于模型的任何模型衍生物的作品;或(iii)通过将模型或模型的任何模型衍生物的权重、参数、操作或输出的模式转移到该模型而创建的任何其他机器学习模型,以使该模型的性能类似于模型或模型衍生物。为清楚起见,输出本身不被视为模型衍生物。
13
+ f. “模型作品”包括:(i)材料;(ii)模型衍生品;及(iii)其所有衍生作品。
14
+ g. “许可人”或“我们”指作品所有者或作品所有者授权的授予许可的实体,包括可能对模型和/或分发模型拥有权利的个人或实体。
15
+ h.“被许可人”、“您”或“您的”是指行使本协议授予的权利和/或为任何目的和在任何使用领域使用模型作品的自然人或法人实体。
16
+ i.“第三方”是指不受我们或您共同控制的个人或法人实体。
17
+
18
+ 2. 许可内容。
19
+ a.我们授予您非排他性的、全球性的、不可转让的、免版税的许可(在我们的知识产权或我们拥有的体现在材料中或利用材料的其他权利的范围内),允许您仅根据本协议的条款使用、复制、分发、创作衍生作品(包括模型衍生品)和对材料进行修改,并且您不得违反(或鼓励、或允许任何其他人违反)本协议的任何条款。
20
+ b.在遵守本协议的前提下,您可以分发或向第三方提供模型作品,您须满足以下条件:
21
+ (i)您必须向所有该模型作品或使用该作品的产品或服务的任何第三方接收者提供模型作品的来源和本协议的副本;
22
+ (ii)您必须在任何修改过的文档上附加明显的声明,说明您更改了这些文档;
23
+ (iii)您可以在您的修改中添加您自己的版权声明,并且,在您对该作品的使用、复制、修改、分发、表演和展示符合本协议的条款和条件的前提下,您可以为您的修改或任何此类模型衍生品的使用、复制或分发提供额外或不同的许可条款和条件。
24
+ c. 附加商业条款:若您或其关联方提供的所有产品或服务的月活跃用户数在前一个自然月未超过3亿月活跃用户数,则您向许可方进行登记,将被视为获得相应的商业许可;若您或其关联方提供的所有产品或服务的月活跃用户数在前一个自然月超过3亿月活跃用户数,则您必须向许可人申请许可,许可人可自行决定向您授予许可。除非许可人另行明确授予您该等权利,否则您无权行使本协议项下的任何权利。
25
+
26
+ 3.使用限制。
27
+ a. 您对本模型作品的使用必须遵守适用法律法规(包括贸易合规法律法规),并遵守《服务协议》(https://kolors.kuaishou.com/agreement)。您必须将本第 3(a) 和 3(b) 条中提及的使用限制作为可执行条款纳入任何规范本模型作品使用和/或分发的协议(例如许可协议、使用条款等),并且您必须向您分发的后续用户发出通知,告知其本模型作品受本第 3(a) 和 3(b) 条中的使用限制约束。
28
+ b. 您不得使用本模型作品或本模型作品的任何输出或成果来改进任何其他模型(本模型或其模型衍生品除外)。
29
+
30
+ 4.知识产权。
31
+ a. 我们保留材料的所有权及其相关知识产权。在遵守本协议条款和条件的前提下,对于您制作的材料的任何衍生作品和修改,您是且将是此类衍生作品和修改的所有者。
32
+ b. 本协议不授予任何商标、商号、服务标记或产品名称的标识许可,除非出于描述和分发本模型作品的合理和惯常用途。
33
+ c. 如果您对我们或任何个人或实体提起诉讼或其他程序(包括诉讼中的交叉索赔或反索赔),声称材料或任何输出或任何上述内容的任何部分侵犯您拥有或可许可的任何知识产权或其他权利,则根据本协议授予您的所有许可应于提起此类诉讼或其他程序之日起终止。
34
+
35
+ 5. 免责声明和责任限制。
36
+ a. 本模型作品及其任何输出和结果按“原样”提供,不作任何明示或暗示的保证,包括适销性、非侵权性或适用于特定用途的保证。我们不对材料及其任何输出的安全性或稳定性作任何保证,也不承担任何责任。
37
+ b. 在任何情况下,我们均不对您承担任何损害赔偿责任,包括但不限于因您使用或无法使用材料或其任何输出而造成的任何直接、间接、特殊或后果性损害赔偿责任,无论该损害赔偿责任是如何造成的。
38
+ c. 对于因您使用或分发模型的衍生物而引起的或与之相关的任何第三方索赔,您应提供辩护,赔偿,并使我方免受损害。
39
+
40
+ 6. 存续和终止。
41
+ a. 本协议期限自您接受本协议或访问材料之日起开始,并将持续完全有效,直至根据本协议条款和条件终止。
42
+ b. 如果您违反本协议的任何条款或条件,我们可终止本协议。本协议终止后,您必须立即删除并停止使用本模型作品。第 4(a)、4(c)、5和 7 条在本协议终止后仍然有效。
43
+
44
+ 7. 适用法律和管辖权。
45
+ a. 本协议及由本协议引起的或与本协议有关的任何争议均受中华人民共和国大陆地区(仅为本协议目的,不包括香港、澳门和台湾)法律管辖,并排除冲突法的适用,且《联合国国际货物销售合同公约》不适用于本协议。
46
+ b. 因本协议引起或与本协议有关的任何争议,由许可人住所地人民法院管辖。
47
+
48
+ 请注意,许可证可能会更新到更全面的版本。 有关许可和版权的任何问题,请通过 [email protected] 与我们联系。
49
+
50
+
51
+ 英文版
52
+
53
+ MODEL LICENSE AGREEMENT
54
+ Release Date: 2024/7/6
55
+ By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Model Works, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
56
+ 1. DEFINITIONS.
57
+ a. “Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of the Model Works or any portion or element thereof set forth herein.
58
+ b. “Materials” shall mean, collectively, Us proprietary the Model and Documentation (and any portion thereof) as made available by Us under this Agreement.
59
+ c. “Model” shall mean the large language models, image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us .
60
+ d. “Output” shall mean the information and/or content output of Model or a Model Derivative that results from operating or otherwise using Model or a Model Derivative.
61
+ e. “Model Derivatives” shall mean all: (i) modifications to the Model or any Model Derivative; (ii) works based on the Model or any Model Derivative; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of the Model or any Model Derivative, to that model in order to cause that model to perform similarly to the Model or a Model Derivative, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs or a Model Derivative for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
62
+ f. “Model Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
63
+ g. “Licensor” , “We” or “Us” shall mean the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
64
+ h. “Licensee”, “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Model Works for any purpose and in any field of use.
65
+ i. “Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
66
+
67
+ 2. LICENSE CONTENT.
68
+ a. We grant You a non-exclusive, worldwide, non-transferable and royalty-free limited license under the intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
69
+ b. You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Model Works, provided that You meet all of the following conditions:
70
+ (i) You must provide all such Third Party recipients of the Model Works or products or services using them the source of the Model and a copy of this Agreement;
71
+ (ii) You must cause any modified documents to carry prominent notices stating that You changed the documents;
72
+ (iii) You may add Your own copyright statement to Your modifications and, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement.
73
+ c. additional commercial terms: If, the monthly active users of all products or services made available by or for You, or Your affiliates, does not exceed 300 million monthly active users in the preceding calendar month, Your registration with the Licensor will be deemed to have obtained the corresponding business license; If, the monthly active users of all products or services made available by or for You, or Your affiliates, is greater than 300 million monthly active users in the preceding calendar month, You must request a license from Licensor, which the Licensor may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until We otherwise expressly grants You such rights.
74
+
75
+ 3. LICENSE RESTRICITIONS.
76
+ a. Your use of the Model Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Service Agreement. You must include the use restrictions referenced in these Sections 3(a) and 3(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Model Works and You must provide notice to subsequent users to whom You distribute that Model Works are subject to the use restrictions in these Sections 3(a) and 3(b).
77
+ b. You must not use the Model Works or any Output or results of the Model Works to improve any other large model (other than Model or Model Derivatives thereof).
78
+ 4. INTELLECTUAL PROPERTY.
79
+ a. We retain ownership of all intellectual property rights in and to the Model and derivatives. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by You, You are and will be the owner of such derivative works and modifications.
80
+ b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
81
+ c. If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed.
82
+ 5. DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
83
+ a. THE MODEL WORKS AND ANY OUTPUT AND RESULTS THERE FROM ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
84
+ b. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
85
+ c. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to Your use or distribution of the Materials.
86
+
87
+ 6. SURVIVAL AND TERMINATION.
88
+ a. The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
89
+ b. We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Model Works. Sections 4(a), 4(c), 5 and 7 shall survive the termination of this Agreement.
90
+ 7. GOVERNING LAW AND JURISDICTION.
91
+ a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China (for the purpose of this agreement only, excluding Hong Kong, Macau, and Taiwan), without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
92
+ b. Any disputes arising from or related to this Agreement shall be under the jurisdiction of the People's Court where the Licensor is located.
93
+
94
+ Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at [email protected].
README.md CHANGED
@@ -1,12 +1,385 @@
1
  ---
2
- title: Gradio
3
- emoji: 🦀
4
- colorFrom: green
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.32.1
8
- app_file: app.py
9
- pinned: false
10
  ---
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: gradio
3
+ app_file: scripts/sampleui.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.32.1
 
 
6
  ---
7
+ <p align="left">
8
+ English</a>&nbsp | &nbsp<a href="README_CN.md">中文</a>&nbsp
9
+ </p>
10
+ <br><br>
11
 
12
+ <p align="center">
13
+ <img src="imgs/logo.png" width="400"/>
14
+ <p>
15
+ <br>
16
+
17
+
18
+ <div align="center">
19
+ <a href='https://huggingface.co/Kwai-Kolors/Kolors'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-HF-yellow'></a> &ensp;
20
+ <a href="https://github.com/Kwai-Kolors/Kolors"><img src="https://img.shields.io/static/v1?label=Kolors Code&message=Github&color=blue&logo=github-pages"></a> &ensp;
21
+ <a href="https://kwai-kolors.github.io/"><img src="https://img.shields.io/static/v1?label=Team%20Page&message=Page&color=green"></a> &ensp;
22
+
23
+ <a href='https://huggingface.co/spaces/Kwai-Kolors/Kolors '><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HF Space-HF-yellow'></a> &ensp;
24
+ <a href="https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf"><img src="https://img.shields.io/static/v1?label=Tech Report&message=Arxiv:Kolors&color=red&logo=arxiv"></a> &ensp;
25
+ <a href="https://kolors.kuaishou.com/"><img src="https://img.shields.io/static/v1?label=Official Website&message=Page&color=green"></a> &ensp;
26
+ </div>
27
+
28
+
29
+
30
+ # Kolors: Effective Training of Diffusion Model for Photorealistic Text-to-Image Synthesis
31
+ <figure>
32
+ <img src="imgs/head_final3.png">
33
+ </figure>
34
+ <br><br>
35
+
36
+ ## Contents
37
+
38
+ - [🎉 News](#News)
39
+ - [📑 Open-source Plan](#open-source-plan)
40
+ - [📖 Introduction](#Introduction)
41
+ - [📊 Evaluation 🥇🥇🔥🔥](#Evaluation)
42
+ - [🎥 Visualization](#Visualization)
43
+ - [🛠️ Usage](#Usage)
44
+ - [📜 License & Citation & Acknowledgments](#License)
45
+ <br><br>
46
+
47
+
48
+ ## <a name="News"></a>🎉 News
49
+ * 2024.11.13 🔥 [Kolors-Portrait-with-Flux](https://huggingface.co/spaces/Kwai-Kolors/Kolors-Portrait-with-Flux) and [Kolors-Character-With-Flux](https://huggingface.co/spaces/Kwai-Kolors/Kolors-Character-With-Flux), which enable to preserve identity, are available on HuggingFace Space for free trials! Hope you enjoy it!
50
+ * 2024.09.01 🔥 Kolors-Virtual-Try-On, a virtual try-on demo based on Kolors is released! Enjoy trying on [Kolors-Virtual-Try-On](https://huggingface.co/spaces/Kwai-Kolors/Kolors-Virtual-Try-On), [WeChat post](https://mp.weixin.qq.com/s/Wk_Eq7OAywlrPqNC6zWZJQ).
51
+
52
+ * 2024.08.06 🔥 Pose ControlNet is released! Please check [ControlNet(Pose)](./controlnet/) for more details.
53
+
54
+ * 2024.08.01 🔥 The Kolors-Dreambooth-LoRA training and inference code is released! Please check [Dreambooth-LoRA](./dreambooth/) for more details.
55
+
56
+ * 2024.07.31 🔥 The Kolors-IP-Adapter-FaceID-Plus weights and inference code is released! Please check [IP-Adapter-FaceID-Plus](./ipadapter_FaceID/) for more details.
57
+
58
+ * 2024.07.26 🔥 ControlNet and Inpainting Model are released! Please check [ControlNet(Canny, Depth)](./controlnet/) and [Inpainting Model](./inpainting/) for more details.
59
+
60
+
61
+ * 2024.07.17 🔥 The Kolors-IP-Adapter-Plus weights and infernce code is released! Please check [IP-Adapter-Plus](./ipadapter/) for more details.
62
+
63
+ * 2024.07.12 🤗 Kolors is now available in **Diffusers**! Please check [kolors-diffusers](https://huggingface.co/Kwai-Kolors/Kolors-diffusers) or the [example](#using-with-diffusers) below for detail! Thanks to the Diffusers team for their technical support.
64
+ * 2024.07.10 🤖 Kolors supports [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors).
65
+ * 2024.07.09 💥 Kolors supports [ComfyUI](https://github.com/comfyanonymous/ComfyUI#manual-install-windows-linux). Thanks to [@kijai](https://github.com/kijai/ComfyUI-KwaiKolorsWrapper) with his great work.
66
+ * 2024.07.06 🔥🔥🔥 We release **Kolors**, a large text-to-image model trained on billions of text-image pairs. This model is bilingual in both Chinese and English, and supports a context length of 256 tokens. For more technical details, please refer to [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).
67
+ * 2024.07.03 📊 Kolors won the second place on [FlagEval Multimodal Text-to-Image Leaderboard](https://flageval.baai.ac.cn/#/leaderboard/multimodal?kind=t2i), excelling particularly in the Chinese and English subjective quality assessment where Kolors took the first place.
68
+ * 2024.07.02 🎉 Congratulations! Our paper on controllable video generation, [DragAnything: Motion Control for Anything using Entity Representation](https://arxiv.org/abs/2403.07420), have been accepted by ECCV 2024.
69
+ * 2024.02.08 🎉 Congratulations! Our paper on generative model evaluation, [Learning Multi-dimensional Human Preference for Text-to-Image Generation](https://wangbohan97.github.io/MPS/), have been accepted by CVPR 2024.
70
+ <br><br>
71
+
72
+ ## <a name="open-source-plan"></a>📑 Open-source Plan
73
+
74
+ - Kolors (Text-to-Image Model)
75
+ - [x] Inference
76
+ - [x] Checkpoints
77
+ - [x] IP-Adapter
78
+ - [x] ControlNet (Canny, Depth)
79
+ - [x] Inpainting
80
+ - [x] IP-Adapter-FaceID
81
+ - [x] LoRA
82
+ - [x] ControlNet (Pose)
83
+ - [x] ComfyUI
84
+ - [x] Gradio
85
+ - [x] Diffusers
86
+ <br><br>
87
+
88
+ ##
89
+ ## <a name="Introduction"></a>📖 Introduction
90
+
91
+ Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by the Kuaishou Kolors team. Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this <a href="https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf">technical report</a></b>.
92
+ <br><br>
93
+
94
+ ## <a name="Evaluation"></a>📊 Evaluation
95
+ We have collected a comprehensive text-to-image evaluation dataset named KolorsPrompts to compare Kolors with other state-of-the-art open models and closed-source models. KolorsPrompts includes over 1,000 prompts across 14 catagories and 12 evaluation dimensions. The evaluation process incorporates both human and machine assessments. In relevant benchmark evaluations, Kolors demonstrated highly competitive performance, achieving industry-leading standards.
96
+
97
+ <br><br>
98
+
99
+ ### Human Assessment
100
+
101
+ For the human evaluation, we invited 50 imagery experts to conduct comparative evaluations of the results generated by different models. The experts rated the generated images based on three criteria: visual appeal, text faithfulness, and overall satisfaction. In the evaluation, Kolors achieved the highest overall satisfaction score and significantly led in visual appeal compared to other models.
102
+
103
+ | Model | Average Overall Satisfaction | Average Visual Appeal | Average Text Faithfulness |
104
+ | :--------------: | :--------: | :--------: | :--------: |
105
+ | Adobe-Firefly | 3.03 | 3.46 | 3.84 |
106
+ | Stable Diffusion 3 | 3.26 | 3.50 | 4.20 |
107
+ | DALL-E 3 | 3.32 | 3.54 | 4.22 |
108
+ | Midjourney-v5 | 3.32 | 3.68 | 4.02 |
109
+ | Playground-v2.5 | 3.37 | 3.73 | 4.04 |
110
+ | Midjourney-v6 | 3.58 | 3.92 | 4.18 |
111
+ | **Kolors** | **3.59** | **3.99** | **4.17** |
112
+
113
+ ------
114
+
115
+ <div style="color: gray; font-size: small;">
116
+
117
+ **All model results are tested with the April 2024 product versions**
118
+
119
+ </div>
120
+ <br>
121
+
122
+ ### Machine Assessment
123
+ We used [MPS](https://arxiv.org/abs/2405.14705) (Multi-dimensional Human Preference Score) on KolorsPrompts as the evaluation metric for machine assessment. Kolors achieved the highest MPS score, which is consistent with the results of the human evaluations.
124
+
125
+ <div style="text-align:center">
126
+
127
+ | Models | Overall MPS |
128
+ |:-------------------:|:-------------:|
129
+ | Adobe-Firefly | 8.5 |
130
+ | Stable Diffusion 3 | 8.9 |
131
+ | DALL-E 3 | 9.0 |
132
+ | Midjourney-v5 | 9.4 |
133
+ | Playground-v2.5 | 9.8 |
134
+ | Midjourney-v6 | 10.2 |
135
+ | **Kolors** | **10.3** |
136
+ </div>
137
+
138
+
139
+ <br>
140
+
141
+ For more experimental results and details, please refer to our [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).
142
+
143
+ <br><br>
144
+
145
+
146
+ ## <a name="Visualization"></a>🎥 Visualization
147
+
148
+ * **High-quality Portrait**
149
+ <div style="display: flex; justify-content: space-between;">
150
+ <img src="imgs/zl8.png" />
151
+ </div>
152
+ <br>
153
+
154
+ * **Chinese Elements Generation**
155
+ <div style="display: flex; justify-content: space-between;">
156
+ <img src="imgs/cn_all.png"/>
157
+ </div>
158
+ <br>
159
+
160
+ * **Complex Semantic Understanding**
161
+ <div style="display: flex; justify-content: space-between;">
162
+ <img src="imgs/fz_all.png"/>
163
+ </div>
164
+ <br>
165
+
166
+ * **Text Rendering**
167
+ <div style="display: flex; justify-content: space-between;">
168
+ <img src="imgs/wz_all.png" />
169
+ </div>
170
+ <br>
171
+ </div>
172
+
173
+ The visualized case prompts mentioned above can be accessed [here](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/prompt_vis.txt).
174
+ <br><br>
175
+
176
+ ## <a name="Usage"></a>🛠️ Usage
177
+
178
+ ### Requirements
179
+
180
+ * Python 3.8 or later
181
+ * PyTorch 1.13.1 or later
182
+ * Transformers 4.26.1 or later
183
+ * Recommended: CUDA 11.7 or later
184
+ <br>
185
+
186
+ 1. Repository Cloning and Dependency Installation
187
+
188
+ ```bash
189
+ apt-get install git-lfs
190
+ git clone https://github.com/Kwai-Kolors/Kolors
191
+ cd Kolors
192
+ conda create --name kolors python=3.8
193
+ conda activate kolors
194
+ pip install -r requirements.txt
195
+ python3 setup.py install
196
+ ```
197
+ 2. Weights download([link](https://huggingface.co/Kwai-Kolors/Kolors)):
198
+ ```bash
199
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors --local-dir weights/Kolors
200
+ ```
201
+ or
202
+ ```bash
203
+ git lfs clone https://huggingface.co/Kwai-Kolors/Kolors weights/Kolors
204
+ ```
205
+ 3. Inference:
206
+ ```bash
207
+ python3 scripts/sample.py "一张瓢虫的照片,微距,变焦,高质量,电影,拿着一个牌子,写着“可图”"
208
+ # The image will be saved to "scripts/outputs/sample_text.jpg"
209
+ ```
210
+ 4. Web demo:
211
+ ```bash
212
+ python3 scripts/sampleui.py
213
+ ```
214
+
215
+ ### Using with Diffusers
216
+ Make sure you upgrade to the latest version(0.30.0.dev0) of diffusers:
217
+ ```
218
+ git clone https://github.com/huggingface/diffusers
219
+ cd diffusers
220
+ python3 setup.py install
221
+ ```
222
+ **Notes:**
223
+ - The pipeline uses the `EulerDiscreteScheduler` by default. We recommend using this scheduler with `guidance scale=5.0` and `num_inference_steps=50`.
224
+ - The pipeline also supports the `EDMDPMSolverMultistepScheduler`. `guidance scale=5.0` and `num_inference_steps=25` is a good default for this scheduler.
225
+ - In addition to Text-to-Image, `KolorsImg2ImgPipeline` also supports Image-to-Image.
226
+
227
+ And then you can run:
228
+ ```python
229
+ import torch
230
+ from diffusers import KolorsPipeline
231
+ pipe = KolorsPipeline.from_pretrained(
232
+ "Kwai-Kolors/Kolors-diffusers",
233
+ torch_dtype=torch.float16,
234
+ variant="fp16"
235
+ ).to("cuda")
236
+ prompt = '一张瓢虫的照片,微距,变焦,高质量,电影,拿着一个牌子,写着"可图"'
237
+ image = pipe(
238
+ prompt=prompt,
239
+ negative_prompt="",
240
+ guidance_scale=5.0,
241
+ num_inference_steps=50,
242
+ generator=torch.Generator(pipe.device).manual_seed(66),
243
+ ).images[0]
244
+ image.show()
245
+ ```
246
+
247
+ ### IP-Adapter-Plus
248
+
249
+ We provide IP-Adapter-Plus weights and inference code, detailed in the [ipadapter](./ipadapter/README.md).
250
+
251
+ ```bash
252
+ # Weights download
253
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-IP-Adapter-Plus --local-dir weights/Kolors-IP-Adapter-Plus
254
+ ```
255
+
256
+ ```bash
257
+ # Inference:
258
+ python3 ipadapter/sample_ipadapter_plus.py ./ipadapter/asset/test_ip.jpg "穿着黑色T恤衫,上面中文绿色大字写着“可图”"
259
+
260
+ python3 ipadapter/sample_ipadapter_plus.py ./ipadapter/asset/test_ip2.png "一只可爱的小狗在奔跑"
261
+
262
+ # The image will be saved to "scripts/outputs/"
263
+ ```
264
+
265
+ ### ControlNet
266
+
267
+ We provide three ControlNet weights and inference code, detailed in the [controlnet](./controlnet/README.md).
268
+
269
+ ```bash
270
+ # Weights download
271
+
272
+ # Canny - ControlNet
273
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-ControlNet-Canny --local-dir weights/Kolors-ControlNet-Canny
274
+
275
+ # Depth - ControlNet
276
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-ControlNet-Depth --local-dir weights/Kolors-ControlNet-Depth
277
+
278
+ # Pose - ControlNet
279
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-ControlNet-Pose --local-dir weights/Kolors-ControlNet-Pose
280
+ ```
281
+
282
+ If you intend to utilize the depth estimation network, please make sure to download its corresponding model weights.
283
+ ```
284
+ huggingface-cli download lllyasviel/Annotators ./dpt_hybrid-midas-501f0c75.pt --local-dir ./controlnet/annotator/ckpts
285
+ ```
286
+
287
+ Thanks to [DWPose](https://github.com/IDEA-Research/DWPose/tree/onnx?tab=readme-ov-file), you can utilize the pose estimation network. Please download the Pose model dw-ll_ucoco_384.onnx ([baidu](https://pan.baidu.com/s/1nuBjw-KKSxD_BkpmwXUJiw?pwd=28d7), [google](https://drive.google.com/file/d/12L8E2oAgZy4VACGSK9RaZBZrfgx7VTA2/view?usp=sharing)) and Det model yolox_l.onnx ([baidu](https://pan.baidu.com/s/1fpfIVpv5ypo4c1bUlzkMYQ?pwd=mjdn), [google](https://drive.google.com/file/d/1w9pXC8tT0p9ndMN-CArp1__b2GbzewWI/view?usp=sharing)). Then please put them into `controlnet/annotator/ckpts/`.
288
+
289
+
290
+ ```bash
291
+ # Inference:
292
+
293
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_1.png 一个漂亮的女孩,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K Canny
294
+
295
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_2.png 新海诚风格,丰富的色彩,穿着绿色衬衫的女人站在田野里,唯美风景,清新明亮,斑驳的光影,最好的质量,超细节,8K画质 Depth
296
+
297
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_3.png 一位穿着紫色泡泡袖连衣裙、戴着皇冠和白色蕾丝手套的女孩双手托脸,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K Pose
298
+
299
+ # The image will be saved to "controlnet/outputs/"
300
+ ```
301
+
302
+
303
+ ### Inpainting
304
+
305
+ We provide Inpainting weights and inference code, detailed in the [inpainting](./inpainting/README.md).
306
+
307
+ ```bash
308
+ # Weights download
309
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-Inpainting --local-dir weights/Kolors-Inpainting
310
+ ```
311
+
312
+ ```bash
313
+ # Inference:
314
+ python3 inpainting/sample_inpainting.py ./inpainting/asset/3.png ./inpainting/asset/3_mask.png 穿着美少女战士的衣服,一件类似于水手服风格的衣服,包括一个白色紧身上衣,前胸搭配一个大大的红色蝴蝶结。衣服的领子部分呈蓝色,并且有白色条纹。她还穿着一条蓝色百褶裙,超高清,辛烷渲染,高级质感,32k,高分辨率,最好的质量,超级细节,景深
315
+
316
+ python3 inpainting/sample_inpainting.py ./inpainting/asset/4.png ./inpainting/asset/4_mask.png 穿着钢铁侠的衣服,高科技盔甲,主要颜色为红色和金色���并且有一些银色装饰。胸前有一个亮起的圆形反应堆装置,充满了未来科技感。超清晰,高质量,超逼真,高分辨率,最好的质量,超级细节,景深
317
+
318
+ # The image will be saved to "scripts/outputs/"
319
+ ```
320
+
321
+ ### IP-Adapter-FaceID-Plus
322
+
323
+ We provide IP-Adapter-FaceID-Plus weights and inference code, detailed in the [ipadapter_FaceID](./ipadapter_FaceID/README.md).
324
+
325
+ ```bash
326
+ # Weights download
327
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-IP-Adapter-FaceID-Plus --local-dir weights/Kolors-IP-Adapter-FaceID-Plus
328
+ ```
329
+
330
+ ```bash
331
+ # Inference:
332
+ python ipadapter_FaceID/sample_ipadapter_faceid_plus.py ./ipadapter_FaceID/assets/image1.png "穿着晚礼服,在星光下的晚宴场景中,烛光闪闪,整个场景洋溢着浪漫而奢华的氛围"
333
+
334
+ python ipadapter_FaceID/sample_ipadapter_faceid_plus.py ./ipadapter_FaceID/assets/image2.png "西部牛仔,牛仔帽,荒野大镖客,背景是西部小镇,仙人掌,,日落余晖, 暖色调, 使用XT4胶片拍摄, 噪点, 晕影, 柯达胶卷,复古"
335
+
336
+ # The image will be saved to "scripts/outputs/"
337
+ ```
338
+
339
+ ### Dreambooth-LoRA
340
+
341
+ We provide LoRA training and inference code, detailed in the [Dreambooth-LoRA](./dreambooth/README.md).
342
+
343
+ ```bash
344
+ # Training:
345
+ sh train.sh
346
+ ```
347
+
348
+ ```bash
349
+ # Inference:
350
+ python infer_dreambooth.py "ktxl狗在草地上跑"
351
+ ```
352
+
353
+ <br><br>
354
+
355
+ ## <a name="License"></a>📜 License & Citation & Acknowledgments
356
+
357
+ ### License
358
+
359
+ Kolors weights are fully open for academic research. If you intend to use the Kolors model or its derivatives for commercial purposes under the licensing terms and conditions, please send the [questionnaire](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/可图KOLORS模型商业授权申请书.docx) to [email protected] to register with the licensor. If the monthly active users of all products or services made available by or for Licensee does not exceed 300 million monthly active users in the preceding calendar month, Your registration with the Licensor will be deemed to have obtained the corresponding business license; If, the monthly active users of all products or services made available by or for Licensee is greater than 300 million monthly active users in the preceding calendar month, You must request a license from Licensor, which the Licensor may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until We otherwise expressly grants You such rights.
360
+
361
+
362
+ We open-source Kolors to promote the development of large text-to-image models in collaboration with the open-source community. The code of this project is open-sourced under the Apache-2.0 license. We sincerely urge all developers and users to strictly adhere to the [open-source license](MODEL_LICENSE), avoiding the use of the open-source model, code, and its derivatives for any purposes that may harm the country and society or for any services not evaluated and registered for safety. Note that despite our best efforts to ensure the compliance, accuracy, and safety of the data during training, due to the diversity and combinability of generated content and the probabilistic randomness affecting the model, we cannot guarantee the accuracy and safety of the output content, and the model is susceptible to misleading. This project does not assume any legal responsibility for any data security issues, public opinion risks, or risks and liabilities arising from the model being misled, abused, misused, or improperly utilized due to the use of the open-source model and code.
363
+
364
+ ### Citation
365
+ If you find our work helpful, please cite it!
366
+
367
+ ```
368
+ @article{kolors,
369
+ title={Kolors: Effective Training of Diffusion Model for Photorealistic Text-to-Image Synthesis},
370
+ author={Kolors Team},
371
+ journal={arXiv preprint},
372
+ year={2024}
373
+ }
374
+ ```
375
+
376
+ ### Acknowledgments
377
+ - Thanks to [Diffusers](https://github.com/huggingface/diffusers) for providing the codebase.
378
+ - Thanks to [ChatGLM3](https://github.com/THUDM/ChatGLM3) for providing the powerful Chinese language model.
379
+ <br>
380
+
381
+ ### Contact Us
382
+
383
+ If you want to leave a message for our R&D team and product team, feel free to join our [WeChat group](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/wechat.png). You can also contact us via email ([email protected]).
384
+
385
+ [![Star History Chart](https://api.star-history.com/svg?repos=Kwai-Kolors/Kolors&type=Date)](https://star-history.com/#Kwai-Kolors/Kolors&Date)
README_CN.md ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="left">
2
+ 中文</a>&nbsp | &nbsp<a href="README.md">English</a>&nbsp
3
+ </p>
4
+ <!-- <br><br> -->
5
+
6
+ <p align="center">
7
+ <img src="imgs/logo.png" width="400"/>
8
+ <p>
9
+ <br>
10
+
11
+ <!-- <div align="center">
12
+
13
+ <a href='https://kwai-kolors.github.io/'><img src='https://img.shields.io/badge/Team-Page-green'></a> <a href=''><img src='https://img.shields.io/badge/Technique-Report-red'></a> [![Teampage](https://img.shields.io/badge/Website-Page-blue)](https://kolors.kuaishou.com/)
14
+
15
+ </div> -->
16
+
17
+ <div align="center">
18
+ <a href='https://huggingface.co/Kwai-Kolors/Kolors'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-HF-yellow'></a> &ensp;
19
+ <a href="https://github.com/Kwai-Kolors/Kolors"><img src="https://img.shields.io/static/v1?label=Kolors Code&message=Github&color=blue&logo=github-pages"></a> &ensp;
20
+ <a href="https://kwai-kolors.github.io/"><img src="https://img.shields.io/static/v1?label=Team%20Page&message=Page&color=green"></a> &ensp;
21
+
22
+ <a href='https://huggingface.co/spaces/Kwai-Kolors/Kolors '><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HF Space-HF-yellow'></a> &ensp;
23
+ <a href="https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf"><img src="https://img.shields.io/static/v1?label=Tech Report&message=Arxiv:Kolors&color=red&logo=arxiv"></a> &ensp;
24
+ <a href="https://kolors.kuaishou.com/"><img src="https://img.shields.io/static/v1?label=Official Website&message=Page&color=green"></a> &ensp;
25
+ </div>
26
+
27
+
28
+
29
+
30
+ </p>
31
+
32
+ # Kolors: Effective Training of Diffusion Model for Photorealistic Text-to-Image Synthesis
33
+ <figure>
34
+ <img src="imgs/head_final3.png">
35
+ </figure>
36
+ <br><br>
37
+
38
+ ## 目录
39
+
40
+ - [🎉 新闻](#新闻)
41
+ - [📑 开源计划](#开源计划)
42
+ - [📖 模型介绍](#模型介绍)
43
+ - [📊 评测表现 🥇🥇🔥🔥](#评测表现)
44
+ - [🎥 可视化](#可视化)
45
+ - [🛠️ 快速使用](#快速使用)
46
+ - [📜 协议、引用、致谢](#协议引用)
47
+ <br><br>
48
+
49
+ ## <a name="新闻"></a>🎉 新闻
50
+ * 2024.09.01 🔥 Kolors-Virtual-Try-On 可图虚拟试衣项目体验demo已发布!欢迎体验 [Kolors-Virtual-Try-On](https://huggingface.co/spaces/Kwai-Kolors/Kolors-Virtual-Try-On), [可图公众号](https://mp.weixin.qq.com/s/Wk_Eq7OAywlrPqNC6zWZJQ)。
51
+
52
+ * 2024.08.06 🔥 Pose ControlNet 已发布! 请查看 [ControlNet(Pose)](./controlnet/) 获取详细信息。
53
+
54
+ * 2024.08.01 🔥 Kolors-Dreambooth-LoRA 的训练和推理代码已发布!请查看 [Dreambooth-LoRA](./dreambooth/) 获取详细信息。
55
+
56
+ * 2024.07.31 🔥 Kolors-IP-Adapter-FaceID-Plus 的权重和推理代码已发布!请查看 [IP-Adapter-FaceID-Plus](./ipadapter_FaceID/) 获取详细信息。
57
+
58
+ * 2024.07.26 🔥 Kolors发布了ControlNet和Inpainting Model! 请查看 [ControlNet(Canny, Depth)](./controlnet/) 和[Inpainting Model](./inpainting/) 获取详细信息。
59
+
60
+ * 2024.07.17 🔥 Kolors-IP-Adapter-Plus 的权重和推理代码已发布!请查看 [IP-Adapter-Plus](./ipadapter/) 获取详细信息。
61
+
62
+ * 2024.07.12 🤗 Kolors 已支持 **Diffusers**! 使用方式可参考 [kolors-diffusers](https://huggingface.co/Kwai-Kolors/Kolors-diffusers)或[下面的例子](#using-with-diffusers) ! 感谢 Diffusers 官方提供的技术支持。
63
+ * 2024.07.10 🤖 Kolors 支持了 [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors).
64
+ * 2024.07.09 💥 Kolors 支持了 [ComfyUI](https://github.com/comfyanonymous/ComfyUI#manual-install-windows-linux),感谢 [@kijai](https://github.com/kijai/ComfyUI-KwaiKolorsWrapper) 的工作。
65
+ * 2024.07.06 🔥🔥🔥 我们开源了基于隐空间扩散的文生图大模型 **Kolors** ,该模型基于数十亿图文对进行训练,支持256的上下文token数,支持中英双语,技术细节参考[技术报告](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf)。
66
+ * 2024.07.03 📊 Kolors 在智源研究院 [FlagEval 多模态文生图](https://flageval.baai.ac.cn/#/leaderboard/multimodal?kind=t2i)评测中取得第二名,其中中文主观质量、英文主观质量两个单项排名第一。
67
+ * 2024.07.02 🎉 祝贺,可图项目组提出的可控视频生成方法 [DragAnything: Motion Control for Anything using Entity Representation](https://arxiv.org/abs/2403.07420) 被 ECCV 2024 接收。
68
+ * 2024.02.08 🎉 祝贺,可图项目组提出的生成模型评估方法 [Learning Multi-dimensional Human Preference for Text-to-Image Generation](https://wangbohan97.github.io/MPS/) 被 CVPR 2024 接收。
69
+ <br><br>
70
+
71
+ ## <a name="开源计划"></a>📑 开源计划
72
+
73
+ - Kolors (Text-to-Image Model)
74
+ - [x] Inference
75
+ - [x] Checkpoints
76
+ - [x] IP-Adapter
77
+ - [x] ControlNet (Canny, Depth)
78
+ - [x] Inpainting
79
+ - [x] IP-Adapter-FaceID
80
+ - [x] LoRA
81
+ - [x] ControlNet (Pose)
82
+ - [x] ComfyUI
83
+ - [x] Gradio
84
+ - [x] Diffusers
85
+ <br><br>
86
+
87
+ ## <a name="模型介绍"></a>📖 模型介绍
88
+ 可图大模型是由快手可图团队开发的基于潜在扩散的大规模文本到图像生成模型。Kolors 在数十亿图文对下进行训练,在视觉质量、复杂语义理解、文字生成(中英文字符)等方面,相比于开源/闭源模型,都展示出了巨大的优势。同时,Kolors 支持中英双语,在中文特色内容理解方面更具竞争力。更多的实验结果和细节请查看我们的<a href="https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf">技术报告</a></b>。
89
+ <br><br>
90
+
91
+ ## <a name="评测表现"></a>📊 评测表现
92
+ 为了全面比较 Kolors 与其他模型的生成能力,我们构建了包含人工评估、机器评估的全面评测内容。
93
+ 在相关基准评测中,Kolors 具有非常有竞争力的表现,达到业界领先水平。我们构建了一个包含14种垂类,12个挑战项,总数量为一千多个 prompt 的文生图评估集 KolorsPrompts。在 KolorsPrompts 上,我们收集了 Kolors 与市面上常见的 SOTA 级别的开源/闭源系统的文生图结果,并进行了人工评测和机器评测。
94
+ <br><br>
95
+
96
+ ### 人工评测
97
+
98
+ 我们邀请了50个具有图像领域知识的专业评估人员对不同模型的生成结果进行对比评估,为生成图像打分,衡量维度为:画面质量、图文相关性、整体满意度三个方面。
99
+ Kolors 在整体满意度方面处于最优水平,其中画面质量显著领先其他模型。
100
+ <div style="text-align: center;">
101
+
102
+ | 模型 | 整体满意度平均分 | 画面质量平均分 | 图文相关性平均分 |
103
+ | :--------------: | :--------: | :--------: | :--------: |
104
+ | Adobe-Firefly | 3.03 | 3.46 | 3.84 |
105
+ | Stable Diffusion 3 | 3.26 | 3.50 | 4.20 |
106
+ | DALL-E 3 | 3.32 | 3.54 | 4.22 |
107
+ | Midjourney-v5 | 3.32 | 3.68 | 4.02 |
108
+ | Playground-v2.5 | 3.37 | 3.73 | 4.04 |
109
+ | Midjourney-v6 | 3.58 | 3.92 | 4.18 |
110
+ | **Kolors** | **3.59** | **3.99** | **4.17** |
111
+
112
+ </div>
113
+
114
+
115
+ <div style="color: gray; font-size: small;">
116
+
117
+ **所有模型结果取自 2024.04 的产品版本**
118
+
119
+ </div>
120
+ <br>
121
+
122
+ ### 机器评测
123
+ 我们采用 [MPS](https://arxiv.org/abs/2405.14705) (Multi-dimensional Human preference Score) 来评估上述模型。
124
+ 我们以 KolorsPrompts 作为基础评估数据集,计算多个模型的 MPS 指标。Kolors 实现了最高的MPS 指标,这与人工评估的指标一致。
125
+
126
+ <div style="text-align:center">
127
+
128
+ | 模型 | MPS综合得分 |
129
+ |-------------------|-------------|
130
+ | Adobe-Firefly | 8.5 |
131
+ | Stable Diffusion 3 | 8.9 |
132
+ | DALL-E 3 | 9.0 |
133
+ | Midjourney-v5 | 9.4 |
134
+ | Playground-v2.5 | 9.8 |
135
+ | Midjourney-v6 | 10.2 |
136
+ | **Kolors** | **10.3** |
137
+ </div>
138
+
139
+
140
+ <br>
141
+
142
+ 更多的实验结果和细节请查看我们的技术报告。点击[技术报告](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf)。
143
+ <br><br>
144
+
145
+ ## <a name="可视化"></a>🎥 可视化
146
+
147
+ * **高质量人像**
148
+ <div style="display: flex; justify-content: space-between;">
149
+ <img src="imgs/zl8.png" />
150
+ </div>
151
+ <br>
152
+
153
+ * **中国元素**
154
+ <div style="display: flex; justify-content: space-between;">
155
+ <img src="imgs/cn_all.png"/>
156
+ </div>
157
+ <br>
158
+
159
+ * **复杂语义理解**
160
+ <div style="display: flex; justify-content: space-between;">
161
+ <img src="imgs/fz_all.png"/>
162
+ </div>
163
+ <br>
164
+
165
+ * **文字绘制**
166
+ <div style="display: flex; justify-content: space-between;">
167
+ <img src="imgs/wz_all.png" />
168
+ </div>
169
+ <br>
170
+ </div>
171
+
172
+ 上述可视化 case,可以点击[可视化prompts](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/prompt_vis.txt) 获取
173
+ <br><br>
174
+
175
+ ## <a name="快速使用"></a>🛠️ 快速使用
176
+
177
+ ### 要求
178
+
179
+ * python 3.8及以上版本
180
+ * pytorch 1.13.1及以上版本
181
+ * transformers 4.26.1及以上版本
182
+ * 建议使用CUDA 11.7及以上
183
+ <br>
184
+
185
+ 1、仓库克隆及依赖安装
186
+ ```bash
187
+ apt-get install git-lfs
188
+ git clone https://github.com/Kwai-Kolors/Kolors
189
+ cd Kolors
190
+ conda create --name kolors python=3.8
191
+ conda activate kolors
192
+ pip install -r requirements.txt
193
+ python3 setup.py install
194
+ ```
195
+ 2、模型权重下载([链接](https://huggingface.co/Kwai-Kolors/Kolors)):
196
+ ```bash
197
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors --local-dir weights/Kolors
198
+ ```
199
+ 或者
200
+ ```bash
201
+ git lfs clone https://huggingface.co/Kwai-Kolors/Kolors weights/Kolors
202
+ ```
203
+ 3、模型推理:
204
+ ```bash
205
+ python3 scripts/sample.py "一张瓢虫的照片,微距,变焦,高质量,电影,拿着一个牌子,写着“可图”"
206
+ # The image will be saved to "scripts/outputs/sample_text.jpg"
207
+ ```
208
+ 4、 Web demo:
209
+ ```bash
210
+ python3 scripts/sampleui.py
211
+ ```
212
+
213
+ ### 在 Diffusers 中使用
214
+ 确保您安装了最新版本的 `diffusers`(0.30.0.dev0):
215
+ ```
216
+ git clone https://github.com/huggingface/diffusers
217
+ cd diffusers
218
+ python3 setup.py install
219
+ ```
220
+ **注意:**
221
+ - KolorsPipeline 默认使用`EulerDiscreteScheduler` 作为噪声调度器。我们推荐使用该调度器时搭配 `guidance scale=5.0` 及 `num_inference_steps=50`。
222
+ - KolorsPipeline 同时支持 `EDMDPMSolverMultistepScheduler`。在使用该噪声调度器时,推荐使用参数 `guidance scale=5.0`及`num_inference_steps=25`。
223
+ - 除了文生图能力,`KolorsImg2ImgPipeline` 同时也支持图文生图功能。
224
+
225
+ 运行以下指令进行图像生成:
226
+ ```python
227
+ import torch
228
+ from diffusers import KolorsPipeline
229
+ pipe = KolorsPipeline.from_pretrained(
230
+ "Kwai-Kolors/Kolors-diffusers",
231
+ torch_dtype=torch.float16,
232
+ variant="fp16"
233
+ ).to("cuda")
234
+ prompt = '一张瓢虫的照片,微距,变焦,高质量,电影,拿着一个牌子,写着"可图"'
235
+ image = pipe(
236
+ prompt=prompt,
237
+ negative_prompt="",
238
+ guidance_scale=5.0,
239
+ num_inference_steps=50,
240
+ generator=torch.Generator(pipe.device).manual_seed(66),
241
+ ).images[0]
242
+ image.show()
243
+ ```
244
+ ### IP-Adapter-Plus
245
+
246
+ 我们提供了 IP-Adapter-Plus 的参数和代码, 详细信息见 [ipadapter](./ipadapter/README.md).
247
+
248
+ ```bash
249
+ # Weights download
250
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-IP-Adapter-Plus --local-dir weights/Kolors-IP-Adapter-Plus
251
+ ```
252
+
253
+ ```bash
254
+ # Inference:
255
+ python3 ipadapter/sample_ipadapter_plus.py ./ipadapter/asset/test_ip.jpg "穿着黑色T恤衫,上面中文绿色大字写着“可图”"
256
+
257
+ python3 ipadapter/sample_ipadapter_plus.py ./ipadapter/asset/test_ip2.png "一只可爱的小狗在奔跑"
258
+
259
+ # The image will be saved to "scripts/outputs/"
260
+ ```
261
+
262
+
263
+ ### ControlNet
264
+
265
+ 我们提供了三个类型的ControlNet参数和代码,详细信息见[controlnet](./controlnet/README.md)。
266
+
267
+ ```bash
268
+ # Weights download
269
+
270
+ # Canny - ControlNet
271
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-ControlNet-Canny --local-dir weights/Kolors-ControlNet-Canny
272
+
273
+ # Depth - ControlNet
274
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-ControlNet-Depth --local-dir weights/Kolors-ControlNet-Depth
275
+
276
+ # Pose - ControlNet
277
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-ControlNet-Pose --local-dir weights/Kolors-ControlNet-Pose
278
+ ```
279
+ 如果你打算使用深度估计网络,请确保下载其相应的模型权重。
280
+ ```
281
+ huggingface-cli download lllyasviel/Annotators ./dpt_hybrid-midas-501f0c75.pt --local-dir ./controlnet/annotator/ckpts
282
+ ```
283
+
284
+ 感谢[DWPose](https://github.com/IDEA-Research/DWPose/tree/onnx?tab=readme-ov-file),你可以使用姿态预测网络。 请下载姿态模型 dw-ll_ucoco_384.onnx ([baidu](https://pan.baidu.com/s/1nuBjw-KKSxD_BkpmwXUJiw?pwd=28d7), [google](https://drive.google.com/file/d/12L8E2oAgZy4VACGSK9RaZBZrfgx7VTA2/view?usp=sharing)) 和检测模型 yolox_l.onnx ([baidu](https://pan.baidu.com/s/1fpfIVpv5ypo4c1bUlzkMYQ?pwd=mjdn), [google](https://drive.google.com/file/d/1w9pXC8tT0p9ndMN-CArp1__b2GbzewWI/view?usp=sharing))。然后请将它们放入 `controlnet/annotator/ckpts/`。
285
+
286
+
287
+ ```bash
288
+ # Inference:
289
+
290
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_1.png 一个漂亮的女孩,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K Canny
291
+
292
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_2.png 新海诚风格,丰富的色彩,穿着绿色衬衫的女人站在田野里,唯美风景,清新明亮,斑驳的光影,最好的质量,超细节,8K画质 Depth
293
+
294
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_3.png 一位穿着紫色泡泡袖连衣裙、戴着皇冠和白色蕾丝手套的女孩双手托脸,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K Pose
295
+
296
+ # The image will be saved to "controlnet/outputs/"
297
+ ```
298
+
299
+
300
+ ### Inpainting
301
+
302
+ 我们提供了 Inpainting 的参数和代码, 详细信息见 [inpainting](./inpainting/README.md).
303
+
304
+ ```bash
305
+ # Weights download
306
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-Inpainting --local-dir weights/Kolors-Inpainting
307
+ ```
308
+
309
+ ```bash
310
+ # Inference:
311
+ python3 inpainting/sample_inpainting.py ./inpainting/asset/3.png ./inpainting/asset/3_mask.png 穿着美少女战士的衣服,一件类似于水手服风格的衣服,包括一个白色紧身上衣,前胸搭配一个大大的红色蝴蝶结。衣服的领子部分呈蓝色,并且有白色条纹。她还穿着一条蓝色百褶裙,超高清,辛烷渲染,高级质感,32k,高分辨率,最好的质量,超级细节,景深
312
+
313
+ python3 inpainting/sample_inpainting.py ./inpainting/asset/4.png ./inpainting/asset/4_mask.png 穿着钢铁侠的衣服,高科技盔甲,主要颜色为红色和金色,并且有一些银色装饰。胸前有一个亮起的圆形反应堆装置,充满了未来科技感。超清晰,高质量,超逼真,高分辨率,最好的质量,超级细节,景深
314
+
315
+ # The image will be saved to "scripts/outputs/"
316
+ ```
317
+
318
+ ### IP-Adapter-FaceID-Plus
319
+
320
+ 我们提供了 IP-Adapter-FaceID-Plus 的参数和代码, 详细信息见 [ipadapter_FaceID](./ipadapter_FaceID/README.md).
321
+
322
+ ```bash
323
+ # Weights download
324
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-IP-Adapter-FaceID-Plus --local-dir weights/Kolors-IP-Adapter-FaceID-Plus
325
+ ```
326
+
327
+ ```bash
328
+ # Inference:
329
+ python ipadapter_FaceID/sample_ipadapter_faceid_plus.py ./ipadapter_FaceID/assets/image1.png "穿着晚礼服,在星光下的晚宴场景中,烛光闪闪,整个场景洋溢着浪漫而奢华的氛围"
330
+
331
+ python ipadapter_FaceID/sample_ipadapter_faceid_plus.py ./ipadapter_FaceID/assets/image2.png "西部牛仔,牛仔帽,荒野大镖客,背景是西部小镇,仙人掌,,日落余晖, 暖色调, 使用XT4胶片拍摄, 噪点, 晕影, 柯达胶卷,复古"
332
+
333
+ # The image will be saved to "scripts/outputs/"
334
+ ```
335
+
336
+ ### Dreambooth-LoRA
337
+
338
+ 我们提供了Dreambooth-LoRA 的训练和推理代码,详细信息见 [Dreambooth-LoRA](./dreambooth/README.md).
339
+
340
+ ```bash
341
+ # Training:
342
+ sh train.sh
343
+ ```
344
+
345
+ ```bash
346
+ # Inference:
347
+ python infer_dreambooth.py "ktxl狗在草地上跑"
348
+ ```
349
+
350
+ <br><br>
351
+
352
+ ## <a name="协议引用"></a>📜协议、引用、致谢
353
+
354
+
355
+ ### 协议
356
+ **Kolors**(可图)权重对学术研究完全开放,若您期望基于本模型协议的许可条件与限制,将可图KOLORS模型或其衍生品用作商业目的,请您将[问卷](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/可图KOLORS模型商业授权申请书.docx)发送至邮箱[email protected],以向许可方登记。若您提供的所有产品或服务的月活跃用户数在前一个自然月未超过3亿月活跃用户数,则您向许可方进行登记,将被视为获得相应的商业许可;若您提供的所有产品或服务的月活跃用户数在前一个自然月超过3亿月活跃用户数,则您必须向许可人申请许可,许可人可自行决定向您授予许可。
357
+
358
+ 本开源模型旨在与开源社区共同推进文生图大模型技术的发展。本项目代码依照 Apache-2.0 协议开源,模型权重需要遵循本《模型许可协议》,我们恳请所有开发者和用户严格遵守[开源协议](MODEL_LICENSE),避免将开源模型、代码及其衍生物用于任何可能对国家和社会造成危害的用途,或用于任何未经安全评估和备案的服务。需要注意,尽管模型在训练中我们尽力确保数据的合规性、准确性和安全性,但由于视觉生成模型存在生成多样性和可组合性等特点,以及生成模型受概率随机性因素的影响,模型无法保证输出内容的准确性和安全性,且模型易被误导。本项目不对因使用开源模型和代码而导致的任何数据安全问题、舆情风险或因模型被误导、滥用、传播、不当利用而产生的风险和责任承担任何法律责任。
359
+
360
+ <br>
361
+
362
+ ### 引用
363
+ 如果你觉得我们的工作对你有帮助,欢迎引用!
364
+
365
+ ```
366
+ @article{kolors,
367
+ title={Kolors: Effective Training of Diffusion Model for Photorealistic Text-to-Image Synthesis},
368
+ author={Kolors Team},
369
+ journal={arXiv preprint},
370
+ year={2024}
371
+ }
372
+ ```
373
+ <br>
374
+
375
+ ### 致谢
376
+ - 感谢 [Diffusers](https://github.com/huggingface/diffusers) 提供的codebase
377
+ - 感谢 [ChatGLM3](https://github.com/THUDM/ChatGLM3) 提供的强大中文语言模型
378
+ <br>
379
+
380
+ ### 联系我们
381
+
382
+ 如果你想给我们的研发团队和产品团队留言,欢迎加入我们的[微信群](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/wechat.png)。当然也可以通过邮件([email protected])联系我们。
383
+
384
+
385
+ ## Star History
386
+
387
+ [![Star History Chart](https://api.star-history.com/svg?repos=Kwai-Kolors/Kolors&type=Date)](https://star-history.com/#Kwai-Kolors/Kolors&Date)
build/lib/kolors/__init__.py ADDED
File without changes
build/lib/kolors/models/__init__.py ADDED
File without changes
build/lib/kolors/models/configuration_chatglm.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class ChatGLMConfig(PretrainedConfig):
5
+ model_type = "chatglm"
6
+ def __init__(
7
+ self,
8
+ num_layers=28,
9
+ padded_vocab_size=65024,
10
+ hidden_size=4096,
11
+ ffn_hidden_size=13696,
12
+ kv_channels=128,
13
+ num_attention_heads=32,
14
+ seq_length=2048,
15
+ hidden_dropout=0.0,
16
+ classifier_dropout=None,
17
+ attention_dropout=0.0,
18
+ layernorm_epsilon=1e-5,
19
+ rmsnorm=True,
20
+ apply_residual_connection_post_layernorm=False,
21
+ post_layer_norm=True,
22
+ add_bias_linear=False,
23
+ add_qkv_bias=False,
24
+ bias_dropout_fusion=True,
25
+ multi_query_attention=False,
26
+ multi_query_group_num=1,
27
+ apply_query_key_layer_scaling=True,
28
+ attention_softmax_in_fp32=True,
29
+ fp32_residual_connection=False,
30
+ quantization_bit=0,
31
+ pre_seq_len=None,
32
+ prefix_projection=False,
33
+ **kwargs
34
+ ):
35
+ self.num_layers = num_layers
36
+ self.vocab_size = padded_vocab_size
37
+ self.padded_vocab_size = padded_vocab_size
38
+ self.hidden_size = hidden_size
39
+ self.ffn_hidden_size = ffn_hidden_size
40
+ self.kv_channels = kv_channels
41
+ self.num_attention_heads = num_attention_heads
42
+ self.seq_length = seq_length
43
+ self.hidden_dropout = hidden_dropout
44
+ self.classifier_dropout = classifier_dropout
45
+ self.attention_dropout = attention_dropout
46
+ self.layernorm_epsilon = layernorm_epsilon
47
+ self.rmsnorm = rmsnorm
48
+ self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
49
+ self.post_layer_norm = post_layer_norm
50
+ self.add_bias_linear = add_bias_linear
51
+ self.add_qkv_bias = add_qkv_bias
52
+ self.bias_dropout_fusion = bias_dropout_fusion
53
+ self.multi_query_attention = multi_query_attention
54
+ self.multi_query_group_num = multi_query_group_num
55
+ self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
56
+ self.attention_softmax_in_fp32 = attention_softmax_in_fp32
57
+ self.fp32_residual_connection = fp32_residual_connection
58
+ self.quantization_bit = quantization_bit
59
+ self.pre_seq_len = pre_seq_len
60
+ self.prefix_projection = prefix_projection
61
+ super().__init__(**kwargs)
build/lib/kolors/models/controlnet.py ADDED
@@ -0,0 +1,887 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from dataclasses import dataclass
15
+ from typing import Any, Dict, List, Optional, Tuple, Union
16
+
17
+ import torch
18
+ from torch import nn
19
+ from torch.nn import functional as F
20
+
21
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from diffusers.loaders.single_file_model import FromOriginalModelMixin
23
+ from diffusers.utils import BaseOutput, logging
24
+ from diffusers.models.attention_processor import (
25
+ ADDED_KV_ATTENTION_PROCESSORS,
26
+ CROSS_ATTENTION_PROCESSORS,
27
+ AttentionProcessor,
28
+ AttnAddedKVProcessor,
29
+ AttnProcessor,
30
+ )
31
+ from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
32
+ from diffusers.models.modeling_utils import ModelMixin
33
+
34
+ try:
35
+ from diffusers.unets.unet_2d_blocks import (
36
+ CrossAttnDownBlock2D,
37
+ DownBlock2D,
38
+ UNetMidBlock2D,
39
+ UNetMidBlock2DCrossAttn,
40
+ get_down_block,
41
+ )
42
+ from diffusers.unets.unet_2d_condition import UNet2DConditionModel
43
+ except:
44
+ from diffusers.models.unets.unet_2d_blocks import (
45
+ CrossAttnDownBlock2D,
46
+ DownBlock2D,
47
+ UNetMidBlock2D,
48
+ UNetMidBlock2DCrossAttn,
49
+ get_down_block,
50
+ )
51
+ from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
52
+
53
+
54
+
55
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
56
+
57
+
58
+ @dataclass
59
+ class ControlNetOutput(BaseOutput):
60
+ """
61
+ The output of [`ControlNetModel`].
62
+
63
+ Args:
64
+ down_block_res_samples (`tuple[torch.Tensor]`):
65
+ A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
66
+ be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
67
+ used to condition the original UNet's downsampling activations.
68
+ mid_down_block_re_sample (`torch.Tensor`):
69
+ The activation of the middle block (the lowest sample resolution). Each tensor should be of shape
70
+ `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
71
+ Output can be used to condition the original UNet's middle block activation.
72
+ """
73
+
74
+ down_block_res_samples: Tuple[torch.Tensor]
75
+ mid_block_res_sample: torch.Tensor
76
+
77
+
78
+ class ControlNetConditioningEmbedding(nn.Module):
79
+ """
80
+ Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
81
+ [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
82
+ training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
83
+ convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
84
+ (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
85
+ model) to encode image-space conditions ... into feature maps ..."
86
+ """
87
+
88
+ def __init__(
89
+ self,
90
+ conditioning_embedding_channels: int,
91
+ conditioning_channels: int = 3,
92
+ block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
93
+ ):
94
+ super().__init__()
95
+
96
+ self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
97
+
98
+ self.blocks = nn.ModuleList([])
99
+
100
+ for i in range(len(block_out_channels) - 1):
101
+ channel_in = block_out_channels[i]
102
+ channel_out = block_out_channels[i + 1]
103
+ self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
104
+ self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
105
+
106
+ self.conv_out = zero_module(
107
+ nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
108
+ )
109
+
110
+ def forward(self, conditioning):
111
+ embedding = self.conv_in(conditioning)
112
+ embedding = F.silu(embedding)
113
+
114
+ for block in self.blocks:
115
+ embedding = block(embedding)
116
+ embedding = F.silu(embedding)
117
+
118
+ embedding = self.conv_out(embedding)
119
+
120
+ return embedding
121
+
122
+
123
+ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
124
+ """
125
+ A ControlNet model.
126
+
127
+ Args:
128
+ in_channels (`int`, defaults to 4):
129
+ The number of channels in the input sample.
130
+ flip_sin_to_cos (`bool`, defaults to `True`):
131
+ Whether to flip the sin to cos in the time embedding.
132
+ freq_shift (`int`, defaults to 0):
133
+ The frequency shift to apply to the time embedding.
134
+ down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
135
+ The tuple of downsample blocks to use.
136
+ only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
137
+ block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
138
+ The tuple of output channels for each block.
139
+ layers_per_block (`int`, defaults to 2):
140
+ The number of layers per block.
141
+ downsample_padding (`int`, defaults to 1):
142
+ The padding to use for the downsampling convolution.
143
+ mid_block_scale_factor (`float`, defaults to 1):
144
+ The scale factor to use for the mid block.
145
+ act_fn (`str`, defaults to "silu"):
146
+ The activation function to use.
147
+ norm_num_groups (`int`, *optional*, defaults to 32):
148
+ The number of groups to use for the normalization. If None, normalization and activation layers is skipped
149
+ in post-processing.
150
+ norm_eps (`float`, defaults to 1e-5):
151
+ The epsilon to use for the normalization.
152
+ cross_attention_dim (`int`, defaults to 1280):
153
+ The dimension of the cross attention features.
154
+ transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
155
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
156
+ [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
157
+ [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
158
+ encoder_hid_dim (`int`, *optional*, defaults to None):
159
+ If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
160
+ dimension to `cross_attention_dim`.
161
+ encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
162
+ If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
163
+ embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
164
+ attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
165
+ The dimension of the attention heads.
166
+ use_linear_projection (`bool`, defaults to `False`):
167
+ class_embed_type (`str`, *optional*, defaults to `None`):
168
+ The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
169
+ `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
170
+ addition_embed_type (`str`, *optional*, defaults to `None`):
171
+ Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
172
+ "text". "text" will use the `TextTimeEmbedding` layer.
173
+ num_class_embeds (`int`, *optional*, defaults to 0):
174
+ Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
175
+ class conditioning with `class_embed_type` equal to `None`.
176
+ upcast_attention (`bool`, defaults to `False`):
177
+ resnet_time_scale_shift (`str`, defaults to `"default"`):
178
+ Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
179
+ projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
180
+ The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
181
+ `class_embed_type="projection"`.
182
+ controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
183
+ The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
184
+ conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
185
+ The tuple of output channel for each block in the `conditioning_embedding` layer.
186
+ global_pool_conditions (`bool`, defaults to `False`):
187
+ TODO(Patrick) - unused parameter.
188
+ addition_embed_type_num_heads (`int`, defaults to 64):
189
+ The number of heads to use for the `TextTimeEmbedding` layer.
190
+ """
191
+
192
+ _supports_gradient_checkpointing = True
193
+
194
+ @register_to_config
195
+ def __init__(
196
+ self,
197
+ in_channels: int = 4,
198
+ conditioning_channels: int = 3,
199
+ flip_sin_to_cos: bool = True,
200
+ freq_shift: int = 0,
201
+ down_block_types: Tuple[str, ...] = (
202
+ "CrossAttnDownBlock2D",
203
+ "CrossAttnDownBlock2D",
204
+ "CrossAttnDownBlock2D",
205
+ "DownBlock2D",
206
+ ),
207
+ mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
208
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
209
+ block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
210
+ layers_per_block: int = 2,
211
+ downsample_padding: int = 1,
212
+ mid_block_scale_factor: float = 1,
213
+ act_fn: str = "silu",
214
+ norm_num_groups: Optional[int] = 32,
215
+ norm_eps: float = 1e-5,
216
+ cross_attention_dim: int = 1280,
217
+ transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
218
+ encoder_hid_dim: Optional[int] = None,
219
+ encoder_hid_dim_type: Optional[str] = None,
220
+ attention_head_dim: Union[int, Tuple[int, ...]] = 8,
221
+ num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
222
+ use_linear_projection: bool = False,
223
+ class_embed_type: Optional[str] = None,
224
+ addition_embed_type: Optional[str] = None,
225
+ addition_time_embed_dim: Optional[int] = None,
226
+ num_class_embeds: Optional[int] = None,
227
+ upcast_attention: bool = False,
228
+ resnet_time_scale_shift: str = "default",
229
+ projection_class_embeddings_input_dim: Optional[int] = None,
230
+ controlnet_conditioning_channel_order: str = "rgb",
231
+ conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
232
+ global_pool_conditions: bool = False,
233
+ addition_embed_type_num_heads: int = 64,
234
+ ):
235
+ super().__init__()
236
+
237
+ # If `num_attention_heads` is not defined (which is the case for most models)
238
+ # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
239
+ # The reason for this behavior is to correct for incorrectly named variables that were introduced
240
+ # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
241
+ # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
242
+ # which is why we correct for the naming here.
243
+ num_attention_heads = num_attention_heads or attention_head_dim
244
+
245
+ # Check inputs
246
+ if len(block_out_channels) != len(down_block_types):
247
+ raise ValueError(
248
+ f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
249
+ )
250
+
251
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
252
+ raise ValueError(
253
+ f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
254
+ )
255
+
256
+ if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
257
+ raise ValueError(
258
+ f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
259
+ )
260
+
261
+ if isinstance(transformer_layers_per_block, int):
262
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
263
+
264
+ # input
265
+ conv_in_kernel = 3
266
+ conv_in_padding = (conv_in_kernel - 1) // 2
267
+ self.conv_in = nn.Conv2d(
268
+ in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
269
+ )
270
+
271
+ # time
272
+ time_embed_dim = block_out_channels[0] * 4
273
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
274
+ timestep_input_dim = block_out_channels[0]
275
+ self.time_embedding = TimestepEmbedding(
276
+ timestep_input_dim,
277
+ time_embed_dim,
278
+ act_fn=act_fn,
279
+ )
280
+
281
+ if encoder_hid_dim_type is None and encoder_hid_dim is not None:
282
+ encoder_hid_dim_type = "text_proj"
283
+ self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
284
+ logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
285
+
286
+ if encoder_hid_dim is None and encoder_hid_dim_type is not None:
287
+ raise ValueError(
288
+ f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
289
+ )
290
+
291
+ if encoder_hid_dim_type == "text_proj":
292
+ self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
293
+ elif encoder_hid_dim_type == "text_image_proj":
294
+ # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
295
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
296
+ # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
297
+ self.encoder_hid_proj = TextImageProjection(
298
+ text_embed_dim=encoder_hid_dim,
299
+ image_embed_dim=cross_attention_dim,
300
+ cross_attention_dim=cross_attention_dim,
301
+ )
302
+
303
+ elif encoder_hid_dim_type is not None:
304
+ raise ValueError(
305
+ f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
306
+ )
307
+ else:
308
+ self.encoder_hid_proj = None
309
+
310
+ # class embedding
311
+ if class_embed_type is None and num_class_embeds is not None:
312
+ self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
313
+ elif class_embed_type == "timestep":
314
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
315
+ elif class_embed_type == "identity":
316
+ self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
317
+ elif class_embed_type == "projection":
318
+ if projection_class_embeddings_input_dim is None:
319
+ raise ValueError(
320
+ "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
321
+ )
322
+ # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
323
+ # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
324
+ # 2. it projects from an arbitrary input dimension.
325
+ #
326
+ # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
327
+ # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
328
+ # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
329
+ self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
330
+ else:
331
+ self.class_embedding = None
332
+
333
+ if addition_embed_type == "text":
334
+ if encoder_hid_dim is not None:
335
+ text_time_embedding_from_dim = encoder_hid_dim
336
+ else:
337
+ text_time_embedding_from_dim = cross_attention_dim
338
+
339
+ self.add_embedding = TextTimeEmbedding(
340
+ text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
341
+ )
342
+ elif addition_embed_type == "text_image":
343
+ # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
344
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
345
+ # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
346
+ self.add_embedding = TextImageTimeEmbedding(
347
+ text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
348
+ )
349
+ elif addition_embed_type == "text_time":
350
+ self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
351
+ self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
352
+
353
+ elif addition_embed_type is not None:
354
+ raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
355
+
356
+ # control net conditioning embedding
357
+ self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
358
+ conditioning_embedding_channels=block_out_channels[0],
359
+ block_out_channels=conditioning_embedding_out_channels,
360
+ conditioning_channels=conditioning_channels,
361
+ )
362
+
363
+ self.down_blocks = nn.ModuleList([])
364
+ self.controlnet_down_blocks = nn.ModuleList([])
365
+
366
+ if isinstance(only_cross_attention, bool):
367
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
368
+
369
+ if isinstance(attention_head_dim, int):
370
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
371
+
372
+ if isinstance(num_attention_heads, int):
373
+ num_attention_heads = (num_attention_heads,) * len(down_block_types)
374
+
375
+ # down
376
+ output_channel = block_out_channels[0]
377
+
378
+ controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
379
+ controlnet_block = zero_module(controlnet_block)
380
+ self.controlnet_down_blocks.append(controlnet_block)
381
+
382
+ for i, down_block_type in enumerate(down_block_types):
383
+ input_channel = output_channel
384
+ output_channel = block_out_channels[i]
385
+ is_final_block = i == len(block_out_channels) - 1
386
+
387
+ down_block = get_down_block(
388
+ down_block_type,
389
+ num_layers=layers_per_block,
390
+ transformer_layers_per_block=transformer_layers_per_block[i],
391
+ in_channels=input_channel,
392
+ out_channels=output_channel,
393
+ temb_channels=time_embed_dim,
394
+ add_downsample=not is_final_block,
395
+ resnet_eps=norm_eps,
396
+ resnet_act_fn=act_fn,
397
+ resnet_groups=norm_num_groups,
398
+ cross_attention_dim=cross_attention_dim,
399
+ num_attention_heads=num_attention_heads[i],
400
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
401
+ downsample_padding=downsample_padding,
402
+ use_linear_projection=use_linear_projection,
403
+ only_cross_attention=only_cross_attention[i],
404
+ upcast_attention=upcast_attention,
405
+ resnet_time_scale_shift=resnet_time_scale_shift,
406
+ )
407
+ self.down_blocks.append(down_block)
408
+
409
+ for _ in range(layers_per_block):
410
+ controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
411
+ controlnet_block = zero_module(controlnet_block)
412
+ self.controlnet_down_blocks.append(controlnet_block)
413
+
414
+ if not is_final_block:
415
+ controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
416
+ controlnet_block = zero_module(controlnet_block)
417
+ self.controlnet_down_blocks.append(controlnet_block)
418
+
419
+ # mid
420
+ mid_block_channel = block_out_channels[-1]
421
+
422
+ controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
423
+ controlnet_block = zero_module(controlnet_block)
424
+ self.controlnet_mid_block = controlnet_block
425
+
426
+ if mid_block_type == "UNetMidBlock2DCrossAttn":
427
+ self.mid_block = UNetMidBlock2DCrossAttn(
428
+ transformer_layers_per_block=transformer_layers_per_block[-1],
429
+ in_channels=mid_block_channel,
430
+ temb_channels=time_embed_dim,
431
+ resnet_eps=norm_eps,
432
+ resnet_act_fn=act_fn,
433
+ output_scale_factor=mid_block_scale_factor,
434
+ resnet_time_scale_shift=resnet_time_scale_shift,
435
+ cross_attention_dim=cross_attention_dim,
436
+ num_attention_heads=num_attention_heads[-1],
437
+ resnet_groups=norm_num_groups,
438
+ use_linear_projection=use_linear_projection,
439
+ upcast_attention=upcast_attention,
440
+ )
441
+ elif mid_block_type == "UNetMidBlock2D":
442
+ self.mid_block = UNetMidBlock2D(
443
+ in_channels=block_out_channels[-1],
444
+ temb_channels=time_embed_dim,
445
+ num_layers=0,
446
+ resnet_eps=norm_eps,
447
+ resnet_act_fn=act_fn,
448
+ output_scale_factor=mid_block_scale_factor,
449
+ resnet_groups=norm_num_groups,
450
+ resnet_time_scale_shift=resnet_time_scale_shift,
451
+ add_attention=False,
452
+ )
453
+ else:
454
+ raise ValueError(f"unknown mid_block_type : {mid_block_type}")
455
+
456
+ @classmethod
457
+ def from_unet(
458
+ cls,
459
+ unet: UNet2DConditionModel,
460
+ controlnet_conditioning_channel_order: str = "rgb",
461
+ conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
462
+ load_weights_from_unet: bool = True,
463
+ conditioning_channels: int = 3,
464
+ ):
465
+ r"""
466
+ Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
467
+
468
+ Parameters:
469
+ unet (`UNet2DConditionModel`):
470
+ The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
471
+ where applicable.
472
+ """
473
+ transformer_layers_per_block = (
474
+ unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
475
+ )
476
+ encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
477
+ encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
478
+ addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
479
+ addition_time_embed_dim = (
480
+ unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
481
+ )
482
+
483
+ controlnet = cls(
484
+ encoder_hid_dim=encoder_hid_dim,
485
+ encoder_hid_dim_type=encoder_hid_dim_type,
486
+ addition_embed_type=addition_embed_type,
487
+ addition_time_embed_dim=addition_time_embed_dim,
488
+ transformer_layers_per_block=transformer_layers_per_block,
489
+ in_channels=unet.config.in_channels,
490
+ flip_sin_to_cos=unet.config.flip_sin_to_cos,
491
+ freq_shift=unet.config.freq_shift,
492
+ down_block_types=unet.config.down_block_types,
493
+ only_cross_attention=unet.config.only_cross_attention,
494
+ block_out_channels=unet.config.block_out_channels,
495
+ layers_per_block=unet.config.layers_per_block,
496
+ downsample_padding=unet.config.downsample_padding,
497
+ mid_block_scale_factor=unet.config.mid_block_scale_factor,
498
+ act_fn=unet.config.act_fn,
499
+ norm_num_groups=unet.config.norm_num_groups,
500
+ norm_eps=unet.config.norm_eps,
501
+ cross_attention_dim=unet.config.cross_attention_dim,
502
+ attention_head_dim=unet.config.attention_head_dim,
503
+ num_attention_heads=unet.config.num_attention_heads,
504
+ use_linear_projection=unet.config.use_linear_projection,
505
+ class_embed_type=unet.config.class_embed_type,
506
+ num_class_embeds=unet.config.num_class_embeds,
507
+ upcast_attention=unet.config.upcast_attention,
508
+ resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
509
+ projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
510
+ mid_block_type=unet.config.mid_block_type,
511
+ controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
512
+ conditioning_embedding_out_channels=conditioning_embedding_out_channels,
513
+ conditioning_channels=conditioning_channels,
514
+ )
515
+
516
+ if load_weights_from_unet:
517
+ controlnet.conv_in.load_state_dict(unet.conv_in.state_dict())
518
+ controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
519
+ controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
520
+
521
+ if controlnet.class_embedding:
522
+ controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
523
+
524
+ if hasattr(controlnet, "add_embedding"):
525
+ controlnet.add_embedding.load_state_dict(unet.add_embedding.state_dict())
526
+
527
+ controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict())
528
+ controlnet.mid_block.load_state_dict(unet.mid_block.state_dict())
529
+
530
+ return controlnet
531
+
532
+ @property
533
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
534
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
535
+ r"""
536
+ Returns:
537
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
538
+ indexed by its weight name.
539
+ """
540
+ # set recursively
541
+ processors = {}
542
+
543
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
544
+ if hasattr(module, "get_processor"):
545
+ processors[f"{name}.processor"] = module.get_processor()
546
+
547
+ for sub_name, child in module.named_children():
548
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
549
+
550
+ return processors
551
+
552
+ for name, module in self.named_children():
553
+ fn_recursive_add_processors(name, module, processors)
554
+
555
+ return processors
556
+
557
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
558
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
559
+ r"""
560
+ Sets the attention processor to use to compute attention.
561
+
562
+ Parameters:
563
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
564
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
565
+ for **all** `Attention` layers.
566
+
567
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
568
+ processor. This is strongly recommended when setting trainable attention processors.
569
+
570
+ """
571
+ count = len(self.attn_processors.keys())
572
+
573
+ if isinstance(processor, dict) and len(processor) != count:
574
+ raise ValueError(
575
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
576
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
577
+ )
578
+
579
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
580
+ if hasattr(module, "set_processor"):
581
+ if not isinstance(processor, dict):
582
+ module.set_processor(processor)
583
+ else:
584
+ module.set_processor(processor.pop(f"{name}.processor"))
585
+
586
+ for sub_name, child in module.named_children():
587
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
588
+
589
+ for name, module in self.named_children():
590
+ fn_recursive_attn_processor(name, module, processor)
591
+
592
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
593
+ def set_default_attn_processor(self):
594
+ """
595
+ Disables custom attention processors and sets the default attention implementation.
596
+ """
597
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
598
+ processor = AttnAddedKVProcessor()
599
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
600
+ processor = AttnProcessor()
601
+ else:
602
+ raise ValueError(
603
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
604
+ )
605
+
606
+ self.set_attn_processor(processor)
607
+
608
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
609
+ def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
610
+ r"""
611
+ Enable sliced attention computation.
612
+
613
+ When this option is enabled, the attention module splits the input tensor in slices to compute attention in
614
+ several steps. This is useful for saving some memory in exchange for a small decrease in speed.
615
+
616
+ Args:
617
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
618
+ When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
619
+ `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
620
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
621
+ must be a multiple of `slice_size`.
622
+ """
623
+ sliceable_head_dims = []
624
+
625
+ def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
626
+ if hasattr(module, "set_attention_slice"):
627
+ sliceable_head_dims.append(module.sliceable_head_dim)
628
+
629
+ for child in module.children():
630
+ fn_recursive_retrieve_sliceable_dims(child)
631
+
632
+ # retrieve number of attention layers
633
+ for module in self.children():
634
+ fn_recursive_retrieve_sliceable_dims(module)
635
+
636
+ num_sliceable_layers = len(sliceable_head_dims)
637
+
638
+ if slice_size == "auto":
639
+ # half the attention head size is usually a good trade-off between
640
+ # speed and memory
641
+ slice_size = [dim // 2 for dim in sliceable_head_dims]
642
+ elif slice_size == "max":
643
+ # make smallest slice possible
644
+ slice_size = num_sliceable_layers * [1]
645
+
646
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
647
+
648
+ if len(slice_size) != len(sliceable_head_dims):
649
+ raise ValueError(
650
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
651
+ f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
652
+ )
653
+
654
+ for i in range(len(slice_size)):
655
+ size = slice_size[i]
656
+ dim = sliceable_head_dims[i]
657
+ if size is not None and size > dim:
658
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
659
+
660
+ # Recursively walk through all the children.
661
+ # Any children which exposes the set_attention_slice method
662
+ # gets the message
663
+ def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
664
+ if hasattr(module, "set_attention_slice"):
665
+ module.set_attention_slice(slice_size.pop())
666
+
667
+ for child in module.children():
668
+ fn_recursive_set_attention_slice(child, slice_size)
669
+
670
+ reversed_slice_size = list(reversed(slice_size))
671
+ for module in self.children():
672
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
673
+
674
+ def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
675
+ if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
676
+ module.gradient_checkpointing = value
677
+
678
+ def forward(
679
+ self,
680
+ sample: torch.Tensor,
681
+ timestep: Union[torch.Tensor, float, int],
682
+ encoder_hidden_states: torch.Tensor,
683
+ controlnet_cond: torch.Tensor,
684
+ conditioning_scale: float = 1.0,
685
+ class_labels: Optional[torch.Tensor] = None,
686
+ timestep_cond: Optional[torch.Tensor] = None,
687
+ attention_mask: Optional[torch.Tensor] = None,
688
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
689
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
690
+ guess_mode: bool = False,
691
+ return_dict: bool = True,
692
+ ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]:
693
+ """
694
+ The [`ControlNetModel`] forward method.
695
+
696
+ Args:
697
+ sample (`torch.Tensor`):
698
+ The noisy input tensor.
699
+ timestep (`Union[torch.Tensor, float, int]`):
700
+ The number of timesteps to denoise an input.
701
+ encoder_hidden_states (`torch.Tensor`):
702
+ The encoder hidden states.
703
+ controlnet_cond (`torch.Tensor`):
704
+ The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
705
+ conditioning_scale (`float`, defaults to `1.0`):
706
+ The scale factor for ControlNet outputs.
707
+ class_labels (`torch.Tensor`, *optional*, defaults to `None`):
708
+ Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
709
+ timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
710
+ Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
711
+ timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
712
+ embeddings.
713
+ attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
714
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
715
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
716
+ negative values to the attention scores corresponding to "discard" tokens.
717
+ added_cond_kwargs (`dict`):
718
+ Additional conditions for the Stable Diffusion XL UNet.
719
+ cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
720
+ A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
721
+ guess_mode (`bool`, defaults to `False`):
722
+ In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
723
+ you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
724
+ return_dict (`bool`, defaults to `True`):
725
+ Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
726
+
727
+ Returns:
728
+ [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
729
+ If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
730
+ returned where the first element is the sample tensor.
731
+ """
732
+ # check channel order
733
+ channel_order = self.config.controlnet_conditioning_channel_order
734
+
735
+ if channel_order == "rgb":
736
+ # in rgb order by default
737
+ ...
738
+ elif channel_order == "bgr":
739
+ controlnet_cond = torch.flip(controlnet_cond, dims=[1])
740
+ else:
741
+ raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
742
+
743
+ # prepare attention_mask
744
+ if attention_mask is not None:
745
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
746
+ attention_mask = attention_mask.unsqueeze(1)
747
+
748
+ #Todo
749
+ if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
750
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
751
+
752
+ # 1. time
753
+ timesteps = timestep
754
+ if not torch.is_tensor(timesteps):
755
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
756
+ # This would be a good case for the `match` statement (Python 3.10+)
757
+ is_mps = sample.device.type == "mps"
758
+ if isinstance(timestep, float):
759
+ dtype = torch.float32 if is_mps else torch.float64
760
+ else:
761
+ dtype = torch.int32 if is_mps else torch.int64
762
+ timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
763
+ elif len(timesteps.shape) == 0:
764
+ timesteps = timesteps[None].to(sample.device)
765
+
766
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
767
+ timesteps = timesteps.expand(sample.shape[0])
768
+
769
+ t_emb = self.time_proj(timesteps)
770
+
771
+ # timesteps does not contain any weights and will always return f32 tensors
772
+ # but time_embedding might actually be running in fp16. so we need to cast here.
773
+ # there might be better ways to encapsulate this.
774
+ t_emb = t_emb.to(dtype=sample.dtype)
775
+
776
+ emb = self.time_embedding(t_emb, timestep_cond)
777
+ aug_emb = None
778
+
779
+ if self.class_embedding is not None:
780
+ if class_labels is None:
781
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
782
+
783
+ if self.config.class_embed_type == "timestep":
784
+ class_labels = self.time_proj(class_labels)
785
+
786
+ class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
787
+ emb = emb + class_emb
788
+
789
+ if self.config.addition_embed_type is not None:
790
+ if self.config.addition_embed_type == "text":
791
+ aug_emb = self.add_embedding(encoder_hidden_states)
792
+
793
+ elif self.config.addition_embed_type == "text_time":
794
+ if "text_embeds" not in added_cond_kwargs:
795
+ raise ValueError(
796
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
797
+ )
798
+ text_embeds = added_cond_kwargs.get("text_embeds")
799
+ if "time_ids" not in added_cond_kwargs:
800
+ raise ValueError(
801
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
802
+ )
803
+ time_ids = added_cond_kwargs.get("time_ids")
804
+ time_embeds = self.add_time_proj(time_ids.flatten())
805
+ time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
806
+
807
+ add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
808
+ add_embeds = add_embeds.to(emb.dtype)
809
+ aug_emb = self.add_embedding(add_embeds)
810
+
811
+ emb = emb + aug_emb if aug_emb is not None else emb
812
+
813
+ # 2. pre-process
814
+ sample = self.conv_in(sample)
815
+
816
+ controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
817
+ sample = sample + controlnet_cond
818
+
819
+ # 3. down
820
+ down_block_res_samples = (sample,)
821
+ for downsample_block in self.down_blocks:
822
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
823
+ sample, res_samples = downsample_block(
824
+ hidden_states=sample,
825
+ temb=emb,
826
+ encoder_hidden_states=encoder_hidden_states,
827
+ attention_mask=attention_mask,
828
+ cross_attention_kwargs=cross_attention_kwargs,
829
+ )
830
+ else:
831
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
832
+
833
+ down_block_res_samples += res_samples
834
+
835
+ # 4. mid
836
+ if self.mid_block is not None:
837
+ if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
838
+ sample = self.mid_block(
839
+ sample,
840
+ emb,
841
+ encoder_hidden_states=encoder_hidden_states,
842
+ attention_mask=attention_mask,
843
+ cross_attention_kwargs=cross_attention_kwargs,
844
+ )
845
+ else:
846
+ sample = self.mid_block(sample, emb)
847
+
848
+ # 5. Control net blocks
849
+
850
+ controlnet_down_block_res_samples = ()
851
+
852
+ for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
853
+ down_block_res_sample = controlnet_block(down_block_res_sample)
854
+ controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
855
+
856
+ down_block_res_samples = controlnet_down_block_res_samples
857
+
858
+ mid_block_res_sample = self.controlnet_mid_block(sample)
859
+
860
+ # 6. scaling
861
+ if guess_mode and not self.config.global_pool_conditions:
862
+ scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device) # 0.1 to 1.0
863
+ scales = scales * conditioning_scale
864
+ down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
865
+ mid_block_res_sample = mid_block_res_sample * scales[-1] # last one
866
+ else:
867
+ down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
868
+ mid_block_res_sample = mid_block_res_sample * conditioning_scale
869
+
870
+ if self.config.global_pool_conditions:
871
+ down_block_res_samples = [
872
+ torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
873
+ ]
874
+ mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
875
+
876
+ if not return_dict:
877
+ return (down_block_res_samples, mid_block_res_sample)
878
+
879
+ return ControlNetOutput(
880
+ down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
881
+ )
882
+
883
+
884
+ def zero_module(module):
885
+ for p in module.parameters():
886
+ nn.init.zeros_(p)
887
+ return module
build/lib/kolors/models/ipa_faceid_plus/__init__.py ADDED
File without changes
build/lib/kolors/models/ipa_faceid_plus/attention_processor.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+ class AttnProcessor2_0(torch.nn.Module):
7
+ r"""
8
+ Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
9
+ """
10
+ def __init__(
11
+ self,
12
+ hidden_size=None,
13
+ cross_attention_dim=None,
14
+ ):
15
+ super().__init__()
16
+ if not hasattr(F, "scaled_dot_product_attention"):
17
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
18
+
19
+ def __call__(
20
+ self,
21
+ attn,
22
+ hidden_states,
23
+ encoder_hidden_states=None,
24
+ attention_mask=None,
25
+ temb=None,
26
+ ):
27
+ residual = hidden_states
28
+
29
+ if attn.spatial_norm is not None:
30
+ hidden_states = attn.spatial_norm(hidden_states, temb)
31
+
32
+ input_ndim = hidden_states.ndim
33
+
34
+ if input_ndim == 4:
35
+ batch_size, channel, height, width = hidden_states.shape
36
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
37
+
38
+ batch_size, sequence_length, _ = (
39
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
40
+ )
41
+
42
+ if attention_mask is not None:
43
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
44
+ # scaled_dot_product_attention expects attention_mask shape to be
45
+ # (batch, heads, source_length, target_length)
46
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
47
+
48
+ if attn.group_norm is not None:
49
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
50
+
51
+ query = attn.to_q(hidden_states)
52
+
53
+ if encoder_hidden_states is None:
54
+ encoder_hidden_states = hidden_states
55
+ elif attn.norm_cross:
56
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
57
+
58
+ key = attn.to_k(encoder_hidden_states)
59
+ value = attn.to_v(encoder_hidden_states)
60
+
61
+ inner_dim = key.shape[-1]
62
+ head_dim = inner_dim // attn.heads
63
+
64
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
65
+
66
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
67
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
68
+
69
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
70
+ # TODO: add support for attn.scale when we move to Torch 2.1
71
+ hidden_states = F.scaled_dot_product_attention(
72
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
73
+ )
74
+
75
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
76
+ hidden_states = hidden_states.to(query.dtype)
77
+
78
+ # linear proj
79
+ hidden_states = attn.to_out[0](hidden_states)
80
+ # dropout
81
+ hidden_states = attn.to_out[1](hidden_states)
82
+
83
+ if input_ndim == 4:
84
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
85
+
86
+ if attn.residual_connection:
87
+ hidden_states = hidden_states + residual
88
+
89
+ hidden_states = hidden_states / attn.rescale_output_factor
90
+
91
+ return hidden_states
92
+
93
+ class IPAttnProcessor2_0(torch.nn.Module):
94
+ r"""
95
+ Attention processor for IP-Adapater for PyTorch 2.0.
96
+ Args:
97
+ hidden_size (`int`):
98
+ The hidden size of the attention layer.
99
+ cross_attention_dim (`int`):
100
+ The number of channels in the `encoder_hidden_states`.
101
+ scale (`float`, defaults to 1.0):
102
+ the weight scale of image prompt.
103
+ num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
104
+ The context length of the image features.
105
+ """
106
+
107
+ def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
108
+ super().__init__()
109
+
110
+ if not hasattr(F, "scaled_dot_product_attention"):
111
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
112
+
113
+ self.hidden_size = hidden_size
114
+ self.cross_attention_dim = cross_attention_dim
115
+ self.scale = scale
116
+ self.num_tokens = num_tokens
117
+
118
+ self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
119
+ self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
120
+
121
+ def __call__(
122
+ self,
123
+ attn,
124
+ hidden_states,
125
+ encoder_hidden_states=None,
126
+ attention_mask=None,
127
+ temb=None,
128
+ ):
129
+ residual = hidden_states
130
+
131
+ if attn.spatial_norm is not None:
132
+ hidden_states = attn.spatial_norm(hidden_states, temb)
133
+
134
+ input_ndim = hidden_states.ndim
135
+
136
+ if input_ndim == 4:
137
+ batch_size, channel, height, width = hidden_states.shape
138
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
139
+
140
+ batch_size, sequence_length, _ = (
141
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
142
+ )
143
+
144
+ if attention_mask is not None:
145
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
146
+ # scaled_dot_product_attention expects attention_mask shape to be
147
+ # (batch, heads, source_length, target_length)
148
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
149
+
150
+ if attn.group_norm is not None:
151
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
152
+
153
+ query = attn.to_q(hidden_states)
154
+
155
+ if encoder_hidden_states is None:
156
+ encoder_hidden_states = hidden_states
157
+ else:
158
+ # get encoder_hidden_states, ip_hidden_states
159
+ end_pos = encoder_hidden_states.shape[1] - self.num_tokens
160
+ encoder_hidden_states, ip_hidden_states = encoder_hidden_states[:, :end_pos, :], encoder_hidden_states[:, end_pos:, :]
161
+ if attn.norm_cross:
162
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
163
+
164
+ key = attn.to_k(encoder_hidden_states)
165
+ value = attn.to_v(encoder_hidden_states)
166
+
167
+ inner_dim = key.shape[-1]
168
+ head_dim = inner_dim // attn.heads
169
+
170
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
171
+
172
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
173
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
174
+
175
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
176
+ # TODO: add support for attn.scale when we move to Torch 2.1
177
+ hidden_states = F.scaled_dot_product_attention(
178
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
179
+ )
180
+
181
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
182
+ hidden_states = hidden_states.to(query.dtype)
183
+
184
+ # for ip-adapter
185
+ ip_key = self.to_k_ip(ip_hidden_states)
186
+ ip_value = self.to_v_ip(ip_hidden_states)
187
+
188
+ ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
189
+ ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
190
+
191
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
192
+ # TODO: add support for attn.scale when we move to Torch 2.1
193
+ ip_hidden_states = F.scaled_dot_product_attention(
194
+ query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
195
+ )
196
+
197
+ ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
198
+ ip_hidden_states = ip_hidden_states.to(query.dtype)
199
+
200
+ hidden_states = hidden_states + self.scale * ip_hidden_states
201
+
202
+ # linear proj
203
+ hidden_states = attn.to_out[0](hidden_states)
204
+ # dropout
205
+ hidden_states = attn.to_out[1](hidden_states)
206
+
207
+ if input_ndim == 4:
208
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
209
+
210
+ if attn.residual_connection:
211
+ hidden_states = hidden_states + residual
212
+
213
+ hidden_states = hidden_states / attn.rescale_output_factor
214
+
215
+ return hidden_states
build/lib/kolors/models/ipa_faceid_plus/ipa_faceid_plus.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+ def reshape_tensor(x, heads):
6
+ bs, length, width = x.shape
7
+ #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
8
+ x = x.view(bs, length, heads, -1)
9
+ # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
10
+ x = x.transpose(1, 2)
11
+ # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
12
+ x = x.reshape(bs, heads, length, -1)
13
+ return x
14
+
15
+ def FeedForward(dim, mult=4):
16
+ inner_dim = int(dim * mult)
17
+ return nn.Sequential(
18
+ nn.LayerNorm(dim),
19
+ nn.Linear(dim, inner_dim, bias=False),
20
+ nn.GELU(),
21
+ nn.Linear(inner_dim, dim, bias=False),
22
+ )
23
+
24
+ class PerceiverAttention(nn.Module):
25
+ def __init__(self, *, dim, dim_head=64, heads=8):
26
+ super().__init__()
27
+ self.scale = dim_head**-0.5
28
+ self.dim_head = dim_head
29
+ self.heads = heads
30
+ inner_dim = dim_head * heads
31
+
32
+ self.norm1 = nn.LayerNorm(dim)
33
+ self.norm2 = nn.LayerNorm(dim)
34
+
35
+ self.to_q = nn.Linear(dim, inner_dim, bias=False)
36
+ self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
37
+ self.to_out = nn.Linear(inner_dim, dim, bias=False)
38
+
39
+ def forward(self, x, latents):
40
+ """
41
+ Args:
42
+ x (torch.Tensor): image features
43
+ shape (b, n1, D)
44
+ latent (torch.Tensor): latent features
45
+ shape (b, n2, D)
46
+ """
47
+ x = self.norm1(x)
48
+ latents = self.norm2(latents)
49
+
50
+ b, l, _ = latents.shape
51
+
52
+ q = self.to_q(latents)
53
+ kv_input = torch.cat((x, latents), dim=-2)
54
+ k, v = self.to_kv(kv_input).chunk(2, dim=-1)
55
+
56
+ q = reshape_tensor(q, self.heads)
57
+ k = reshape_tensor(k, self.heads)
58
+ v = reshape_tensor(v, self.heads)
59
+
60
+ # attention
61
+ scale = 1 / math.sqrt(math.sqrt(self.dim_head))
62
+ weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
63
+ weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
64
+ out = weight @ v
65
+
66
+ out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
67
+
68
+ return self.to_out(out)
69
+
70
+ class FacePerceiverResampler(torch.nn.Module):
71
+ def __init__(
72
+ self,
73
+ *,
74
+ dim=768,
75
+ depth=4,
76
+ dim_head=64,
77
+ heads=16,
78
+ embedding_dim=1280,
79
+ output_dim=768,
80
+ ff_mult=4,
81
+ ):
82
+ super().__init__()
83
+
84
+ self.proj_in = torch.nn.Linear(embedding_dim, dim)
85
+ self.proj_out = torch.nn.Linear(dim, output_dim)
86
+ self.norm_out = torch.nn.LayerNorm(output_dim)
87
+ self.layers = torch.nn.ModuleList([])
88
+ for _ in range(depth):
89
+ self.layers.append(
90
+ torch.nn.ModuleList(
91
+ [
92
+ PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
93
+ FeedForward(dim=dim, mult=ff_mult),
94
+ ]
95
+ )
96
+ )
97
+
98
+ def forward(self, latents, x):
99
+ x = self.proj_in(x)
100
+ for attn, ff in self.layers:
101
+ latents = attn(x, latents) + latents
102
+ latents = ff(latents) + latents
103
+ latents = self.proj_out(latents)
104
+ return self.norm_out(latents)
105
+
106
+ class ProjPlusModel(torch.nn.Module):
107
+ def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, clip_embeddings_dim=1280, num_tokens=4):
108
+ super().__init__()
109
+
110
+ self.cross_attention_dim = cross_attention_dim
111
+ self.num_tokens = num_tokens
112
+
113
+ self.proj = torch.nn.Sequential(
114
+ torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
115
+ torch.nn.GELU(),
116
+ torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
117
+ )
118
+ self.norm = torch.nn.LayerNorm(cross_attention_dim)
119
+
120
+ self.perceiver_resampler = FacePerceiverResampler(
121
+ dim=cross_attention_dim,
122
+ depth=4,
123
+ dim_head=64,
124
+ heads=cross_attention_dim // 64,
125
+ embedding_dim=clip_embeddings_dim,
126
+ output_dim=cross_attention_dim,
127
+ ff_mult=4,
128
+ )
129
+
130
+ def forward(self, id_embeds, clip_embeds, shortcut = True, scale = 1.0):
131
+ x = self.proj(id_embeds)
132
+ x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
133
+ x = self.norm(x)
134
+ out = self.perceiver_resampler(x, clip_embeds)
135
+ if shortcut:
136
+ out = x + scale * out
137
+ return out
build/lib/kolors/models/modeling_chatglm.py ADDED
@@ -0,0 +1,1298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ PyTorch ChatGLM model. """
2
+
3
+ import math
4
+ import copy
5
+ import warnings
6
+ import re
7
+ import sys
8
+
9
+ import torch
10
+ import torch.utils.checkpoint
11
+ import torch.nn.functional as F
12
+ from torch import nn
13
+ from torch.nn import CrossEntropyLoss, LayerNorm
14
+ from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
15
+ from torch.nn.utils import skip_init
16
+ from typing import Optional, Tuple, Union, List, Callable, Dict, Any
17
+ from copy import deepcopy
18
+
19
+ from transformers.modeling_outputs import (
20
+ BaseModelOutputWithPast,
21
+ CausalLMOutputWithPast,
22
+ SequenceClassifierOutputWithPast,
23
+ )
24
+ from transformers.modeling_utils import PreTrainedModel
25
+ from transformers.utils import logging
26
+ from transformers.generation.logits_process import LogitsProcessor
27
+ from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
28
+
29
+ try:
30
+ from .configuration_chatglm import ChatGLMConfig
31
+ except:
32
+ from configuration_chatglm import ChatGLMConfig
33
+
34
+
35
+ # flags required to enable jit fusion kernels
36
+
37
+ if sys.platform != 'darwin':
38
+ torch._C._jit_set_profiling_mode(False)
39
+ torch._C._jit_set_profiling_executor(False)
40
+ torch._C._jit_override_can_fuse_on_cpu(True)
41
+ torch._C._jit_override_can_fuse_on_gpu(True)
42
+
43
+ logger = logging.get_logger(__name__)
44
+
45
+ _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
46
+ _CONFIG_FOR_DOC = "ChatGLM6BConfig"
47
+
48
+ CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
49
+ "THUDM/chatglm3-6b-base",
50
+ # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
51
+ ]
52
+
53
+
54
+ def default_init(cls, *args, **kwargs):
55
+ return cls(*args, **kwargs)
56
+
57
+
58
+ class InvalidScoreLogitsProcessor(LogitsProcessor):
59
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
60
+ if torch.isnan(scores).any() or torch.isinf(scores).any():
61
+ scores.zero_()
62
+ scores[..., 5] = 5e4
63
+ return scores
64
+
65
+
66
+ class PrefixEncoder(torch.nn.Module):
67
+ """
68
+ The torch.nn model to encode the prefix
69
+ Input shape: (batch-size, prefix-length)
70
+ Output shape: (batch-size, prefix-length, 2*layers*hidden)
71
+ """
72
+
73
+ def __init__(self, config: ChatGLMConfig):
74
+ super().__init__()
75
+ self.prefix_projection = config.prefix_projection
76
+ if self.prefix_projection:
77
+ # Use a two-layer MLP to encode the prefix
78
+ kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
79
+ self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
80
+ self.trans = torch.nn.Sequential(
81
+ torch.nn.Linear(kv_size, config.hidden_size),
82
+ torch.nn.Tanh(),
83
+ torch.nn.Linear(config.hidden_size, kv_size)
84
+ )
85
+ else:
86
+ self.embedding = torch.nn.Embedding(config.pre_seq_len,
87
+ config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
88
+
89
+ def forward(self, prefix: torch.Tensor):
90
+ if self.prefix_projection:
91
+ prefix_tokens = self.embedding(prefix)
92
+ past_key_values = self.trans(prefix_tokens)
93
+ else:
94
+ past_key_values = self.embedding(prefix)
95
+ return past_key_values
96
+
97
+
98
+ def split_tensor_along_last_dim(
99
+ tensor: torch.Tensor,
100
+ num_partitions: int,
101
+ contiguous_split_chunks: bool = False,
102
+ ) -> List[torch.Tensor]:
103
+ """Split a tensor along its last dimension.
104
+
105
+ Arguments:
106
+ tensor: input tensor.
107
+ num_partitions: number of partitions to split the tensor
108
+ contiguous_split_chunks: If True, make each chunk contiguous
109
+ in memory.
110
+
111
+ Returns:
112
+ A list of Tensors
113
+ """
114
+ # Get the size and dimension.
115
+ last_dim = tensor.dim() - 1
116
+ last_dim_size = tensor.size()[last_dim] // num_partitions
117
+ # Split.
118
+ tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
119
+ # Note: torch.split does not create contiguous tensors by default.
120
+ if contiguous_split_chunks:
121
+ return tuple(chunk.contiguous() for chunk in tensor_list)
122
+
123
+ return tensor_list
124
+
125
+
126
+ class RotaryEmbedding(nn.Module):
127
+ def __init__(self, dim, original_impl=False, device=None, dtype=None):
128
+ super().__init__()
129
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
130
+ self.register_buffer("inv_freq", inv_freq)
131
+ self.dim = dim
132
+ self.original_impl = original_impl
133
+
134
+ def forward_impl(
135
+ self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
136
+ ):
137
+ """Enhanced Transformer with Rotary Position Embedding.
138
+
139
+ Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
140
+ transformers/rope/__init__.py. MIT License:
141
+ https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
142
+ """
143
+ # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
144
+ theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
145
+
146
+ # Create position indexes `[0, 1, ..., seq_len - 1]`
147
+ seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
148
+
149
+ # Calculate the product of position index and $\theta_i$
150
+ idx_theta = torch.outer(seq_idx, theta).float()
151
+
152
+ cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
153
+
154
+ # this is to mimic the behaviour of complex32, else we will get different results
155
+ if dtype in (torch.float16, torch.bfloat16, torch.int8):
156
+ cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
157
+ return cache
158
+
159
+ def forward(self, max_seq_len, offset=0):
160
+ return self.forward_impl(
161
+ max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
162
+ )
163
+
164
+
165
+ @torch.jit.script
166
+ def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
167
+ # x: [sq, b, np, hn]
168
+ sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
169
+ rot_dim = rope_cache.shape[-2] * 2
170
+ x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
171
+ # truncate to support variable sizes
172
+ rope_cache = rope_cache[:sq]
173
+ xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
174
+ rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
175
+ x_out2 = torch.stack(
176
+ [
177
+ xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
178
+ xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
179
+ ],
180
+ -1,
181
+ )
182
+ x_out2 = x_out2.flatten(3)
183
+ return torch.cat((x_out2, x_pass), dim=-1)
184
+
185
+
186
+ class RMSNorm(torch.nn.Module):
187
+ def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
188
+ super().__init__()
189
+ self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
190
+ self.eps = eps
191
+
192
+ def forward(self, hidden_states: torch.Tensor):
193
+ input_dtype = hidden_states.dtype
194
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
195
+ hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
196
+
197
+ return (self.weight * hidden_states).to(input_dtype)
198
+
199
+
200
+ class CoreAttention(torch.nn.Module):
201
+ def __init__(self, config: ChatGLMConfig, layer_number):
202
+ super(CoreAttention, self).__init__()
203
+
204
+ self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
205
+ self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
206
+ if self.apply_query_key_layer_scaling:
207
+ self.attention_softmax_in_fp32 = True
208
+ self.layer_number = max(1, layer_number)
209
+
210
+ projection_size = config.kv_channels * config.num_attention_heads
211
+
212
+ # Per attention head and per partition values.
213
+ self.hidden_size_per_partition = projection_size
214
+ self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
215
+ self.num_attention_heads_per_partition = config.num_attention_heads
216
+
217
+ coeff = None
218
+ self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
219
+ if self.apply_query_key_layer_scaling:
220
+ coeff = self.layer_number
221
+ self.norm_factor *= coeff
222
+ self.coeff = coeff
223
+
224
+ self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
225
+
226
+ def forward(self, query_layer, key_layer, value_layer, attention_mask):
227
+ pytorch_major_version = int(torch.__version__.split('.')[0])
228
+ if pytorch_major_version >= 2:
229
+ query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
230
+ if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
231
+ context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
232
+ is_causal=True)
233
+ else:
234
+ if attention_mask is not None:
235
+ attention_mask = ~attention_mask
236
+ context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
237
+ attention_mask)
238
+ context_layer = context_layer.permute(2, 0, 1, 3)
239
+ new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
240
+ context_layer = context_layer.reshape(*new_context_layer_shape)
241
+ else:
242
+ # Raw attention scores
243
+
244
+ # [b, np, sq, sk]
245
+ output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
246
+
247
+ # [sq, b, np, hn] -> [sq, b * np, hn]
248
+ query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
249
+ # [sk, b, np, hn] -> [sk, b * np, hn]
250
+ key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
251
+
252
+ # preallocting input tensor: [b * np, sq, sk]
253
+ matmul_input_buffer = torch.empty(
254
+ output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
255
+ device=query_layer.device
256
+ )
257
+
258
+ # Raw attention scores. [b * np, sq, sk]
259
+ matmul_result = torch.baddbmm(
260
+ matmul_input_buffer,
261
+ query_layer.transpose(0, 1), # [b * np, sq, hn]
262
+ key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk]
263
+ beta=0.0,
264
+ alpha=(1.0 / self.norm_factor),
265
+ )
266
+
267
+ # change view to [b, np, sq, sk]
268
+ attention_scores = matmul_result.view(*output_size)
269
+
270
+ # ===========================
271
+ # Attention probs and dropout
272
+ # ===========================
273
+
274
+ # attention scores and attention mask [b, np, sq, sk]
275
+ if self.attention_softmax_in_fp32:
276
+ attention_scores = attention_scores.float()
277
+ if self.coeff is not None:
278
+ attention_scores = attention_scores * self.coeff
279
+ if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
280
+ attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
281
+ device=attention_scores.device, dtype=torch.bool)
282
+ attention_mask.tril_()
283
+ attention_mask = ~attention_mask
284
+ if attention_mask is not None:
285
+ attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
286
+ attention_probs = F.softmax(attention_scores, dim=-1)
287
+ attention_probs = attention_probs.type_as(value_layer)
288
+
289
+ # This is actually dropping out entire tokens to attend to, which might
290
+ # seem a bit unusual, but is taken from the original Transformer paper.
291
+ attention_probs = self.attention_dropout(attention_probs)
292
+ # =========================
293
+ # Context layer. [sq, b, hp]
294
+ # =========================
295
+
296
+ # value_layer -> context layer.
297
+ # [sk, b, np, hn] --> [b, np, sq, hn]
298
+
299
+ # context layer shape: [b, np, sq, hn]
300
+ output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
301
+ # change view [sk, b * np, hn]
302
+ value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
303
+ # change view [b * np, sq, sk]
304
+ attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
305
+ # matmul: [b * np, sq, hn]
306
+ context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
307
+ # change view [b, np, sq, hn]
308
+ context_layer = context_layer.view(*output_size)
309
+ # [b, np, sq, hn] --> [sq, b, np, hn]
310
+ context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
311
+ # [sq, b, np, hn] --> [sq, b, hp]
312
+ new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
313
+ context_layer = context_layer.view(*new_context_layer_shape)
314
+
315
+ return context_layer
316
+
317
+
318
+ class SelfAttention(torch.nn.Module):
319
+ """Parallel self-attention layer abstract class.
320
+
321
+ Self-attention layer takes input with size [s, b, h]
322
+ and returns output of the same size.
323
+ """
324
+
325
+ def __init__(self, config: ChatGLMConfig, layer_number, device=None):
326
+ super(SelfAttention, self).__init__()
327
+ self.layer_number = max(1, layer_number)
328
+
329
+ self.projection_size = config.kv_channels * config.num_attention_heads
330
+
331
+ # Per attention head and per partition values.
332
+ self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
333
+ self.num_attention_heads_per_partition = config.num_attention_heads
334
+
335
+ self.multi_query_attention = config.multi_query_attention
336
+ self.qkv_hidden_size = 3 * self.projection_size
337
+ if self.multi_query_attention:
338
+ self.num_multi_query_groups_per_partition = config.multi_query_group_num
339
+ self.qkv_hidden_size = (
340
+ self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
341
+ )
342
+ self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
343
+ bias=config.add_bias_linear or config.add_qkv_bias,
344
+ device=device, **_config_to_kwargs(config)
345
+ )
346
+
347
+ self.core_attention = CoreAttention(config, self.layer_number)
348
+
349
+ # Output.
350
+ self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
351
+ device=device, **_config_to_kwargs(config)
352
+ )
353
+
354
+ def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
355
+ if self.multi_query_attention:
356
+ num_attention_heads = self.num_multi_query_groups_per_partition
357
+ else:
358
+ num_attention_heads = self.num_attention_heads_per_partition
359
+ return torch.empty(
360
+ inference_max_sequence_len,
361
+ batch_size,
362
+ num_attention_heads,
363
+ self.hidden_size_per_attention_head,
364
+ dtype=dtype,
365
+ device=device,
366
+ )
367
+
368
+ def forward(
369
+ self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
370
+ ):
371
+ # hidden_states: [sq, b, h]
372
+
373
+ # =================================================
374
+ # Pre-allocate memory for key-values for inference.
375
+ # =================================================
376
+ # =====================
377
+ # Query, Key, and Value
378
+ # =====================
379
+
380
+ # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
381
+ mixed_x_layer = self.query_key_value(hidden_states)
382
+
383
+ if self.multi_query_attention:
384
+ (query_layer, key_layer, value_layer) = mixed_x_layer.split(
385
+ [
386
+ self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
387
+ self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
388
+ self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
389
+ ],
390
+ dim=-1,
391
+ )
392
+ query_layer = query_layer.view(
393
+ query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
394
+ )
395
+ key_layer = key_layer.view(
396
+ key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
397
+ )
398
+ value_layer = value_layer.view(
399
+ value_layer.size()[:-1]
400
+ + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
401
+ )
402
+ else:
403
+ new_tensor_shape = mixed_x_layer.size()[:-1] + \
404
+ (self.num_attention_heads_per_partition,
405
+ 3 * self.hidden_size_per_attention_head)
406
+ mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
407
+
408
+ # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
409
+ (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
410
+
411
+ # apply relative positional encoding (rotary embedding)
412
+ if rotary_pos_emb is not None:
413
+ query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
414
+ key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
415
+
416
+ # adjust key and value for inference
417
+ if kv_cache is not None:
418
+ cache_k, cache_v = kv_cache
419
+ key_layer = torch.cat((cache_k, key_layer), dim=0)
420
+ value_layer = torch.cat((cache_v, value_layer), dim=0)
421
+ if use_cache:
422
+ kv_cache = (key_layer, value_layer)
423
+ else:
424
+ kv_cache = None
425
+
426
+ if self.multi_query_attention:
427
+ key_layer = key_layer.unsqueeze(-2)
428
+ key_layer = key_layer.expand(
429
+ -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
430
+ )
431
+ key_layer = key_layer.contiguous().view(
432
+ key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
433
+ )
434
+ value_layer = value_layer.unsqueeze(-2)
435
+ value_layer = value_layer.expand(
436
+ -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
437
+ )
438
+ value_layer = value_layer.contiguous().view(
439
+ value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
440
+ )
441
+
442
+ # ==================================
443
+ # core attention computation
444
+ # ==================================
445
+
446
+ context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
447
+
448
+ # =================
449
+ # Output. [sq, b, h]
450
+ # =================
451
+
452
+ output = self.dense(context_layer)
453
+
454
+ return output, kv_cache
455
+
456
+
457
+ def _config_to_kwargs(args):
458
+ common_kwargs = {
459
+ "dtype": args.torch_dtype,
460
+ }
461
+ return common_kwargs
462
+
463
+
464
+ class MLP(torch.nn.Module):
465
+ """MLP.
466
+
467
+ MLP will take the input with h hidden state, project it to 4*h
468
+ hidden dimension, perform nonlinear transformation, and project the
469
+ state back into h hidden dimension.
470
+ """
471
+
472
+ def __init__(self, config: ChatGLMConfig, device=None):
473
+ super(MLP, self).__init__()
474
+
475
+ self.add_bias = config.add_bias_linear
476
+
477
+ # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
478
+ self.dense_h_to_4h = nn.Linear(
479
+ config.hidden_size,
480
+ config.ffn_hidden_size * 2,
481
+ bias=self.add_bias,
482
+ device=device,
483
+ **_config_to_kwargs(config)
484
+ )
485
+
486
+ def swiglu(x):
487
+ x = torch.chunk(x, 2, dim=-1)
488
+ return F.silu(x[0]) * x[1]
489
+
490
+ self.activation_func = swiglu
491
+
492
+ # Project back to h.
493
+ self.dense_4h_to_h = nn.Linear(
494
+ config.ffn_hidden_size,
495
+ config.hidden_size,
496
+ bias=self.add_bias,
497
+ device=device,
498
+ **_config_to_kwargs(config)
499
+ )
500
+
501
+ def forward(self, hidden_states):
502
+ # [s, b, 4hp]
503
+ intermediate_parallel = self.dense_h_to_4h(hidden_states)
504
+ intermediate_parallel = self.activation_func(intermediate_parallel)
505
+ # [s, b, h]
506
+ output = self.dense_4h_to_h(intermediate_parallel)
507
+ return output
508
+
509
+
510
+ class GLMBlock(torch.nn.Module):
511
+ """A single transformer layer.
512
+
513
+ Transformer layer takes input with size [s, b, h] and returns an
514
+ output of the same size.
515
+ """
516
+
517
+ def __init__(self, config: ChatGLMConfig, layer_number, device=None):
518
+ super(GLMBlock, self).__init__()
519
+ self.layer_number = layer_number
520
+
521
+ self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
522
+
523
+ self.fp32_residual_connection = config.fp32_residual_connection
524
+
525
+ LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
526
+ # Layernorm on the input data.
527
+ self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
528
+ dtype=config.torch_dtype)
529
+
530
+ # Self attention.
531
+ self.self_attention = SelfAttention(config, layer_number, device=device)
532
+ self.hidden_dropout = config.hidden_dropout
533
+
534
+ # Layernorm on the attention output
535
+ self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
536
+ dtype=config.torch_dtype)
537
+
538
+ # MLP
539
+ self.mlp = MLP(config, device=device)
540
+
541
+ def forward(
542
+ self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
543
+ ):
544
+ # hidden_states: [s, b, h]
545
+
546
+ # Layer norm at the beginning of the transformer layer.
547
+ layernorm_output = self.input_layernorm(hidden_states)
548
+ # Self attention.
549
+ attention_output, kv_cache = self.self_attention(
550
+ layernorm_output,
551
+ attention_mask,
552
+ rotary_pos_emb,
553
+ kv_cache=kv_cache,
554
+ use_cache=use_cache
555
+ )
556
+
557
+ # Residual connection.
558
+ if self.apply_residual_connection_post_layernorm:
559
+ residual = layernorm_output
560
+ else:
561
+ residual = hidden_states
562
+
563
+ layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
564
+ layernorm_input = residual + layernorm_input
565
+
566
+ # Layer norm post the self attention.
567
+ layernorm_output = self.post_attention_layernorm(layernorm_input)
568
+
569
+ # MLP.
570
+ mlp_output = self.mlp(layernorm_output)
571
+
572
+ # Second residual connection.
573
+ if self.apply_residual_connection_post_layernorm:
574
+ residual = layernorm_output
575
+ else:
576
+ residual = layernorm_input
577
+
578
+ output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
579
+ output = residual + output
580
+
581
+ return output, kv_cache
582
+
583
+
584
+ class GLMTransformer(torch.nn.Module):
585
+ """Transformer class."""
586
+
587
+ def __init__(self, config: ChatGLMConfig, device=None):
588
+ super(GLMTransformer, self).__init__()
589
+
590
+ self.fp32_residual_connection = config.fp32_residual_connection
591
+ self.post_layer_norm = config.post_layer_norm
592
+
593
+ # Number of layers.
594
+ self.num_layers = config.num_layers
595
+
596
+ # Transformer layers.
597
+ def build_layer(layer_number):
598
+ return GLMBlock(config, layer_number, device=device)
599
+
600
+ self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
601
+
602
+ if self.post_layer_norm:
603
+ LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
604
+ # Final layer norm before output.
605
+ self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
606
+ dtype=config.torch_dtype)
607
+
608
+ self.gradient_checkpointing = False
609
+
610
+ def _get_layer(self, layer_number):
611
+ return self.layers[layer_number]
612
+
613
+ def forward(
614
+ self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
615
+ use_cache: Optional[bool] = True,
616
+ output_hidden_states: Optional[bool] = False,
617
+ ):
618
+ if not kv_caches:
619
+ kv_caches = [None for _ in range(self.num_layers)]
620
+ presents = () if use_cache else None
621
+ if self.gradient_checkpointing and self.training:
622
+ if use_cache:
623
+ logger.warning_once(
624
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
625
+ )
626
+ use_cache = False
627
+
628
+ all_self_attentions = None
629
+ all_hidden_states = () if output_hidden_states else None
630
+ for index in range(self.num_layers):
631
+ if output_hidden_states:
632
+ all_hidden_states = all_hidden_states + (hidden_states,)
633
+
634
+ layer = self._get_layer(index)
635
+ if self.gradient_checkpointing and self.training:
636
+ layer_ret = torch.utils.checkpoint.checkpoint(
637
+ layer,
638
+ hidden_states,
639
+ attention_mask,
640
+ rotary_pos_emb,
641
+ kv_caches[index],
642
+ use_cache
643
+ )
644
+ else:
645
+ layer_ret = layer(
646
+ hidden_states,
647
+ attention_mask,
648
+ rotary_pos_emb,
649
+ kv_cache=kv_caches[index],
650
+ use_cache=use_cache
651
+ )
652
+ hidden_states, kv_cache = layer_ret
653
+ if use_cache:
654
+ presents = presents + (kv_cache,)
655
+
656
+ if output_hidden_states:
657
+ all_hidden_states = all_hidden_states + (hidden_states,)
658
+
659
+ # Final layer norm.
660
+ if self.post_layer_norm:
661
+ hidden_states = self.final_layernorm(hidden_states)
662
+
663
+ return hidden_states, presents, all_hidden_states, all_self_attentions
664
+
665
+
666
+ class ChatGLMPreTrainedModel(PreTrainedModel):
667
+ """
668
+ An abstract class to handle weights initialization and
669
+ a simple interface for downloading and loading pretrained models.
670
+ """
671
+
672
+ is_parallelizable = False
673
+ supports_gradient_checkpointing = True
674
+ config_class = ChatGLMConfig
675
+ base_model_prefix = "transformer"
676
+ _no_split_modules = ["GLMBlock"]
677
+
678
+ def _init_weights(self, module: nn.Module):
679
+ """Initialize the weights."""
680
+ return
681
+
682
+ def get_masks(self, input_ids, past_key_values, padding_mask=None):
683
+ batch_size, seq_length = input_ids.shape
684
+ full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
685
+ full_attention_mask.tril_()
686
+ past_length = 0
687
+ if past_key_values:
688
+ past_length = past_key_values[0][0].shape[0]
689
+ if past_length:
690
+ full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
691
+ device=input_ids.device), full_attention_mask), dim=-1)
692
+ if padding_mask is not None:
693
+ full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
694
+ if not past_length and padding_mask is not None:
695
+ full_attention_mask -= padding_mask.unsqueeze(-1) - 1
696
+ full_attention_mask = (full_attention_mask < 0.5).bool()
697
+ full_attention_mask.unsqueeze_(1)
698
+ return full_attention_mask
699
+
700
+ def get_position_ids(self, input_ids, device):
701
+ batch_size, seq_length = input_ids.shape
702
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
703
+ return position_ids
704
+
705
+ def _set_gradient_checkpointing(self, module, value=False):
706
+ if isinstance(module, GLMTransformer):
707
+ module.gradient_checkpointing = value
708
+
709
+
710
+ class Embedding(torch.nn.Module):
711
+ """Language model embeddings."""
712
+
713
+ def __init__(self, config: ChatGLMConfig, device=None):
714
+ super(Embedding, self).__init__()
715
+
716
+ self.hidden_size = config.hidden_size
717
+ # Word embeddings (parallel).
718
+ self.word_embeddings = nn.Embedding(
719
+ config.padded_vocab_size,
720
+ self.hidden_size,
721
+ dtype=config.torch_dtype,
722
+ device=device
723
+ )
724
+ self.fp32_residual_connection = config.fp32_residual_connection
725
+
726
+ def forward(self, input_ids):
727
+ # Embeddings.
728
+ words_embeddings = self.word_embeddings(input_ids)
729
+ embeddings = words_embeddings
730
+ # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
731
+ embeddings = embeddings.transpose(0, 1).contiguous()
732
+ # If the input flag for fp32 residual connection is set, convert for float.
733
+ if self.fp32_residual_connection:
734
+ embeddings = embeddings.float()
735
+ return embeddings
736
+
737
+
738
+ class ChatGLMModel(ChatGLMPreTrainedModel):
739
+ def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
740
+ super().__init__(config)
741
+ if empty_init:
742
+ init_method = skip_init
743
+ else:
744
+ init_method = default_init
745
+ init_kwargs = {}
746
+ if device is not None:
747
+ init_kwargs["device"] = device
748
+ self.embedding = init_method(Embedding, config, **init_kwargs)
749
+ self.num_layers = config.num_layers
750
+ self.multi_query_group_num = config.multi_query_group_num
751
+ self.kv_channels = config.kv_channels
752
+
753
+ # Rotary positional embeddings
754
+ self.seq_length = config.seq_length
755
+ rotary_dim = (
756
+ config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
757
+ )
758
+
759
+ self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
760
+ dtype=config.torch_dtype)
761
+ self.encoder = init_method(GLMTransformer, config, **init_kwargs)
762
+ self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
763
+ dtype=config.torch_dtype, **init_kwargs)
764
+ self.pre_seq_len = config.pre_seq_len
765
+ self.prefix_projection = config.prefix_projection
766
+ if self.pre_seq_len is not None:
767
+ for param in self.parameters():
768
+ param.requires_grad = False
769
+ self.prefix_tokens = torch.arange(self.pre_seq_len).long()
770
+ self.prefix_encoder = PrefixEncoder(config)
771
+ self.dropout = torch.nn.Dropout(0.1)
772
+
773
+ def get_input_embeddings(self):
774
+ return self.embedding.word_embeddings
775
+
776
+ def get_prompt(self, batch_size, device, dtype=torch.half):
777
+ prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
778
+ past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
779
+ past_key_values = past_key_values.view(
780
+ batch_size,
781
+ self.pre_seq_len,
782
+ self.num_layers * 2,
783
+ self.multi_query_group_num,
784
+ self.kv_channels
785
+ )
786
+ # seq_len, b, nh, hidden_size
787
+ past_key_values = self.dropout(past_key_values)
788
+ past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
789
+ return past_key_values
790
+
791
+ def forward(
792
+ self,
793
+ input_ids,
794
+ position_ids: Optional[torch.Tensor] = None,
795
+ attention_mask: Optional[torch.BoolTensor] = None,
796
+ full_attention_mask: Optional[torch.BoolTensor] = None,
797
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
798
+ inputs_embeds: Optional[torch.Tensor] = None,
799
+ use_cache: Optional[bool] = None,
800
+ output_hidden_states: Optional[bool] = None,
801
+ return_dict: Optional[bool] = None,
802
+ ):
803
+ output_hidden_states = (
804
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
805
+ )
806
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
807
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
808
+
809
+ batch_size, seq_length = input_ids.shape
810
+
811
+ if inputs_embeds is None:
812
+ inputs_embeds = self.embedding(input_ids)
813
+
814
+ if self.pre_seq_len is not None:
815
+ if past_key_values is None:
816
+ past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
817
+ dtype=inputs_embeds.dtype)
818
+ if attention_mask is not None:
819
+ attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
820
+ attention_mask], dim=-1)
821
+
822
+ if full_attention_mask is None:
823
+ if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
824
+ full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
825
+
826
+ # Rotary positional embeddings
827
+ rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
828
+ if position_ids is not None:
829
+ rotary_pos_emb = rotary_pos_emb[position_ids]
830
+ else:
831
+ rotary_pos_emb = rotary_pos_emb[None, :seq_length]
832
+ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
833
+
834
+ # Run encoder.
835
+ hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
836
+ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
837
+ kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
838
+ )
839
+
840
+ if not return_dict:
841
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
842
+
843
+ return BaseModelOutputWithPast(
844
+ last_hidden_state=hidden_states,
845
+ past_key_values=presents,
846
+ hidden_states=all_hidden_states,
847
+ attentions=all_self_attentions,
848
+ )
849
+
850
+ def quantize(self, weight_bit_width: int):
851
+ from .quantization import quantize
852
+ quantize(self.encoder, weight_bit_width)
853
+ return self
854
+
855
+
856
+ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
857
+ def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
858
+ super().__init__(config)
859
+
860
+ self.max_sequence_length = config.max_length
861
+ self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
862
+ self.config = config
863
+ self.quantized = False
864
+
865
+ if self.config.quantization_bit:
866
+ self.quantize(self.config.quantization_bit, empty_init=True)
867
+
868
+ def _update_model_kwargs_for_generation(
869
+ self,
870
+ outputs: ModelOutput,
871
+ model_kwargs: Dict[str, Any],
872
+ is_encoder_decoder: bool = False,
873
+ standardize_cache_format: bool = False,
874
+ ) -> Dict[str, Any]:
875
+ # update past_key_values
876
+ model_kwargs["past_key_values"] = self._extract_past_from_model_output(
877
+ outputs, standardize_cache_format=standardize_cache_format
878
+ )
879
+
880
+ # update attention mask
881
+ if "attention_mask" in model_kwargs:
882
+ attention_mask = model_kwargs["attention_mask"]
883
+ model_kwargs["attention_mask"] = torch.cat(
884
+ [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
885
+ )
886
+
887
+ # update position ids
888
+ if "position_ids" in model_kwargs:
889
+ position_ids = model_kwargs["position_ids"]
890
+ new_position_id = position_ids[..., -1:].clone()
891
+ new_position_id += 1
892
+ model_kwargs["position_ids"] = torch.cat(
893
+ [position_ids, new_position_id], dim=-1
894
+ )
895
+
896
+ model_kwargs["is_first_forward"] = False
897
+ return model_kwargs
898
+
899
+ def prepare_inputs_for_generation(
900
+ self,
901
+ input_ids: torch.LongTensor,
902
+ past_key_values: Optional[torch.Tensor] = None,
903
+ attention_mask: Optional[torch.Tensor] = None,
904
+ position_ids: Optional[torch.Tensor] = None,
905
+ use_cache: Optional[bool] = None,
906
+ is_first_forward: bool = True,
907
+ **kwargs
908
+ ) -> dict:
909
+ # only last token for input_ids if past is not None
910
+ if position_ids is None:
911
+ position_ids = self.get_position_ids(input_ids, device=input_ids.device)
912
+ if not is_first_forward:
913
+ if past_key_values is not None:
914
+ position_ids = position_ids[..., -1:]
915
+ input_ids = input_ids[:, -1:]
916
+ return {
917
+ "input_ids": input_ids,
918
+ "past_key_values": past_key_values,
919
+ "position_ids": position_ids,
920
+ "attention_mask": attention_mask,
921
+ "return_last_logit": True,
922
+ "use_cache": use_cache
923
+ }
924
+
925
+ def forward(
926
+ self,
927
+ input_ids: Optional[torch.Tensor] = None,
928
+ position_ids: Optional[torch.Tensor] = None,
929
+ attention_mask: Optional[torch.Tensor] = None,
930
+ past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
931
+ inputs_embeds: Optional[torch.Tensor] = None,
932
+ labels: Optional[torch.Tensor] = None,
933
+ use_cache: Optional[bool] = None,
934
+ output_attentions: Optional[bool] = None,
935
+ output_hidden_states: Optional[bool] = None,
936
+ return_dict: Optional[bool] = None,
937
+ return_last_logit: Optional[bool] = False,
938
+ ):
939
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
940
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
941
+
942
+ transformer_outputs = self.transformer(
943
+ input_ids=input_ids,
944
+ position_ids=position_ids,
945
+ attention_mask=attention_mask,
946
+ past_key_values=past_key_values,
947
+ inputs_embeds=inputs_embeds,
948
+ use_cache=use_cache,
949
+ output_hidden_states=output_hidden_states,
950
+ return_dict=return_dict,
951
+ )
952
+
953
+ hidden_states = transformer_outputs[0]
954
+ if return_last_logit:
955
+ hidden_states = hidden_states[-1:]
956
+ lm_logits = self.transformer.output_layer(hidden_states)
957
+ lm_logits = lm_logits.transpose(0, 1).contiguous()
958
+
959
+ loss = None
960
+ if labels is not None:
961
+ lm_logits = lm_logits.to(torch.float32)
962
+
963
+ # Shift so that tokens < n predict n
964
+ shift_logits = lm_logits[..., :-1, :].contiguous()
965
+ shift_labels = labels[..., 1:].contiguous()
966
+ # Flatten the tokens
967
+ loss_fct = CrossEntropyLoss(ignore_index=-100)
968
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
969
+
970
+ lm_logits = lm_logits.to(hidden_states.dtype)
971
+ loss = loss.to(hidden_states.dtype)
972
+
973
+ if not return_dict:
974
+ output = (lm_logits,) + transformer_outputs[1:]
975
+ return ((loss,) + output) if loss is not None else output
976
+
977
+ return CausalLMOutputWithPast(
978
+ loss=loss,
979
+ logits=lm_logits,
980
+ past_key_values=transformer_outputs.past_key_values,
981
+ hidden_states=transformer_outputs.hidden_states,
982
+ attentions=transformer_outputs.attentions,
983
+ )
984
+
985
+ @staticmethod
986
+ def _reorder_cache(
987
+ past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
988
+ ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
989
+ """
990
+ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
991
+ [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
992
+ beam_idx at every generation step.
993
+
994
+ Output shares the same memory storage as `past`.
995
+ """
996
+ return tuple(
997
+ (
998
+ layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
999
+ layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
1000
+ )
1001
+ for layer_past in past
1002
+ )
1003
+
1004
+ def process_response(self, output, history):
1005
+ content = ""
1006
+ history = deepcopy(history)
1007
+ for response in output.split("<|assistant|>"):
1008
+ metadata, content = response.split("\n", maxsplit=1)
1009
+ if not metadata.strip():
1010
+ content = content.strip()
1011
+ history.append({"role": "assistant", "metadata": metadata, "content": content})
1012
+ content = content.replace("[[训练时间]]", "2023年")
1013
+ else:
1014
+ history.append({"role": "assistant", "metadata": metadata, "content": content})
1015
+ if history[0]["role"] == "system" and "tools" in history[0]:
1016
+ content = "\n".join(content.split("\n")[1:-1])
1017
+ def tool_call(**kwargs):
1018
+ return kwargs
1019
+ parameters = eval(content)
1020
+ content = {"name": metadata.strip(), "parameters": parameters}
1021
+ else:
1022
+ content = {"name": metadata.strip(), "content": content}
1023
+ return content, history
1024
+
1025
+ @torch.inference_mode()
1026
+ def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user",
1027
+ max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
1028
+ **kwargs):
1029
+ if history is None:
1030
+ history = []
1031
+ if logits_processor is None:
1032
+ logits_processor = LogitsProcessorList()
1033
+ logits_processor.append(InvalidScoreLogitsProcessor())
1034
+ gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
1035
+ "temperature": temperature, "logits_processor": logits_processor, **kwargs}
1036
+ inputs = tokenizer.build_chat_input(query, history=history, role=role)
1037
+ inputs = inputs.to(self.device)
1038
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
1039
+ tokenizer.get_command("<|observation|>")]
1040
+ outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
1041
+ outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
1042
+ response = tokenizer.decode(outputs)
1043
+ history.append({"role": role, "content": query})
1044
+ response, history = self.process_response(response, history)
1045
+ return response, history
1046
+
1047
+ @torch.inference_mode()
1048
+ def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user",
1049
+ past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
1050
+ logits_processor=None, return_past_key_values=False, **kwargs):
1051
+ if history is None:
1052
+ history = []
1053
+ if logits_processor is None:
1054
+ logits_processor = LogitsProcessorList()
1055
+ logits_processor.append(InvalidScoreLogitsProcessor())
1056
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
1057
+ tokenizer.get_command("<|observation|>")]
1058
+ gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
1059
+ "temperature": temperature, "logits_processor": logits_processor, **kwargs}
1060
+ if past_key_values is None:
1061
+ inputs = tokenizer.build_chat_input(query, history=history, role=role)
1062
+ else:
1063
+ inputs = tokenizer.build_chat_input(query, role=role)
1064
+ inputs = inputs.to(self.device)
1065
+ if past_key_values is not None:
1066
+ past_length = past_key_values[0][0].shape[0]
1067
+ if self.transformer.pre_seq_len is not None:
1068
+ past_length -= self.transformer.pre_seq_len
1069
+ inputs.position_ids += past_length
1070
+ attention_mask = inputs.attention_mask
1071
+ attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
1072
+ inputs['attention_mask'] = attention_mask
1073
+ history.append({"role": role, "content": query})
1074
+ for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
1075
+ eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
1076
+ **gen_kwargs):
1077
+ if return_past_key_values:
1078
+ outputs, past_key_values = outputs
1079
+ outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
1080
+ response = tokenizer.decode(outputs)
1081
+ if response and response[-1] != "�":
1082
+ response, new_history = self.process_response(response, history)
1083
+ if return_past_key_values:
1084
+ yield response, new_history, past_key_values
1085
+ else:
1086
+ yield response, new_history
1087
+
1088
+ @torch.inference_mode()
1089
+ def stream_generate(
1090
+ self,
1091
+ input_ids,
1092
+ generation_config: Optional[GenerationConfig] = None,
1093
+ logits_processor: Optional[LogitsProcessorList] = None,
1094
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
1095
+ prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
1096
+ return_past_key_values=False,
1097
+ **kwargs,
1098
+ ):
1099
+ batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
1100
+
1101
+ if generation_config is None:
1102
+ generation_config = self.generation_config
1103
+ generation_config = copy.deepcopy(generation_config)
1104
+ model_kwargs = generation_config.update(**kwargs)
1105
+ model_kwargs["use_cache"] = generation_config.use_cache
1106
+ bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
1107
+
1108
+ if isinstance(eos_token_id, int):
1109
+ eos_token_id = [eos_token_id]
1110
+ eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
1111
+
1112
+ has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
1113
+ if has_default_max_length and generation_config.max_new_tokens is None:
1114
+ warnings.warn(
1115
+ f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
1116
+ "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
1117
+ " recommend using `max_new_tokens` to control the maximum length of the generation.",
1118
+ UserWarning,
1119
+ )
1120
+ elif generation_config.max_new_tokens is not None:
1121
+ generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
1122
+ if not has_default_max_length:
1123
+ logger.warn(
1124
+ f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
1125
+ f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
1126
+ "Please refer to the documentation for more information. "
1127
+ "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
1128
+ UserWarning,
1129
+ )
1130
+
1131
+ if input_ids_seq_length >= generation_config.max_length:
1132
+ input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
1133
+ logger.warning(
1134
+ f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
1135
+ f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
1136
+ " increasing `max_new_tokens`."
1137
+ )
1138
+
1139
+ # 2. Set generation parameters if not already defined
1140
+ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
1141
+ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
1142
+
1143
+ logits_processor = self._get_logits_processor(
1144
+ generation_config=generation_config,
1145
+ input_ids_seq_length=input_ids_seq_length,
1146
+ encoder_input_ids=input_ids,
1147
+ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
1148
+ logits_processor=logits_processor,
1149
+ )
1150
+
1151
+ stopping_criteria = self._get_stopping_criteria(
1152
+ generation_config=generation_config, stopping_criteria=stopping_criteria
1153
+ )
1154
+ logits_warper = self._get_logits_warper(generation_config)
1155
+
1156
+ unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
1157
+ scores = None
1158
+ while True:
1159
+ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
1160
+ # forward pass to get next token
1161
+ outputs = self(
1162
+ **model_inputs,
1163
+ return_dict=True,
1164
+ output_attentions=False,
1165
+ output_hidden_states=False,
1166
+ )
1167
+
1168
+ next_token_logits = outputs.logits[:, -1, :]
1169
+
1170
+ # pre-process distribution
1171
+ next_token_scores = logits_processor(input_ids, next_token_logits)
1172
+ next_token_scores = logits_warper(input_ids, next_token_scores)
1173
+
1174
+ # sample
1175
+ probs = nn.functional.softmax(next_token_scores, dim=-1)
1176
+ if generation_config.do_sample:
1177
+ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
1178
+ else:
1179
+ next_tokens = torch.argmax(probs, dim=-1)
1180
+ # update generated ids, model inputs, and length for next step
1181
+ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
1182
+ model_kwargs = self._update_model_kwargs_for_generation(
1183
+ outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
1184
+ )
1185
+ unfinished_sequences = unfinished_sequences.mul(
1186
+ next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
1187
+ )
1188
+ if return_past_key_values:
1189
+ yield input_ids, outputs.past_key_values
1190
+ else:
1191
+ yield input_ids
1192
+ # stop when each sentence is finished, or if we exceed the maximum length
1193
+ if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
1194
+ break
1195
+
1196
+ def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
1197
+ if bits == 0:
1198
+ return
1199
+
1200
+ from .quantization import quantize
1201
+
1202
+ if self.quantized:
1203
+ logger.info("Already quantized.")
1204
+ return self
1205
+
1206
+ self.quantized = True
1207
+
1208
+ self.config.quantization_bit = bits
1209
+
1210
+ self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
1211
+ **kwargs)
1212
+ return self
1213
+
1214
+
1215
+ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1216
+ def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
1217
+ super().__init__(config)
1218
+
1219
+ self.num_labels = config.num_labels
1220
+ self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
1221
+
1222
+ self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
1223
+ if config.classifier_dropout is not None:
1224
+ self.dropout = nn.Dropout(config.classifier_dropout)
1225
+ else:
1226
+ self.dropout = None
1227
+ self.config = config
1228
+
1229
+ if self.config.quantization_bit:
1230
+ self.quantize(self.config.quantization_bit, empty_init=True)
1231
+
1232
+ def forward(
1233
+ self,
1234
+ input_ids: Optional[torch.LongTensor] = None,
1235
+ position_ids: Optional[torch.LongTensor] = None,
1236
+ attention_mask: Optional[torch.Tensor] = None,
1237
+ full_attention_mask: Optional[torch.Tensor] = None,
1238
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
1239
+ inputs_embeds: Optional[torch.LongTensor] = None,
1240
+ labels: Optional[torch.LongTensor] = None,
1241
+ use_cache: Optional[bool] = None,
1242
+ output_hidden_states: Optional[bool] = None,
1243
+ return_dict: Optional[bool] = None,
1244
+ ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
1245
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1246
+
1247
+ transformer_outputs = self.transformer(
1248
+ input_ids=input_ids,
1249
+ position_ids=position_ids,
1250
+ attention_mask=attention_mask,
1251
+ full_attention_mask=full_attention_mask,
1252
+ past_key_values=past_key_values,
1253
+ inputs_embeds=inputs_embeds,
1254
+ use_cache=use_cache,
1255
+ output_hidden_states=output_hidden_states,
1256
+ return_dict=return_dict,
1257
+ )
1258
+
1259
+ hidden_states = transformer_outputs[0]
1260
+ pooled_hidden_states = hidden_states[-1]
1261
+ if self.dropout is not None:
1262
+ pooled_hidden_states = self.dropout(pooled_hidden_states)
1263
+ logits = self.classifier_head(pooled_hidden_states)
1264
+
1265
+ loss = None
1266
+ if labels is not None:
1267
+ if self.config.problem_type is None:
1268
+ if self.num_labels == 1:
1269
+ self.config.problem_type = "regression"
1270
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1271
+ self.config.problem_type = "single_label_classification"
1272
+ else:
1273
+ self.config.problem_type = "multi_label_classification"
1274
+
1275
+ if self.config.problem_type == "regression":
1276
+ loss_fct = MSELoss()
1277
+ if self.num_labels == 1:
1278
+ loss = loss_fct(logits.squeeze().float(), labels.squeeze())
1279
+ else:
1280
+ loss = loss_fct(logits.float(), labels)
1281
+ elif self.config.problem_type == "single_label_classification":
1282
+ loss_fct = CrossEntropyLoss()
1283
+ loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
1284
+ elif self.config.problem_type == "multi_label_classification":
1285
+ loss_fct = BCEWithLogitsLoss()
1286
+ loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
1287
+
1288
+ if not return_dict:
1289
+ output = (logits,) + transformer_outputs[1:]
1290
+ return ((loss,) + output) if loss is not None else output
1291
+
1292
+ return SequenceClassifierOutputWithPast(
1293
+ loss=loss,
1294
+ logits=logits,
1295
+ past_key_values=transformer_outputs.past_key_values,
1296
+ hidden_states=transformer_outputs.hidden_states,
1297
+ attentions=transformer_outputs.attentions,
1298
+ )
build/lib/kolors/models/tokenization_chatglm.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import List, Optional, Union, Dict
5
+ from sentencepiece import SentencePieceProcessor
6
+ from transformers import PreTrainedTokenizer
7
+ from transformers.utils import logging, PaddingStrategy
8
+ from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
9
+
10
+
11
+ class SPTokenizer:
12
+ def __init__(self, model_path: str):
13
+ # reload tokenizer
14
+ assert os.path.isfile(model_path), model_path
15
+ self.sp_model = SentencePieceProcessor(model_file=model_path)
16
+
17
+ # BOS / EOS token IDs
18
+ self.n_words: int = self.sp_model.vocab_size()
19
+ self.bos_id: int = self.sp_model.bos_id()
20
+ self.eos_id: int = self.sp_model.eos_id()
21
+ self.pad_id: int = self.sp_model.unk_id()
22
+ assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
23
+
24
+ role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
25
+ special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
26
+ self.special_tokens = {}
27
+ self.index_special_tokens = {}
28
+ for token in special_tokens:
29
+ self.special_tokens[token] = self.n_words
30
+ self.index_special_tokens[self.n_words] = token
31
+ self.n_words += 1
32
+ self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
33
+
34
+ def tokenize(self, s: str, encode_special_tokens=False):
35
+ if encode_special_tokens:
36
+ last_index = 0
37
+ t = []
38
+ for match in re.finditer(self.role_special_token_expression, s):
39
+ if last_index < match.start():
40
+ t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
41
+ t.append(s[match.start():match.end()])
42
+ last_index = match.end()
43
+ if last_index < len(s):
44
+ t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
45
+ return t
46
+ else:
47
+ return self.sp_model.EncodeAsPieces(s)
48
+
49
+ def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
50
+ assert type(s) is str
51
+ t = self.sp_model.encode(s)
52
+ if bos:
53
+ t = [self.bos_id] + t
54
+ if eos:
55
+ t = t + [self.eos_id]
56
+ return t
57
+
58
+ def decode(self, t: List[int]) -> str:
59
+ text, buffer = "", []
60
+ for token in t:
61
+ if token in self.index_special_tokens:
62
+ if buffer:
63
+ text += self.sp_model.decode(buffer)
64
+ buffer = []
65
+ text += self.index_special_tokens[token]
66
+ else:
67
+ buffer.append(token)
68
+ if buffer:
69
+ text += self.sp_model.decode(buffer)
70
+ return text
71
+
72
+ def decode_tokens(self, tokens: List[str]) -> str:
73
+ text = self.sp_model.DecodePieces(tokens)
74
+ return text
75
+
76
+ def convert_token_to_id(self, token):
77
+ """ Converts a token (str) in an id using the vocab. """
78
+ if token in self.special_tokens:
79
+ return self.special_tokens[token]
80
+ return self.sp_model.PieceToId(token)
81
+
82
+ def convert_id_to_token(self, index):
83
+ """Converts an index (integer) in a token (str) using the vocab."""
84
+ if index in self.index_special_tokens:
85
+ return self.index_special_tokens[index]
86
+ if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
87
+ return ""
88
+ return self.sp_model.IdToPiece(index)
89
+
90
+
91
+ class ChatGLMTokenizer(PreTrainedTokenizer):
92
+ vocab_files_names = {"vocab_file": "tokenizer.model"}
93
+
94
+ model_input_names = ["input_ids", "attention_mask", "position_ids"]
95
+
96
+ def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
97
+ **kwargs):
98
+ self.name = "GLMTokenizer"
99
+
100
+ self.vocab_file = vocab_file
101
+ self.tokenizer = SPTokenizer(vocab_file)
102
+ self.special_tokens = {
103
+ "<bos>": self.tokenizer.bos_id,
104
+ "<eos>": self.tokenizer.eos_id,
105
+ "<pad>": self.tokenizer.pad_id
106
+ }
107
+ self.encode_special_tokens = encode_special_tokens
108
+ super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
109
+ encode_special_tokens=encode_special_tokens,
110
+ **kwargs)
111
+
112
+ def get_command(self, token):
113
+ if token in self.special_tokens:
114
+ return self.special_tokens[token]
115
+ assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
116
+ return self.tokenizer.special_tokens[token]
117
+
118
+ @property
119
+ def unk_token(self) -> str:
120
+ return "<unk>"
121
+
122
+ @property
123
+ def pad_token(self) -> str:
124
+ return "<unk>"
125
+
126
+ @property
127
+ def pad_token_id(self):
128
+ return self.get_command("<pad>")
129
+
130
+ @property
131
+ def eos_token(self) -> str:
132
+ return "</s>"
133
+
134
+ @property
135
+ def eos_token_id(self):
136
+ return self.get_command("<eos>")
137
+
138
+ @property
139
+ def vocab_size(self):
140
+ return self.tokenizer.n_words
141
+
142
+ def get_vocab(self):
143
+ """ Returns vocab as a dict """
144
+ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
145
+ vocab.update(self.added_tokens_encoder)
146
+ return vocab
147
+
148
+ def _tokenize(self, text, **kwargs):
149
+ return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
150
+
151
+ def _convert_token_to_id(self, token):
152
+ """ Converts a token (str) in an id using the vocab. """
153
+ return self.tokenizer.convert_token_to_id(token)
154
+
155
+ def _convert_id_to_token(self, index):
156
+ """Converts an index (integer) in a token (str) using the vocab."""
157
+ return self.tokenizer.convert_id_to_token(index)
158
+
159
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
160
+ return self.tokenizer.decode_tokens(tokens)
161
+
162
+ def save_vocabulary(self, save_directory, filename_prefix=None):
163
+ """
164
+ Save the vocabulary and special tokens file to a directory.
165
+
166
+ Args:
167
+ save_directory (`str`):
168
+ The directory in which to save the vocabulary.
169
+ filename_prefix (`str`, *optional*):
170
+ An optional prefix to add to the named of the saved files.
171
+
172
+ Returns:
173
+ `Tuple(str)`: Paths to the files saved.
174
+ """
175
+ if os.path.isdir(save_directory):
176
+ vocab_file = os.path.join(
177
+ save_directory, self.vocab_files_names["vocab_file"]
178
+ )
179
+ else:
180
+ vocab_file = save_directory
181
+
182
+ with open(self.vocab_file, 'rb') as fin:
183
+ proto_str = fin.read()
184
+
185
+ with open(vocab_file, "wb") as writer:
186
+ writer.write(proto_str)
187
+
188
+ return (vocab_file,)
189
+
190
+ def get_prefix_tokens(self):
191
+ prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
192
+ return prefix_tokens
193
+
194
+ def build_single_message(self, role, metadata, message):
195
+ assert role in ["system", "user", "assistant", "observation"], role
196
+ role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
197
+ message_tokens = self.tokenizer.encode(message)
198
+ tokens = role_tokens + message_tokens
199
+ return tokens
200
+
201
+ def build_chat_input(self, query, history=None, role="user"):
202
+ if history is None:
203
+ history = []
204
+ input_ids = []
205
+ for item in history:
206
+ content = item["content"]
207
+ if item["role"] == "system" and "tools" in item:
208
+ content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
209
+ input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
210
+ input_ids.extend(self.build_single_message(role, "", query))
211
+ input_ids.extend([self.get_command("<|assistant|>")])
212
+ return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
213
+
214
+ def build_inputs_with_special_tokens(
215
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
216
+ ) -> List[int]:
217
+ """
218
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
219
+ adding special tokens. A BERT sequence has the following format:
220
+
221
+ - single sequence: `[CLS] X [SEP]`
222
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
223
+
224
+ Args:
225
+ token_ids_0 (`List[int]`):
226
+ List of IDs to which the special tokens will be added.
227
+ token_ids_1 (`List[int]`, *optional*):
228
+ Optional second list of IDs for sequence pairs.
229
+
230
+ Returns:
231
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
232
+ """
233
+ prefix_tokens = self.get_prefix_tokens()
234
+ token_ids_0 = prefix_tokens + token_ids_0
235
+ if token_ids_1 is not None:
236
+ token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
237
+ return token_ids_0
238
+
239
+ def _pad(
240
+ self,
241
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
242
+ max_length: Optional[int] = None,
243
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
244
+ pad_to_multiple_of: Optional[int] = None,
245
+ return_attention_mask: Optional[bool] = None,
246
+ ) -> dict:
247
+ """
248
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
249
+
250
+ Args:
251
+ encoded_inputs:
252
+ Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
253
+ max_length: maximum length of the returned list and optionally padding length (see below).
254
+ Will truncate by taking into account the special tokens.
255
+ padding_strategy: PaddingStrategy to use for padding.
256
+
257
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
258
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
259
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
260
+ The tokenizer padding sides are defined in self.padding_side:
261
+
262
+ - 'left': pads on the left of the sequences
263
+ - 'right': pads on the right of the sequences
264
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
265
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
266
+ `>= 7.5` (Volta).
267
+ return_attention_mask:
268
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
269
+ """
270
+ # Load from model defaults
271
+ assert self.padding_side == "left"
272
+
273
+ required_input = encoded_inputs[self.model_input_names[0]]
274
+ seq_length = len(required_input)
275
+
276
+ if padding_strategy == PaddingStrategy.LONGEST:
277
+ max_length = len(required_input)
278
+
279
+ if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
280
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
281
+
282
+ needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
283
+
284
+ # Initialize attention mask if not present.
285
+ if "attention_mask" not in encoded_inputs:
286
+ encoded_inputs["attention_mask"] = [1] * seq_length
287
+
288
+ if "position_ids" not in encoded_inputs:
289
+ encoded_inputs["position_ids"] = list(range(seq_length))
290
+
291
+ if needs_to_be_padded:
292
+ difference = max_length - len(required_input)
293
+
294
+ if "attention_mask" in encoded_inputs:
295
+ encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
296
+ if "position_ids" in encoded_inputs:
297
+ encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
298
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
299
+
300
+ return encoded_inputs
build/lib/kolors/models/unet_2d_condition.py ADDED
@@ -0,0 +1,1318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from dataclasses import dataclass
15
+ from typing import Any, Dict, List, Optional, Tuple, Union
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+ import torch.utils.checkpoint
20
+
21
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from diffusers.loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
23
+ from diffusers.loaders.single_file_model import FromOriginalModelMixin
24
+ from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
25
+ from diffusers.models.activations import get_activation
26
+ from diffusers.models.attention_processor import (
27
+ ADDED_KV_ATTENTION_PROCESSORS,
28
+ CROSS_ATTENTION_PROCESSORS,
29
+ Attention,
30
+ AttentionProcessor,
31
+ AttnAddedKVProcessor,
32
+ AttnProcessor,
33
+ )
34
+ from diffusers.models.embeddings import (
35
+ GaussianFourierProjection,
36
+ GLIGENTextBoundingboxProjection,
37
+ ImageHintTimeEmbedding,
38
+ ImageProjection,
39
+ ImageTimeEmbedding,
40
+ TextImageProjection,
41
+ TextImageTimeEmbedding,
42
+ TextTimeEmbedding,
43
+ TimestepEmbedding,
44
+ Timesteps,
45
+ )
46
+ from diffusers.models.modeling_utils import ModelMixin
47
+
48
+ try:
49
+ from diffusers.models.unet_2d_blocks import (
50
+ get_down_block,
51
+ get_mid_block,
52
+ get_up_block,
53
+ )
54
+ except:
55
+ from diffusers.models.unets.unet_2d_blocks import (
56
+ get_down_block,
57
+ get_mid_block,
58
+ get_up_block,
59
+ )
60
+
61
+
62
+
63
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
64
+
65
+
66
+ @dataclass
67
+ class UNet2DConditionOutput(BaseOutput):
68
+ """
69
+ The output of [`UNet2DConditionModel`].
70
+
71
+ Args:
72
+ sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
73
+ The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
74
+ """
75
+
76
+ sample: torch.Tensor = None
77
+
78
+
79
+ class UNet2DConditionModel(
80
+ ModelMixin, ConfigMixin, FromOriginalModelMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin
81
+ ):
82
+ r"""
83
+ A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
84
+ shaped output.
85
+
86
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
87
+ for all models (such as downloading or saving).
88
+
89
+ Parameters:
90
+ sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
91
+ Height and width of input/output sample.
92
+ in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
93
+ out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
94
+ center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
95
+ flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
96
+ Whether to flip the sin to cos in the time embedding.
97
+ freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
98
+ down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
99
+ The tuple of downsample blocks to use.
100
+ mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
101
+ Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
102
+ `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
103
+ up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
104
+ The tuple of upsample blocks to use.
105
+ only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
106
+ Whether to include self-attention in the basic transformer blocks, see
107
+ [`~models.attention.BasicTransformerBlock`].
108
+ block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
109
+ The tuple of output channels for each block.
110
+ layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
111
+ downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
112
+ mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
113
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
114
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
115
+ norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
116
+ If `None`, normalization and activation layers is skipped in post-processing.
117
+ norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
118
+ cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
119
+ The dimension of the cross attention features.
120
+ transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
121
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
122
+ [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
123
+ [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
124
+ reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
125
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
126
+ blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
127
+ [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
128
+ [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
129
+ encoder_hid_dim (`int`, *optional*, defaults to None):
130
+ If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
131
+ dimension to `cross_attention_dim`.
132
+ encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
133
+ If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
134
+ embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
135
+ attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
136
+ num_attention_heads (`int`, *optional*):
137
+ The number of attention heads. If not defined, defaults to `attention_head_dim`
138
+ resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
139
+ for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
140
+ class_embed_type (`str`, *optional*, defaults to `None`):
141
+ The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
142
+ `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
143
+ addition_embed_type (`str`, *optional*, defaults to `None`):
144
+ Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
145
+ "text". "text" will use the `TextTimeEmbedding` layer.
146
+ addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
147
+ Dimension for the timestep embeddings.
148
+ num_class_embeds (`int`, *optional*, defaults to `None`):
149
+ Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
150
+ class conditioning with `class_embed_type` equal to `None`.
151
+ time_embedding_type (`str`, *optional*, defaults to `positional`):
152
+ The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
153
+ time_embedding_dim (`int`, *optional*, defaults to `None`):
154
+ An optional override for the dimension of the projected time embedding.
155
+ time_embedding_act_fn (`str`, *optional*, defaults to `None`):
156
+ Optional activation function to use only once on the time embeddings before they are passed to the rest of
157
+ the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
158
+ timestep_post_act (`str`, *optional*, defaults to `None`):
159
+ The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
160
+ time_cond_proj_dim (`int`, *optional*, defaults to `None`):
161
+ The dimension of `cond_proj` layer in the timestep embedding.
162
+ conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
163
+ conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
164
+ projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
165
+ `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
166
+ class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
167
+ embeddings with the class embeddings.
168
+ mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
169
+ Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
170
+ `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
171
+ `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
172
+ otherwise.
173
+ """
174
+
175
+ _supports_gradient_checkpointing = True
176
+ _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D", "CrossAttnUpBlock2D"]
177
+
178
+ @register_to_config
179
+ def __init__(
180
+ self,
181
+ sample_size: Optional[int] = None,
182
+ in_channels: int = 4,
183
+ out_channels: int = 4,
184
+ center_input_sample: bool = False,
185
+ flip_sin_to_cos: bool = True,
186
+ freq_shift: int = 0,
187
+ down_block_types: Tuple[str] = (
188
+ "CrossAttnDownBlock2D",
189
+ "CrossAttnDownBlock2D",
190
+ "CrossAttnDownBlock2D",
191
+ "DownBlock2D",
192
+ ),
193
+ mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
194
+ up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
195
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
196
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
197
+ layers_per_block: Union[int, Tuple[int]] = 2,
198
+ downsample_padding: int = 1,
199
+ mid_block_scale_factor: float = 1,
200
+ dropout: float = 0.0,
201
+ act_fn: str = "silu",
202
+ norm_num_groups: Optional[int] = 32,
203
+ norm_eps: float = 1e-5,
204
+ cross_attention_dim: Union[int, Tuple[int]] = 1280,
205
+ transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
206
+ reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
207
+ encoder_hid_dim: Optional[int] = None,
208
+ encoder_hid_dim_type: Optional[str] = None,
209
+ attention_head_dim: Union[int, Tuple[int]] = 8,
210
+ num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
211
+ dual_cross_attention: bool = False,
212
+ use_linear_projection: bool = False,
213
+ class_embed_type: Optional[str] = None,
214
+ addition_embed_type: Optional[str] = None,
215
+ addition_time_embed_dim: Optional[int] = None,
216
+ num_class_embeds: Optional[int] = None,
217
+ upcast_attention: bool = False,
218
+ resnet_time_scale_shift: str = "default",
219
+ resnet_skip_time_act: bool = False,
220
+ resnet_out_scale_factor: float = 1.0,
221
+ time_embedding_type: str = "positional",
222
+ time_embedding_dim: Optional[int] = None,
223
+ time_embedding_act_fn: Optional[str] = None,
224
+ timestep_post_act: Optional[str] = None,
225
+ time_cond_proj_dim: Optional[int] = None,
226
+ conv_in_kernel: int = 3,
227
+ conv_out_kernel: int = 3,
228
+ projection_class_embeddings_input_dim: Optional[int] = None,
229
+ attention_type: str = "default",
230
+ class_embeddings_concat: bool = False,
231
+ mid_block_only_cross_attention: Optional[bool] = None,
232
+ cross_attention_norm: Optional[str] = None,
233
+ addition_embed_type_num_heads: int = 64,
234
+ ):
235
+ super().__init__()
236
+
237
+ self.sample_size = sample_size
238
+
239
+ if num_attention_heads is not None:
240
+ raise ValueError(
241
+ "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
242
+ )
243
+
244
+ # If `num_attention_heads` is not defined (which is the case for most models)
245
+ # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
246
+ # The reason for this behavior is to correct for incorrectly named variables that were introduced
247
+ # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
248
+ # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
249
+ # which is why we correct for the naming here.
250
+ num_attention_heads = num_attention_heads or attention_head_dim
251
+
252
+ # Check inputs
253
+ self._check_config(
254
+ down_block_types=down_block_types,
255
+ up_block_types=up_block_types,
256
+ only_cross_attention=only_cross_attention,
257
+ block_out_channels=block_out_channels,
258
+ layers_per_block=layers_per_block,
259
+ cross_attention_dim=cross_attention_dim,
260
+ transformer_layers_per_block=transformer_layers_per_block,
261
+ reverse_transformer_layers_per_block=reverse_transformer_layers_per_block,
262
+ attention_head_dim=attention_head_dim,
263
+ num_attention_heads=num_attention_heads,
264
+ )
265
+
266
+ # input
267
+ conv_in_padding = (conv_in_kernel - 1) // 2
268
+ self.conv_in = nn.Conv2d(
269
+ in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
270
+ )
271
+
272
+ # time
273
+ time_embed_dim, timestep_input_dim = self._set_time_proj(
274
+ time_embedding_type,
275
+ block_out_channels=block_out_channels,
276
+ flip_sin_to_cos=flip_sin_to_cos,
277
+ freq_shift=freq_shift,
278
+ time_embedding_dim=time_embedding_dim,
279
+ )
280
+
281
+ self.time_embedding = TimestepEmbedding(
282
+ timestep_input_dim,
283
+ time_embed_dim,
284
+ act_fn=act_fn,
285
+ post_act_fn=timestep_post_act,
286
+ cond_proj_dim=time_cond_proj_dim,
287
+ )
288
+
289
+ self._set_encoder_hid_proj(
290
+ encoder_hid_dim_type,
291
+ cross_attention_dim=cross_attention_dim,
292
+ encoder_hid_dim=encoder_hid_dim,
293
+ )
294
+
295
+ # class embedding
296
+ self._set_class_embedding(
297
+ class_embed_type,
298
+ act_fn=act_fn,
299
+ num_class_embeds=num_class_embeds,
300
+ projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
301
+ time_embed_dim=time_embed_dim,
302
+ timestep_input_dim=timestep_input_dim,
303
+ )
304
+
305
+ self._set_add_embedding(
306
+ addition_embed_type,
307
+ addition_embed_type_num_heads=addition_embed_type_num_heads,
308
+ addition_time_embed_dim=addition_time_embed_dim,
309
+ cross_attention_dim=cross_attention_dim,
310
+ encoder_hid_dim=encoder_hid_dim,
311
+ flip_sin_to_cos=flip_sin_to_cos,
312
+ freq_shift=freq_shift,
313
+ projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
314
+ time_embed_dim=time_embed_dim,
315
+ )
316
+
317
+ if time_embedding_act_fn is None:
318
+ self.time_embed_act = None
319
+ else:
320
+ self.time_embed_act = get_activation(time_embedding_act_fn)
321
+
322
+ self.down_blocks = nn.ModuleList([])
323
+ self.up_blocks = nn.ModuleList([])
324
+
325
+ if isinstance(only_cross_attention, bool):
326
+ if mid_block_only_cross_attention is None:
327
+ mid_block_only_cross_attention = only_cross_attention
328
+
329
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
330
+
331
+ if mid_block_only_cross_attention is None:
332
+ mid_block_only_cross_attention = False
333
+
334
+ if isinstance(num_attention_heads, int):
335
+ num_attention_heads = (num_attention_heads,) * len(down_block_types)
336
+
337
+ if isinstance(attention_head_dim, int):
338
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
339
+
340
+ if isinstance(cross_attention_dim, int):
341
+ cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
342
+
343
+ if isinstance(layers_per_block, int):
344
+ layers_per_block = [layers_per_block] * len(down_block_types)
345
+
346
+ if isinstance(transformer_layers_per_block, int):
347
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
348
+
349
+ if class_embeddings_concat:
350
+ # The time embeddings are concatenated with the class embeddings. The dimension of the
351
+ # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
352
+ # regular time embeddings
353
+ blocks_time_embed_dim = time_embed_dim * 2
354
+ else:
355
+ blocks_time_embed_dim = time_embed_dim
356
+
357
+ # down
358
+ output_channel = block_out_channels[0]
359
+ for i, down_block_type in enumerate(down_block_types):
360
+ input_channel = output_channel
361
+ output_channel = block_out_channels[i]
362
+ is_final_block = i == len(block_out_channels) - 1
363
+
364
+ down_block = get_down_block(
365
+ down_block_type,
366
+ num_layers=layers_per_block[i],
367
+ transformer_layers_per_block=transformer_layers_per_block[i],
368
+ in_channels=input_channel,
369
+ out_channels=output_channel,
370
+ temb_channels=blocks_time_embed_dim,
371
+ add_downsample=not is_final_block,
372
+ resnet_eps=norm_eps,
373
+ resnet_act_fn=act_fn,
374
+ resnet_groups=norm_num_groups,
375
+ cross_attention_dim=cross_attention_dim[i],
376
+ num_attention_heads=num_attention_heads[i],
377
+ downsample_padding=downsample_padding,
378
+ dual_cross_attention=dual_cross_attention,
379
+ use_linear_projection=use_linear_projection,
380
+ only_cross_attention=only_cross_attention[i],
381
+ upcast_attention=upcast_attention,
382
+ resnet_time_scale_shift=resnet_time_scale_shift,
383
+ attention_type=attention_type,
384
+ resnet_skip_time_act=resnet_skip_time_act,
385
+ resnet_out_scale_factor=resnet_out_scale_factor,
386
+ cross_attention_norm=cross_attention_norm,
387
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
388
+ dropout=dropout,
389
+ )
390
+ self.down_blocks.append(down_block)
391
+
392
+ # mid
393
+ self.mid_block = get_mid_block(
394
+ mid_block_type,
395
+ temb_channels=blocks_time_embed_dim,
396
+ in_channels=block_out_channels[-1],
397
+ resnet_eps=norm_eps,
398
+ resnet_act_fn=act_fn,
399
+ resnet_groups=norm_num_groups,
400
+ output_scale_factor=mid_block_scale_factor,
401
+ transformer_layers_per_block=transformer_layers_per_block[-1],
402
+ num_attention_heads=num_attention_heads[-1],
403
+ cross_attention_dim=cross_attention_dim[-1],
404
+ dual_cross_attention=dual_cross_attention,
405
+ use_linear_projection=use_linear_projection,
406
+ mid_block_only_cross_attention=mid_block_only_cross_attention,
407
+ upcast_attention=upcast_attention,
408
+ resnet_time_scale_shift=resnet_time_scale_shift,
409
+ attention_type=attention_type,
410
+ resnet_skip_time_act=resnet_skip_time_act,
411
+ cross_attention_norm=cross_attention_norm,
412
+ attention_head_dim=attention_head_dim[-1],
413
+ dropout=dropout,
414
+ )
415
+
416
+ # count how many layers upsample the images
417
+ self.num_upsamplers = 0
418
+
419
+ # up
420
+ reversed_block_out_channels = list(reversed(block_out_channels))
421
+ reversed_num_attention_heads = list(reversed(num_attention_heads))
422
+ reversed_layers_per_block = list(reversed(layers_per_block))
423
+ reversed_cross_attention_dim = list(reversed(cross_attention_dim))
424
+ reversed_transformer_layers_per_block = (
425
+ list(reversed(transformer_layers_per_block))
426
+ if reverse_transformer_layers_per_block is None
427
+ else reverse_transformer_layers_per_block
428
+ )
429
+ only_cross_attention = list(reversed(only_cross_attention))
430
+
431
+ output_channel = reversed_block_out_channels[0]
432
+ for i, up_block_type in enumerate(up_block_types):
433
+ is_final_block = i == len(block_out_channels) - 1
434
+
435
+ prev_output_channel = output_channel
436
+ output_channel = reversed_block_out_channels[i]
437
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
438
+
439
+ # add upsample block for all BUT final layer
440
+ if not is_final_block:
441
+ add_upsample = True
442
+ self.num_upsamplers += 1
443
+ else:
444
+ add_upsample = False
445
+
446
+ up_block = get_up_block(
447
+ up_block_type,
448
+ num_layers=reversed_layers_per_block[i] + 1,
449
+ transformer_layers_per_block=reversed_transformer_layers_per_block[i],
450
+ in_channels=input_channel,
451
+ out_channels=output_channel,
452
+ prev_output_channel=prev_output_channel,
453
+ temb_channels=blocks_time_embed_dim,
454
+ add_upsample=add_upsample,
455
+ resnet_eps=norm_eps,
456
+ resnet_act_fn=act_fn,
457
+ resolution_idx=i,
458
+ resnet_groups=norm_num_groups,
459
+ cross_attention_dim=reversed_cross_attention_dim[i],
460
+ num_attention_heads=reversed_num_attention_heads[i],
461
+ dual_cross_attention=dual_cross_attention,
462
+ use_linear_projection=use_linear_projection,
463
+ only_cross_attention=only_cross_attention[i],
464
+ upcast_attention=upcast_attention,
465
+ resnet_time_scale_shift=resnet_time_scale_shift,
466
+ attention_type=attention_type,
467
+ resnet_skip_time_act=resnet_skip_time_act,
468
+ resnet_out_scale_factor=resnet_out_scale_factor,
469
+ cross_attention_norm=cross_attention_norm,
470
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
471
+ dropout=dropout,
472
+ )
473
+ self.up_blocks.append(up_block)
474
+ prev_output_channel = output_channel
475
+
476
+ # out
477
+ if norm_num_groups is not None:
478
+ self.conv_norm_out = nn.GroupNorm(
479
+ num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
480
+ )
481
+
482
+ self.conv_act = get_activation(act_fn)
483
+
484
+ else:
485
+ self.conv_norm_out = None
486
+ self.conv_act = None
487
+
488
+ conv_out_padding = (conv_out_kernel - 1) // 2
489
+ self.conv_out = nn.Conv2d(
490
+ block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
491
+ )
492
+
493
+ self._set_pos_net_if_use_gligen(attention_type=attention_type, cross_attention_dim=cross_attention_dim)
494
+
495
+ def _check_config(
496
+ self,
497
+ down_block_types: Tuple[str],
498
+ up_block_types: Tuple[str],
499
+ only_cross_attention: Union[bool, Tuple[bool]],
500
+ block_out_channels: Tuple[int],
501
+ layers_per_block: Union[int, Tuple[int]],
502
+ cross_attention_dim: Union[int, Tuple[int]],
503
+ transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]],
504
+ reverse_transformer_layers_per_block: bool,
505
+ attention_head_dim: int,
506
+ num_attention_heads: Optional[Union[int, Tuple[int]]],
507
+ ):
508
+ if len(down_block_types) != len(up_block_types):
509
+ raise ValueError(
510
+ f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
511
+ )
512
+
513
+ if len(block_out_channels) != len(down_block_types):
514
+ raise ValueError(
515
+ f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
516
+ )
517
+
518
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
519
+ raise ValueError(
520
+ f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
521
+ )
522
+
523
+ if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
524
+ raise ValueError(
525
+ f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
526
+ )
527
+
528
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
529
+ raise ValueError(
530
+ f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
531
+ )
532
+
533
+ if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
534
+ raise ValueError(
535
+ f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
536
+ )
537
+
538
+ if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
539
+ raise ValueError(
540
+ f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
541
+ )
542
+ if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
543
+ for layer_number_per_block in transformer_layers_per_block:
544
+ if isinstance(layer_number_per_block, list):
545
+ raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
546
+
547
+ def _set_time_proj(
548
+ self,
549
+ time_embedding_type: str,
550
+ block_out_channels: int,
551
+ flip_sin_to_cos: bool,
552
+ freq_shift: float,
553
+ time_embedding_dim: int,
554
+ ) -> Tuple[int, int]:
555
+ if time_embedding_type == "fourier":
556
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
557
+ if time_embed_dim % 2 != 0:
558
+ raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
559
+ self.time_proj = GaussianFourierProjection(
560
+ time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
561
+ )
562
+ timestep_input_dim = time_embed_dim
563
+ elif time_embedding_type == "positional":
564
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
565
+
566
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
567
+ timestep_input_dim = block_out_channels[0]
568
+ else:
569
+ raise ValueError(
570
+ f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
571
+ )
572
+
573
+ return time_embed_dim, timestep_input_dim
574
+
575
+ def _set_encoder_hid_proj(
576
+ self,
577
+ encoder_hid_dim_type: Optional[str],
578
+ cross_attention_dim: Union[int, Tuple[int]],
579
+ encoder_hid_dim: Optional[int],
580
+ ):
581
+ if encoder_hid_dim_type is None and encoder_hid_dim is not None:
582
+ encoder_hid_dim_type = "text_proj"
583
+ self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
584
+ logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
585
+
586
+ if encoder_hid_dim is None and encoder_hid_dim_type is not None:
587
+ raise ValueError(
588
+ f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
589
+ )
590
+
591
+ if encoder_hid_dim_type == "text_proj":
592
+ self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
593
+ elif encoder_hid_dim_type == "text_image_proj":
594
+ # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
595
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
596
+ # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
597
+ self.encoder_hid_proj = TextImageProjection(
598
+ text_embed_dim=encoder_hid_dim,
599
+ image_embed_dim=cross_attention_dim,
600
+ cross_attention_dim=cross_attention_dim,
601
+ )
602
+ elif encoder_hid_dim_type == "image_proj":
603
+ # Kandinsky 2.2
604
+ self.encoder_hid_proj = ImageProjection(
605
+ image_embed_dim=encoder_hid_dim,
606
+ cross_attention_dim=cross_attention_dim,
607
+ )
608
+ elif encoder_hid_dim_type is not None:
609
+ raise ValueError(
610
+ f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
611
+ )
612
+ else:
613
+ self.encoder_hid_proj = None
614
+
615
+ def _set_class_embedding(
616
+ self,
617
+ class_embed_type: Optional[str],
618
+ act_fn: str,
619
+ num_class_embeds: Optional[int],
620
+ projection_class_embeddings_input_dim: Optional[int],
621
+ time_embed_dim: int,
622
+ timestep_input_dim: int,
623
+ ):
624
+ if class_embed_type is None and num_class_embeds is not None:
625
+ self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
626
+ elif class_embed_type == "timestep":
627
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
628
+ elif class_embed_type == "identity":
629
+ self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
630
+ elif class_embed_type == "projection":
631
+ if projection_class_embeddings_input_dim is None:
632
+ raise ValueError(
633
+ "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
634
+ )
635
+ # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
636
+ # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
637
+ # 2. it projects from an arbitrary input dimension.
638
+ #
639
+ # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
640
+ # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
641
+ # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
642
+ self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
643
+ elif class_embed_type == "simple_projection":
644
+ if projection_class_embeddings_input_dim is None:
645
+ raise ValueError(
646
+ "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
647
+ )
648
+ self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
649
+ else:
650
+ self.class_embedding = None
651
+
652
+ def _set_add_embedding(
653
+ self,
654
+ addition_embed_type: str,
655
+ addition_embed_type_num_heads: int,
656
+ addition_time_embed_dim: Optional[int],
657
+ flip_sin_to_cos: bool,
658
+ freq_shift: float,
659
+ cross_attention_dim: Optional[int],
660
+ encoder_hid_dim: Optional[int],
661
+ projection_class_embeddings_input_dim: Optional[int],
662
+ time_embed_dim: int,
663
+ ):
664
+ if addition_embed_type == "text":
665
+ if encoder_hid_dim is not None:
666
+ text_time_embedding_from_dim = encoder_hid_dim
667
+ else:
668
+ text_time_embedding_from_dim = cross_attention_dim
669
+
670
+ self.add_embedding = TextTimeEmbedding(
671
+ text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
672
+ )
673
+ elif addition_embed_type == "text_image":
674
+ # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
675
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
676
+ # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
677
+ self.add_embedding = TextImageTimeEmbedding(
678
+ text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
679
+ )
680
+ elif addition_embed_type == "text_time":
681
+ self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
682
+ self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
683
+ elif addition_embed_type == "image":
684
+ # Kandinsky 2.2
685
+ self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
686
+ elif addition_embed_type == "image_hint":
687
+ # Kandinsky 2.2 ControlNet
688
+ self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
689
+ elif addition_embed_type is not None:
690
+ raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
691
+
692
+ def _set_pos_net_if_use_gligen(self, attention_type: str, cross_attention_dim: int):
693
+ if attention_type in ["gated", "gated-text-image"]:
694
+ positive_len = 768
695
+ if isinstance(cross_attention_dim, int):
696
+ positive_len = cross_attention_dim
697
+ elif isinstance(cross_attention_dim, (list, tuple)):
698
+ positive_len = cross_attention_dim[0]
699
+
700
+ feature_type = "text-only" if attention_type == "gated" else "text-image"
701
+ self.position_net = GLIGENTextBoundingboxProjection(
702
+ positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
703
+ )
704
+
705
+ @property
706
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
707
+ r"""
708
+ Returns:
709
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
710
+ indexed by its weight name.
711
+ """
712
+ # set recursively
713
+ processors = {}
714
+
715
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
716
+ if hasattr(module, "get_processor"):
717
+ processors[f"{name}.processor"] = module.get_processor()
718
+
719
+ for sub_name, child in module.named_children():
720
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
721
+
722
+ return processors
723
+
724
+ for name, module in self.named_children():
725
+ fn_recursive_add_processors(name, module, processors)
726
+
727
+ return processors
728
+
729
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
730
+ r"""
731
+ Sets the attention processor to use to compute attention.
732
+
733
+ Parameters:
734
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
735
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
736
+ for **all** `Attention` layers.
737
+
738
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
739
+ processor. This is strongly recommended when setting trainable attention processors.
740
+
741
+ """
742
+ count = len(self.attn_processors.keys())
743
+
744
+ if isinstance(processor, dict) and len(processor) != count:
745
+ raise ValueError(
746
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
747
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
748
+ )
749
+
750
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
751
+ if hasattr(module, "set_processor"):
752
+ if not isinstance(processor, dict):
753
+ module.set_processor(processor)
754
+ else:
755
+ module.set_processor(processor.pop(f"{name}.processor"))
756
+
757
+ for sub_name, child in module.named_children():
758
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
759
+
760
+ for name, module in self.named_children():
761
+ fn_recursive_attn_processor(name, module, processor)
762
+
763
+ def set_default_attn_processor(self):
764
+ """
765
+ Disables custom attention processors and sets the default attention implementation.
766
+ """
767
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
768
+ processor = AttnAddedKVProcessor()
769
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
770
+ processor = AttnProcessor()
771
+ else:
772
+ raise ValueError(
773
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
774
+ )
775
+
776
+ self.set_attn_processor(processor)
777
+
778
+ def set_attention_slice(self, slice_size: Union[str, int, List[int]] = "auto"):
779
+ r"""
780
+ Enable sliced attention computation.
781
+
782
+ When this option is enabled, the attention module splits the input tensor in slices to compute attention in
783
+ several steps. This is useful for saving some memory in exchange for a small decrease in speed.
784
+
785
+ Args:
786
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
787
+ When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
788
+ `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
789
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
790
+ must be a multiple of `slice_size`.
791
+ """
792
+ sliceable_head_dims = []
793
+
794
+ def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
795
+ if hasattr(module, "set_attention_slice"):
796
+ sliceable_head_dims.append(module.sliceable_head_dim)
797
+
798
+ for child in module.children():
799
+ fn_recursive_retrieve_sliceable_dims(child)
800
+
801
+ # retrieve number of attention layers
802
+ for module in self.children():
803
+ fn_recursive_retrieve_sliceable_dims(module)
804
+
805
+ num_sliceable_layers = len(sliceable_head_dims)
806
+
807
+ if slice_size == "auto":
808
+ # half the attention head size is usually a good trade-off between
809
+ # speed and memory
810
+ slice_size = [dim // 2 for dim in sliceable_head_dims]
811
+ elif slice_size == "max":
812
+ # make smallest slice possible
813
+ slice_size = num_sliceable_layers * [1]
814
+
815
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
816
+
817
+ if len(slice_size) != len(sliceable_head_dims):
818
+ raise ValueError(
819
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
820
+ f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
821
+ )
822
+
823
+ for i in range(len(slice_size)):
824
+ size = slice_size[i]
825
+ dim = sliceable_head_dims[i]
826
+ if size is not None and size > dim:
827
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
828
+
829
+ # Recursively walk through all the children.
830
+ # Any children which exposes the set_attention_slice method
831
+ # gets the message
832
+ def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
833
+ if hasattr(module, "set_attention_slice"):
834
+ module.set_attention_slice(slice_size.pop())
835
+
836
+ for child in module.children():
837
+ fn_recursive_set_attention_slice(child, slice_size)
838
+
839
+ reversed_slice_size = list(reversed(slice_size))
840
+ for module in self.children():
841
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
842
+
843
+ def _set_gradient_checkpointing(self, module, value=False):
844
+ if hasattr(module, "gradient_checkpointing"):
845
+ module.gradient_checkpointing = value
846
+
847
+ def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
848
+ r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
849
+
850
+ The suffixes after the scaling factors represent the stage blocks where they are being applied.
851
+
852
+ Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
853
+ are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
854
+
855
+ Args:
856
+ s1 (`float`):
857
+ Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
858
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
859
+ s2 (`float`):
860
+ Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
861
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
862
+ b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
863
+ b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
864
+ """
865
+ for i, upsample_block in enumerate(self.up_blocks):
866
+ setattr(upsample_block, "s1", s1)
867
+ setattr(upsample_block, "s2", s2)
868
+ setattr(upsample_block, "b1", b1)
869
+ setattr(upsample_block, "b2", b2)
870
+
871
+ def disable_freeu(self):
872
+ """Disables the FreeU mechanism."""
873
+ freeu_keys = {"s1", "s2", "b1", "b2"}
874
+ for i, upsample_block in enumerate(self.up_blocks):
875
+ for k in freeu_keys:
876
+ if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
877
+ setattr(upsample_block, k, None)
878
+
879
+ def fuse_qkv_projections(self):
880
+ """
881
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
882
+ are fused. For cross-attention modules, key and value projection matrices are fused.
883
+
884
+ <Tip warning={true}>
885
+
886
+ This API is 🧪 experimental.
887
+
888
+ </Tip>
889
+ """
890
+ self.original_attn_processors = None
891
+
892
+ for _, attn_processor in self.attn_processors.items():
893
+ if "Added" in str(attn_processor.__class__.__name__):
894
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
895
+
896
+ self.original_attn_processors = self.attn_processors
897
+
898
+ for module in self.modules():
899
+ if isinstance(module, Attention):
900
+ module.fuse_projections(fuse=True)
901
+
902
+ def unfuse_qkv_projections(self):
903
+ """Disables the fused QKV projection if enabled.
904
+
905
+ <Tip warning={true}>
906
+
907
+ This API is 🧪 experimental.
908
+
909
+ </Tip>
910
+
911
+ """
912
+ if self.original_attn_processors is not None:
913
+ self.set_attn_processor(self.original_attn_processors)
914
+
915
+ def get_time_embed(
916
+ self, sample: torch.Tensor, timestep: Union[torch.Tensor, float, int]
917
+ ) -> Optional[torch.Tensor]:
918
+ timesteps = timestep
919
+ if not torch.is_tensor(timesteps):
920
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
921
+ # This would be a good case for the `match` statement (Python 3.10+)
922
+ is_mps = sample.device.type == "mps"
923
+ if isinstance(timestep, float):
924
+ dtype = torch.float32 if is_mps else torch.float64
925
+ else:
926
+ dtype = torch.int32 if is_mps else torch.int64
927
+ timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
928
+ elif len(timesteps.shape) == 0:
929
+ timesteps = timesteps[None].to(sample.device)
930
+
931
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
932
+ timesteps = timesteps.expand(sample.shape[0])
933
+
934
+ t_emb = self.time_proj(timesteps)
935
+ # `Timesteps` does not contain any weights and will always return f32 tensors
936
+ # but time_embedding might actually be running in fp16. so we need to cast here.
937
+ # there might be better ways to encapsulate this.
938
+ t_emb = t_emb.to(dtype=sample.dtype)
939
+ return t_emb
940
+
941
+ def get_class_embed(self, sample: torch.Tensor, class_labels: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
942
+ class_emb = None
943
+ if self.class_embedding is not None:
944
+ if class_labels is None:
945
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
946
+
947
+ if self.config.class_embed_type == "timestep":
948
+ class_labels = self.time_proj(class_labels)
949
+
950
+ # `Timesteps` does not contain any weights and will always return f32 tensors
951
+ # there might be better ways to encapsulate this.
952
+ class_labels = class_labels.to(dtype=sample.dtype)
953
+
954
+ class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
955
+ return class_emb
956
+
957
+ def get_aug_embed(
958
+ self, emb: torch.Tensor, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
959
+ ) -> Optional[torch.Tensor]:
960
+ aug_emb = None
961
+ if self.config.addition_embed_type == "text":
962
+ aug_emb = self.add_embedding(encoder_hidden_states)
963
+ elif self.config.addition_embed_type == "text_image":
964
+ # Kandinsky 2.1 - style
965
+ if "image_embeds" not in added_cond_kwargs:
966
+ raise ValueError(
967
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
968
+ )
969
+
970
+ image_embs = added_cond_kwargs.get("image_embeds")
971
+ text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
972
+ aug_emb = self.add_embedding(text_embs, image_embs)
973
+ elif self.config.addition_embed_type == "text_time":
974
+ # SDXL - style
975
+ if "text_embeds" not in added_cond_kwargs:
976
+ raise ValueError(
977
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
978
+ )
979
+ text_embeds = added_cond_kwargs.get("text_embeds")
980
+ if "time_ids" not in added_cond_kwargs:
981
+ raise ValueError(
982
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
983
+ )
984
+ time_ids = added_cond_kwargs.get("time_ids")
985
+ time_embeds = self.add_time_proj(time_ids.flatten())
986
+ time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
987
+ add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
988
+ add_embeds = add_embeds.to(emb.dtype)
989
+ aug_emb = self.add_embedding(add_embeds)
990
+ elif self.config.addition_embed_type == "image":
991
+ # Kandinsky 2.2 - style
992
+ if "image_embeds" not in added_cond_kwargs:
993
+ raise ValueError(
994
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
995
+ )
996
+ image_embs = added_cond_kwargs.get("image_embeds")
997
+ aug_emb = self.add_embedding(image_embs)
998
+ elif self.config.addition_embed_type == "image_hint":
999
+ # Kandinsky 2.2 - style
1000
+ if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
1001
+ raise ValueError(
1002
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
1003
+ )
1004
+ image_embs = added_cond_kwargs.get("image_embeds")
1005
+ hint = added_cond_kwargs.get("hint")
1006
+ aug_emb = self.add_embedding(image_embs, hint)
1007
+ return aug_emb
1008
+
1009
+ def process_encoder_hidden_states(
1010
+ self, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
1011
+ ) -> torch.Tensor:
1012
+ if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
1013
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
1014
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
1015
+ # Kandinsky 2.1 - style
1016
+ if "image_embeds" not in added_cond_kwargs:
1017
+ raise ValueError(
1018
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
1019
+ )
1020
+
1021
+ image_embeds = added_cond_kwargs.get("image_embeds")
1022
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
1023
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
1024
+ # Kandinsky 2.2 - style
1025
+ if "image_embeds" not in added_cond_kwargs:
1026
+ raise ValueError(
1027
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
1028
+ )
1029
+ image_embeds = added_cond_kwargs.get("image_embeds")
1030
+ encoder_hidden_states = self.encoder_hid_proj(image_embeds)
1031
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
1032
+ if "image_embeds" not in added_cond_kwargs:
1033
+ raise ValueError(
1034
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
1035
+ )
1036
+
1037
+ if hasattr(self, 'text_encoder_hid_proj') and not self.text_encoder_hid_proj is None:
1038
+ encoder_hidden_states = self.text_encoder_hid_proj( encoder_hidden_states )
1039
+
1040
+ image_embeds = added_cond_kwargs.get("image_embeds")
1041
+ image_embeds = self.encoder_hid_proj(image_embeds)
1042
+ encoder_hidden_states = (encoder_hidden_states, image_embeds)
1043
+ return encoder_hidden_states
1044
+
1045
+ def forward(
1046
+ self,
1047
+ sample: torch.Tensor,
1048
+ timestep: Union[torch.Tensor, float, int],
1049
+ encoder_hidden_states: torch.Tensor,
1050
+ class_labels: Optional[torch.Tensor] = None,
1051
+ timestep_cond: Optional[torch.Tensor] = None,
1052
+ attention_mask: Optional[torch.Tensor] = None,
1053
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1054
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
1055
+ down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
1056
+ mid_block_additional_residual: Optional[torch.Tensor] = None,
1057
+ down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
1058
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1059
+ return_dict: bool = True,
1060
+ ) -> Union[UNet2DConditionOutput, Tuple]:
1061
+ r"""
1062
+ The [`UNet2DConditionModel`] forward method.
1063
+
1064
+ Args:
1065
+ sample (`torch.Tensor`):
1066
+ The noisy input tensor with the following shape `(batch, channel, height, width)`.
1067
+ timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
1068
+ encoder_hidden_states (`torch.Tensor`):
1069
+ The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
1070
+ class_labels (`torch.Tensor`, *optional*, defaults to `None`):
1071
+ Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
1072
+ timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
1073
+ Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
1074
+ through the `self.time_embedding` layer to obtain the timestep embeddings.
1075
+ attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
1076
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
1077
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
1078
+ negative values to the attention scores corresponding to "discard" tokens.
1079
+ cross_attention_kwargs (`dict`, *optional*):
1080
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
1081
+ `self.processor` in
1082
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1083
+ added_cond_kwargs: (`dict`, *optional*):
1084
+ A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
1085
+ are passed along to the UNet blocks.
1086
+ down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
1087
+ A tuple of tensors that if specified are added to the residuals of down unet blocks.
1088
+ mid_block_additional_residual: (`torch.Tensor`, *optional*):
1089
+ A tensor that if specified is added to the residual of the middle unet block.
1090
+ down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
1091
+ additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
1092
+ encoder_attention_mask (`torch.Tensor`):
1093
+ A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
1094
+ `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
1095
+ which adds large negative values to the attention scores corresponding to "discard" tokens.
1096
+ return_dict (`bool`, *optional*, defaults to `True`):
1097
+ Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
1098
+ tuple.
1099
+
1100
+ Returns:
1101
+ [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
1102
+ If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
1103
+ otherwise a `tuple` is returned where the first element is the sample tensor.
1104
+ """
1105
+ # By default samples have to be AT least a multiple of the overall upsampling factor.
1106
+ # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
1107
+ # However, the upsampling interpolation output size can be forced to fit any upsampling size
1108
+ # on the fly if necessary.
1109
+ default_overall_up_factor = 2**self.num_upsamplers
1110
+
1111
+ # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
1112
+ forward_upsample_size = False
1113
+ upsample_size = None
1114
+
1115
+ for dim in sample.shape[-2:]:
1116
+ if dim % default_overall_up_factor != 0:
1117
+ # Forward upsample size to force interpolation output size.
1118
+ forward_upsample_size = True
1119
+ break
1120
+
1121
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
1122
+ # expects mask of shape:
1123
+ # [batch, key_tokens]
1124
+ # adds singleton query_tokens dimension:
1125
+ # [batch, 1, key_tokens]
1126
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
1127
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
1128
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
1129
+ if attention_mask is not None:
1130
+ # assume that mask is expressed as:
1131
+ # (1 = keep, 0 = discard)
1132
+ # convert mask into a bias that can be added to attention scores:
1133
+ # (keep = +0, discard = -10000.0)
1134
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
1135
+ attention_mask = attention_mask.unsqueeze(1)
1136
+
1137
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
1138
+ if encoder_attention_mask is not None:
1139
+ encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
1140
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
1141
+
1142
+ # 0. center input if necessary
1143
+ if self.config.center_input_sample:
1144
+ sample = 2 * sample - 1.0
1145
+
1146
+ # 1. time
1147
+ t_emb = self.get_time_embed(sample=sample, timestep=timestep)
1148
+ emb = self.time_embedding(t_emb, timestep_cond)
1149
+ aug_emb = None
1150
+
1151
+ class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
1152
+ if class_emb is not None:
1153
+ if self.config.class_embeddings_concat:
1154
+ emb = torch.cat([emb, class_emb], dim=-1)
1155
+ else:
1156
+ emb = emb + class_emb
1157
+
1158
+ aug_emb = self.get_aug_embed(
1159
+ emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
1160
+ )
1161
+ if self.config.addition_embed_type == "image_hint":
1162
+ aug_emb, hint = aug_emb
1163
+ sample = torch.cat([sample, hint], dim=1)
1164
+
1165
+ emb = emb + aug_emb if aug_emb is not None else emb
1166
+
1167
+ if self.time_embed_act is not None:
1168
+ emb = self.time_embed_act(emb)
1169
+
1170
+ encoder_hidden_states = self.process_encoder_hidden_states(
1171
+ encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
1172
+ )
1173
+
1174
+ # 2. pre-process
1175
+ sample = self.conv_in(sample)
1176
+
1177
+ # 2.5 GLIGEN position net
1178
+ if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
1179
+ cross_attention_kwargs = cross_attention_kwargs.copy()
1180
+ gligen_args = cross_attention_kwargs.pop("gligen")
1181
+ cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
1182
+
1183
+ # 3. down
1184
+ # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
1185
+ # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
1186
+ if cross_attention_kwargs is not None:
1187
+ cross_attention_kwargs = cross_attention_kwargs.copy()
1188
+ lora_scale = cross_attention_kwargs.pop("scale", 1.0)
1189
+ else:
1190
+ lora_scale = 1.0
1191
+
1192
+ if USE_PEFT_BACKEND:
1193
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
1194
+ scale_lora_layers(self, lora_scale)
1195
+
1196
+ is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
1197
+ # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
1198
+ is_adapter = down_intrablock_additional_residuals is not None
1199
+ # maintain backward compatibility for legacy usage, where
1200
+ # T2I-Adapter and ControlNet both use down_block_additional_residuals arg
1201
+ # but can only use one or the other
1202
+ if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
1203
+ deprecate(
1204
+ "T2I should not use down_block_additional_residuals",
1205
+ "1.3.0",
1206
+ "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
1207
+ and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \
1208
+ for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
1209
+ standard_warn=False,
1210
+ )
1211
+ down_intrablock_additional_residuals = down_block_additional_residuals
1212
+ is_adapter = True
1213
+
1214
+ down_block_res_samples = (sample,)
1215
+ for downsample_block in self.down_blocks:
1216
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
1217
+ # For t2i-adapter CrossAttnDownBlock2D
1218
+ additional_residuals = {}
1219
+ if is_adapter and len(down_intrablock_additional_residuals) > 0:
1220
+ additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
1221
+
1222
+ sample, res_samples = downsample_block(
1223
+ hidden_states=sample,
1224
+ temb=emb,
1225
+ encoder_hidden_states=encoder_hidden_states,
1226
+ attention_mask=attention_mask,
1227
+ cross_attention_kwargs=cross_attention_kwargs,
1228
+ encoder_attention_mask=encoder_attention_mask,
1229
+ **additional_residuals,
1230
+ )
1231
+ else:
1232
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
1233
+ if is_adapter and len(down_intrablock_additional_residuals) > 0:
1234
+ sample += down_intrablock_additional_residuals.pop(0)
1235
+
1236
+ down_block_res_samples += res_samples
1237
+
1238
+ if is_controlnet:
1239
+ new_down_block_res_samples = ()
1240
+
1241
+ for down_block_res_sample, down_block_additional_residual in zip(
1242
+ down_block_res_samples, down_block_additional_residuals
1243
+ ):
1244
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
1245
+ new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
1246
+
1247
+ down_block_res_samples = new_down_block_res_samples
1248
+
1249
+ # 4. mid
1250
+ if self.mid_block is not None:
1251
+ if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
1252
+ sample = self.mid_block(
1253
+ sample,
1254
+ emb,
1255
+ encoder_hidden_states=encoder_hidden_states,
1256
+ attention_mask=attention_mask,
1257
+ cross_attention_kwargs=cross_attention_kwargs,
1258
+ encoder_attention_mask=encoder_attention_mask,
1259
+ )
1260
+ else:
1261
+ sample = self.mid_block(sample, emb)
1262
+
1263
+ # To support T2I-Adapter-XL
1264
+ if (
1265
+ is_adapter
1266
+ and len(down_intrablock_additional_residuals) > 0
1267
+ and sample.shape == down_intrablock_additional_residuals[0].shape
1268
+ ):
1269
+ sample += down_intrablock_additional_residuals.pop(0)
1270
+
1271
+ if is_controlnet:
1272
+ sample = sample + mid_block_additional_residual
1273
+
1274
+ # 5. up
1275
+ for i, upsample_block in enumerate(self.up_blocks):
1276
+ is_final_block = i == len(self.up_blocks) - 1
1277
+
1278
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
1279
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
1280
+
1281
+ # if we have not reached the final block and need to forward the
1282
+ # upsample size, we do it here
1283
+ if not is_final_block and forward_upsample_size:
1284
+ upsample_size = down_block_res_samples[-1].shape[2:]
1285
+
1286
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
1287
+ sample = upsample_block(
1288
+ hidden_states=sample,
1289
+ temb=emb,
1290
+ res_hidden_states_tuple=res_samples,
1291
+ encoder_hidden_states=encoder_hidden_states,
1292
+ cross_attention_kwargs=cross_attention_kwargs,
1293
+ upsample_size=upsample_size,
1294
+ attention_mask=attention_mask,
1295
+ encoder_attention_mask=encoder_attention_mask,
1296
+ )
1297
+ else:
1298
+ sample = upsample_block(
1299
+ hidden_states=sample,
1300
+ temb=emb,
1301
+ res_hidden_states_tuple=res_samples,
1302
+ upsample_size=upsample_size,
1303
+ )
1304
+
1305
+ # 6. post-process
1306
+ if self.conv_norm_out:
1307
+ sample = self.conv_norm_out(sample)
1308
+ sample = self.conv_act(sample)
1309
+ sample = self.conv_out(sample)
1310
+
1311
+ if USE_PEFT_BACKEND:
1312
+ # remove `lora_scale` from each PEFT layer
1313
+ unscale_lora_layers(self, lora_scale)
1314
+
1315
+ if not return_dict:
1316
+ return (sample,)
1317
+
1318
+ return UNet2DConditionOutput(sample=sample)
build/lib/kolors/pipelines/__init__.py ADDED
File without changes
build/lib/kolors/pipelines/pipeline_controlnet_xl_kolors_img2img.py ADDED
@@ -0,0 +1,1365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import inspect
17
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import PIL.Image
21
+ import torch
22
+ import torch.nn.functional as F
23
+ from transformers import (
24
+ CLIPImageProcessor,
25
+ CLIPTextModel,
26
+ CLIPTextModelWithProjection,
27
+ CLIPTokenizer,
28
+ CLIPVisionModelWithProjection,
29
+ )
30
+
31
+ from diffusers.utils.import_utils import is_invisible_watermark_available
32
+
33
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
34
+ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
35
+ from diffusers.loaders import (
36
+ FromSingleFileMixin,
37
+ IPAdapterMixin,
38
+ StableDiffusionXLLoraLoaderMixin,
39
+ TextualInversionLoaderMixin,
40
+ )
41
+ from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
42
+ from diffusers.models.attention_processor import (
43
+ AttnProcessor2_0,
44
+ XFormersAttnProcessor,
45
+ )
46
+ from diffusers.models.lora import adjust_lora_scale_text_encoder
47
+ from diffusers.schedulers import KarrasDiffusionSchedulers
48
+ from diffusers.utils import (
49
+ USE_PEFT_BACKEND,
50
+ deprecate,
51
+ logging,
52
+ replace_example_docstring,
53
+ scale_lora_layers,
54
+ unscale_lora_layers,
55
+ )
56
+ from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
57
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
58
+ from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
59
+ from diffusers.pipelines.controlnet import MultiControlNetModel
60
+
61
+ from ..models.controlnet import ControlNetModel
62
+
63
+
64
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
65
+
66
+
67
+
68
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
69
+ def retrieve_latents(
70
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
71
+ ):
72
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
73
+ return encoder_output.latent_dist.sample(generator)
74
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
75
+ return encoder_output.latent_dist.mode()
76
+ elif hasattr(encoder_output, "latents"):
77
+ return encoder_output.latents
78
+ else:
79
+ raise AttributeError("Could not access latents of provided encoder_output")
80
+
81
+
82
+ class StableDiffusionXLControlNetImg2ImgPipeline(
83
+ DiffusionPipeline,
84
+ StableDiffusionMixin,
85
+ TextualInversionLoaderMixin,
86
+ StableDiffusionXLLoraLoaderMixin,
87
+ FromSingleFileMixin,
88
+ IPAdapterMixin,
89
+ ):
90
+ r"""
91
+ Pipeline for image-to-image generation using Stable Diffusion XL with ControlNet guidance.
92
+
93
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
94
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
95
+
96
+ The pipeline also inherits the following loading methods:
97
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
98
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
99
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
100
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
101
+
102
+ Args:
103
+ vae ([`AutoencoderKL`]):
104
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
105
+ text_encoder ([`CLIPTextModel`]):
106
+ Frozen text-encoder. Stable Diffusion uses the text portion of
107
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
108
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
109
+ tokenizer (`CLIPTokenizer`):
110
+ Tokenizer of class
111
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
112
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
113
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
114
+ Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
115
+ as a list, the outputs from each ControlNet are added together to create one combined additional
116
+ conditioning.
117
+ scheduler ([`SchedulerMixin`]):
118
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
119
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
120
+ requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
121
+ Whether the `unet` requires an `aesthetic_score` condition to be passed during inference. Also see the
122
+ config of `stabilityai/stable-diffusion-xl-refiner-1-0`.
123
+ force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
124
+ Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
125
+ `stabilityai/stable-diffusion-xl-base-1-0`.
126
+ add_watermarker (`bool`, *optional*):
127
+ Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
128
+ watermark output images. If not defined, it will default to True if the package is installed, otherwise no
129
+ watermarker will be used.
130
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
131
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
132
+ """
133
+
134
+ model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
135
+ _optional_components = [
136
+ "tokenizer",
137
+ "text_encoder",
138
+ "feature_extractor",
139
+ "image_encoder",
140
+ ]
141
+ _callback_tensor_inputs = [
142
+ "latents",
143
+ "prompt_embeds",
144
+ "negative_prompt_embeds",
145
+ "add_text_embeds",
146
+ "add_time_ids",
147
+ "negative_pooled_prompt_embeds",
148
+ "add_neg_time_ids",
149
+ ]
150
+
151
+ def __init__(
152
+ self,
153
+ vae: AutoencoderKL,
154
+ text_encoder: CLIPTextModel,
155
+ tokenizer: CLIPTokenizer,
156
+ unet: UNet2DConditionModel,
157
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
158
+ scheduler: KarrasDiffusionSchedulers,
159
+ requires_aesthetics_score: bool = False,
160
+ force_zeros_for_empty_prompt: bool = True,
161
+ feature_extractor: CLIPImageProcessor = None,
162
+ image_encoder: CLIPVisionModelWithProjection = None,
163
+ ):
164
+ super().__init__()
165
+
166
+ if isinstance(controlnet, (list, tuple)):
167
+ controlnet = MultiControlNetModel(controlnet)
168
+
169
+ self.register_modules(
170
+ vae=vae,
171
+ text_encoder=text_encoder,
172
+ tokenizer=tokenizer,
173
+ unet=unet,
174
+ controlnet=controlnet,
175
+ scheduler=scheduler,
176
+ feature_extractor=feature_extractor,
177
+ image_encoder=image_encoder,
178
+ )
179
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
180
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
181
+ self.control_image_processor = VaeImageProcessor(
182
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
183
+ )
184
+
185
+ self.watermark = None
186
+
187
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
188
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
189
+
190
+
191
+ def encode_prompt(
192
+ self,
193
+ prompt,
194
+ device: Optional[torch.device] = None,
195
+ num_images_per_prompt: int = 1,
196
+ do_classifier_free_guidance: bool = True,
197
+ negative_prompt=None,
198
+ prompt_embeds: Optional[torch.FloatTensor] = None,
199
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
200
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
201
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
202
+ lora_scale: Optional[float] = None,
203
+ ):
204
+ r"""
205
+ Encodes the prompt into text encoder hidden states.
206
+
207
+ Args:
208
+ prompt (`str` or `List[str]`, *optional*):
209
+ prompt to be encoded
210
+ device: (`torch.device`):
211
+ torch device
212
+ num_images_per_prompt (`int`):
213
+ number of images that should be generated per prompt
214
+ do_classifier_free_guidance (`bool`):
215
+ whether to use classifier free guidance or not
216
+ negative_prompt (`str` or `List[str]`, *optional*):
217
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
218
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
219
+ less than `1`).
220
+ prompt_embeds (`torch.FloatTensor`, *optional*):
221
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
222
+ provided, text embeddings will be generated from `prompt` input argument.
223
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
224
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
225
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
226
+ argument.
227
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
228
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
229
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
230
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
231
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
232
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
233
+ input argument.
234
+ lora_scale (`float`, *optional*):
235
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
236
+ """
237
+ # from IPython import embed; embed(); exit()
238
+ device = device or self._execution_device
239
+
240
+ # set lora scale so that monkey patched LoRA
241
+ # function of text encoder can correctly access it
242
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
243
+ self._lora_scale = lora_scale
244
+
245
+ if prompt is not None and isinstance(prompt, str):
246
+ batch_size = 1
247
+ elif prompt is not None and isinstance(prompt, list):
248
+ batch_size = len(prompt)
249
+ else:
250
+ batch_size = prompt_embeds.shape[0]
251
+
252
+ # Define tokenizers and text encoders
253
+ tokenizers = [self.tokenizer]
254
+ text_encoders = [self.text_encoder]
255
+
256
+ if prompt_embeds is None:
257
+ # textual inversion: procecss multi-vector tokens if necessary
258
+ prompt_embeds_list = []
259
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
260
+ if isinstance(self, TextualInversionLoaderMixin):
261
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
262
+
263
+ text_inputs = tokenizer(
264
+ prompt,
265
+ padding="max_length",
266
+ max_length=256,
267
+ truncation=True,
268
+ return_tensors="pt",
269
+ ).to('cuda')
270
+ output = text_encoder(
271
+ input_ids=text_inputs['input_ids'] ,
272
+ attention_mask=text_inputs['attention_mask'],
273
+ position_ids=text_inputs['position_ids'],
274
+ output_hidden_states=True)
275
+ prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
276
+ pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone() # [batch_size, 4096]
277
+ bs_embed, seq_len, _ = prompt_embeds.shape
278
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
279
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
280
+
281
+ prompt_embeds_list.append(prompt_embeds)
282
+
283
+ # prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
284
+ prompt_embeds = prompt_embeds_list[0]
285
+
286
+ # get unconditional embeddings for classifier free guidance
287
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
288
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
289
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
290
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
291
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
292
+ # negative_prompt = negative_prompt or ""
293
+ uncond_tokens: List[str]
294
+ if negative_prompt is None:
295
+ uncond_tokens = [""] * batch_size
296
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
297
+ raise TypeError(
298
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
299
+ f" {type(prompt)}."
300
+ )
301
+ elif isinstance(negative_prompt, str):
302
+ uncond_tokens = [negative_prompt]
303
+ elif batch_size != len(negative_prompt):
304
+ raise ValueError(
305
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
306
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
307
+ " the batch size of `prompt`."
308
+ )
309
+ else:
310
+ uncond_tokens = negative_prompt
311
+
312
+ negative_prompt_embeds_list = []
313
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
314
+ # textual inversion: procecss multi-vector tokens if necessary
315
+ if isinstance(self, TextualInversionLoaderMixin):
316
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, tokenizer)
317
+
318
+ max_length = prompt_embeds.shape[1]
319
+ uncond_input = tokenizer(
320
+ uncond_tokens,
321
+ padding="max_length",
322
+ max_length=max_length,
323
+ truncation=True,
324
+ return_tensors="pt",
325
+ ).to('cuda')
326
+ output = text_encoder(
327
+ input_ids=uncond_input['input_ids'] ,
328
+ attention_mask=uncond_input['attention_mask'],
329
+ position_ids=uncond_input['position_ids'],
330
+ output_hidden_states=True)
331
+ negative_prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
332
+ negative_pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone() # [batch_size, 4096]
333
+
334
+ if do_classifier_free_guidance:
335
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
336
+ seq_len = negative_prompt_embeds.shape[1]
337
+
338
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device)
339
+
340
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
341
+ negative_prompt_embeds = negative_prompt_embeds.view(
342
+ batch_size * num_images_per_prompt, seq_len, -1
343
+ )
344
+
345
+ # For classifier free guidance, we need to do two forward passes.
346
+ # Here we concatenate the unconditional and text embeddings into a single batch
347
+ # to avoid doing two forward passes
348
+
349
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
350
+
351
+ # negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
352
+ negative_prompt_embeds = negative_prompt_embeds_list[0]
353
+
354
+ bs_embed = pooled_prompt_embeds.shape[0]
355
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
356
+ bs_embed * num_images_per_prompt, -1
357
+ )
358
+ if do_classifier_free_guidance:
359
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
360
+ bs_embed * num_images_per_prompt, -1
361
+ )
362
+
363
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
364
+
365
+
366
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
367
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
368
+ dtype = next(self.image_encoder.parameters()).dtype
369
+
370
+ if not isinstance(image, torch.Tensor):
371
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
372
+
373
+ image = image.to(device=device, dtype=dtype)
374
+ if output_hidden_states:
375
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
376
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
377
+ uncond_image_enc_hidden_states = self.image_encoder(
378
+ torch.zeros_like(image), output_hidden_states=True
379
+ ).hidden_states[-2]
380
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
381
+ num_images_per_prompt, dim=0
382
+ )
383
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
384
+ else:
385
+ image_embeds = self.image_encoder(image).image_embeds
386
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
387
+ uncond_image_embeds = torch.zeros_like(image_embeds)
388
+
389
+ return image_embeds, uncond_image_embeds
390
+
391
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
392
+ def prepare_ip_adapter_image_embeds(
393
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
394
+ ):
395
+ image_embeds = []
396
+ if do_classifier_free_guidance:
397
+ negative_image_embeds = []
398
+ if ip_adapter_image_embeds is None:
399
+ if not isinstance(ip_adapter_image, list):
400
+ ip_adapter_image = [ip_adapter_image]
401
+
402
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
403
+ raise ValueError(
404
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
405
+ )
406
+
407
+ for single_ip_adapter_image, image_proj_layer in zip(
408
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
409
+ ):
410
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
411
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
412
+ single_ip_adapter_image, device, 1, output_hidden_state
413
+ )
414
+
415
+ image_embeds.append(single_image_embeds[None, :])
416
+ if do_classifier_free_guidance:
417
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
418
+ else:
419
+ for single_image_embeds in ip_adapter_image_embeds:
420
+ if do_classifier_free_guidance:
421
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
422
+ negative_image_embeds.append(single_negative_image_embeds)
423
+ image_embeds.append(single_image_embeds)
424
+
425
+ ip_adapter_image_embeds = []
426
+ for i, single_image_embeds in enumerate(image_embeds):
427
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
428
+ if do_classifier_free_guidance:
429
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
430
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
431
+
432
+ single_image_embeds = single_image_embeds.to(device=device)
433
+ ip_adapter_image_embeds.append(single_image_embeds)
434
+
435
+ return ip_adapter_image_embeds
436
+
437
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
438
+ def prepare_extra_step_kwargs(self, generator, eta):
439
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
440
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
441
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
442
+ # and should be between [0, 1]
443
+
444
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
445
+ extra_step_kwargs = {}
446
+ if accepts_eta:
447
+ extra_step_kwargs["eta"] = eta
448
+
449
+ # check if the scheduler accepts generator
450
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
451
+ if accepts_generator:
452
+ extra_step_kwargs["generator"] = generator
453
+ return extra_step_kwargs
454
+
455
+ def check_inputs(
456
+ self,
457
+ prompt,
458
+ image,
459
+ strength,
460
+ num_inference_steps,
461
+ callback_steps,
462
+ negative_prompt=None,
463
+ prompt_embeds=None,
464
+ negative_prompt_embeds=None,
465
+ pooled_prompt_embeds=None,
466
+ negative_pooled_prompt_embeds=None,
467
+ ip_adapter_image=None,
468
+ ip_adapter_image_embeds=None,
469
+ controlnet_conditioning_scale=1.0,
470
+ control_guidance_start=0.0,
471
+ control_guidance_end=1.0,
472
+ callback_on_step_end_tensor_inputs=None,
473
+ ):
474
+ if strength < 0 or strength > 1:
475
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
476
+ if num_inference_steps is None:
477
+ raise ValueError("`num_inference_steps` cannot be None.")
478
+ elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
479
+ raise ValueError(
480
+ f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
481
+ f" {type(num_inference_steps)}."
482
+ )
483
+
484
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
485
+ raise ValueError(
486
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
487
+ f" {type(callback_steps)}."
488
+ )
489
+
490
+ if callback_on_step_end_tensor_inputs is not None and not all(
491
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
492
+ ):
493
+ raise ValueError(
494
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
495
+ )
496
+
497
+ if prompt is not None and prompt_embeds is not None:
498
+ raise ValueError(
499
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
500
+ " only forward one of the two."
501
+ )
502
+ elif prompt is None and prompt_embeds is None:
503
+ raise ValueError(
504
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
505
+ )
506
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
507
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
508
+
509
+ if negative_prompt is not None and negative_prompt_embeds is not None:
510
+ raise ValueError(
511
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
512
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
513
+ )
514
+
515
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
516
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
517
+ raise ValueError(
518
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
519
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
520
+ f" {negative_prompt_embeds.shape}."
521
+ )
522
+
523
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
524
+ raise ValueError(
525
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
526
+ )
527
+
528
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
529
+ raise ValueError(
530
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
531
+ )
532
+
533
+ # `prompt` needs more sophisticated handling when there are multiple
534
+ # conditionings.
535
+ if isinstance(self.controlnet, MultiControlNetModel):
536
+ if isinstance(prompt, list):
537
+ logger.warning(
538
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
539
+ " prompts. The conditionings will be fixed across the prompts."
540
+ )
541
+
542
+ # Check `image`
543
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
544
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
545
+ )
546
+ if (
547
+ isinstance(self.controlnet, ControlNetModel)
548
+ or is_compiled
549
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
550
+ ):
551
+ self.check_image(image, prompt, prompt_embeds)
552
+ elif (
553
+ isinstance(self.controlnet, MultiControlNetModel)
554
+ or is_compiled
555
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
556
+ ):
557
+ if not isinstance(image, list):
558
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
559
+
560
+ # When `image` is a nested list:
561
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
562
+ elif any(isinstance(i, list) for i in image):
563
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
564
+ elif len(image) != len(self.controlnet.nets):
565
+ raise ValueError(
566
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
567
+ )
568
+
569
+ for image_ in image:
570
+ self.check_image(image_, prompt, prompt_embeds)
571
+ else:
572
+ assert False
573
+
574
+ # Check `controlnet_conditioning_scale`
575
+ if (
576
+ isinstance(self.controlnet, ControlNetModel)
577
+ or is_compiled
578
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
579
+ ):
580
+ if not isinstance(controlnet_conditioning_scale, float):
581
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
582
+ elif (
583
+ isinstance(self.controlnet, MultiControlNetModel)
584
+ or is_compiled
585
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
586
+ ):
587
+ if isinstance(controlnet_conditioning_scale, list):
588
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
589
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
590
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
591
+ self.controlnet.nets
592
+ ):
593
+ raise ValueError(
594
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
595
+ " the same length as the number of controlnets"
596
+ )
597
+ else:
598
+ assert False
599
+
600
+ if not isinstance(control_guidance_start, (tuple, list)):
601
+ control_guidance_start = [control_guidance_start]
602
+
603
+ if not isinstance(control_guidance_end, (tuple, list)):
604
+ control_guidance_end = [control_guidance_end]
605
+
606
+ if len(control_guidance_start) != len(control_guidance_end):
607
+ raise ValueError(
608
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
609
+ )
610
+
611
+ if isinstance(self.controlnet, MultiControlNetModel):
612
+ if len(control_guidance_start) != len(self.controlnet.nets):
613
+ raise ValueError(
614
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
615
+ )
616
+
617
+ for start, end in zip(control_guidance_start, control_guidance_end):
618
+ if start >= end:
619
+ raise ValueError(
620
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
621
+ )
622
+ if start < 0.0:
623
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
624
+ if end > 1.0:
625
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
626
+
627
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
628
+ raise ValueError(
629
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
630
+ )
631
+
632
+ if ip_adapter_image_embeds is not None:
633
+ if not isinstance(ip_adapter_image_embeds, list):
634
+ raise ValueError(
635
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
636
+ )
637
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
638
+ raise ValueError(
639
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
640
+ )
641
+
642
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
643
+ def check_image(self, image, prompt, prompt_embeds):
644
+ image_is_pil = isinstance(image, PIL.Image.Image)
645
+ image_is_tensor = isinstance(image, torch.Tensor)
646
+ image_is_np = isinstance(image, np.ndarray)
647
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
648
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
649
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
650
+
651
+ if (
652
+ not image_is_pil
653
+ and not image_is_tensor
654
+ and not image_is_np
655
+ and not image_is_pil_list
656
+ and not image_is_tensor_list
657
+ and not image_is_np_list
658
+ ):
659
+ raise TypeError(
660
+ f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
661
+ )
662
+
663
+ if image_is_pil:
664
+ image_batch_size = 1
665
+ else:
666
+ image_batch_size = len(image)
667
+
668
+ if prompt is not None and isinstance(prompt, str):
669
+ prompt_batch_size = 1
670
+ elif prompt is not None and isinstance(prompt, list):
671
+ prompt_batch_size = len(prompt)
672
+ elif prompt_embeds is not None:
673
+ prompt_batch_size = prompt_embeds.shape[0]
674
+
675
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
676
+ raise ValueError(
677
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
678
+ )
679
+
680
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.prepare_image
681
+ def prepare_control_image(
682
+ self,
683
+ image,
684
+ width,
685
+ height,
686
+ batch_size,
687
+ num_images_per_prompt,
688
+ device,
689
+ dtype,
690
+ do_classifier_free_guidance=False,
691
+ guess_mode=False,
692
+ ):
693
+ image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
694
+ image_batch_size = image.shape[0]
695
+
696
+ if image_batch_size == 1:
697
+ repeat_by = batch_size
698
+ else:
699
+ # image batch size is the same as prompt batch size
700
+ repeat_by = num_images_per_prompt
701
+
702
+ image = image.repeat_interleave(repeat_by, dim=0)
703
+
704
+ image = image.to(device=device, dtype=dtype)
705
+
706
+ if do_classifier_free_guidance and not guess_mode:
707
+ image = torch.cat([image] * 2)
708
+
709
+ return image
710
+
711
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
712
+ def get_timesteps(self, num_inference_steps, strength, device):
713
+ # get the original timestep using init_timestep
714
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
715
+
716
+ t_start = max(num_inference_steps - init_timestep, 0)
717
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
718
+ if hasattr(self.scheduler, "set_begin_index"):
719
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
720
+
721
+ return timesteps, num_inference_steps - t_start
722
+
723
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
724
+ def prepare_latents(
725
+ self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
726
+ ):
727
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
728
+ raise ValueError(
729
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
730
+ )
731
+
732
+ # Offload text encoder if `enable_model_cpu_offload` was enabled
733
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
734
+ torch.cuda.empty_cache()
735
+
736
+ image = image.to(device=device, dtype=dtype)
737
+
738
+ batch_size = batch_size * num_images_per_prompt
739
+
740
+ if image.shape[1] == 4:
741
+ init_latents = image
742
+
743
+ else:
744
+ # make sure the VAE is in float32 mode, as it overflows in float16
745
+ if self.vae.config.force_upcast:
746
+ image = image.float()
747
+ self.vae.to(dtype=torch.float32)
748
+
749
+ if isinstance(generator, list) and len(generator) != batch_size:
750
+ raise ValueError(
751
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
752
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
753
+ )
754
+
755
+ elif isinstance(generator, list):
756
+ init_latents = [
757
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
758
+ for i in range(batch_size)
759
+ ]
760
+ init_latents = torch.cat(init_latents, dim=0)
761
+ else:
762
+ init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
763
+
764
+ if self.vae.config.force_upcast:
765
+ self.vae.to(dtype)
766
+
767
+ init_latents = init_latents.to(dtype)
768
+
769
+ init_latents = self.vae.config.scaling_factor * init_latents
770
+
771
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
772
+ # expand init_latents for batch_size
773
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
774
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
775
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
776
+ raise ValueError(
777
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
778
+ )
779
+ else:
780
+ init_latents = torch.cat([init_latents], dim=0)
781
+
782
+ if add_noise:
783
+ shape = init_latents.shape
784
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
785
+ # get latents
786
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
787
+
788
+ latents = init_latents
789
+
790
+ return latents
791
+
792
+
793
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
794
+ def prepare_latents_t2i(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
795
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
796
+ if isinstance(generator, list) and len(generator) != batch_size:
797
+ raise ValueError(
798
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
799
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
800
+ )
801
+
802
+ if latents is None:
803
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
804
+ else:
805
+ latents = latents.to(device)
806
+
807
+ # scale the initial noise by the standard deviation required by the scheduler
808
+ latents = latents * self.scheduler.init_noise_sigma
809
+ return latents
810
+
811
+
812
+
813
+ def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
814
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
815
+
816
+ passed_add_embed_dim = (
817
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + 4096
818
+ )
819
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
820
+
821
+ if expected_add_embed_dim != passed_add_embed_dim:
822
+ raise ValueError(
823
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
824
+ )
825
+
826
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
827
+ return add_time_ids
828
+
829
+
830
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
831
+ def upcast_vae(self):
832
+ dtype = self.vae.dtype
833
+ self.vae.to(dtype=torch.float32)
834
+ use_torch_2_0_or_xformers = isinstance(
835
+ self.vae.decoder.mid_block.attentions[0].processor,
836
+ (
837
+ AttnProcessor2_0,
838
+ XFormersAttnProcessor,
839
+ ),
840
+ )
841
+ # if xformers or torch_2_0 is used attention block does not need
842
+ # to be in float32 which can save lots of memory
843
+ if use_torch_2_0_or_xformers:
844
+ self.vae.post_quant_conv.to(dtype)
845
+ self.vae.decoder.conv_in.to(dtype)
846
+ self.vae.decoder.mid_block.to(dtype)
847
+
848
+ @property
849
+ def guidance_scale(self):
850
+ return self._guidance_scale
851
+
852
+ @property
853
+ def clip_skip(self):
854
+ return self._clip_skip
855
+
856
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
857
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
858
+ # corresponds to doing no classifier free guidance.
859
+ @property
860
+ def do_classifier_free_guidance(self):
861
+ return self._guidance_scale > 1
862
+
863
+ @property
864
+ def cross_attention_kwargs(self):
865
+ return self._cross_attention_kwargs
866
+
867
+ @property
868
+ def num_timesteps(self):
869
+ return self._num_timesteps
870
+
871
+ @torch.no_grad()
872
+ def __call__(
873
+ self,
874
+ prompt: Union[str, List[str]] = None,
875
+ image: PipelineImageInput = None,
876
+ control_image: PipelineImageInput = None,
877
+ height: Optional[int] = None,
878
+ width: Optional[int] = None,
879
+ strength: float = 0.8,
880
+ num_inference_steps: int = 50,
881
+ guidance_scale: float = 5.0,
882
+ negative_prompt: Optional[Union[str, List[str]]] = None,
883
+ num_images_per_prompt: Optional[int] = 1,
884
+ eta: float = 0.0,
885
+ guess_mode: bool = False,
886
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
887
+ latents: Optional[torch.Tensor] = None,
888
+ prompt_embeds: Optional[torch.Tensor] = None,
889
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
890
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
891
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
892
+ ip_adapter_image: Optional[PipelineImageInput] = None,
893
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
894
+ output_type: Optional[str] = "pil",
895
+ return_dict: bool = True,
896
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
897
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
898
+ control_guidance_start: Union[float, List[float]] = 0.0,
899
+ control_guidance_end: Union[float, List[float]] = 1.0,
900
+ original_size: Tuple[int, int] = None,
901
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
902
+ target_size: Tuple[int, int] = None,
903
+ clip_skip: Optional[int] = None,
904
+ callback_on_step_end: Optional[
905
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
906
+ ] = None,
907
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
908
+ **kwargs,
909
+ ):
910
+ r"""
911
+ Function invoked when calling the pipeline for generation.
912
+
913
+ Args:
914
+ prompt (`str` or `List[str]`, *optional*):
915
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
916
+ instead.
917
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
918
+ `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
919
+ The initial image will be used as the starting point for the image generation process. Can also accept
920
+ image latents as `image`, if passing latents directly, it will not be encoded again.
921
+ control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
922
+ `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
923
+ The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
924
+ the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also
925
+ be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
926
+ and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in
927
+ init, images must be passed as a list such that each element of the list can be correctly batched for
928
+ input to a single controlnet.
929
+ height (`int`, *optional*, defaults to the size of control_image):
930
+ The height in pixels of the generated image. Anything below 512 pixels won't work well for
931
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
932
+ and checkpoints that are not specifically fine-tuned on low resolutions.
933
+ width (`int`, *optional*, defaults to the size of control_image):
934
+ The width in pixels of the generated image. Anything below 512 pixels won't work well for
935
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
936
+ and checkpoints that are not specifically fine-tuned on low resolutions.
937
+ strength (`float`, *optional*, defaults to 0.8):
938
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
939
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
940
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
941
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
942
+ essentially ignores `image`.
943
+ num_inference_steps (`int`, *optional*, defaults to 50):
944
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
945
+ expense of slower inference.
946
+ guidance_scale (`float`, *optional*, defaults to 7.5):
947
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
948
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
949
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
950
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
951
+ usually at the expense of lower image quality.
952
+ negative_prompt (`str` or `List[str]`, *optional*):
953
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
954
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
955
+ less than `1`).
956
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
957
+ The number of images to generate per prompt.
958
+ eta (`float`, *optional*, defaults to 0.0):
959
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
960
+ [`schedulers.DDIMScheduler`], will be ignored for others.
961
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
962
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
963
+ to make generation deterministic.
964
+ latents (`torch.Tensor`, *optional*):
965
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
966
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
967
+ tensor will ge generated by sampling using the supplied random `generator`.
968
+ prompt_embeds (`torch.Tensor`, *optional*):
969
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
970
+ provided, text embeddings will be generated from `prompt` input argument.
971
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
972
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
973
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
974
+ argument.
975
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
976
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
977
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
978
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
979
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
980
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
981
+ input argument.
982
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
983
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
984
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
985
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
986
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
987
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
988
+ output_type (`str`, *optional*, defaults to `"pil"`):
989
+ The output format of the generate image. Choose between
990
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
991
+ return_dict (`bool`, *optional*, defaults to `True`):
992
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
993
+ plain tuple.
994
+ cross_attention_kwargs (`dict`, *optional*):
995
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
996
+ `self.processor` in
997
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
998
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
999
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
1000
+ to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
1001
+ corresponding scale as a list.
1002
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
1003
+ The percentage of total steps at which the controlnet starts applying.
1004
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
1005
+ The percentage of total steps at which the controlnet stops applying.
1006
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1007
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
1008
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
1009
+ explained in section 2.2 of
1010
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1011
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
1012
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
1013
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
1014
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
1015
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1016
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1017
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
1018
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
1019
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1020
+ clip_skip (`int`, *optional*):
1021
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1022
+ the output of the pre-final layer will be used for computing the prompt embeddings.
1023
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
1024
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
1025
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
1026
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
1027
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
1028
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
1029
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1030
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1031
+ `._callback_tensor_inputs` attribute of your pipeline class.
1032
+
1033
+ Examples:
1034
+
1035
+ Returns:
1036
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
1037
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple`
1038
+ containing the output images.
1039
+ """
1040
+
1041
+ callback = kwargs.pop("callback", None)
1042
+ callback_steps = kwargs.pop("callback_steps", None)
1043
+
1044
+ if callback is not None:
1045
+ deprecate(
1046
+ "callback",
1047
+ "1.0.0",
1048
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1049
+ )
1050
+ if callback_steps is not None:
1051
+ deprecate(
1052
+ "callback_steps",
1053
+ "1.0.0",
1054
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1055
+ )
1056
+
1057
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
1058
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
1059
+
1060
+ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
1061
+
1062
+ # align format for control guidance
1063
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
1064
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
1065
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
1066
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
1067
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
1068
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
1069
+ control_guidance_start, control_guidance_end = (
1070
+ mult * [control_guidance_start],
1071
+ mult * [control_guidance_end],
1072
+ )
1073
+
1074
+ # from IPython import embed; embed()
1075
+ # 1. Check inputs. Raise error if not correct
1076
+ self.check_inputs(
1077
+ prompt,
1078
+ control_image,
1079
+ strength,
1080
+ num_inference_steps,
1081
+ callback_steps,
1082
+ negative_prompt,
1083
+ prompt_embeds,
1084
+ negative_prompt_embeds,
1085
+ pooled_prompt_embeds,
1086
+ negative_pooled_prompt_embeds,
1087
+ ip_adapter_image,
1088
+ ip_adapter_image_embeds,
1089
+ controlnet_conditioning_scale,
1090
+ control_guidance_start,
1091
+ control_guidance_end,
1092
+ callback_on_step_end_tensor_inputs,
1093
+ )
1094
+
1095
+ self._guidance_scale = guidance_scale
1096
+ self._clip_skip = clip_skip
1097
+ self._cross_attention_kwargs = cross_attention_kwargs
1098
+
1099
+ # 2. Define call parameters
1100
+ if prompt is not None and isinstance(prompt, str):
1101
+ batch_size = 1
1102
+ elif prompt is not None and isinstance(prompt, list):
1103
+ batch_size = len(prompt)
1104
+ else:
1105
+ batch_size = prompt_embeds.shape[0]
1106
+
1107
+ device = self._execution_device
1108
+
1109
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
1110
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
1111
+
1112
+ # 3.1. Encode input prompt
1113
+ text_encoder_lora_scale = (
1114
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
1115
+ )
1116
+ (
1117
+ prompt_embeds,
1118
+ negative_prompt_embeds,
1119
+ pooled_prompt_embeds,
1120
+ negative_pooled_prompt_embeds,
1121
+ ) = self.encode_prompt(
1122
+ prompt,
1123
+ device,
1124
+ num_images_per_prompt,
1125
+ self.do_classifier_free_guidance,
1126
+ negative_prompt,
1127
+ prompt_embeds=prompt_embeds,
1128
+ negative_prompt_embeds=negative_prompt_embeds,
1129
+ pooled_prompt_embeds=pooled_prompt_embeds,
1130
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
1131
+ lora_scale=text_encoder_lora_scale,
1132
+ )
1133
+
1134
+ # 3.2 Encode ip_adapter_image
1135
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1136
+ image_embeds = self.prepare_ip_adapter_image_embeds(
1137
+ ip_adapter_image,
1138
+ ip_adapter_image_embeds,
1139
+ device,
1140
+ batch_size * num_images_per_prompt,
1141
+ self.do_classifier_free_guidance,
1142
+ )
1143
+
1144
+ # 4. Prepare image and controlnet_conditioning_image
1145
+ image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
1146
+
1147
+ if isinstance(controlnet, ControlNetModel):
1148
+ control_image = self.prepare_control_image(
1149
+ image=control_image,
1150
+ width=width,
1151
+ height=height,
1152
+ batch_size=batch_size * num_images_per_prompt,
1153
+ num_images_per_prompt=num_images_per_prompt,
1154
+ device=device,
1155
+ dtype=controlnet.dtype,
1156
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1157
+ guess_mode=guess_mode,
1158
+ )
1159
+ height, width = control_image.shape[-2:]
1160
+ elif isinstance(controlnet, MultiControlNetModel):
1161
+ control_images = []
1162
+
1163
+ for control_image_ in control_image:
1164
+ control_image_ = self.prepare_control_image(
1165
+ image=control_image_,
1166
+ width=width,
1167
+ height=height,
1168
+ batch_size=batch_size * num_images_per_prompt,
1169
+ num_images_per_prompt=num_images_per_prompt,
1170
+ device=device,
1171
+ dtype=controlnet.dtype,
1172
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1173
+ guess_mode=guess_mode,
1174
+ )
1175
+
1176
+ control_images.append(control_image_)
1177
+
1178
+ control_image = control_images
1179
+ height, width = control_image[0].shape[-2:]
1180
+ else:
1181
+ assert False
1182
+
1183
+ # 5. Prepare timesteps
1184
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
1185
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
1186
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1187
+ self._num_timesteps = len(timesteps)
1188
+
1189
+ # 6. Prepare latent variables
1190
+
1191
+ num_channels_latents = self.unet.config.in_channels
1192
+ if latents is None:
1193
+ if strength >= 1.0:
1194
+ latents = self.prepare_latents_t2i(
1195
+ batch_size * num_images_per_prompt,
1196
+ num_channels_latents,
1197
+ height,
1198
+ width,
1199
+ prompt_embeds.dtype,
1200
+ device,
1201
+ generator,
1202
+ latents,
1203
+ )
1204
+ else:
1205
+ latents = self.prepare_latents(
1206
+ image,
1207
+ latent_timestep,
1208
+ batch_size,
1209
+ num_images_per_prompt,
1210
+ prompt_embeds.dtype,
1211
+ device,
1212
+ generator,
1213
+ True,
1214
+ )
1215
+
1216
+
1217
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1218
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1219
+
1220
+ # 7.1 Create tensor stating which controlnets to keep
1221
+ controlnet_keep = []
1222
+ for i in range(len(timesteps)):
1223
+ keeps = [
1224
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1225
+ for s, e in zip(control_guidance_start, control_guidance_end)
1226
+ ]
1227
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
1228
+
1229
+ # 7.2 Prepare added time ids & embeddings
1230
+ if isinstance(control_image, list):
1231
+ original_size = original_size or control_image[0].shape[-2:]
1232
+ else:
1233
+ original_size = original_size or control_image.shape[-2:]
1234
+ target_size = target_size or (height, width)
1235
+
1236
+ # 7. Prepare added time ids & embeddings
1237
+ add_text_embeds = pooled_prompt_embeds
1238
+ add_time_ids = self._get_add_time_ids(
1239
+ original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
1240
+ )
1241
+
1242
+ if self.do_classifier_free_guidance:
1243
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
1244
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
1245
+ add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
1246
+
1247
+ prompt_embeds = prompt_embeds.to(device)
1248
+ add_text_embeds = add_text_embeds.to(device)
1249
+ add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
1250
+
1251
+ # 8. Denoising loop
1252
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1253
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1254
+ for i, t in enumerate(timesteps):
1255
+ # expand the latents if we are doing classifier free guidance
1256
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
1257
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1258
+
1259
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1260
+
1261
+ # controlnet(s) inference
1262
+ if guess_mode and self.do_classifier_free_guidance:
1263
+ # Infer ControlNet only for the conditional batch.
1264
+ control_model_input = latents
1265
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
1266
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
1267
+ controlnet_added_cond_kwargs = {
1268
+ "text_embeds": add_text_embeds.chunk(2)[1],
1269
+ "time_ids": add_time_ids.chunk(2)[1],
1270
+ }
1271
+ else:
1272
+ control_model_input = latent_model_input
1273
+ controlnet_prompt_embeds = prompt_embeds
1274
+ controlnet_added_cond_kwargs = added_cond_kwargs
1275
+
1276
+ if isinstance(controlnet_keep[i], list):
1277
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
1278
+ else:
1279
+ controlnet_cond_scale = controlnet_conditioning_scale
1280
+ if isinstance(controlnet_cond_scale, list):
1281
+ controlnet_cond_scale = controlnet_cond_scale[0]
1282
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
1283
+
1284
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
1285
+ control_model_input,
1286
+ t,
1287
+ encoder_hidden_states=controlnet_prompt_embeds,
1288
+ controlnet_cond=control_image,
1289
+ conditioning_scale=cond_scale,
1290
+ guess_mode=guess_mode,
1291
+ added_cond_kwargs=controlnet_added_cond_kwargs,
1292
+ return_dict=False,
1293
+ )
1294
+
1295
+ if guess_mode and self.do_classifier_free_guidance:
1296
+ # Infered ControlNet only for the conditional batch.
1297
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
1298
+ # add 0 to the unconditional batch to keep it unchanged.
1299
+ down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
1300
+ mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
1301
+
1302
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1303
+ added_cond_kwargs["image_embeds"] = image_embeds
1304
+
1305
+ # predict the noise residual
1306
+ noise_pred = self.unet(
1307
+ latent_model_input,
1308
+ t,
1309
+ encoder_hidden_states=prompt_embeds,
1310
+ cross_attention_kwargs=self.cross_attention_kwargs,
1311
+ down_block_additional_residuals=down_block_res_samples,
1312
+ mid_block_additional_residual=mid_block_res_sample,
1313
+ added_cond_kwargs=added_cond_kwargs,
1314
+ return_dict=False,
1315
+ )[0]
1316
+
1317
+ # perform guidance
1318
+ if self.do_classifier_free_guidance:
1319
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1320
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1321
+
1322
+ # compute the previous noisy sample x_t -> x_t-1
1323
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1324
+
1325
+ # call the callback, if provided
1326
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1327
+ progress_bar.update()
1328
+ if callback is not None and i % callback_steps == 0:
1329
+ step_idx = i // getattr(self.scheduler, "order", 1)
1330
+ callback(step_idx, t, latents)
1331
+
1332
+ # If we do sequential model offloading, let's offload unet and controlnet
1333
+ # manually for max memory savings
1334
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1335
+ self.unet.to("cpu")
1336
+ self.controlnet.to("cpu")
1337
+ torch.cuda.empty_cache()
1338
+
1339
+ if not output_type == "latent":
1340
+ # make sure the VAE is in float32 mode, as it overflows in float16
1341
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
1342
+
1343
+ if needs_upcasting:
1344
+ self.upcast_vae()
1345
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1346
+
1347
+ latents = latents / self.vae.config.scaling_factor
1348
+ image = self.vae.decode(latents, return_dict=False)[0]
1349
+
1350
+ # cast back to fp16 if needed
1351
+ if needs_upcasting:
1352
+ self.vae.to(dtype=torch.float16)
1353
+ else:
1354
+ image = latents
1355
+ return StableDiffusionXLPipelineOutput(images=image)
1356
+
1357
+ image = self.image_processor.postprocess(image, output_type=output_type)
1358
+
1359
+ # Offload all models
1360
+ self.maybe_free_model_hooks()
1361
+
1362
+ if not return_dict:
1363
+ return (image,)
1364
+
1365
+ return StableDiffusionXLPipelineOutput(images=image)
build/lib/kolors/pipelines/pipeline_stable_diffusion_xl_chatglm_256.py ADDED
@@ -0,0 +1,841 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import sys
15
+ import os
16
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
17
+ from kolors.models.modeling_chatglm import ChatGLMModel
18
+ from kolors.models.tokenization_chatglm import ChatGLMTokenizer
19
+ import inspect
20
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
21
+ import torch
22
+ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
23
+ from transformers import XLMRobertaModel, ChineseCLIPTextModel
24
+
25
+ from diffusers.image_processor import VaeImageProcessor
26
+ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
27
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel
28
+ from diffusers.models.attention_processor import (
29
+ AttnProcessor2_0,
30
+ LoRAAttnProcessor2_0,
31
+ LoRAXFormersAttnProcessor,
32
+ XFormersAttnProcessor,
33
+ )
34
+ from diffusers.schedulers import KarrasDiffusionSchedulers
35
+ from diffusers.utils import (
36
+ is_accelerate_available,
37
+ is_accelerate_version,
38
+ logging,
39
+ replace_example_docstring,
40
+ )
41
+ try:
42
+ from diffusers.utils import randn_tensor
43
+ except:
44
+ from diffusers.utils.torch_utils import randn_tensor
45
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
46
+ from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
47
+
48
+
49
+
50
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
51
+
52
+ EXAMPLE_DOC_STRING = """
53
+ Examples:
54
+ ```py
55
+ >>> import torch
56
+ >>> from diffusers import StableDiffusionXLPipeline
57
+
58
+ >>> pipe = StableDiffusionXLPipeline.from_pretrained(
59
+ ... "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16
60
+ ... )
61
+ >>> pipe = pipe.to("cuda")
62
+
63
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
64
+ >>> image = pipe(prompt).images[0]
65
+ ```
66
+ """
67
+
68
+
69
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
70
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
71
+ """
72
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
73
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
74
+ """
75
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
76
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
77
+ # rescale the results from guidance (fixes overexposure)
78
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
79
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
80
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
81
+ return noise_cfg
82
+
83
+
84
+ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin):
85
+ r"""
86
+ Pipeline for text-to-image generation using Stable Diffusion XL.
87
+
88
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
89
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
90
+
91
+ In addition the pipeline inherits the following loading methods:
92
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
93
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
94
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
95
+
96
+ as well as the following saving methods:
97
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
98
+
99
+ Args:
100
+ vae ([`AutoencoderKL`]):
101
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
102
+ text_encoder ([`CLIPTextModel`]):
103
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
104
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
105
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
106
+
107
+ tokenizer (`CLIPTokenizer`):
108
+ Tokenizer of class
109
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
110
+
111
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
112
+ scheduler ([`SchedulerMixin`]):
113
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
114
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
115
+ """
116
+
117
+ def __init__(
118
+ self,
119
+ vae: AutoencoderKL,
120
+ text_encoder: ChatGLMModel,
121
+ tokenizer: ChatGLMTokenizer,
122
+ unet: UNet2DConditionModel,
123
+ scheduler: KarrasDiffusionSchedulers,
124
+ force_zeros_for_empty_prompt: bool = True,
125
+ ):
126
+ super().__init__()
127
+
128
+ self.register_modules(
129
+ vae=vae,
130
+ text_encoder=text_encoder,
131
+ tokenizer=tokenizer,
132
+ unet=unet,
133
+ scheduler=scheduler,
134
+ )
135
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
136
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
137
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
138
+ self.default_sample_size = self.unet.config.sample_size
139
+
140
+ # self.watermark = StableDiffusionXLWatermarker()
141
+
142
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
143
+ def enable_vae_slicing(self):
144
+ r"""
145
+ Enable sliced VAE decoding.
146
+
147
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
148
+ steps. This is useful to save some memory and allow larger batch sizes.
149
+ """
150
+ self.vae.enable_slicing()
151
+
152
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
153
+ def disable_vae_slicing(self):
154
+ r"""
155
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
156
+ computing decoding in one step.
157
+ """
158
+ self.vae.disable_slicing()
159
+
160
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
161
+ def enable_vae_tiling(self):
162
+ r"""
163
+ Enable tiled VAE decoding.
164
+
165
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
166
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
167
+ """
168
+ self.vae.enable_tiling()
169
+
170
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
171
+ def disable_vae_tiling(self):
172
+ r"""
173
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
174
+ computing decoding in one step.
175
+ """
176
+ self.vae.disable_tiling()
177
+
178
+ def enable_sequential_cpu_offload(self, gpu_id=0):
179
+ r"""
180
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
181
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
182
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
183
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
184
+ `enable_model_cpu_offload`, but performance is lower.
185
+ """
186
+ if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
187
+ from accelerate import cpu_offload
188
+ else:
189
+ raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
190
+
191
+ device = torch.device(f"cuda:{gpu_id}")
192
+
193
+ if self.device.type != "cpu":
194
+ self.to("cpu", silence_dtype_warnings=True)
195
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
196
+
197
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
198
+ cpu_offload(cpu_offloaded_model, device)
199
+
200
+ def enable_model_cpu_offload(self, gpu_id=0):
201
+ r"""
202
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
203
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
204
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
205
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
206
+ """
207
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
208
+ from accelerate import cpu_offload_with_hook
209
+ else:
210
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
211
+
212
+ device = torch.device(f"cuda:{gpu_id}")
213
+
214
+ if self.device.type != "cpu":
215
+ self.to("cpu", silence_dtype_warnings=True)
216
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
217
+
218
+ model_sequence = (
219
+ [self.text_encoder]
220
+ )
221
+ model_sequence.extend([self.unet, self.vae])
222
+
223
+ hook = None
224
+ for cpu_offloaded_model in model_sequence:
225
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
226
+
227
+ # We'll offload the last model manually.
228
+ self.final_offload_hook = hook
229
+
230
+ @property
231
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
232
+ def _execution_device(self):
233
+ r"""
234
+ Returns the device on which the pipeline's models will be executed. After calling
235
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
236
+ hooks.
237
+ """
238
+ if not hasattr(self.unet, "_hf_hook"):
239
+ return self.device
240
+ for module in self.unet.modules():
241
+ if (
242
+ hasattr(module, "_hf_hook")
243
+ and hasattr(module._hf_hook, "execution_device")
244
+ and module._hf_hook.execution_device is not None
245
+ ):
246
+ return torch.device(module._hf_hook.execution_device)
247
+ return self.device
248
+
249
+ def encode_prompt(
250
+ self,
251
+ prompt,
252
+ device: Optional[torch.device] = None,
253
+ num_images_per_prompt: int = 1,
254
+ do_classifier_free_guidance: bool = True,
255
+ negative_prompt=None,
256
+ prompt_embeds: Optional[torch.FloatTensor] = None,
257
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
258
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
259
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
260
+ lora_scale: Optional[float] = None,
261
+ ):
262
+ r"""
263
+ Encodes the prompt into text encoder hidden states.
264
+
265
+ Args:
266
+ prompt (`str` or `List[str]`, *optional*):
267
+ prompt to be encoded
268
+ device: (`torch.device`):
269
+ torch device
270
+ num_images_per_prompt (`int`):
271
+ number of images that should be generated per prompt
272
+ do_classifier_free_guidance (`bool`):
273
+ whether to use classifier free guidance or not
274
+ negative_prompt (`str` or `List[str]`, *optional*):
275
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
276
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
277
+ less than `1`).
278
+ prompt_embeds (`torch.FloatTensor`, *optional*):
279
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
280
+ provided, text embeddings will be generated from `prompt` input argument.
281
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
282
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
283
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
284
+ argument.
285
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
286
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
287
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
288
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
289
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
290
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
291
+ input argument.
292
+ lora_scale (`float`, *optional*):
293
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
294
+ """
295
+ # from IPython import embed; embed(); exit()
296
+ device = device or self._execution_device
297
+
298
+ # set lora scale so that monkey patched LoRA
299
+ # function of text encoder can correctly access it
300
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
301
+ self._lora_scale = lora_scale
302
+
303
+ if prompt is not None and isinstance(prompt, str):
304
+ batch_size = 1
305
+ elif prompt is not None and isinstance(prompt, list):
306
+ batch_size = len(prompt)
307
+ else:
308
+ batch_size = prompt_embeds.shape[0]
309
+
310
+ # Define tokenizers and text encoders
311
+ tokenizers = [self.tokenizer]
312
+ text_encoders = [self.text_encoder]
313
+
314
+ if prompt_embeds is None:
315
+ # textual inversion: procecss multi-vector tokens if necessary
316
+ prompt_embeds_list = []
317
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
318
+ if isinstance(self, TextualInversionLoaderMixin):
319
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
320
+
321
+ text_inputs = tokenizer(
322
+ prompt,
323
+ padding="max_length",
324
+ max_length=256,
325
+ truncation=True,
326
+ return_tensors="pt",
327
+ ).to('cuda')
328
+ output = text_encoder(
329
+ input_ids=text_inputs['input_ids'] ,
330
+ attention_mask=text_inputs['attention_mask'],
331
+ position_ids=text_inputs['position_ids'],
332
+ output_hidden_states=True)
333
+ prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
334
+ pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone() # [batch_size, 4096]
335
+ bs_embed, seq_len, _ = prompt_embeds.shape
336
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
337
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
338
+
339
+ prompt_embeds_list.append(prompt_embeds)
340
+
341
+ # prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
342
+ prompt_embeds = prompt_embeds_list[0]
343
+
344
+ # get unconditional embeddings for classifier free guidance
345
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
346
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
347
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
348
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
349
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
350
+ # negative_prompt = negative_prompt or ""
351
+ uncond_tokens: List[str]
352
+ if negative_prompt is None:
353
+ uncond_tokens = [""] * batch_size
354
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
355
+ raise TypeError(
356
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
357
+ f" {type(prompt)}."
358
+ )
359
+ elif isinstance(negative_prompt, str):
360
+ uncond_tokens = [negative_prompt]
361
+ elif batch_size != len(negative_prompt):
362
+ raise ValueError(
363
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
364
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
365
+ " the batch size of `prompt`."
366
+ )
367
+ else:
368
+ uncond_tokens = negative_prompt
369
+
370
+ negative_prompt_embeds_list = []
371
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
372
+ # textual inversion: procecss multi-vector tokens if necessary
373
+ if isinstance(self, TextualInversionLoaderMixin):
374
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, tokenizer)
375
+
376
+ max_length = prompt_embeds.shape[1]
377
+ uncond_input = tokenizer(
378
+ uncond_tokens,
379
+ padding="max_length",
380
+ max_length=max_length,
381
+ truncation=True,
382
+ return_tensors="pt",
383
+ ).to('cuda')
384
+ output = text_encoder(
385
+ input_ids=uncond_input['input_ids'] ,
386
+ attention_mask=uncond_input['attention_mask'],
387
+ position_ids=uncond_input['position_ids'],
388
+ output_hidden_states=True)
389
+ negative_prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
390
+ negative_pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone() # [batch_size, 4096]
391
+
392
+ if do_classifier_free_guidance:
393
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
394
+ seq_len = negative_prompt_embeds.shape[1]
395
+
396
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device)
397
+
398
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
399
+ negative_prompt_embeds = negative_prompt_embeds.view(
400
+ batch_size * num_images_per_prompt, seq_len, -1
401
+ )
402
+
403
+ # For classifier free guidance, we need to do two forward passes.
404
+ # Here we concatenate the unconditional and text embeddings into a single batch
405
+ # to avoid doing two forward passes
406
+
407
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
408
+
409
+ # negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
410
+ negative_prompt_embeds = negative_prompt_embeds_list[0]
411
+
412
+ bs_embed = pooled_prompt_embeds.shape[0]
413
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
414
+ bs_embed * num_images_per_prompt, -1
415
+ )
416
+ if do_classifier_free_guidance:
417
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
418
+ bs_embed * num_images_per_prompt, -1
419
+ )
420
+
421
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
422
+
423
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
424
+ def prepare_extra_step_kwargs(self, generator, eta):
425
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
426
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
427
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
428
+ # and should be between [0, 1]
429
+
430
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
431
+ extra_step_kwargs = {}
432
+ if accepts_eta:
433
+ extra_step_kwargs["eta"] = eta
434
+
435
+ # check if the scheduler accepts generator
436
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
437
+ if accepts_generator:
438
+ extra_step_kwargs["generator"] = generator
439
+ return extra_step_kwargs
440
+
441
+ def check_inputs(
442
+ self,
443
+ prompt,
444
+ height,
445
+ width,
446
+ callback_steps,
447
+ negative_prompt=None,
448
+ prompt_embeds=None,
449
+ negative_prompt_embeds=None,
450
+ pooled_prompt_embeds=None,
451
+ negative_pooled_prompt_embeds=None,
452
+ ):
453
+ if height % 8 != 0 or width % 8 != 0:
454
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
455
+
456
+ if (callback_steps is None) or (
457
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
458
+ ):
459
+ raise ValueError(
460
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
461
+ f" {type(callback_steps)}."
462
+ )
463
+
464
+ if prompt is not None and prompt_embeds is not None:
465
+ raise ValueError(
466
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
467
+ " only forward one of the two."
468
+ )
469
+ elif prompt is None and prompt_embeds is None:
470
+ raise ValueError(
471
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
472
+ )
473
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
474
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
475
+
476
+ if negative_prompt is not None and negative_prompt_embeds is not None:
477
+ raise ValueError(
478
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
479
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
480
+ )
481
+
482
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
483
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
484
+ raise ValueError(
485
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
486
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
487
+ f" {negative_prompt_embeds.shape}."
488
+ )
489
+
490
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
491
+ raise ValueError(
492
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
493
+ )
494
+
495
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
496
+ raise ValueError(
497
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
498
+ )
499
+
500
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
501
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
502
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
503
+ if isinstance(generator, list) and len(generator) != batch_size:
504
+ raise ValueError(
505
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
506
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
507
+ )
508
+
509
+ if latents is None:
510
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
511
+ else:
512
+ latents = latents.to(device)
513
+
514
+ # scale the initial noise by the standard deviation required by the scheduler
515
+ latents = latents * self.scheduler.init_noise_sigma
516
+ return latents
517
+
518
+ def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
519
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
520
+
521
+ passed_add_embed_dim = (
522
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + 4096
523
+ )
524
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
525
+
526
+ if expected_add_embed_dim != passed_add_embed_dim:
527
+ raise ValueError(
528
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
529
+ )
530
+
531
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
532
+ return add_time_ids
533
+
534
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
535
+ def upcast_vae(self):
536
+ dtype = self.vae.dtype
537
+ self.vae.to(dtype=torch.float32)
538
+ use_torch_2_0_or_xformers = isinstance(
539
+ self.vae.decoder.mid_block.attentions[0].processor,
540
+ (
541
+ AttnProcessor2_0,
542
+ XFormersAttnProcessor,
543
+ LoRAXFormersAttnProcessor,
544
+ LoRAAttnProcessor2_0,
545
+ ),
546
+ )
547
+ # if xformers or torch_2_0 is used attention block does not need
548
+ # to be in float32 which can save lots of memory
549
+ if use_torch_2_0_or_xformers:
550
+ self.vae.post_quant_conv.to(dtype)
551
+ self.vae.decoder.conv_in.to(dtype)
552
+ self.vae.decoder.mid_block.to(dtype)
553
+
554
+ @torch.no_grad()
555
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
556
+ def __call__(
557
+ self,
558
+ prompt: Union[str, List[str]] = None,
559
+ height: Optional[int] = None,
560
+ width: Optional[int] = None,
561
+ num_inference_steps: int = 50,
562
+ denoising_end: Optional[float] = None,
563
+ guidance_scale: float = 5.0,
564
+ negative_prompt: Optional[Union[str, List[str]]] = None,
565
+ num_images_per_prompt: Optional[int] = 1,
566
+ eta: float = 0.0,
567
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
568
+ latents: Optional[torch.FloatTensor] = None,
569
+ prompt_embeds: Optional[torch.FloatTensor] = None,
570
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
571
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
572
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
573
+ output_type: Optional[str] = "pil",
574
+ return_dict: bool = True,
575
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
576
+ callback_steps: int = 1,
577
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
578
+ guidance_rescale: float = 0.0,
579
+ original_size: Optional[Tuple[int, int]] = None,
580
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
581
+ target_size: Optional[Tuple[int, int]] = None,
582
+ use_dynamic_threshold: Optional[bool] = False,
583
+ ):
584
+ r"""
585
+ Function invoked when calling the pipeline for generation.
586
+
587
+ Args:
588
+ prompt (`str` or `List[str]`, *optional*):
589
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
590
+ instead.
591
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
592
+ The height in pixels of the generated image.
593
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
594
+ The width in pixels of the generated image.
595
+ num_inference_steps (`int`, *optional*, defaults to 50):
596
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
597
+ expense of slower inference.
598
+ denoising_end (`float`, *optional*):
599
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
600
+ completed before it is intentionally prematurely terminated. For instance, if denoising_end is set to
601
+ 0.7 and `num_inference_steps` is fixed at 50, the process will execute only 35 (i.e., 0.7 * 50)
602
+ Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
603
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
604
+ guidance_scale (`float`, *optional*, defaults to 7.5):
605
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
606
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
607
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
608
+ negative_prompt (`str` or `List[str]`, *optional*):
609
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
610
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
611
+ less than `1`).
612
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
613
+ The number of images to generate per prompt.
614
+ eta (`float`, *optional*, defaults to 0.0):
615
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
616
+ [`schedulers.DDIMScheduler`], will be ignored for others.
617
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
618
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
619
+ to make generation deterministic.
620
+ latents (`torch.FloatTensor`, *optional*):
621
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
622
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
623
+ tensor will ge generated by sampling using the supplied random `generator`.
624
+ prompt_embeds (`torch.FloatTensor`, *optional*):
625
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
626
+ provided, text embeddings will be generated from `prompt` input argument.
627
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
628
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
629
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
630
+ argument.
631
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
632
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
633
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
634
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
635
+ output_type (`str`, *optional*, defaults to `"pil"`):
636
+ The output format of the generate image. Choose between
637
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
638
+ return_dict (`bool`, *optional*, defaults to `True`):
639
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
640
+ callback (`Callable`, *optional*):
641
+ A function that will be called every `callback_steps` steps during inference. The function will be
642
+ callback_steps (`int`, *optional*, defaults to 1):
643
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
644
+ called at every step.
645
+ cross_attention_kwargs (`dict`, *optional*):
646
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
647
+ `self.processor` in
648
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
649
+ guidance_rescale (`float`, *optional*, defaults to 0.7):
650
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
651
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
652
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
653
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
654
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
655
+ TODO
656
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
657
+ TODO
658
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
659
+ TODO
660
+
661
+ Examples:
662
+
663
+ Returns:
664
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
665
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
666
+ `tuple. When returning a tuple, the first element is a list with the generated images, and the second
667
+ element is a list of `bool`s denoting whether the corresponding generated image likely represents
668
+ "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
669
+ """
670
+ # 0. Default height and width to unet
671
+ height = height or self.default_sample_size * self.vae_scale_factor
672
+ width = width or self.default_sample_size * self.vae_scale_factor
673
+
674
+ original_size = original_size or (height, width)
675
+ target_size = target_size or (height, width)
676
+
677
+ # 1. Check inputs. Raise error if not correct
678
+ self.check_inputs(
679
+ prompt,
680
+ height,
681
+ width,
682
+ callback_steps,
683
+ negative_prompt,
684
+ prompt_embeds,
685
+ negative_prompt_embeds,
686
+ pooled_prompt_embeds,
687
+ negative_pooled_prompt_embeds,
688
+ )
689
+
690
+ # 2. Define call parameters
691
+ if prompt is not None and isinstance(prompt, str):
692
+ batch_size = 1
693
+ elif prompt is not None and isinstance(prompt, list):
694
+ batch_size = len(prompt)
695
+ else:
696
+ batch_size = prompt_embeds.shape[0]
697
+
698
+ device = self._execution_device
699
+
700
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
701
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
702
+ # corresponds to doing no classifier free guidance.
703
+ do_classifier_free_guidance = guidance_scale > 1.0
704
+
705
+ # 3. Encode input prompt
706
+ text_encoder_lora_scale = (
707
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
708
+ )
709
+ (
710
+ prompt_embeds,
711
+ negative_prompt_embeds,
712
+ pooled_prompt_embeds,
713
+ negative_pooled_prompt_embeds,
714
+ ) = self.encode_prompt(
715
+ prompt,
716
+ device,
717
+ num_images_per_prompt,
718
+ do_classifier_free_guidance,
719
+ negative_prompt,
720
+ prompt_embeds=prompt_embeds,
721
+ negative_prompt_embeds=negative_prompt_embeds,
722
+ pooled_prompt_embeds=pooled_prompt_embeds,
723
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
724
+ lora_scale=text_encoder_lora_scale,
725
+ )
726
+
727
+ # 4. Prepare timesteps
728
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
729
+
730
+ timesteps = self.scheduler.timesteps
731
+
732
+ # 5. Prepare latent variables
733
+ num_channels_latents = self.unet.config.in_channels
734
+ latents = self.prepare_latents(
735
+ batch_size * num_images_per_prompt,
736
+ num_channels_latents,
737
+ height,
738
+ width,
739
+ prompt_embeds.dtype,
740
+ device,
741
+ generator,
742
+ latents,
743
+ )
744
+
745
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
746
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
747
+
748
+ # 7. Prepare added time ids & embeddings
749
+ add_text_embeds = pooled_prompt_embeds
750
+ add_time_ids = self._get_add_time_ids(
751
+ original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
752
+ )
753
+
754
+ if do_classifier_free_guidance:
755
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
756
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
757
+ add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
758
+
759
+ prompt_embeds = prompt_embeds.to(device)
760
+ add_text_embeds = add_text_embeds.to(device)
761
+ add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
762
+
763
+ # 8. Denoising loop
764
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
765
+
766
+ # 7.1 Apply denoising_end
767
+ if denoising_end is not None:
768
+ num_inference_steps = int(round(denoising_end * num_inference_steps))
769
+ timesteps = timesteps[: num_warmup_steps + self.scheduler.order * num_inference_steps]
770
+
771
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
772
+ for i, t in enumerate(timesteps):
773
+ # expand the latents if we are doing classifier free guidance
774
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
775
+
776
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
777
+
778
+ # predict the noise residual
779
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
780
+ noise_pred = self.unet(
781
+ latent_model_input,
782
+ t,
783
+ encoder_hidden_states=prompt_embeds,
784
+ cross_attention_kwargs=cross_attention_kwargs,
785
+ added_cond_kwargs=added_cond_kwargs,
786
+ return_dict=False,
787
+ )[0]
788
+
789
+ # perform guidance
790
+ if do_classifier_free_guidance:
791
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
792
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
793
+ if use_dynamic_threshold:
794
+ DynamicThresh = DynThresh(maxSteps=num_inference_steps, experiment_mode=0)
795
+ noise_pred = DynamicThresh.dynthresh(noise_pred_text,
796
+ noise_pred_uncond,
797
+ guidance_scale,
798
+ None)
799
+
800
+ if do_classifier_free_guidance and guidance_rescale > 0.0:
801
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
802
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
803
+
804
+ # compute the previous noisy sample x_t -> x_t-1
805
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
806
+
807
+ # call the callback, if provided
808
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
809
+ progress_bar.update()
810
+ if callback is not None and i % callback_steps == 0:
811
+ callback(i, t, latents)
812
+
813
+ # make sureo the VAE is in float32 mode, as it overflows in float16
814
+ # torch.cuda.empty_cache()
815
+ if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
816
+ self.upcast_vae()
817
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
818
+
819
+
820
+ if not output_type == "latent":
821
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
822
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
823
+ else:
824
+ image = latents
825
+ return StableDiffusionXLPipelineOutput(images=image)
826
+
827
+ # image = self.watermark.apply_watermark(image)
828
+ image = self.image_processor.postprocess(image, output_type=output_type)
829
+
830
+ # Offload last model to CPU
831
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
832
+ self.final_offload_hook.offload()
833
+
834
+ if not return_dict:
835
+ return (image,)
836
+
837
+ return StableDiffusionXLPipelineOutput(images=image)
838
+
839
+
840
+ if __name__ == "__main__":
841
+ pass
build/lib/kolors/pipelines/pipeline_stable_diffusion_xl_chatglm_256_inpainting.py ADDED
@@ -0,0 +1,1790 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ from transformers import (
22
+ CLIPImageProcessor,
23
+ CLIPTextModel,
24
+ CLIPTextModelWithProjection,
25
+ CLIPTokenizer,
26
+ CLIPVisionModelWithProjection,
27
+ )
28
+
29
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
30
+ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
31
+ from diffusers.loaders import (
32
+ FromSingleFileMixin,
33
+ IPAdapterMixin,
34
+ StableDiffusionXLLoraLoaderMixin,
35
+ TextualInversionLoaderMixin,
36
+ )
37
+ from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
38
+ from diffusers.models.attention_processor import (
39
+ AttnProcessor2_0,
40
+ LoRAAttnProcessor2_0,
41
+ LoRAXFormersAttnProcessor,
42
+ XFormersAttnProcessor,
43
+ )
44
+ from diffusers.models.lora import adjust_lora_scale_text_encoder
45
+ from diffusers.schedulers import KarrasDiffusionSchedulers
46
+ from diffusers.utils import (
47
+ USE_PEFT_BACKEND,
48
+ deprecate,
49
+ is_invisible_watermark_available,
50
+ is_torch_xla_available,
51
+ logging,
52
+ replace_example_docstring,
53
+ scale_lora_layers,
54
+ unscale_lora_layers,
55
+ )
56
+ from diffusers.utils.torch_utils import randn_tensor
57
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
58
+ from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
59
+
60
+
61
+ if is_invisible_watermark_available():
62
+ from .watermark import StableDiffusionXLWatermarker
63
+
64
+ if is_torch_xla_available():
65
+ import torch_xla.core.xla_model as xm
66
+
67
+ XLA_AVAILABLE = True
68
+ else:
69
+ XLA_AVAILABLE = False
70
+
71
+
72
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
73
+
74
+
75
+ EXAMPLE_DOC_STRING = """
76
+ Examples:
77
+ ```py
78
+ >>> import torch
79
+ >>> from diffusers import StableDiffusionXLInpaintPipeline
80
+ >>> from diffusers.utils import load_image
81
+
82
+ >>> pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
83
+ ... "stabilityai/stable-diffusion-xl-base-1.0",
84
+ ... torch_dtype=torch.float16,
85
+ ... variant="fp16",
86
+ ... use_safetensors=True,
87
+ ... )
88
+ >>> pipe.to("cuda")
89
+
90
+ >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
91
+ >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
92
+
93
+ >>> init_image = load_image(img_url).convert("RGB")
94
+ >>> mask_image = load_image(mask_url).convert("RGB")
95
+
96
+ >>> prompt = "A majestic tiger sitting on a bench"
97
+ >>> image = pipe(
98
+ ... prompt=prompt, image=init_image, mask_image=mask_image, num_inference_steps=50, strength=0.80
99
+ ... ).images[0]
100
+ ```
101
+ """
102
+
103
+
104
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
105
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
106
+ """
107
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
108
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
109
+ """
110
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
111
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
112
+ # rescale the results from guidance (fixes overexposure)
113
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
114
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
115
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
116
+ return noise_cfg
117
+
118
+
119
+ def mask_pil_to_torch(mask, height, width):
120
+ # preprocess mask
121
+ if isinstance(mask, (PIL.Image.Image, np.ndarray)):
122
+ mask = [mask]
123
+
124
+ if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
125
+ mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
126
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
127
+ mask = mask.astype(np.float32) / 255.0
128
+ elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
129
+ mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
130
+
131
+ mask = torch.from_numpy(mask)
132
+ return mask
133
+
134
+
135
+ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool = False):
136
+ """
137
+ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
138
+ converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
139
+ ``image`` and ``1`` for the ``mask``.
140
+
141
+ The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
142
+ binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
143
+
144
+ Args:
145
+ image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
146
+ It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
147
+ ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
148
+ mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
149
+ It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
150
+ ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
151
+
152
+
153
+ Raises:
154
+ ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
155
+ should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
156
+ TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
157
+ (ot the other way around).
158
+
159
+ Returns:
160
+ tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
161
+ dimensions: ``batch x channels x height x width``.
162
+ """
163
+
164
+ # checkpoint. TOD(Yiyi) - need to clean this up later
165
+ deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
166
+ deprecate(
167
+ "prepare_mask_and_masked_image",
168
+ "0.30.0",
169
+ deprecation_message,
170
+ )
171
+ if image is None:
172
+ raise ValueError("`image` input cannot be undefined.")
173
+
174
+ if mask is None:
175
+ raise ValueError("`mask_image` input cannot be undefined.")
176
+
177
+ if isinstance(image, torch.Tensor):
178
+ if not isinstance(mask, torch.Tensor):
179
+ mask = mask_pil_to_torch(mask, height, width)
180
+
181
+ if image.ndim == 3:
182
+ image = image.unsqueeze(0)
183
+
184
+ # Batch and add channel dim for single mask
185
+ if mask.ndim == 2:
186
+ mask = mask.unsqueeze(0).unsqueeze(0)
187
+
188
+ # Batch single mask or add channel dim
189
+ if mask.ndim == 3:
190
+ # Single batched mask, no channel dim or single mask not batched but channel dim
191
+ if mask.shape[0] == 1:
192
+ mask = mask.unsqueeze(0)
193
+
194
+ # Batched masks no channel dim
195
+ else:
196
+ mask = mask.unsqueeze(1)
197
+
198
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
199
+ # assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
200
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
201
+
202
+ # Check image is in [-1, 1]
203
+ # if image.min() < -1 or image.max() > 1:
204
+ # raise ValueError("Image should be in [-1, 1] range")
205
+
206
+ # Check mask is in [0, 1]
207
+ if mask.min() < 0 or mask.max() > 1:
208
+ raise ValueError("Mask should be in [0, 1] range")
209
+
210
+ # Binarize mask
211
+ mask[mask < 0.5] = 0
212
+ mask[mask >= 0.5] = 1
213
+
214
+ # Image as float32
215
+ image = image.to(dtype=torch.float32)
216
+ elif isinstance(mask, torch.Tensor):
217
+ raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
218
+ else:
219
+ # preprocess image
220
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
221
+ image = [image]
222
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
223
+ # resize all images w.r.t passed height an width
224
+ image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
225
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
226
+ image = np.concatenate(image, axis=0)
227
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
228
+ image = np.concatenate([i[None, :] for i in image], axis=0)
229
+
230
+ image = image.transpose(0, 3, 1, 2)
231
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
232
+
233
+ mask = mask_pil_to_torch(mask, height, width)
234
+ mask[mask < 0.5] = 0
235
+ mask[mask >= 0.5] = 1
236
+
237
+ if image.shape[1] == 4:
238
+ # images are in latent space and thus can't
239
+ # be masked set masked_image to None
240
+ # we assume that the checkpoint is not an inpainting
241
+ # checkpoint. TOD(Yiyi) - need to clean this up later
242
+ masked_image = None
243
+ else:
244
+ masked_image = image * (mask < 0.5)
245
+
246
+ # n.b. ensure backwards compatibility as old function does not return image
247
+ if return_image:
248
+ return mask, masked_image, image
249
+
250
+ return mask, masked_image
251
+
252
+
253
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
254
+ def retrieve_latents(
255
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
256
+ ):
257
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
258
+ return encoder_output.latent_dist.sample(generator)
259
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
260
+ return encoder_output.latent_dist.mode()
261
+ elif hasattr(encoder_output, "latents"):
262
+ return encoder_output.latents
263
+ else:
264
+ raise AttributeError("Could not access latents of provided encoder_output")
265
+
266
+
267
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
268
+ def retrieve_timesteps(
269
+ scheduler,
270
+ num_inference_steps: Optional[int] = None,
271
+ device: Optional[Union[str, torch.device]] = None,
272
+ timesteps: Optional[List[int]] = None,
273
+ sigmas: Optional[List[float]] = None,
274
+ **kwargs,
275
+ ):
276
+ """
277
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
278
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
279
+
280
+ Args:
281
+ scheduler (`SchedulerMixin`):
282
+ The scheduler to get timesteps from.
283
+ num_inference_steps (`int`):
284
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
285
+ must be `None`.
286
+ device (`str` or `torch.device`, *optional*):
287
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
288
+ timesteps (`List[int]`, *optional*):
289
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
290
+ `num_inference_steps` and `sigmas` must be `None`.
291
+ sigmas (`List[float]`, *optional*):
292
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
293
+ `num_inference_steps` and `timesteps` must be `None`.
294
+
295
+ Returns:
296
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
297
+ second element is the number of inference steps.
298
+ """
299
+ if timesteps is not None and sigmas is not None:
300
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
301
+ if timesteps is not None:
302
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
303
+ if not accepts_timesteps:
304
+ raise ValueError(
305
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
306
+ f" timestep schedules. Please check whether you are using the correct scheduler."
307
+ )
308
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
309
+ timesteps = scheduler.timesteps
310
+ num_inference_steps = len(timesteps)
311
+ elif sigmas is not None:
312
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
313
+ if not accept_sigmas:
314
+ raise ValueError(
315
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
316
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
317
+ )
318
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
319
+ timesteps = scheduler.timesteps
320
+ num_inference_steps = len(timesteps)
321
+ else:
322
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
323
+ timesteps = scheduler.timesteps
324
+ return timesteps, num_inference_steps
325
+
326
+
327
+ class StableDiffusionXLInpaintPipeline(
328
+ DiffusionPipeline,
329
+ StableDiffusionMixin,
330
+ TextualInversionLoaderMixin,
331
+ StableDiffusionXLLoraLoaderMixin,
332
+ FromSingleFileMixin,
333
+ IPAdapterMixin,
334
+ ):
335
+ r"""
336
+ Pipeline for text-to-image generation using Stable Diffusion XL.
337
+
338
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
339
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
340
+
341
+ The pipeline also inherits the following loading methods:
342
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
343
+ - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
344
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
345
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
346
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
347
+
348
+ Args:
349
+ vae ([`AutoencoderKL`]):
350
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
351
+ text_encoder ([`CLIPTextModel`]):
352
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
353
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
354
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
355
+ text_encoder_2 ([` CLIPTextModelWithProjection`]):
356
+ Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
357
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
358
+ specifically the
359
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
360
+ variant.
361
+ tokenizer (`CLIPTokenizer`):
362
+ Tokenizer of class
363
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
364
+ tokenizer_2 (`CLIPTokenizer`):
365
+ Second Tokenizer of class
366
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
367
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
368
+ scheduler ([`SchedulerMixin`]):
369
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
370
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
371
+ requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
372
+ Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
373
+ of `stabilityai/stable-diffusion-xl-refiner-1-0`.
374
+ force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
375
+ Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
376
+ `stabilityai/stable-diffusion-xl-base-1-0`.
377
+ add_watermarker (`bool`, *optional*):
378
+ Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
379
+ watermark output images. If not defined, it will default to True if the package is installed, otherwise no
380
+ watermarker will be used.
381
+ """
382
+
383
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
384
+
385
+ _optional_components = [
386
+ "tokenizer",
387
+ "tokenizer_2",
388
+ "text_encoder",
389
+ "text_encoder_2",
390
+ "image_encoder",
391
+ "feature_extractor",
392
+ ]
393
+ _callback_tensor_inputs = [
394
+ "latents",
395
+ "prompt_embeds",
396
+ "negative_prompt_embeds",
397
+ "add_text_embeds",
398
+ "add_time_ids",
399
+ "negative_pooled_prompt_embeds",
400
+ "add_neg_time_ids",
401
+ "mask",
402
+ "masked_image_latents",
403
+ ]
404
+
405
+ def __init__(
406
+ self,
407
+ vae: AutoencoderKL,
408
+ text_encoder: CLIPTextModel,
409
+ tokenizer: CLIPTokenizer,
410
+ unet: UNet2DConditionModel,
411
+ scheduler: KarrasDiffusionSchedulers,
412
+ tokenizer_2: CLIPTokenizer = None,
413
+ text_encoder_2: CLIPTextModelWithProjection = None,
414
+ image_encoder: CLIPVisionModelWithProjection = None,
415
+ feature_extractor: CLIPImageProcessor = None,
416
+ requires_aesthetics_score: bool = False,
417
+ force_zeros_for_empty_prompt: bool = True,
418
+ add_watermarker: Optional[bool] = None,
419
+ ):
420
+ super().__init__()
421
+
422
+ self.register_modules(
423
+ vae=vae,
424
+ text_encoder=text_encoder,
425
+ text_encoder_2=text_encoder_2,
426
+ tokenizer=tokenizer,
427
+ tokenizer_2=tokenizer_2,
428
+ unet=unet,
429
+ image_encoder=image_encoder,
430
+ feature_extractor=feature_extractor,
431
+ scheduler=scheduler,
432
+ )
433
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
434
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
435
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
436
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
437
+ self.mask_processor = VaeImageProcessor(
438
+ vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
439
+ )
440
+
441
+ add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
442
+
443
+ if add_watermarker:
444
+ self.watermark = StableDiffusionXLWatermarker()
445
+ else:
446
+ self.watermark = None
447
+
448
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
449
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
450
+ dtype = next(self.image_encoder.parameters()).dtype
451
+
452
+ if not isinstance(image, torch.Tensor):
453
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
454
+
455
+ image = image.to(device=device, dtype=dtype)
456
+ if output_hidden_states:
457
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
458
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
459
+ uncond_image_enc_hidden_states = self.image_encoder(
460
+ torch.zeros_like(image), output_hidden_states=True
461
+ ).hidden_states[-2]
462
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
463
+ num_images_per_prompt, dim=0
464
+ )
465
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
466
+ else:
467
+ image_embeds = self.image_encoder(image).image_embeds
468
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
469
+ uncond_image_embeds = torch.zeros_like(image_embeds)
470
+
471
+ return image_embeds, uncond_image_embeds
472
+
473
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
474
+ def prepare_ip_adapter_image_embeds(
475
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
476
+ ):
477
+ if ip_adapter_image_embeds is None:
478
+ if not isinstance(ip_adapter_image, list):
479
+ ip_adapter_image = [ip_adapter_image]
480
+
481
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
482
+ raise ValueError(
483
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
484
+ )
485
+
486
+ image_embeds = []
487
+ for single_ip_adapter_image, image_proj_layer in zip(
488
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
489
+ ):
490
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
491
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
492
+ single_ip_adapter_image, device, 1, output_hidden_state
493
+ )
494
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
495
+ single_negative_image_embeds = torch.stack(
496
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
497
+ )
498
+
499
+ if do_classifier_free_guidance:
500
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
501
+ single_image_embeds = single_image_embeds.to(device)
502
+
503
+ image_embeds.append(single_image_embeds)
504
+ else:
505
+ repeat_dims = [1]
506
+ image_embeds = []
507
+ for single_image_embeds in ip_adapter_image_embeds:
508
+ if do_classifier_free_guidance:
509
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
510
+ single_image_embeds = single_image_embeds.repeat(
511
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
512
+ )
513
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
514
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
515
+ )
516
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
517
+ else:
518
+ single_image_embeds = single_image_embeds.repeat(
519
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
520
+ )
521
+ image_embeds.append(single_image_embeds)
522
+
523
+ return image_embeds
524
+
525
+ def encode_prompt(
526
+ self,
527
+ prompt,
528
+ device: Optional[torch.device] = None,
529
+ num_images_per_prompt: int = 1,
530
+ do_classifier_free_guidance: bool = True,
531
+ negative_prompt=None,
532
+ prompt_embeds: Optional[torch.FloatTensor] = None,
533
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
534
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
535
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
536
+ lora_scale: Optional[float] = None,
537
+ ):
538
+ r"""
539
+ Encodes the prompt into text encoder hidden states.
540
+
541
+ Args:
542
+ prompt (`str` or `List[str]`, *optional*):
543
+ prompt to be encoded
544
+ device: (`torch.device`):
545
+ torch device
546
+ num_images_per_prompt (`int`):
547
+ number of images that should be generated per prompt
548
+ do_classifier_free_guidance (`bool`):
549
+ whether to use classifier free guidance or not
550
+ negative_prompt (`str` or `List[str]`, *optional*):
551
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
552
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
553
+ less than `1`).
554
+ prompt_embeds (`torch.FloatTensor`, *optional*):
555
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
556
+ provided, text embeddings will be generated from `prompt` input argument.
557
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
558
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
559
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
560
+ argument.
561
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
562
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
563
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
564
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
565
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
566
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
567
+ input argument.
568
+ lora_scale (`float`, *optional*):
569
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
570
+ """
571
+ # from IPython import embed; embed(); exit()
572
+ device = device or self._execution_device
573
+
574
+ # set lora scale so that monkey patched LoRA
575
+ # function of text encoder can correctly access it
576
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
577
+ self._lora_scale = lora_scale
578
+
579
+ if prompt is not None and isinstance(prompt, str):
580
+ batch_size = 1
581
+ elif prompt is not None and isinstance(prompt, list):
582
+ batch_size = len(prompt)
583
+ else:
584
+ batch_size = prompt_embeds.shape[0]
585
+
586
+ # Define tokenizers and text encoders
587
+ tokenizers = [self.tokenizer]
588
+ text_encoders = [self.text_encoder]
589
+
590
+ if prompt_embeds is None:
591
+ # textual inversion: procecss multi-vector tokens if necessary
592
+ prompt_embeds_list = []
593
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
594
+ if isinstance(self, TextualInversionLoaderMixin):
595
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
596
+
597
+ text_inputs = tokenizer(
598
+ prompt,
599
+ padding="max_length",
600
+ max_length=256,
601
+ truncation=True,
602
+ return_tensors="pt",
603
+ ).to('cuda')
604
+ output = text_encoder(
605
+ input_ids=text_inputs['input_ids'] ,
606
+ attention_mask=text_inputs['attention_mask'],
607
+ position_ids=text_inputs['position_ids'],
608
+ output_hidden_states=True)
609
+ prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
610
+ text_proj = output.hidden_states[-1][-1, :, :].clone()
611
+ bs_embed, seq_len, _ = prompt_embeds.shape
612
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
613
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
614
+ prompt_embeds_list.append(prompt_embeds)
615
+
616
+ prompt_embeds = prompt_embeds_list[0]
617
+
618
+ # get unconditional embeddings for classifier free guidance
619
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
620
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
621
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
622
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
623
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
624
+ # negative_prompt = negative_prompt or ""
625
+ uncond_tokens: List[str]
626
+ if negative_prompt is None:
627
+ uncond_tokens = [""] * batch_size
628
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
629
+ raise TypeError(
630
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
631
+ f" {type(prompt)}."
632
+ )
633
+ elif isinstance(negative_prompt, str):
634
+ uncond_tokens = [negative_prompt]
635
+ elif batch_size != len(negative_prompt):
636
+ raise ValueError(
637
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
638
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
639
+ " the batch size of `prompt`."
640
+ )
641
+ else:
642
+ uncond_tokens = negative_prompt
643
+
644
+ negative_prompt_embeds_list = []
645
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
646
+ # textual inversion: procecss multi-vector tokens if necessary
647
+ if isinstance(self, TextualInversionLoaderMixin):
648
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, tokenizer)
649
+
650
+ max_length = prompt_embeds.shape[1]
651
+ uncond_input = tokenizer(
652
+ uncond_tokens,
653
+ padding="max_length",
654
+ max_length=max_length,
655
+ truncation=True,
656
+ return_tensors="pt",
657
+ ).to('cuda')
658
+ output = text_encoder(
659
+ input_ids=uncond_input['input_ids'] ,
660
+ attention_mask=uncond_input['attention_mask'],
661
+ position_ids=uncond_input['position_ids'],
662
+ output_hidden_states=True)
663
+ negative_prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
664
+ negative_text_proj = output.hidden_states[-1][-1, :, :].clone()
665
+
666
+ if do_classifier_free_guidance:
667
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
668
+ seq_len = negative_prompt_embeds.shape[1]
669
+
670
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device)
671
+
672
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
673
+ negative_prompt_embeds = negative_prompt_embeds.view(
674
+ batch_size * num_images_per_prompt, seq_len, -1
675
+ )
676
+
677
+ # For classifier free guidance, we need to do two forward passes.
678
+ # Here we concatenate the unconditional and text embeddings into a single batch
679
+ # to avoid doing two forward passes
680
+
681
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
682
+
683
+ negative_prompt_embeds = negative_prompt_embeds_list[0]
684
+
685
+ bs_embed = text_proj.shape[0]
686
+ text_proj = text_proj.repeat(1, num_images_per_prompt).view(
687
+ bs_embed * num_images_per_prompt, -1
688
+ )
689
+ negative_text_proj = negative_text_proj.repeat(1, num_images_per_prompt).view(
690
+ bs_embed * num_images_per_prompt, -1
691
+ )
692
+
693
+ return prompt_embeds, negative_prompt_embeds, text_proj, negative_text_proj
694
+
695
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
696
+ def prepare_extra_step_kwargs(self, generator, eta):
697
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
698
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
699
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
700
+ # and should be between [0, 1]
701
+
702
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
703
+ extra_step_kwargs = {}
704
+ if accepts_eta:
705
+ extra_step_kwargs["eta"] = eta
706
+
707
+ # check if the scheduler accepts generator
708
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
709
+ if accepts_generator:
710
+ extra_step_kwargs["generator"] = generator
711
+ return extra_step_kwargs
712
+
713
+ def check_inputs(
714
+ self,
715
+ prompt,
716
+ prompt_2,
717
+ image,
718
+ mask_image,
719
+ height,
720
+ width,
721
+ strength,
722
+ callback_steps,
723
+ output_type,
724
+ negative_prompt=None,
725
+ negative_prompt_2=None,
726
+ prompt_embeds=None,
727
+ negative_prompt_embeds=None,
728
+ ip_adapter_image=None,
729
+ ip_adapter_image_embeds=None,
730
+ callback_on_step_end_tensor_inputs=None,
731
+ padding_mask_crop=None,
732
+ ):
733
+ if strength < 0 or strength > 1:
734
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
735
+
736
+ if height % 8 != 0 or width % 8 != 0:
737
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
738
+
739
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
740
+ raise ValueError(
741
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
742
+ f" {type(callback_steps)}."
743
+ )
744
+
745
+ if callback_on_step_end_tensor_inputs is not None and not all(
746
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
747
+ ):
748
+ raise ValueError(
749
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
750
+ )
751
+
752
+ if prompt is not None and prompt_embeds is not None:
753
+ raise ValueError(
754
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
755
+ " only forward one of the two."
756
+ )
757
+ elif prompt_2 is not None and prompt_embeds is not None:
758
+ raise ValueError(
759
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
760
+ " only forward one of the two."
761
+ )
762
+ elif prompt is None and prompt_embeds is None:
763
+ raise ValueError(
764
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
765
+ )
766
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
767
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
768
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
769
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
770
+
771
+ if negative_prompt is not None and negative_prompt_embeds is not None:
772
+ raise ValueError(
773
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
774
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
775
+ )
776
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
777
+ raise ValueError(
778
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
779
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
780
+ )
781
+
782
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
783
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
784
+ raise ValueError(
785
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
786
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
787
+ f" {negative_prompt_embeds.shape}."
788
+ )
789
+ if padding_mask_crop is not None:
790
+ if not isinstance(image, PIL.Image.Image):
791
+ raise ValueError(
792
+ f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
793
+ )
794
+ if not isinstance(mask_image, PIL.Image.Image):
795
+ raise ValueError(
796
+ f"The mask image should be a PIL image when inpainting mask crop, but is of type"
797
+ f" {type(mask_image)}."
798
+ )
799
+ if output_type != "pil":
800
+ raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
801
+
802
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
803
+ raise ValueError(
804
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
805
+ )
806
+
807
+ if ip_adapter_image_embeds is not None:
808
+ if not isinstance(ip_adapter_image_embeds, list):
809
+ raise ValueError(
810
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
811
+ )
812
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
813
+ raise ValueError(
814
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
815
+ )
816
+
817
+ def prepare_latents(
818
+ self,
819
+ batch_size,
820
+ num_channels_latents,
821
+ height,
822
+ width,
823
+ dtype,
824
+ device,
825
+ generator,
826
+ latents=None,
827
+ image=None,
828
+ timestep=None,
829
+ is_strength_max=True,
830
+ add_noise=True,
831
+ return_noise=False,
832
+ return_image_latents=False,
833
+ ):
834
+ shape = (
835
+ batch_size,
836
+ num_channels_latents,
837
+ int(height) // self.vae_scale_factor,
838
+ int(width) // self.vae_scale_factor,
839
+ )
840
+ if isinstance(generator, list) and len(generator) != batch_size:
841
+ raise ValueError(
842
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
843
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
844
+ )
845
+
846
+ if (image is None or timestep is None) and not is_strength_max:
847
+ raise ValueError(
848
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
849
+ "However, either the image or the noise timestep has not been provided."
850
+ )
851
+
852
+ if image.shape[1] == 4:
853
+ image_latents = image.to(device=device, dtype=dtype)
854
+ image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
855
+ elif return_image_latents or (latents is None and not is_strength_max):
856
+ image = image.to(device=device, dtype=dtype)
857
+ image_latents = self._encode_vae_image(image=image, generator=generator)
858
+ image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
859
+
860
+ if latents is None and add_noise:
861
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
862
+ # if strength is 1. then initialise the latents to noise, else initial to image + noise
863
+ latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
864
+ # if pure noise then scale the initial latents by the Scheduler's init sigma
865
+ latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
866
+ elif add_noise:
867
+ noise = latents.to(device)
868
+ latents = noise * self.scheduler.init_noise_sigma
869
+ else:
870
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
871
+ latents = image_latents.to(device)
872
+
873
+ outputs = (latents,)
874
+
875
+ if return_noise:
876
+ outputs += (noise,)
877
+
878
+ if return_image_latents:
879
+ outputs += (image_latents,)
880
+
881
+ return outputs
882
+
883
+ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
884
+ dtype = image.dtype
885
+ if self.vae.config.force_upcast:
886
+ image = image.float()
887
+ self.vae.to(dtype=torch.float32)
888
+
889
+ if isinstance(generator, list):
890
+ image_latents = [
891
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
892
+ for i in range(image.shape[0])
893
+ ]
894
+ image_latents = torch.cat(image_latents, dim=0)
895
+ else:
896
+ image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
897
+
898
+ if self.vae.config.force_upcast:
899
+ self.vae.to(dtype)
900
+
901
+ image_latents = image_latents.to(dtype)
902
+ image_latents = self.vae.config.scaling_factor * image_latents
903
+
904
+ return image_latents
905
+
906
+ def prepare_mask_latents(
907
+ self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
908
+ ):
909
+ # resize the mask to latents shape as we concatenate the mask to the latents
910
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
911
+ # and half precision
912
+ mask = torch.nn.functional.interpolate(
913
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
914
+ )
915
+ mask = mask.to(device=device, dtype=dtype)
916
+
917
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
918
+ if mask.shape[0] < batch_size:
919
+ if not batch_size % mask.shape[0] == 0:
920
+ raise ValueError(
921
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
922
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
923
+ " of masks that you pass is divisible by the total requested batch size."
924
+ )
925
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
926
+
927
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
928
+
929
+ if masked_image is not None and masked_image.shape[1] == 4:
930
+ masked_image_latents = masked_image
931
+ else:
932
+ masked_image_latents = None
933
+
934
+ if masked_image is not None:
935
+ if masked_image_latents is None:
936
+ masked_image = masked_image.to(device=device, dtype=dtype)
937
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
938
+
939
+ if masked_image_latents.shape[0] < batch_size:
940
+ if not batch_size % masked_image_latents.shape[0] == 0:
941
+ raise ValueError(
942
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
943
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
944
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
945
+ )
946
+ masked_image_latents = masked_image_latents.repeat(
947
+ batch_size // masked_image_latents.shape[0], 1, 1, 1
948
+ )
949
+
950
+ masked_image_latents = (
951
+ torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
952
+ )
953
+
954
+ # aligning device to prevent device errors when concating it with the latent model input
955
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
956
+
957
+ return mask, masked_image_latents
958
+
959
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
960
+ def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
961
+ # get the original timestep using init_timestep
962
+ if denoising_start is None:
963
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
964
+ t_start = max(num_inference_steps - init_timestep, 0)
965
+ else:
966
+ t_start = 0
967
+
968
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
969
+
970
+ # Strength is irrelevant if we directly request a timestep to start at;
971
+ # that is, strength is determined by the denoising_start instead.
972
+ if denoising_start is not None:
973
+ discrete_timestep_cutoff = int(
974
+ round(
975
+ self.scheduler.config.num_train_timesteps
976
+ - (denoising_start * self.scheduler.config.num_train_timesteps)
977
+ )
978
+ )
979
+
980
+ num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
981
+ if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
982
+ # if the scheduler is a 2nd order scheduler we might have to do +1
983
+ # because `num_inference_steps` might be even given that every timestep
984
+ # (except the highest one) is duplicated. If `num_inference_steps` is even it would
985
+ # mean that we cut the timesteps in the middle of the denoising step
986
+ # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
987
+ # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
988
+ num_inference_steps = num_inference_steps + 1
989
+
990
+ # because t_n+1 >= t_n, we slice the timesteps starting from the end
991
+ timesteps = timesteps[-num_inference_steps:]
992
+ return timesteps, num_inference_steps
993
+
994
+ return timesteps, num_inference_steps - t_start
995
+
996
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
997
+ def _get_add_time_ids(
998
+ self,
999
+ original_size,
1000
+ crops_coords_top_left,
1001
+ target_size,
1002
+ aesthetic_score,
1003
+ negative_aesthetic_score,
1004
+ negative_original_size,
1005
+ negative_crops_coords_top_left,
1006
+ negative_target_size,
1007
+ dtype,
1008
+ text_encoder_projection_dim=None,
1009
+ ):
1010
+ if self.config.requires_aesthetics_score:
1011
+ add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
1012
+ add_neg_time_ids = list(
1013
+ negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
1014
+ )
1015
+ else:
1016
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
1017
+ add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
1018
+
1019
+ passed_add_embed_dim = (
1020
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + 4096
1021
+ )
1022
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
1023
+
1024
+ if (
1025
+ expected_add_embed_dim > passed_add_embed_dim
1026
+ and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
1027
+ ):
1028
+ raise ValueError(
1029
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
1030
+ )
1031
+ elif (
1032
+ expected_add_embed_dim < passed_add_embed_dim
1033
+ and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
1034
+ ):
1035
+ raise ValueError(
1036
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
1037
+ )
1038
+ elif expected_add_embed_dim != passed_add_embed_dim:
1039
+ raise ValueError(
1040
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
1041
+ )
1042
+
1043
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
1044
+ add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
1045
+
1046
+ return add_time_ids, add_neg_time_ids
1047
+
1048
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
1049
+ def upcast_vae(self):
1050
+ dtype = self.vae.dtype
1051
+ self.vae.to(dtype=torch.float32)
1052
+ use_torch_2_0_or_xformers = isinstance(
1053
+ self.vae.decoder.mid_block.attentions[0].processor,
1054
+ (
1055
+ AttnProcessor2_0,
1056
+ XFormersAttnProcessor,
1057
+ LoRAXFormersAttnProcessor,
1058
+ LoRAAttnProcessor2_0,
1059
+ ),
1060
+ )
1061
+ # if xformers or torch_2_0 is used attention block does not need
1062
+ # to be in float32 which can save lots of memory
1063
+ if use_torch_2_0_or_xformers:
1064
+ self.vae.post_quant_conv.to(dtype)
1065
+ self.vae.decoder.conv_in.to(dtype)
1066
+ self.vae.decoder.mid_block.to(dtype)
1067
+
1068
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
1069
+ def get_guidance_scale_embedding(
1070
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
1071
+ ) -> torch.Tensor:
1072
+ """
1073
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
1074
+
1075
+ Args:
1076
+ w (`torch.Tensor`):
1077
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
1078
+ embedding_dim (`int`, *optional*, defaults to 512):
1079
+ Dimension of the embeddings to generate.
1080
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
1081
+ Data type of the generated embeddings.
1082
+
1083
+ Returns:
1084
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
1085
+ """
1086
+ assert len(w.shape) == 1
1087
+ w = w * 1000.0
1088
+
1089
+ half_dim = embedding_dim // 2
1090
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
1091
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
1092
+ emb = w.to(dtype)[:, None] * emb[None, :]
1093
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
1094
+ if embedding_dim % 2 == 1: # zero pad
1095
+ emb = torch.nn.functional.pad(emb, (0, 1))
1096
+ assert emb.shape == (w.shape[0], embedding_dim)
1097
+ return emb
1098
+
1099
+ @property
1100
+ def guidance_scale(self):
1101
+ return self._guidance_scale
1102
+
1103
+ @property
1104
+ def guidance_rescale(self):
1105
+ return self._guidance_rescale
1106
+
1107
+ @property
1108
+ def clip_skip(self):
1109
+ return self._clip_skip
1110
+
1111
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
1112
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
1113
+ # corresponds to doing no classifier free guidance.
1114
+ @property
1115
+ def do_classifier_free_guidance(self):
1116
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
1117
+
1118
+ @property
1119
+ def cross_attention_kwargs(self):
1120
+ return self._cross_attention_kwargs
1121
+
1122
+ @property
1123
+ def denoising_end(self):
1124
+ return self._denoising_end
1125
+
1126
+ @property
1127
+ def denoising_start(self):
1128
+ return self._denoising_start
1129
+
1130
+ @property
1131
+ def num_timesteps(self):
1132
+ return self._num_timesteps
1133
+
1134
+ @property
1135
+ def interrupt(self):
1136
+ return self._interrupt
1137
+
1138
+ @torch.no_grad()
1139
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
1140
+ def __call__(
1141
+ self,
1142
+ prompt: Union[str, List[str]] = None,
1143
+ prompt_2: Optional[Union[str, List[str]]] = None,
1144
+ image: PipelineImageInput = None,
1145
+ mask_image: PipelineImageInput = None,
1146
+ masked_image_latents: torch.Tensor = None,
1147
+ height: Optional[int] = None,
1148
+ width: Optional[int] = None,
1149
+ padding_mask_crop: Optional[int] = None,
1150
+ strength: float = 0.9999,
1151
+ num_inference_steps: int = 50,
1152
+ timesteps: List[int] = None,
1153
+ sigmas: List[float] = None,
1154
+ denoising_start: Optional[float] = None,
1155
+ denoising_end: Optional[float] = None,
1156
+ guidance_scale: float = 7.5,
1157
+ negative_prompt: Optional[Union[str, List[str]]] = None,
1158
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
1159
+ num_images_per_prompt: Optional[int] = 1,
1160
+ eta: float = 0.0,
1161
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
1162
+ latents: Optional[torch.Tensor] = None,
1163
+ prompt_embeds: Optional[torch.Tensor] = None,
1164
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
1165
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
1166
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
1167
+ ip_adapter_image: Optional[PipelineImageInput] = None,
1168
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
1169
+ output_type: Optional[str] = "pil",
1170
+ return_dict: bool = True,
1171
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1172
+ guidance_rescale: float = 0.0,
1173
+ original_size: Tuple[int, int] = None,
1174
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
1175
+ target_size: Tuple[int, int] = None,
1176
+ negative_original_size: Optional[Tuple[int, int]] = None,
1177
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
1178
+ negative_target_size: Optional[Tuple[int, int]] = None,
1179
+ aesthetic_score: float = 6.0,
1180
+ negative_aesthetic_score: float = 2.5,
1181
+ clip_skip: Optional[int] = None,
1182
+ callback_on_step_end: Optional[
1183
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
1184
+ ] = None,
1185
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
1186
+ **kwargs,
1187
+ ):
1188
+ r"""
1189
+ Function invoked when calling the pipeline for generation.
1190
+
1191
+ Args:
1192
+ prompt (`str` or `List[str]`, *optional*):
1193
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
1194
+ instead.
1195
+ prompt_2 (`str` or `List[str]`, *optional*):
1196
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
1197
+ used in both text-encoders
1198
+ image (`PIL.Image.Image`):
1199
+ `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
1200
+ be masked out with `mask_image` and repainted according to `prompt`.
1201
+ mask_image (`PIL.Image.Image`):
1202
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
1203
+ repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
1204
+ to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
1205
+ instead of 3, so the expected shape would be `(B, H, W, 1)`.
1206
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
1207
+ The height in pixels of the generated image. This is set to 1024 by default for the best results.
1208
+ Anything below 512 pixels won't work well for
1209
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
1210
+ and checkpoints that are not specifically fine-tuned on low resolutions.
1211
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
1212
+ The width in pixels of the generated image. This is set to 1024 by default for the best results.
1213
+ Anything below 512 pixels won't work well for
1214
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
1215
+ and checkpoints that are not specifically fine-tuned on low resolutions.
1216
+ padding_mask_crop (`int`, *optional*, defaults to `None`):
1217
+ The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
1218
+ image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
1219
+ with the same aspect ration of the image and contains all masked area, and then expand that area based
1220
+ on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
1221
+ resizing to the original image size for inpainting. This is useful when the masked area is small while
1222
+ the image is large and contain information irrelevant for inpainting, such as background.
1223
+ strength (`float`, *optional*, defaults to 0.9999):
1224
+ Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
1225
+ between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
1226
+ `strength`. The number of denoising steps depends on the amount of noise initially added. When
1227
+ `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
1228
+ iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
1229
+ portion of the reference `image`. Note that in the case of `denoising_start` being declared as an
1230
+ integer, the value of `strength` will be ignored.
1231
+ num_inference_steps (`int`, *optional*, defaults to 50):
1232
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1233
+ expense of slower inference.
1234
+ timesteps (`List[int]`, *optional*):
1235
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
1236
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
1237
+ passed will be used. Must be in descending order.
1238
+ sigmas (`List[float]`, *optional*):
1239
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
1240
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
1241
+ will be used.
1242
+ denoising_start (`float`, *optional*):
1243
+ When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
1244
+ bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
1245
+ it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
1246
+ strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
1247
+ is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
1248
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
1249
+ denoising_end (`float`, *optional*):
1250
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
1251
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
1252
+ still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
1253
+ denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
1254
+ final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
1255
+ forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
1256
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
1257
+ guidance_scale (`float`, *optional*, defaults to 7.5):
1258
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
1259
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
1260
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1261
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
1262
+ usually at the expense of lower image quality.
1263
+ negative_prompt (`str` or `List[str]`, *optional*):
1264
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
1265
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
1266
+ less than `1`).
1267
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
1268
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
1269
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
1270
+ prompt_embeds (`torch.Tensor`, *optional*):
1271
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
1272
+ provided, text embeddings will be generated from `prompt` input argument.
1273
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
1274
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1275
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
1276
+ argument.
1277
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
1278
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
1279
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
1280
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
1281
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1282
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
1283
+ input argument.
1284
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
1285
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
1286
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
1287
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
1288
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
1289
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
1290
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
1291
+ The number of images to generate per prompt.
1292
+ eta (`float`, *optional*, defaults to 0.0):
1293
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
1294
+ [`schedulers.DDIMScheduler`], will be ignored for others.
1295
+ generator (`torch.Generator`, *optional*):
1296
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
1297
+ to make generation deterministic.
1298
+ latents (`torch.Tensor`, *optional*):
1299
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
1300
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1301
+ tensor will ge generated by sampling using the supplied random `generator`.
1302
+ output_type (`str`, *optional*, defaults to `"pil"`):
1303
+ The output format of the generate image. Choose between
1304
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
1305
+ return_dict (`bool`, *optional*, defaults to `True`):
1306
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
1307
+ plain tuple.
1308
+ cross_attention_kwargs (`dict`, *optional*):
1309
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
1310
+ `self.processor` in
1311
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1312
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1313
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
1314
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
1315
+ explained in section 2.2 of
1316
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1317
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
1318
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
1319
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
1320
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
1321
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1322
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1323
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
1324
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
1325
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1326
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1327
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
1328
+ micro-conditioning as explained in section 2.2 of
1329
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
1330
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
1331
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
1332
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
1333
+ micro-conditioning as explained in section 2.2 of
1334
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
1335
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
1336
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1337
+ To negatively condition the generation process based on a target image resolution. It should be as same
1338
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
1339
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
1340
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
1341
+ aesthetic_score (`float`, *optional*, defaults to 6.0):
1342
+ Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
1343
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
1344
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1345
+ negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
1346
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
1347
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
1348
+ simulate an aesthetic score of the generated image by influencing the negative text condition.
1349
+ clip_skip (`int`, *optional*):
1350
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1351
+ the output of the pre-final layer will be used for computing the prompt embeddings.
1352
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
1353
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
1354
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
1355
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
1356
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
1357
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
1358
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1359
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1360
+ `._callback_tensor_inputs` attribute of your pipeline class.
1361
+
1362
+ Examples:
1363
+
1364
+ Returns:
1365
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
1366
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
1367
+ `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
1368
+ """
1369
+
1370
+ callback = kwargs.pop("callback", None)
1371
+ callback_steps = kwargs.pop("callback_steps", None)
1372
+
1373
+ if callback is not None:
1374
+ deprecate(
1375
+ "callback",
1376
+ "1.0.0",
1377
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
1378
+ )
1379
+ if callback_steps is not None:
1380
+ deprecate(
1381
+ "callback_steps",
1382
+ "1.0.0",
1383
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
1384
+ )
1385
+
1386
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
1387
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
1388
+
1389
+ # 0. Default height and width to unet
1390
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
1391
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
1392
+
1393
+ # 1. Check inputs
1394
+ self.check_inputs(
1395
+ prompt,
1396
+ prompt_2,
1397
+ image,
1398
+ mask_image,
1399
+ height,
1400
+ width,
1401
+ strength,
1402
+ callback_steps,
1403
+ output_type,
1404
+ negative_prompt,
1405
+ negative_prompt_2,
1406
+ prompt_embeds,
1407
+ negative_prompt_embeds,
1408
+ ip_adapter_image,
1409
+ ip_adapter_image_embeds,
1410
+ callback_on_step_end_tensor_inputs,
1411
+ padding_mask_crop,
1412
+ )
1413
+
1414
+ self._guidance_scale = guidance_scale
1415
+ self._guidance_rescale = guidance_rescale
1416
+ self._clip_skip = clip_skip
1417
+ self._cross_attention_kwargs = cross_attention_kwargs
1418
+ self._denoising_end = denoising_end
1419
+ self._denoising_start = denoising_start
1420
+ self._interrupt = False
1421
+
1422
+ # 2. Define call parameters
1423
+ if prompt is not None and isinstance(prompt, str):
1424
+ batch_size = 1
1425
+ elif prompt is not None and isinstance(prompt, list):
1426
+ batch_size = len(prompt)
1427
+ else:
1428
+ batch_size = prompt_embeds.shape[0]
1429
+
1430
+ device = self._execution_device
1431
+
1432
+ # 3. Encode input prompt
1433
+ text_encoder_lora_scale = (
1434
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
1435
+ )
1436
+
1437
+ (
1438
+ prompt_embeds,
1439
+ negative_prompt_embeds,
1440
+ pooled_prompt_embeds,
1441
+ negative_pooled_prompt_embeds,
1442
+ ) = self.encode_prompt(
1443
+ prompt=prompt,
1444
+ device=device,
1445
+ num_images_per_prompt=num_images_per_prompt,
1446
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1447
+ negative_prompt=negative_prompt,
1448
+ prompt_embeds=prompt_embeds,
1449
+ negative_prompt_embeds=negative_prompt_embeds,
1450
+ pooled_prompt_embeds=pooled_prompt_embeds,
1451
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
1452
+ lora_scale=text_encoder_lora_scale,
1453
+ )
1454
+
1455
+ # 4. set timesteps
1456
+ def denoising_value_valid(dnv):
1457
+ return isinstance(dnv, float) and 0 < dnv < 1
1458
+
1459
+ timesteps, num_inference_steps = retrieve_timesteps(
1460
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
1461
+ )
1462
+ timesteps, num_inference_steps = self.get_timesteps(
1463
+ num_inference_steps,
1464
+ strength,
1465
+ device,
1466
+ denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
1467
+ )
1468
+ # check that number of inference steps is not < 1 - as this doesn't make sense
1469
+ if num_inference_steps < 1:
1470
+ raise ValueError(
1471
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
1472
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
1473
+ )
1474
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1475
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1476
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1477
+ is_strength_max = strength == 1.0
1478
+
1479
+ # 5. Preprocess mask and image
1480
+ if padding_mask_crop is not None:
1481
+ crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
1482
+ resize_mode = "fill"
1483
+ else:
1484
+ crops_coords = None
1485
+ resize_mode = "default"
1486
+
1487
+ original_image = image
1488
+ init_image = self.image_processor.preprocess(
1489
+ image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
1490
+ )
1491
+ init_image = init_image.to(dtype=torch.float32)
1492
+
1493
+ mask = self.mask_processor.preprocess(
1494
+ mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
1495
+ )
1496
+
1497
+ if masked_image_latents is not None:
1498
+ masked_image = masked_image_latents
1499
+ elif init_image.shape[1] == 4:
1500
+ # if images are in latent space, we can't mask it
1501
+ masked_image = None
1502
+ else:
1503
+ masked_image = init_image * (mask < 0.5)
1504
+
1505
+ # 6. Prepare latent variables
1506
+ num_channels_latents = self.vae.config.latent_channels
1507
+ num_channels_unet = self.unet.config.in_channels
1508
+ return_image_latents = num_channels_unet == 4
1509
+
1510
+ add_noise = True if self.denoising_start is None else False
1511
+ latents_outputs = self.prepare_latents(
1512
+ batch_size * num_images_per_prompt,
1513
+ num_channels_latents,
1514
+ height,
1515
+ width,
1516
+ prompt_embeds.dtype,
1517
+ device,
1518
+ generator,
1519
+ latents,
1520
+ image=init_image,
1521
+ timestep=latent_timestep,
1522
+ is_strength_max=is_strength_max,
1523
+ add_noise=add_noise,
1524
+ return_noise=True,
1525
+ return_image_latents=return_image_latents,
1526
+ )
1527
+
1528
+ if return_image_latents:
1529
+ latents, noise, image_latents = latents_outputs
1530
+ else:
1531
+ latents, noise = latents_outputs
1532
+
1533
+ # 7. Prepare mask latent variables
1534
+ mask, masked_image_latents = self.prepare_mask_latents(
1535
+ mask,
1536
+ masked_image,
1537
+ batch_size * num_images_per_prompt,
1538
+ height,
1539
+ width,
1540
+ prompt_embeds.dtype,
1541
+ device,
1542
+ generator,
1543
+ self.do_classifier_free_guidance,
1544
+ )
1545
+
1546
+ # 8. Check that sizes of mask, masked image and latents match
1547
+ if num_channels_unet == 9:
1548
+ # default case for runwayml/stable-diffusion-inpainting
1549
+ num_channels_mask = mask.shape[1]
1550
+ num_channels_masked_image = masked_image_latents.shape[1]
1551
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
1552
+ raise ValueError(
1553
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
1554
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
1555
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1556
+ f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
1557
+ " `pipeline.unet` or your `mask_image` or `image` input."
1558
+ )
1559
+ elif num_channels_unet != 4:
1560
+ raise ValueError(
1561
+ f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
1562
+ )
1563
+ # 8.1 Prepare extra step kwargs.
1564
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1565
+
1566
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1567
+ height, width = latents.shape[-2:]
1568
+ height = height * self.vae_scale_factor
1569
+ width = width * self.vae_scale_factor
1570
+
1571
+ original_size = original_size or (height, width)
1572
+ target_size = target_size or (height, width)
1573
+
1574
+ # 10. Prepare added time ids & embeddings
1575
+ if negative_original_size is None:
1576
+ negative_original_size = original_size
1577
+ if negative_target_size is None:
1578
+ negative_target_size = target_size
1579
+
1580
+ add_text_embeds = pooled_prompt_embeds
1581
+ if self.text_encoder_2 is None:
1582
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
1583
+ else:
1584
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
1585
+
1586
+ add_time_ids, add_neg_time_ids = self._get_add_time_ids(
1587
+ original_size,
1588
+ crops_coords_top_left,
1589
+ target_size,
1590
+ aesthetic_score,
1591
+ negative_aesthetic_score,
1592
+ negative_original_size,
1593
+ negative_crops_coords_top_left,
1594
+ negative_target_size,
1595
+ dtype=prompt_embeds.dtype,
1596
+ text_encoder_projection_dim=text_encoder_projection_dim,
1597
+ )
1598
+ add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
1599
+
1600
+ if self.do_classifier_free_guidance:
1601
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
1602
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
1603
+ add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
1604
+ add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
1605
+
1606
+ prompt_embeds = prompt_embeds.to(device)
1607
+ add_text_embeds = add_text_embeds.to(device)
1608
+ add_time_ids = add_time_ids.to(device)
1609
+
1610
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1611
+ image_embeds = self.prepare_ip_adapter_image_embeds(
1612
+ ip_adapter_image,
1613
+ ip_adapter_image_embeds,
1614
+ device,
1615
+ batch_size * num_images_per_prompt,
1616
+ self.do_classifier_free_guidance,
1617
+ )
1618
+
1619
+
1620
+ # 11. Denoising loop
1621
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1622
+
1623
+ if (
1624
+ self.denoising_end is not None
1625
+ and self.denoising_start is not None
1626
+ and denoising_value_valid(self.denoising_end)
1627
+ and denoising_value_valid(self.denoising_start)
1628
+ and self.denoising_start >= self.denoising_end
1629
+ ):
1630
+ raise ValueError(
1631
+ f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
1632
+ + f" {self.denoising_end} when using type float."
1633
+ )
1634
+ elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
1635
+ discrete_timestep_cutoff = int(
1636
+ round(
1637
+ self.scheduler.config.num_train_timesteps
1638
+ - (self.denoising_end * self.scheduler.config.num_train_timesteps)
1639
+ )
1640
+ )
1641
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
1642
+ timesteps = timesteps[:num_inference_steps]
1643
+
1644
+ # 11.1 Optionally get Guidance Scale Embedding
1645
+ timestep_cond = None
1646
+ if self.unet.config.time_cond_proj_dim is not None:
1647
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
1648
+ timestep_cond = self.get_guidance_scale_embedding(
1649
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
1650
+ ).to(device=device, dtype=latents.dtype)
1651
+
1652
+ self._num_timesteps = len(timesteps)
1653
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1654
+ for i, t in enumerate(timesteps):
1655
+ if self.interrupt:
1656
+ continue
1657
+ # expand the latents if we are doing classifier free guidance
1658
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
1659
+
1660
+ # concat latents, mask, masked_image_latents in the channel dimension
1661
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1662
+
1663
+ if num_channels_unet == 9:
1664
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
1665
+
1666
+ # predict the noise residual
1667
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1668
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1669
+ added_cond_kwargs["image_embeds"] = image_embeds
1670
+ noise_pred = self.unet(
1671
+ latent_model_input,
1672
+ t,
1673
+ encoder_hidden_states=prompt_embeds,
1674
+ timestep_cond=timestep_cond,
1675
+ cross_attention_kwargs=self.cross_attention_kwargs,
1676
+ added_cond_kwargs=added_cond_kwargs,
1677
+ return_dict=False,
1678
+ )[0]
1679
+
1680
+ # perform guidance
1681
+ if self.do_classifier_free_guidance:
1682
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1683
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
1684
+
1685
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
1686
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
1687
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
1688
+
1689
+ # compute the previous noisy sample x_t -> x_t-1
1690
+ latents_dtype = latents.dtype
1691
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1692
+ if latents.dtype != latents_dtype:
1693
+ if torch.backends.mps.is_available():
1694
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
1695
+ latents = latents.to(latents_dtype)
1696
+
1697
+ if num_channels_unet == 4:
1698
+ init_latents_proper = image_latents
1699
+ if self.do_classifier_free_guidance:
1700
+ init_mask, _ = mask.chunk(2)
1701
+ else:
1702
+ init_mask = mask
1703
+
1704
+ if i < len(timesteps) - 1:
1705
+ noise_timestep = timesteps[i + 1]
1706
+ init_latents_proper = self.scheduler.add_noise(
1707
+ init_latents_proper, noise, torch.tensor([noise_timestep])
1708
+ )
1709
+
1710
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
1711
+
1712
+ if callback_on_step_end is not None:
1713
+ callback_kwargs = {}
1714
+ for k in callback_on_step_end_tensor_inputs:
1715
+ callback_kwargs[k] = locals()[k]
1716
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1717
+
1718
+ latents = callback_outputs.pop("latents", latents)
1719
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1720
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1721
+ add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
1722
+ negative_pooled_prompt_embeds = callback_outputs.pop(
1723
+ "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
1724
+ )
1725
+ add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
1726
+ add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
1727
+ mask = callback_outputs.pop("mask", mask)
1728
+ masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
1729
+
1730
+ # call the callback, if provided
1731
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1732
+ progress_bar.update()
1733
+ if callback is not None and i % callback_steps == 0:
1734
+ step_idx = i // getattr(self.scheduler, "order", 1)
1735
+ callback(step_idx, t, latents)
1736
+
1737
+ if XLA_AVAILABLE:
1738
+ xm.mark_step()
1739
+
1740
+ if not output_type == "latent":
1741
+ # make sure the VAE is in float32 mode, as it overflows in float16
1742
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
1743
+
1744
+ if needs_upcasting:
1745
+ self.upcast_vae()
1746
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1747
+ elif latents.dtype != self.vae.dtype:
1748
+ if torch.backends.mps.is_available():
1749
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
1750
+ self.vae = self.vae.to(latents.dtype)
1751
+
1752
+ # unscale/denormalize the latents
1753
+ # denormalize with the mean and std if available and not None
1754
+ has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
1755
+ has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
1756
+ if has_latents_mean and has_latents_std:
1757
+ latents_mean = (
1758
+ torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
1759
+ )
1760
+ latents_std = (
1761
+ torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
1762
+ )
1763
+ latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
1764
+ else:
1765
+ latents = latents / self.vae.config.scaling_factor
1766
+
1767
+ image = self.vae.decode(latents, return_dict=False)[0]
1768
+
1769
+ # cast back to fp16 if needed
1770
+ if needs_upcasting:
1771
+ self.vae.to(dtype=torch.float16)
1772
+ else:
1773
+ return StableDiffusionXLPipelineOutput(images=latents)
1774
+
1775
+ # apply watermark if available
1776
+ if self.watermark is not None:
1777
+ image = self.watermark.apply_watermark(image)
1778
+
1779
+ image = self.image_processor.postprocess(image, output_type=output_type)
1780
+
1781
+ if padding_mask_crop is not None:
1782
+ image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
1783
+
1784
+ # Offload all models
1785
+ self.maybe_free_model_hooks()
1786
+
1787
+ if not return_dict:
1788
+ return (image,)
1789
+
1790
+ return StableDiffusionXLPipelineOutput(images=image)
build/lib/kolors/pipelines/pipeline_stable_diffusion_xl_chatglm_256_ipadapter.py ADDED
@@ -0,0 +1,948 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import sys
15
+ import os
16
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
17
+ from kolors.models.modeling_chatglm import ChatGLMModel
18
+ from kolors.models.tokenization_chatglm import ChatGLMTokenizer
19
+ import inspect
20
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
21
+ import torch
22
+ from transformers import (
23
+ CLIPImageProcessor,
24
+ CLIPTextModel,
25
+ CLIPTextModelWithProjection,
26
+ CLIPTokenizer,
27
+ CLIPVisionModelWithProjection,
28
+ )
29
+ from transformers import XLMRobertaModel, ChineseCLIPTextModel
30
+
31
+ from diffusers.image_processor import VaeImageProcessor,PipelineImageInput
32
+ from diffusers.loaders import (
33
+ FromSingleFileMixin,
34
+ IPAdapterMixin,
35
+ LoraLoaderMixin,
36
+ TextualInversionLoaderMixin
37
+ )
38
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel,ImageProjection
39
+ from diffusers.models.attention_processor import (
40
+ AttnProcessor2_0,
41
+ LoRAAttnProcessor2_0,
42
+ LoRAXFormersAttnProcessor,
43
+ XFormersAttnProcessor,
44
+ )
45
+ from diffusers.schedulers import KarrasDiffusionSchedulers
46
+ from diffusers.utils import (
47
+ is_accelerate_available,
48
+ is_accelerate_version,
49
+ logging,
50
+ replace_example_docstring,
51
+ )
52
+ try:
53
+ from diffusers.utils import randn_tensor
54
+ except:
55
+ from diffusers.utils.torch_utils import randn_tensor
56
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
57
+ from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
58
+
59
+
60
+
61
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
62
+
63
+ EXAMPLE_DOC_STRING = """
64
+ Examples:
65
+ ```py
66
+ >>> import torch
67
+ >>> from diffusers import StableDiffusionXLPipeline
68
+
69
+ >>> pipe = StableDiffusionXLPipeline.from_pretrained(
70
+ ... "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16
71
+ ... )
72
+ >>> pipe = pipe.to("cuda")
73
+
74
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
75
+ >>> image = pipe(prompt).images[0]
76
+ ```
77
+ """
78
+
79
+
80
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
81
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
82
+ """
83
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
84
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
85
+ """
86
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
87
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
88
+ # rescale the results from guidance (fixes overexposure)
89
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
90
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
91
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
92
+ return noise_cfg
93
+
94
+
95
+ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, IPAdapterMixin,):
96
+ r"""
97
+ Pipeline for text-to-image generation using Stable Diffusion XL.
98
+
99
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
100
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
101
+
102
+ In addition the pipeline inherits the following loading methods:
103
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
104
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
105
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
106
+
107
+ as well as the following saving methods:
108
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
109
+
110
+ Args:
111
+ vae ([`AutoencoderKL`]):
112
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
113
+ text_encoder ([`CLIPTextModel`]):
114
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
115
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
116
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
117
+
118
+ tokenizer (`CLIPTokenizer`):
119
+ Tokenizer of class
120
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
121
+
122
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
123
+ scheduler ([`SchedulerMixin`]):
124
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
125
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
126
+ """
127
+
128
+ def __init__(
129
+ self,
130
+ vae: AutoencoderKL,
131
+ text_encoder: ChatGLMModel,
132
+ tokenizer: ChatGLMTokenizer,
133
+ unet: UNet2DConditionModel,
134
+ scheduler: KarrasDiffusionSchedulers,
135
+ image_encoder: CLIPVisionModelWithProjection = None,
136
+ feature_extractor: CLIPImageProcessor = None,
137
+ force_zeros_for_empty_prompt: bool = True,
138
+ ):
139
+ super().__init__()
140
+
141
+ self.register_modules(
142
+ vae=vae,
143
+ text_encoder=text_encoder,
144
+ tokenizer=tokenizer,
145
+ unet=unet,
146
+ scheduler=scheduler,
147
+ image_encoder=image_encoder,
148
+ feature_extractor=feature_extractor,
149
+ )
150
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
151
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
152
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
153
+ self.default_sample_size = self.unet.config.sample_size
154
+
155
+ # self.watermark = StableDiffusionXLWatermarker()
156
+
157
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
158
+ def enable_vae_slicing(self):
159
+ r"""
160
+ Enable sliced VAE decoding.
161
+
162
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
163
+ steps. This is useful to save some memory and allow larger batch sizes.
164
+ """
165
+ self.vae.enable_slicing()
166
+
167
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
168
+ def disable_vae_slicing(self):
169
+ r"""
170
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
171
+ computing decoding in one step.
172
+ """
173
+ self.vae.disable_slicing()
174
+
175
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
176
+ def enable_vae_tiling(self):
177
+ r"""
178
+ Enable tiled VAE decoding.
179
+
180
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
181
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
182
+ """
183
+ self.vae.enable_tiling()
184
+
185
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
186
+ def disable_vae_tiling(self):
187
+ r"""
188
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
189
+ computing decoding in one step.
190
+ """
191
+ self.vae.disable_tiling()
192
+
193
+ def enable_sequential_cpu_offload(self, gpu_id=0):
194
+ r"""
195
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
196
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
197
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
198
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
199
+ `enable_model_cpu_offload`, but performance is lower.
200
+ """
201
+ if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
202
+ from accelerate import cpu_offload
203
+ else:
204
+ raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
205
+
206
+ device = torch.device(f"cuda:{gpu_id}")
207
+
208
+ if self.device.type != "cpu":
209
+ self.to("cpu", silence_dtype_warnings=True)
210
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
211
+
212
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
213
+ cpu_offload(cpu_offloaded_model, device)
214
+
215
+ def enable_model_cpu_offload(self, gpu_id=0):
216
+ r"""
217
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
218
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
219
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
220
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
221
+ """
222
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
223
+ from accelerate import cpu_offload_with_hook
224
+ else:
225
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
226
+
227
+ device = torch.device(f"cuda:{gpu_id}")
228
+
229
+ if self.device.type != "cpu":
230
+ self.to("cpu", silence_dtype_warnings=True)
231
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
232
+
233
+ model_sequence = (
234
+ [self.text_encoder, self.image_encoder]
235
+ )
236
+ model_sequence.extend([self.unet, self.vae])
237
+
238
+ hook = None
239
+ for cpu_offloaded_model in model_sequence:
240
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
241
+
242
+ # We'll offload the last model manually.
243
+ self.final_offload_hook = hook
244
+
245
+ @property
246
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
247
+ def _execution_device(self):
248
+ r"""
249
+ Returns the device on which the pipeline's models will be executed. After calling
250
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
251
+ hooks.
252
+ """
253
+ if not hasattr(self.unet, "_hf_hook"):
254
+ return self.device
255
+ for module in self.unet.modules():
256
+ if (
257
+ hasattr(module, "_hf_hook")
258
+ and hasattr(module._hf_hook, "execution_device")
259
+ and module._hf_hook.execution_device is not None
260
+ ):
261
+ return torch.device(module._hf_hook.execution_device)
262
+ return self.device
263
+
264
+ def encode_prompt(
265
+ self,
266
+ prompt,
267
+ device: Optional[torch.device] = None,
268
+ num_images_per_prompt: int = 1,
269
+ do_classifier_free_guidance: bool = True,
270
+ negative_prompt=None,
271
+ prompt_embeds: Optional[torch.FloatTensor] = None,
272
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
273
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
274
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
275
+ lora_scale: Optional[float] = None,
276
+ ):
277
+ r"""
278
+ Encodes the prompt into text encoder hidden states.
279
+
280
+ Args:
281
+ prompt (`str` or `List[str]`, *optional*):
282
+ prompt to be encoded
283
+ device: (`torch.device`):
284
+ torch device
285
+ num_images_per_prompt (`int`):
286
+ number of images that should be generated per prompt
287
+ do_classifier_free_guidance (`bool`):
288
+ whether to use classifier free guidance or not
289
+ negative_prompt (`str` or `List[str]`, *optional*):
290
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
291
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
292
+ less than `1`).
293
+ prompt_embeds (`torch.FloatTensor`, *optional*):
294
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
295
+ provided, text embeddings will be generated from `prompt` input argument.
296
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
297
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
298
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
299
+ argument.
300
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
301
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
302
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
303
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
304
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
305
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
306
+ input argument.
307
+ lora_scale (`float`, *optional*):
308
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
309
+ """
310
+ # from IPython import embed; embed(); exit()
311
+ device = device or self._execution_device
312
+
313
+ # set lora scale so that monkey patched LoRA
314
+ # function of text encoder can correctly access it
315
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
316
+ self._lora_scale = lora_scale
317
+
318
+ if prompt is not None and isinstance(prompt, str):
319
+ batch_size = 1
320
+ elif prompt is not None and isinstance(prompt, list):
321
+ batch_size = len(prompt)
322
+ else:
323
+ batch_size = prompt_embeds.shape[0]
324
+
325
+ # Define tokenizers and text encoders
326
+ tokenizers = [self.tokenizer]
327
+ text_encoders = [self.text_encoder]
328
+
329
+ if prompt_embeds is None:
330
+ # textual inversion: procecss multi-vector tokens if necessary
331
+ prompt_embeds_list = []
332
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
333
+ if isinstance(self, TextualInversionLoaderMixin):
334
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
335
+
336
+ text_inputs = tokenizer(
337
+ prompt,
338
+ padding="max_length",
339
+ max_length=256,
340
+ truncation=True,
341
+ return_tensors="pt",
342
+ ).to('cuda')
343
+ output = text_encoder(
344
+ input_ids=text_inputs['input_ids'] ,
345
+ attention_mask=text_inputs['attention_mask'],
346
+ position_ids=text_inputs['position_ids'],
347
+ output_hidden_states=True)
348
+ prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
349
+ pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone() # [batch_size, 4096]
350
+ bs_embed, seq_len, _ = prompt_embeds.shape
351
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
352
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
353
+
354
+ prompt_embeds_list.append(prompt_embeds)
355
+
356
+ # prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
357
+ prompt_embeds = prompt_embeds_list[0]
358
+
359
+ # get unconditional embeddings for classifier free guidance
360
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
361
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
362
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
363
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
364
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
365
+ # negative_prompt = negative_prompt or ""
366
+ uncond_tokens: List[str]
367
+ if negative_prompt is None:
368
+ uncond_tokens = [""] * batch_size
369
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
370
+ raise TypeError(
371
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
372
+ f" {type(prompt)}."
373
+ )
374
+ elif isinstance(negative_prompt, str):
375
+ uncond_tokens = [negative_prompt]
376
+ elif batch_size != len(negative_prompt):
377
+ raise ValueError(
378
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
379
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
380
+ " the batch size of `prompt`."
381
+ )
382
+ else:
383
+ uncond_tokens = negative_prompt
384
+
385
+ negative_prompt_embeds_list = []
386
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
387
+ # textual inversion: procecss multi-vector tokens if necessary
388
+ if isinstance(self, TextualInversionLoaderMixin):
389
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, tokenizer)
390
+
391
+ max_length = prompt_embeds.shape[1]
392
+ uncond_input = tokenizer(
393
+ uncond_tokens,
394
+ padding="max_length",
395
+ max_length=max_length,
396
+ truncation=True,
397
+ return_tensors="pt",
398
+ ).to('cuda')
399
+ output = text_encoder(
400
+ input_ids=uncond_input['input_ids'] ,
401
+ attention_mask=uncond_input['attention_mask'],
402
+ position_ids=uncond_input['position_ids'],
403
+ output_hidden_states=True)
404
+ negative_prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
405
+ negative_pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone() # [batch_size, 4096]
406
+
407
+ if do_classifier_free_guidance:
408
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
409
+ seq_len = negative_prompt_embeds.shape[1]
410
+
411
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device)
412
+
413
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
414
+ negative_prompt_embeds = negative_prompt_embeds.view(
415
+ batch_size * num_images_per_prompt, seq_len, -1
416
+ )
417
+
418
+ # For classifier free guidance, we need to do two forward passes.
419
+ # Here we concatenate the unconditional and text embeddings into a single batch
420
+ # to avoid doing two forward passes
421
+
422
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
423
+
424
+ # negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
425
+ negative_prompt_embeds = negative_prompt_embeds_list[0]
426
+
427
+ bs_embed = pooled_prompt_embeds.shape[0]
428
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
429
+ bs_embed * num_images_per_prompt, -1
430
+ )
431
+ if do_classifier_free_guidance:
432
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
433
+ bs_embed * num_images_per_prompt, -1
434
+ )
435
+
436
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
437
+
438
+
439
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
440
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
441
+ dtype = next(self.image_encoder.parameters()).dtype
442
+
443
+ if not isinstance(image, torch.Tensor):
444
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
445
+
446
+ image = image.to(device=device, dtype=dtype)
447
+ if output_hidden_states:
448
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
449
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
450
+ uncond_image_enc_hidden_states = self.image_encoder(
451
+ torch.zeros_like(image), output_hidden_states=True
452
+ ).hidden_states[-2]
453
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
454
+ num_images_per_prompt, dim=0
455
+ )
456
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
457
+ else:
458
+ image_embeds = self.image_encoder(image).image_embeds
459
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
460
+ uncond_image_embeds = torch.zeros_like(image_embeds)
461
+
462
+ return image_embeds, uncond_image_embeds
463
+
464
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
465
+ def prepare_ip_adapter_image_embeds(
466
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
467
+ ):
468
+ image_embeds = []
469
+ if do_classifier_free_guidance:
470
+ negative_image_embeds = []
471
+ if ip_adapter_image_embeds is None:
472
+ if not isinstance(ip_adapter_image, list):
473
+ ip_adapter_image = [ip_adapter_image]
474
+
475
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
476
+ raise ValueError(
477
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
478
+ )
479
+
480
+ for single_ip_adapter_image, image_proj_layer in zip(
481
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
482
+ ):
483
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
484
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
485
+ single_ip_adapter_image, device, 1, output_hidden_state
486
+ )
487
+
488
+ image_embeds.append(single_image_embeds[None, :])
489
+ if do_classifier_free_guidance:
490
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
491
+ else:
492
+ for single_image_embeds in ip_adapter_image_embeds:
493
+ if do_classifier_free_guidance:
494
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
495
+ negative_image_embeds.append(single_negative_image_embeds)
496
+ image_embeds.append(single_image_embeds)
497
+
498
+ ip_adapter_image_embeds = []
499
+ for i, single_image_embeds in enumerate(image_embeds):
500
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
501
+ if do_classifier_free_guidance:
502
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
503
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
504
+
505
+ single_image_embeds = single_image_embeds.to(device=device)
506
+ ip_adapter_image_embeds.append(single_image_embeds)
507
+
508
+ return ip_adapter_image_embeds
509
+
510
+
511
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
512
+ def prepare_extra_step_kwargs(self, generator, eta):
513
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
514
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
515
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
516
+ # and should be between [0, 1]
517
+
518
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
519
+ extra_step_kwargs = {}
520
+ if accepts_eta:
521
+ extra_step_kwargs["eta"] = eta
522
+
523
+ # check if the scheduler accepts generator
524
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
525
+ if accepts_generator:
526
+ extra_step_kwargs["generator"] = generator
527
+ return extra_step_kwargs
528
+
529
+ def check_inputs(
530
+ self,
531
+ prompt,
532
+ height,
533
+ width,
534
+ callback_steps,
535
+ negative_prompt=None,
536
+ prompt_embeds=None,
537
+ negative_prompt_embeds=None,
538
+ pooled_prompt_embeds=None,
539
+ negative_pooled_prompt_embeds=None,
540
+ ):
541
+ if height % 8 != 0 or width % 8 != 0:
542
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
543
+
544
+ if (callback_steps is None) or (
545
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
546
+ ):
547
+ raise ValueError(
548
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
549
+ f" {type(callback_steps)}."
550
+ )
551
+
552
+ if prompt is not None and prompt_embeds is not None:
553
+ raise ValueError(
554
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
555
+ " only forward one of the two."
556
+ )
557
+ elif prompt is None and prompt_embeds is None:
558
+ raise ValueError(
559
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
560
+ )
561
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
562
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
563
+
564
+ if negative_prompt is not None and negative_prompt_embeds is not None:
565
+ raise ValueError(
566
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
567
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
568
+ )
569
+
570
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
571
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
572
+ raise ValueError(
573
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
574
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
575
+ f" {negative_prompt_embeds.shape}."
576
+ )
577
+
578
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
579
+ raise ValueError(
580
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
581
+ )
582
+
583
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
584
+ raise ValueError(
585
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
586
+ )
587
+
588
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
589
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
590
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
591
+ if isinstance(generator, list) and len(generator) != batch_size:
592
+ raise ValueError(
593
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
594
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
595
+ )
596
+
597
+ if latents is None:
598
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
599
+ else:
600
+ latents = latents.to(device)
601
+
602
+ # scale the initial noise by the standard deviation required by the scheduler
603
+ latents = latents * self.scheduler.init_noise_sigma
604
+ return latents
605
+
606
+ def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
607
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
608
+
609
+ passed_add_embed_dim = (
610
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + 4096
611
+ )
612
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
613
+
614
+ if expected_add_embed_dim != passed_add_embed_dim:
615
+ raise ValueError(
616
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
617
+ )
618
+
619
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
620
+ return add_time_ids
621
+
622
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
623
+ def upcast_vae(self):
624
+ dtype = self.vae.dtype
625
+ self.vae.to(dtype=torch.float32)
626
+ use_torch_2_0_or_xformers = isinstance(
627
+ self.vae.decoder.mid_block.attentions[0].processor,
628
+ (
629
+ AttnProcessor2_0,
630
+ XFormersAttnProcessor,
631
+ LoRAXFormersAttnProcessor,
632
+ LoRAAttnProcessor2_0,
633
+ ),
634
+ )
635
+ # if xformers or torch_2_0 is used attention block does not need
636
+ # to be in float32 which can save lots of memory
637
+ if use_torch_2_0_or_xformers:
638
+ self.vae.post_quant_conv.to(dtype)
639
+ self.vae.decoder.conv_in.to(dtype)
640
+ self.vae.decoder.mid_block.to(dtype)
641
+
642
+ @torch.no_grad()
643
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
644
+ def __call__(
645
+ self,
646
+ prompt: Union[str, List[str]] = None,
647
+ height: Optional[int] = None,
648
+ width: Optional[int] = None,
649
+ num_inference_steps: int = 50,
650
+ denoising_end: Optional[float] = None,
651
+ guidance_scale: float = 5.0,
652
+ negative_prompt: Optional[Union[str, List[str]]] = None,
653
+ num_images_per_prompt: Optional[int] = 1,
654
+ eta: float = 0.0,
655
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
656
+ latents: Optional[torch.FloatTensor] = None,
657
+ prompt_embeds: Optional[torch.FloatTensor] = None,
658
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
659
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
660
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
661
+
662
+ ip_adapter_image: Optional[PipelineImageInput] = None,
663
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
664
+
665
+ output_type: Optional[str] = "pil",
666
+ return_dict: bool = True,
667
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
668
+ callback_steps: int = 1,
669
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
670
+ guidance_rescale: float = 0.0,
671
+ original_size: Optional[Tuple[int, int]] = None,
672
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
673
+ target_size: Optional[Tuple[int, int]] = None,
674
+ use_dynamic_threshold: Optional[bool] = False,
675
+ ):
676
+ r"""
677
+ Function invoked when calling the pipeline for generation.
678
+
679
+ Args:
680
+ prompt (`str` or `List[str]`, *optional*):
681
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
682
+ instead.
683
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
684
+ The height in pixels of the generated image.
685
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
686
+ The width in pixels of the generated image.
687
+ num_inference_steps (`int`, *optional*, defaults to 50):
688
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
689
+ expense of slower inference.
690
+ denoising_end (`float`, *optional*):
691
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
692
+ completed before it is intentionally prematurely terminated. For instance, if denoising_end is set to
693
+ 0.7 and `num_inference_steps` is fixed at 50, the process will execute only 35 (i.e., 0.7 * 50)
694
+ Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
695
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
696
+ guidance_scale (`float`, *optional*, defaults to 7.5):
697
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
698
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
699
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
700
+ negative_prompt (`str` or `List[str]`, *optional*):
701
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
702
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
703
+ less than `1`).
704
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
705
+ The number of images to generate per prompt.
706
+ eta (`float`, *optional*, defaults to 0.0):
707
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
708
+ [`schedulers.DDIMScheduler`], will be ignored for others.
709
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
710
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
711
+ to make generation deterministic.
712
+ latents (`torch.FloatTensor`, *optional*):
713
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
714
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
715
+ tensor will ge generated by sampling using the supplied random `generator`.
716
+ prompt_embeds (`torch.FloatTensor`, *optional*):
717
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
718
+ provided, text embeddings will be generated from `prompt` input argument.
719
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
720
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
721
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
722
+ argument.
723
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
724
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
725
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
726
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
727
+ output_type (`str`, *optional*, defaults to `"pil"`):
728
+ The output format of the generate image. Choose between
729
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
730
+ return_dict (`bool`, *optional*, defaults to `True`):
731
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
732
+ callback (`Callable`, *optional*):
733
+ A function that will be called every `callback_steps` steps during inference. The function will be
734
+ callback_steps (`int`, *optional*, defaults to 1):
735
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
736
+ called at every step.
737
+ cross_attention_kwargs (`dict`, *optional*):
738
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
739
+ `self.processor` in
740
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
741
+ guidance_rescale (`float`, *optional*, defaults to 0.7):
742
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
743
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
744
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
745
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
746
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
747
+ TODO
748
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
749
+ TODO
750
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
751
+ TODO
752
+
753
+ Examples:
754
+
755
+ Returns:
756
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
757
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
758
+ `tuple. When returning a tuple, the first element is a list with the generated images, and the second
759
+ element is a list of `bool`s denoting whether the corresponding generated image likely represents
760
+ "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
761
+ """
762
+ # 0. Default height and width to unet
763
+ height = height or self.default_sample_size * self.vae_scale_factor
764
+ width = width or self.default_sample_size * self.vae_scale_factor
765
+
766
+ original_size = original_size or (height, width)
767
+ target_size = target_size or (height, width)
768
+
769
+ # 1. Check inputs. Raise error if not correct
770
+ self.check_inputs(
771
+ prompt,
772
+ height,
773
+ width,
774
+ callback_steps,
775
+ negative_prompt,
776
+ prompt_embeds,
777
+ negative_prompt_embeds,
778
+ pooled_prompt_embeds,
779
+ negative_pooled_prompt_embeds,
780
+ )
781
+
782
+ # 2. Define call parameters
783
+ if prompt is not None and isinstance(prompt, str):
784
+ batch_size = 1
785
+ elif prompt is not None and isinstance(prompt, list):
786
+ batch_size = len(prompt)
787
+ else:
788
+ batch_size = prompt_embeds.shape[0]
789
+
790
+ device = self._execution_device
791
+
792
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
793
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
794
+ # corresponds to doing no classifier free guidance.
795
+ do_classifier_free_guidance = guidance_scale > 1.0
796
+
797
+ # 3. Encode input prompt
798
+ text_encoder_lora_scale = (
799
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
800
+ )
801
+ (
802
+ prompt_embeds,
803
+ negative_prompt_embeds,
804
+ pooled_prompt_embeds,
805
+ negative_pooled_prompt_embeds,
806
+ ) = self.encode_prompt(
807
+ prompt,
808
+ device,
809
+ num_images_per_prompt,
810
+ do_classifier_free_guidance,
811
+ negative_prompt,
812
+ prompt_embeds=prompt_embeds,
813
+ negative_prompt_embeds=negative_prompt_embeds,
814
+ pooled_prompt_embeds=pooled_prompt_embeds,
815
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
816
+ lora_scale=text_encoder_lora_scale,
817
+ )
818
+
819
+ # 4. Prepare timesteps
820
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
821
+
822
+ timesteps = self.scheduler.timesteps
823
+
824
+ # 5. Prepare latent variables
825
+ num_channels_latents = self.unet.config.in_channels
826
+ latents = self.prepare_latents(
827
+ batch_size * num_images_per_prompt,
828
+ num_channels_latents,
829
+ height,
830
+ width,
831
+ prompt_embeds.dtype,
832
+ device,
833
+ generator,
834
+ latents,
835
+ )
836
+
837
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
838
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
839
+
840
+ # 7. Prepare added time ids & embeddings
841
+ add_text_embeds = pooled_prompt_embeds
842
+ add_time_ids = self._get_add_time_ids(
843
+ original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
844
+ )
845
+
846
+ if do_classifier_free_guidance:
847
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
848
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
849
+ add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
850
+
851
+ prompt_embeds = prompt_embeds.to(device)
852
+ add_text_embeds = add_text_embeds.to(device)
853
+ add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
854
+
855
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
856
+ image_embeds = self.prepare_ip_adapter_image_embeds(
857
+ ip_adapter_image,
858
+ ip_adapter_image_embeds,
859
+ device,
860
+ batch_size * num_images_per_prompt,
861
+ do_classifier_free_guidance,
862
+ )
863
+
864
+ # 8. Denoising loop
865
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
866
+
867
+ # 7.1 Apply denoising_end
868
+ if denoising_end is not None:
869
+ num_inference_steps = int(round(denoising_end * num_inference_steps))
870
+ timesteps = timesteps[: num_warmup_steps + self.scheduler.order * num_inference_steps]
871
+
872
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
873
+ for i, t in enumerate(timesteps):
874
+ # expand the latents if we are doing classifier free guidance
875
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
876
+
877
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
878
+
879
+ # predict the noise residual
880
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
881
+
882
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
883
+ added_cond_kwargs["image_embeds"] = image_embeds
884
+
885
+ # import pdb; pdb.set_trace()
886
+
887
+ noise_pred = self.unet(
888
+ latent_model_input,
889
+ t,
890
+ encoder_hidden_states=prompt_embeds,
891
+ cross_attention_kwargs=cross_attention_kwargs,
892
+ added_cond_kwargs=added_cond_kwargs,
893
+ return_dict=False,
894
+ )[0]
895
+
896
+ # perform guidance
897
+ if do_classifier_free_guidance:
898
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
899
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
900
+ if use_dynamic_threshold:
901
+ DynamicThresh = DynThresh(maxSteps=num_inference_steps, experiment_mode=0)
902
+ noise_pred = DynamicThresh.dynthresh(noise_pred_text,
903
+ noise_pred_uncond,
904
+ guidance_scale,
905
+ None)
906
+
907
+ if do_classifier_free_guidance and guidance_rescale > 0.0:
908
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
909
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
910
+
911
+ # compute the previous noisy sample x_t -> x_t-1
912
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
913
+
914
+ # call the callback, if provided
915
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
916
+ progress_bar.update()
917
+ if callback is not None and i % callback_steps == 0:
918
+ callback(i, t, latents)
919
+
920
+ # make sureo the VAE is in float32 mode, as it overflows in float16
921
+ # torch.cuda.empty_cache()
922
+ if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
923
+ self.upcast_vae()
924
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
925
+
926
+
927
+ if not output_type == "latent":
928
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
929
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
930
+ else:
931
+ image = latents
932
+ return StableDiffusionXLPipelineOutput(images=image)
933
+
934
+ # image = self.watermark.apply_watermark(image)
935
+ image = self.image_processor.postprocess(image, output_type=output_type)
936
+
937
+ # Offload last model to CPU
938
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
939
+ self.final_offload_hook.offload()
940
+
941
+ if not return_dict:
942
+ return (image,)
943
+
944
+ return StableDiffusionXLPipelineOutput(images=image)
945
+
946
+
947
+ if __name__ == "__main__":
948
+ pass
build/lib/kolors/pipelines/pipeline_stable_diffusion_xl_chatglm_256_ipadapter_FaceID.py ADDED
@@ -0,0 +1,951 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import sys
15
+ import os
16
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
17
+ from kolors.models.modeling_chatglm import ChatGLMModel
18
+ from kolors.models.tokenization_chatglm import ChatGLMTokenizer
19
+ import inspect
20
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
21
+ import torch
22
+ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
23
+ from transformers import XLMRobertaModel, ChineseCLIPTextModel
24
+ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
25
+
26
+ from diffusers.image_processor import VaeImageProcessor, PipelineImageInput
27
+ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
28
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel
29
+ from diffusers.models.attention_processor import (
30
+ AttnProcessor2_0,
31
+ LoRAAttnProcessor2_0,
32
+ LoRAXFormersAttnProcessor,
33
+ XFormersAttnProcessor,
34
+ )
35
+
36
+ from diffusers.schedulers import KarrasDiffusionSchedulers
37
+ from diffusers.utils import (
38
+ is_accelerate_available,
39
+ is_accelerate_version,
40
+ logging,
41
+ replace_example_docstring,
42
+ )
43
+ try:
44
+ from diffusers.utils import randn_tensor
45
+ except:
46
+ from diffusers.utils.torch_utils import randn_tensor
47
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
48
+ from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
49
+
50
+ from kolors.models.ipa_faceid_plus.ipa_faceid_plus import ProjPlusModel
51
+ from kolors.models.ipa_faceid_plus.attention_processor import IPAttnProcessor2_0 as IPAttnProcessor, AttnProcessor2_0 as AttnProcessor
52
+
53
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
54
+
55
+ EXAMPLE_DOC_STRING = """
56
+ Examples:
57
+ ```py
58
+ >>> import torch
59
+ >>> from diffusers import StableDiffusionXLPipeline
60
+
61
+ >>> pipe = StableDiffusionXLPipeline.from_pretrained(
62
+ ... "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16
63
+ ... )
64
+ >>> pipe = pipe.to("cuda")
65
+
66
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
67
+ >>> image = pipe(prompt).images[0]
68
+ ```
69
+ """
70
+
71
+
72
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
73
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
74
+ """
75
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
76
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
77
+ """
78
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
79
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
80
+ # rescale the results from guidance (fixes overexposure)
81
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
82
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
83
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
84
+ return noise_cfg
85
+
86
+
87
+ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin):
88
+ r"""
89
+ Pipeline for text-to-image generation using Stable Diffusion XL.
90
+
91
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
92
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
93
+
94
+ In addition the pipeline inherits the following loading methods:
95
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
96
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
97
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
98
+
99
+ as well as the following saving methods:
100
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
101
+
102
+ Args:
103
+ vae ([`AutoencoderKL`]):
104
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
105
+ text_encoder ([`CLIPTextModel`]):
106
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
107
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
108
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
109
+
110
+ tokenizer (`CLIPTokenizer`):
111
+ Tokenizer of class
112
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
113
+
114
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
115
+ scheduler ([`SchedulerMixin`]):
116
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
117
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
118
+ """
119
+
120
+ def __init__(
121
+ self,
122
+ vae: AutoencoderKL,
123
+ text_encoder: ChatGLMModel,
124
+ tokenizer: ChatGLMTokenizer,
125
+ unet: UNet2DConditionModel,
126
+ scheduler: KarrasDiffusionSchedulers,
127
+ force_zeros_for_empty_prompt: bool = True,
128
+ face_clip_encoder: CLIPVisionModelWithProjection = None,
129
+ face_clip_processor: CLIPImageProcessor = None,
130
+ ):
131
+ super().__init__()
132
+
133
+ #### image project with Q-former for FaceID-Plus
134
+ if face_clip_encoder is not None:
135
+ self.image_proj_model = self.init_ip_adapter_proj_layer(
136
+ clip_embeddings_dim = face_clip_encoder.config.hidden_size,
137
+ num_tokens = 6
138
+ )
139
+ else:
140
+ raise NotImplemented("face clip encoder is not provided...")
141
+ self.image_proj_model = None
142
+
143
+ self.register_modules(
144
+ vae=vae,
145
+ text_encoder=text_encoder,
146
+ tokenizer=tokenizer,
147
+ unet=unet,
148
+ scheduler=scheduler,
149
+ face_clip_encoder = face_clip_encoder,
150
+ face_clip_processor = face_clip_processor,
151
+ # image_proj_model = image_proj_model,
152
+ )
153
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
154
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
155
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
156
+ self.default_sample_size = self.unet.config.sample_size
157
+
158
+ # self.watermark = StableDiffusionXLWatermarker()
159
+
160
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
161
+ def enable_vae_slicing(self):
162
+ r"""
163
+ Enable sliced VAE decoding.
164
+
165
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
166
+ steps. This is useful to save some memory and allow larger batch sizes.
167
+ """
168
+ self.vae.enable_slicing()
169
+
170
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
171
+ def disable_vae_slicing(self):
172
+ r"""
173
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
174
+ computing decoding in one step.
175
+ """
176
+ self.vae.disable_slicing()
177
+
178
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
179
+ def enable_vae_tiling(self):
180
+ r"""
181
+ Enable tiled VAE decoding.
182
+
183
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
184
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
185
+ """
186
+ self.vae.enable_tiling()
187
+
188
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
189
+ def disable_vae_tiling(self):
190
+ r"""
191
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
192
+ computing decoding in one step.
193
+ """
194
+ self.vae.disable_tiling()
195
+
196
+ def enable_sequential_cpu_offload(self, gpu_id=0):
197
+ r"""
198
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
199
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
200
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
201
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
202
+ `enable_model_cpu_offload`, but performance is lower.
203
+ """
204
+ if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
205
+ from accelerate import cpu_offload
206
+ else:
207
+ raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
208
+
209
+ device = torch.device(f"cuda:{gpu_id}")
210
+
211
+ if self.device.type != "cpu":
212
+ self.to("cpu", silence_dtype_warnings=True)
213
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
214
+
215
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
216
+ cpu_offload(cpu_offloaded_model, device)
217
+
218
+ def enable_model_cpu_offload(self, gpu_id=0):
219
+ r"""
220
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
221
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
222
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
223
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
224
+ """
225
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
226
+ from accelerate import cpu_offload_with_hook
227
+ else:
228
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
229
+
230
+ device = torch.device(f"cuda:{gpu_id}")
231
+
232
+ if self.device.type != "cpu":
233
+ self.to("cpu", silence_dtype_warnings=True)
234
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
235
+
236
+ model_sequence = (
237
+ [self.text_encoder, self.face_clip_encoder]
238
+ )
239
+ model_sequence.extend([self.unet, self.vae])
240
+
241
+ hook = None
242
+ for cpu_offloaded_model in model_sequence:
243
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
244
+
245
+ # We'll offload the last model manually.
246
+ self.final_offload_hook = hook
247
+
248
+ @property
249
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
250
+ def _execution_device(self):
251
+ r"""
252
+ Returns the device on which the pipeline's models will be executed. After calling
253
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
254
+ hooks.
255
+ """
256
+ if not hasattr(self.unet, "_hf_hook"):
257
+ return self.device
258
+ for module in self.unet.modules():
259
+ if (
260
+ hasattr(module, "_hf_hook")
261
+ and hasattr(module._hf_hook, "execution_device")
262
+ and module._hf_hook.execution_device is not None
263
+ ):
264
+ return torch.device(module._hf_hook.execution_device)
265
+ return self.device
266
+
267
+ def encode_prompt(
268
+ self,
269
+ prompt,
270
+ device: Optional[torch.device] = None,
271
+ num_images_per_prompt: int = 1,
272
+ do_classifier_free_guidance: bool = True,
273
+ negative_prompt=None,
274
+ prompt_embeds: Optional[torch.FloatTensor] = None,
275
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
276
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
277
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
278
+ lora_scale: Optional[float] = None,
279
+ ):
280
+ r"""
281
+ Encodes the prompt into text encoder hidden states.
282
+
283
+ Args:
284
+ prompt (`str` or `List[str]`, *optional*):
285
+ prompt to be encoded
286
+ device: (`torch.device`):
287
+ torch device
288
+ num_images_per_prompt (`int`):
289
+ number of images that should be generated per prompt
290
+ do_classifier_free_guidance (`bool`):
291
+ whether to use classifier free guidance or not
292
+ negative_prompt (`str` or `List[str]`, *optional*):
293
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
294
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
295
+ less than `1`).
296
+ prompt_embeds (`torch.FloatTensor`, *optional*):
297
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
298
+ provided, text embeddings will be generated from `prompt` input argument.
299
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
300
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
301
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
302
+ argument.
303
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
304
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
305
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
306
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
307
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
308
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
309
+ input argument.
310
+ lora_scale (`float`, *optional*):
311
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
312
+ """
313
+ # from IPython import embed; embed(); exit()
314
+ device = device or self._execution_device
315
+
316
+ # set lora scale so that monkey patched LoRA
317
+ # function of text encoder can correctly access it
318
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
319
+ self._lora_scale = lora_scale
320
+
321
+ if prompt is not None and isinstance(prompt, str):
322
+ batch_size = 1
323
+ elif prompt is not None and isinstance(prompt, list):
324
+ batch_size = len(prompt)
325
+ else:
326
+ batch_size = prompt_embeds.shape[0]
327
+
328
+ # Define tokenizers and text encoders
329
+ tokenizers = [self.tokenizer]
330
+ text_encoders = [self.text_encoder]
331
+
332
+ if prompt_embeds is None:
333
+ # textual inversion: procecss multi-vector tokens if necessary
334
+ prompt_embeds_list = []
335
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
336
+ if isinstance(self, TextualInversionLoaderMixin):
337
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
338
+
339
+ text_inputs = tokenizer(
340
+ prompt,
341
+ padding="max_length",
342
+ max_length=256,
343
+ truncation=True,
344
+ return_tensors="pt",
345
+ ).to('cuda')
346
+ output = text_encoder(
347
+ input_ids=text_inputs['input_ids'] ,
348
+ attention_mask=text_inputs['attention_mask'],
349
+ position_ids=text_inputs['position_ids'],
350
+ output_hidden_states=True)
351
+ prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
352
+ pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone() # [batch_size, 4096]
353
+ bs_embed, seq_len, _ = prompt_embeds.shape
354
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
355
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
356
+
357
+ prompt_embeds_list.append(prompt_embeds)
358
+
359
+ # prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
360
+ prompt_embeds = prompt_embeds_list[0]
361
+
362
+ # get unconditional embeddings for classifier free guidance
363
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
364
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
365
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
366
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
367
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
368
+ # negative_prompt = negative_prompt or ""
369
+ uncond_tokens: List[str]
370
+ if negative_prompt is None:
371
+ uncond_tokens = [""] * batch_size
372
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
373
+ raise TypeError(
374
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
375
+ f" {type(prompt)}."
376
+ )
377
+ elif isinstance(negative_prompt, str):
378
+ uncond_tokens = [negative_prompt]
379
+ elif batch_size != len(negative_prompt):
380
+ raise ValueError(
381
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
382
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
383
+ " the batch size of `prompt`."
384
+ )
385
+ else:
386
+ uncond_tokens = negative_prompt
387
+
388
+ negative_prompt_embeds_list = []
389
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
390
+ # textual inversion: procecss multi-vector tokens if necessary
391
+ if isinstance(self, TextualInversionLoaderMixin):
392
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, tokenizer)
393
+
394
+ max_length = prompt_embeds.shape[1]
395
+ uncond_input = tokenizer(
396
+ uncond_tokens,
397
+ padding="max_length",
398
+ max_length=max_length,
399
+ truncation=True,
400
+ return_tensors="pt",
401
+ ).to('cuda')
402
+ output = text_encoder(
403
+ input_ids=uncond_input['input_ids'] ,
404
+ attention_mask=uncond_input['attention_mask'],
405
+ position_ids=uncond_input['position_ids'],
406
+ output_hidden_states=True)
407
+ negative_prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
408
+ negative_pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone() # [batch_size, 4096]
409
+
410
+ if do_classifier_free_guidance:
411
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
412
+ seq_len = negative_prompt_embeds.shape[1]
413
+
414
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device)
415
+
416
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
417
+ negative_prompt_embeds = negative_prompt_embeds.view(
418
+ batch_size * num_images_per_prompt, seq_len, -1
419
+ )
420
+
421
+ # For classifier free guidance, we need to do two forward passes.
422
+ # Here we concatenate the unconditional and text embeddings into a single batch
423
+ # to avoid doing two forward passes
424
+
425
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
426
+
427
+ # negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
428
+ negative_prompt_embeds = negative_prompt_embeds_list[0]
429
+
430
+ bs_embed = pooled_prompt_embeds.shape[0]
431
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
432
+ bs_embed * num_images_per_prompt, -1
433
+ )
434
+ if do_classifier_free_guidance:
435
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
436
+ bs_embed * num_images_per_prompt, -1
437
+ )
438
+
439
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
440
+
441
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
442
+ def prepare_extra_step_kwargs(self, generator, eta):
443
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
444
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
445
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
446
+ # and should be between [0, 1]
447
+
448
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
449
+ extra_step_kwargs = {}
450
+ if accepts_eta:
451
+ extra_step_kwargs["eta"] = eta
452
+
453
+ # check if the scheduler accepts generator
454
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
455
+ if accepts_generator:
456
+ extra_step_kwargs["generator"] = generator
457
+ return extra_step_kwargs
458
+
459
+ def check_inputs(
460
+ self,
461
+ prompt,
462
+ height,
463
+ width,
464
+ callback_steps,
465
+ negative_prompt=None,
466
+ prompt_embeds=None,
467
+ negative_prompt_embeds=None,
468
+ pooled_prompt_embeds=None,
469
+ negative_pooled_prompt_embeds=None,
470
+ ):
471
+ if height % 8 != 0 or width % 8 != 0:
472
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
473
+
474
+ if (callback_steps is None) or (
475
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
476
+ ):
477
+ raise ValueError(
478
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
479
+ f" {type(callback_steps)}."
480
+ )
481
+
482
+ if prompt is not None and prompt_embeds is not None:
483
+ raise ValueError(
484
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
485
+ " only forward one of the two."
486
+ )
487
+ elif prompt is None and prompt_embeds is None:
488
+ raise ValueError(
489
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
490
+ )
491
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
492
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
493
+
494
+ if negative_prompt is not None and negative_prompt_embeds is not None:
495
+ raise ValueError(
496
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
497
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
498
+ )
499
+
500
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
501
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
502
+ raise ValueError(
503
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
504
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
505
+ f" {negative_prompt_embeds.shape}."
506
+ )
507
+
508
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
509
+ raise ValueError(
510
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
511
+ )
512
+
513
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
514
+ raise ValueError(
515
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
516
+ )
517
+
518
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
519
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
520
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
521
+ if isinstance(generator, list) and len(generator) != batch_size:
522
+ raise ValueError(
523
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
524
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
525
+ )
526
+
527
+ if latents is None:
528
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
529
+ else:
530
+ latents = latents.to(device)
531
+
532
+ # scale the initial noise by the standard deviation required by the scheduler
533
+ latents = latents * self.scheduler.init_noise_sigma
534
+ return latents
535
+
536
+ def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
537
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
538
+
539
+ passed_add_embed_dim = (
540
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + 4096
541
+ )
542
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
543
+
544
+ if expected_add_embed_dim != passed_add_embed_dim:
545
+ raise ValueError(
546
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
547
+ )
548
+
549
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
550
+ return add_time_ids
551
+
552
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
553
+ def upcast_vae(self):
554
+ dtype = self.vae.dtype
555
+ self.vae.to(dtype=torch.float32)
556
+ use_torch_2_0_or_xformers = isinstance(
557
+ self.vae.decoder.mid_block.attentions[0].processor,
558
+ (
559
+ AttnProcessor2_0,
560
+ XFormersAttnProcessor,
561
+ LoRAXFormersAttnProcessor,
562
+ LoRAAttnProcessor2_0,
563
+ ),
564
+ )
565
+ # if xformers or torch_2_0 is used attention block does not need
566
+ # to be in float32 which can save lots of memory
567
+ if use_torch_2_0_or_xformers:
568
+ self.vae.post_quant_conv.to(dtype)
569
+ self.vae.decoder.conv_in.to(dtype)
570
+ self.vae.decoder.mid_block.to(dtype)
571
+
572
+ #### set ip adapter module
573
+ def set_ip_adapter(self, device, num_tokens = 6):
574
+ unet = self.unet
575
+ attn_procs = {}
576
+ for name in unet.attn_processors.keys():
577
+ cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
578
+ if name.startswith("mid_block"):
579
+ hidden_size = unet.config.block_out_channels[-1]
580
+ elif name.startswith("up_blocks"):
581
+ block_id = int(name[len("up_blocks.")])
582
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
583
+ elif name.startswith("down_blocks"):
584
+ block_id = int(name[len("down_blocks.")])
585
+ hidden_size = unet.config.block_out_channels[block_id]
586
+ if cross_attention_dim is None:
587
+ attn_procs[name] = AttnProcessor()
588
+ else:
589
+ attn_procs[name] = IPAttnProcessor(
590
+ hidden_size = hidden_size,
591
+ cross_attention_dim = cross_attention_dim,
592
+ scale = 1.0,
593
+ num_tokens = num_tokens
594
+ ).to(device, dtype = unet.dtype)
595
+ unet.set_attn_processor(attn_procs)
596
+
597
+ def init_ip_adapter_proj_layer(self, clip_embeddings_dim, num_tokens):
598
+ image_proj_model = ProjPlusModel(
599
+ cross_attention_dim = 4096,
600
+ id_embeddings_dim = 512,
601
+ clip_embeddings_dim = clip_embeddings_dim,
602
+ num_tokens = num_tokens
603
+ )
604
+ return image_proj_model
605
+
606
+ #### load ip adapter model weight
607
+ def load_ip_adapter_faceid_plus(self, ip_faceid_model_path, device):
608
+ params = torch.load(ip_faceid_model_path, 'cpu')
609
+ self.image_proj_model.load_state_dict(params["image_proj"])
610
+ self.image_proj_model.to(device, dtype = torch.float16)
611
+
612
+ self.set_ip_adapter(num_tokens = 6, device = device)
613
+ ip_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
614
+ ip_layers.load_state_dict(params["adapter_modules"])
615
+
616
+ #### get image embeddings ####
617
+ def get_clip_feat(self, face_crop_image, device):
618
+ face_clip_images = self.face_clip_processor(images = face_crop_image, return_tensors = "pt").pixel_values
619
+ face_clip_images = face_clip_images.to(device, dtype = torch.float16)
620
+
621
+ with torch.no_grad():
622
+ face_clip_embeddings = self.face_clip_encoder(
623
+ face_clip_images,
624
+ output_hidden_states = True
625
+ ).hidden_states[-2]
626
+ return face_clip_embeddings
627
+
628
+ def get_fused_face_embedds(self, face_insightface_embeds, face_crop_image, num_images_per_prompt, device):
629
+ with torch.inference_mode():
630
+ face_clip_embeds = self.get_clip_feat(face_crop_image, device)
631
+ face_clip_embeds = face_clip_embeds.to(device = device, dtype = torch.float16)
632
+
633
+ image_prompt_embeds = self.image_proj_model(face_insightface_embeds, face_clip_embeds)
634
+ uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(face_insightface_embeds), torch.zeros_like(face_clip_embeds))
635
+ bs_embed, seq_len, _ = image_prompt_embeds.shape
636
+ image_prompt_embeds = image_prompt_embeds.repeat(1, num_images_per_prompt, 1)
637
+ image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
638
+ uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_images_per_prompt, 1)
639
+ uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
640
+
641
+ return (image_prompt_embeds, uncond_image_prompt_embeds)
642
+
643
+ def set_face_fidelity_scale(self, scale):
644
+ for attn_processor in self.unet.attn_processors.values():
645
+ if isinstance(attn_processor, IPAttnProcessor):
646
+ attn_processor.scale = scale
647
+ ################################
648
+
649
+ @torch.no_grad()
650
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
651
+ def __call__(
652
+ self,
653
+ prompt: Union[str, List[str]] = None,
654
+ height: Optional[int] = None,
655
+ width: Optional[int] = None,
656
+ num_inference_steps: int = 50,
657
+ denoising_end: Optional[float] = None,
658
+ guidance_scale: float = 5.0,
659
+ negative_prompt: Optional[Union[str, List[str]]] = None,
660
+ num_images_per_prompt: Optional[int] = 1,
661
+ eta: float = 0.0,
662
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
663
+ latents: Optional[torch.FloatTensor] = None,
664
+ prompt_embeds: Optional[torch.FloatTensor] = None,
665
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
666
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
667
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
668
+ output_type: Optional[str] = "pil",
669
+ return_dict: bool = True,
670
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
671
+ callback_steps: int = 1,
672
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
673
+ guidance_rescale: float = 0.0,
674
+ original_size: Optional[Tuple[int, int]] = None,
675
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
676
+ target_size: Optional[Tuple[int, int]] = None,
677
+ use_dynamic_threshold: Optional[bool] = False,
678
+ face_crop_image: Optional[PipelineImageInput] = None,
679
+ face_insightface_embeds: Optional[torch.FloatTensor] = None,
680
+ ):
681
+ r"""
682
+ Function invoked when calling the pipeline for generation.
683
+
684
+ Args:
685
+ prompt (`str` or `List[str]`, *optional*):
686
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
687
+ instead.
688
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
689
+ The height in pixels of the generated image.
690
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
691
+ The width in pixels of the generated image.
692
+ num_inference_steps (`int`, *optional*, defaults to 50):
693
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
694
+ expense of slower inference.
695
+ denoising_end (`float`, *optional*):
696
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
697
+ completed before it is intentionally prematurely terminated. For instance, if denoising_end is set to
698
+ 0.7 and `num_inference_steps` is fixed at 50, the process will execute only 35 (i.e., 0.7 * 50)
699
+ Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
700
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
701
+ guidance_scale (`float`, *optional*, defaults to 7.5):
702
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
703
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
704
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
705
+ negative_prompt (`str` or `List[str]`, *optional*):
706
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
707
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
708
+ less than `1`).
709
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
710
+ The number of images to generate per prompt.
711
+ eta (`float`, *optional*, defaults to 0.0):
712
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
713
+ [`schedulers.DDIMScheduler`], will be ignored for others.
714
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
715
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
716
+ to make generation deterministic.
717
+ latents (`torch.FloatTensor`, *optional*):
718
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
719
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
720
+ tensor will ge generated by sampling using the supplied random `generator`.
721
+ prompt_embeds (`torch.FloatTensor`, *optional*):
722
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
723
+ provided, text embeddings will be generated from `prompt` input argument.
724
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
725
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
726
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
727
+ argument.
728
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
729
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
730
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
731
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
732
+ output_type (`str`, *optional*, defaults to `"pil"`):
733
+ The output format of the generate image. Choose between
734
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
735
+ return_dict (`bool`, *optional*, defaults to `True`):
736
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
737
+ callback (`Callable`, *optional*):
738
+ A function that will be called every `callback_steps` steps during inference. The function will be
739
+ callback_steps (`int`, *optional*, defaults to 1):
740
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
741
+ called at every step.
742
+ cross_attention_kwargs (`dict`, *optional*):
743
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
744
+ `self.processor` in
745
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
746
+ guidance_rescale (`float`, *optional*, defaults to 0.7):
747
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
748
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
749
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
750
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
751
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
752
+ TODO
753
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
754
+ TODO
755
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
756
+ TODO
757
+
758
+ Examples:
759
+
760
+ Returns:
761
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
762
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
763
+ `tuple. When returning a tuple, the first element is a list with the generated images, and the second
764
+ element is a list of `bool`s denoting whether the corresponding generated image likely represents
765
+ "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
766
+ """
767
+ # 0. Default height and width to unet
768
+ height = height or self.default_sample_size * self.vae_scale_factor
769
+ width = width or self.default_sample_size * self.vae_scale_factor
770
+
771
+ original_size = original_size or (height, width)
772
+ target_size = target_size or (height, width)
773
+
774
+ # 1. Check inputs. Raise error if not correct
775
+ self.check_inputs(
776
+ prompt,
777
+ height,
778
+ width,
779
+ callback_steps,
780
+ negative_prompt,
781
+ prompt_embeds,
782
+ negative_prompt_embeds,
783
+ pooled_prompt_embeds,
784
+ negative_pooled_prompt_embeds,
785
+ )
786
+
787
+ # 2. Define call parameters
788
+ if prompt is not None and isinstance(prompt, str):
789
+ batch_size = 1
790
+ elif prompt is not None and isinstance(prompt, list):
791
+ batch_size = len(prompt)
792
+ else:
793
+ batch_size = prompt_embeds.shape[0]
794
+
795
+ device = self._execution_device
796
+
797
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
798
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
799
+ # corresponds to doing no classifier free guidance.
800
+ do_classifier_free_guidance = guidance_scale > 1.0
801
+
802
+ # 3. Encode input prompt
803
+ text_encoder_lora_scale = (
804
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
805
+ )
806
+
807
+ (
808
+ prompt_embeds,
809
+ negative_prompt_embeds,
810
+ pooled_prompt_embeds,
811
+ negative_pooled_prompt_embeds,
812
+ ) = self.encode_prompt(
813
+ prompt,
814
+ device,
815
+ num_images_per_prompt,
816
+ do_classifier_free_guidance,
817
+ negative_prompt,
818
+ prompt_embeds=prompt_embeds,
819
+ negative_prompt_embeds=negative_prompt_embeds,
820
+ pooled_prompt_embeds=pooled_prompt_embeds,
821
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
822
+ lora_scale=text_encoder_lora_scale,
823
+ )
824
+
825
+ ##### prepare fused face embeds
826
+ if face_crop_image is not None and face_insightface_embeds is not None:
827
+ image_prompt_embeds, uncond_image_prompt_embeds = self.get_fused_face_embedds(
828
+ face_insightface_embeds = face_insightface_embeds,
829
+ face_crop_image = face_crop_image,
830
+ num_images_per_prompt = num_images_per_prompt,
831
+ device = device
832
+ )
833
+
834
+ prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
835
+ negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
836
+
837
+ # 4. Prepare timesteps
838
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
839
+
840
+ timesteps = self.scheduler.timesteps
841
+
842
+ # 5. Prepare latent variables
843
+ num_channels_latents = self.unet.config.in_channels
844
+ latents = self.prepare_latents(
845
+ batch_size * num_images_per_prompt,
846
+ num_channels_latents,
847
+ height,
848
+ width,
849
+ prompt_embeds.dtype,
850
+ device,
851
+ generator,
852
+ latents,
853
+ )
854
+
855
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
856
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
857
+
858
+ # 7. Prepare added time ids & embeddings
859
+ add_text_embeds = pooled_prompt_embeds
860
+ add_time_ids = self._get_add_time_ids(
861
+ original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
862
+ )
863
+
864
+ if do_classifier_free_guidance:
865
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
866
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
867
+ add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
868
+
869
+ prompt_embeds = prompt_embeds.to(device)
870
+ add_text_embeds = add_text_embeds.to(device)
871
+ add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
872
+
873
+ # 8. Denoising loop
874
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
875
+
876
+ # 7.1 Apply denoising_end
877
+ if denoising_end is not None:
878
+ num_inference_steps = int(round(denoising_end * num_inference_steps))
879
+ timesteps = timesteps[: num_warmup_steps + self.scheduler.order * num_inference_steps]
880
+
881
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
882
+ for i, t in enumerate(timesteps):
883
+ # expand the latents if we are doing classifier free guidance
884
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
885
+
886
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
887
+
888
+ # predict the noise residual
889
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
890
+ noise_pred = self.unet(
891
+ latent_model_input,
892
+ t,
893
+ encoder_hidden_states=prompt_embeds,
894
+ cross_attention_kwargs=cross_attention_kwargs,
895
+ added_cond_kwargs=added_cond_kwargs,
896
+ return_dict=False,
897
+ )[0]
898
+
899
+ # perform guidance
900
+ if do_classifier_free_guidance:
901
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
902
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
903
+ if use_dynamic_threshold:
904
+ DynamicThresh = DynThresh(maxSteps=num_inference_steps, experiment_mode=0)
905
+ noise_pred = DynamicThresh.dynthresh(noise_pred_text,
906
+ noise_pred_uncond,
907
+ guidance_scale,
908
+ None)
909
+
910
+ if do_classifier_free_guidance and guidance_rescale > 0.0:
911
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
912
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
913
+
914
+ # compute the previous noisy sample x_t -> x_t-1
915
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
916
+
917
+ # call the callback, if provided
918
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
919
+ progress_bar.update()
920
+ if callback is not None and i % callback_steps == 0:
921
+ callback(i, t, latents)
922
+
923
+ # make sureo the VAE is in float32 mode, as it overflows in float16
924
+ # torch.cuda.empty_cache()
925
+ if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
926
+ self.upcast_vae()
927
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
928
+
929
+
930
+ if not output_type == "latent":
931
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
932
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
933
+ else:
934
+ image = latents
935
+ return StableDiffusionXLPipelineOutput(images=image)
936
+
937
+ # image = self.watermark.apply_watermark(image)
938
+ image = self.image_processor.postprocess(image, output_type=output_type)
939
+
940
+ # Offload last model to CPU
941
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
942
+ self.final_offload_hook.offload()
943
+
944
+ if not return_dict:
945
+ return (image,)
946
+
947
+ return StableDiffusionXLPipelineOutput(images=image)
948
+
949
+
950
+ if __name__ == "__main__":
951
+ pass
controlnet/README.md ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ ## <a name="Introduction"></a>📖 Introduction
5
+
6
+ We provide three ControlNet weights and inference code based on Kolors-Basemodel: Canny, Depth and Pose. You can find some example images below.
7
+
8
+
9
+ **1、ControlNet Demos**
10
+
11
+ <table >
12
+
13
+ <tr>
14
+ <td align="center">Condition Image </td>
15
+ <td align="center">Prompt </td>
16
+ <td align="center">Result Image </td>
17
+ </tr>
18
+
19
+ <tr>
20
+ <td align="center"><img src="outputs/Canny_dog_condition.jpg" width=400px/></td>
21
+ <td align="center"><font style="font-size:12px">全景,一只可爱的白色小狗坐在杯子里,看向镜头,动漫风格,3d渲染,辛烷值渲染。</p> Panorama of a cute white puppy sitting in a cup and looking towards the camera, anime style, 3d rendering, octane rendering. </font> </td>
22
+ <td align="center"><img src="outputs/Canny_dog.jpg" width=400px/></td>
23
+ </tr>
24
+
25
+ <tr>
26
+ <td align="center"><img src="outputs/Depth_woman_2_condition.jpg" width=400px/></td>
27
+ <td align="center"><font style="font-size:12px">新海诚风格,丰富的色彩,穿着绿色衬衫的女人站在田野里,唯美风景,清新明亮,斑驳的光影,最好的质量,超细节,8K画质。</p> Makoto Shinkai style, rich colors, a woman in a green shirt standing in the field, beautiful scenery, fresh and bright, mottled light and shadow, best quality, ultra-detailed, 8K quality. </font> </td>
28
+ <td align="center"><img src="outputs/Depth_woman_2.jpg" width=400px/></td>
29
+ </tr>
30
+
31
+ <tr>
32
+ <td align="center"><img src="outputs/Pose_woman_4_condition.jpg" width=400px/></td>
33
+ <td align="center"><font style="font-size:12px">一个穿着黑色运动外套、白色内搭,上面戴着项链的女子,站在街边,背景是红色建筑和绿树,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K。</p> A woman wearing a black sports jacket and a white top, adorned with a necklace, stands by the street, with a background of red buildings and green trees. high quality, ultra clear, colorful, ultra high resolution, best quality, 8k, HD, 4K. </font> </td>
34
+ <td align="center"><img src="outputs/Pose_woman_4.jpg" width=400px/></td>
35
+ </tr>
36
+
37
+
38
+
39
+ </table>
40
+
41
+
42
+
43
+ **2、ControlNet and IP-Adapter-Plus Demos**
44
+
45
+ We also support joint inference code between Kolors-IPadapter and Kolors-ControlNet.
46
+
47
+ <table >
48
+ <tr>
49
+ <td align="center">Reference Image </td>
50
+ <td align="center">Condition Image </td>
51
+ <td align="center">Prompt </td>
52
+ <td align="center">Result Image </td>
53
+ </tr>
54
+
55
+ <tr>
56
+ <td align="center"><img src="../ipadapter/asset/2.png" width=400px/></td>
57
+ <td align="center"><img src="outputs/Depth_woman_2_condition.jpg" width=400px/></td>
58
+ <td align="center"><font style="font-size:12px">一个红色头发的女孩,唯美风景,清新明亮,斑驳的光影,最好的质量,超细节,8K画质。</p> A girl with red hair, beautiful scenery, fresh and bright, mottled light and shadow, best quality, ultra-detailed, 8K quality. </font> </td>
59
+ <td align="center"><img src="outputs/Depth_ipadapter_woman_2.jpg" width=400px/></td>
60
+ </tr>
61
+
62
+ <tr>
63
+ <td align="center"><img src="assets/woman_1.png" width=400px/></td>
64
+ <td align="center"><img src="outputs/Depth_1_condition.jpg" width=400px/></td>
65
+ <td align="center"><font style="font-size:12px">一个漂亮的女孩,最好的质量,超细节,8K画质。</p> A beautiful girl, best quality, super detail, 8K quality. </font> </td>
66
+ <td align="center"><img src="outputs/Depth_ipadapter_1.jpg" width=400px/></td>
67
+ </tr>
68
+
69
+ </table>
70
+
71
+ <br>
72
+
73
+
74
+
75
+
76
+ ## <a name="Evaluation"></a>📊 Evaluation
77
+ To evaluate the performance of models, we compiled a test set of more than 200 images and text prompts. We invite several image experts to provide fair ratings for the generated results of different models. The experts rate the generated images based on four criteria: visual appeal, text faithfulness, conditional controllability, and overall satisfaction. Conditional controllability measures controlnet's ability to preserve spatial structure, while the other criteria follow the evaluation standards of BaseModel. The specific results are summarized in the table below, where Kolors-ControlNet achieved better performance in various criterias.
78
+
79
+ **1、Canny**
80
+
81
+ | Model | Average Overall Satisfaction | Average Visual Appeal | Average Text Faithfulness | Average Conditional Controllability |
82
+ | :--------------: | :--------: | :--------: | :--------: | :--------: |
83
+ | SDXL-ControlNet-Canny | 3.14 | 3.63 | 4.37 | 2.84 |
84
+ | **Kolors-ControlNet-Canny** | **4.06** | **4.64** | **4.45** | **3.52** |
85
+
86
+
87
+
88
+ **2、Depth**
89
+
90
+ | Model | Average Overall Satisfaction | Average Visual Appeal | Average Text Faithfulness | Average Conditional Controllability |
91
+ | :--------------: | :--------: | :--------: | :--------: | :--------: |
92
+ | SDXL-ControlNet-Depth | 3.35 | 3.77 | 4.26 | 4.5 |
93
+ | **Kolors-ControlNet-Depth** | **4.12** | **4.12** | **4.62** | **4.6** |
94
+
95
+
96
+
97
+ **3、Pose**
98
+
99
+ | Model | Average Overall Satisfaction | Average Visual Appeal | Average Text Faithfulness | Average Conditional Controllability |
100
+ | :--------------: | :--------: | :--------: | :--------: | :--------: |
101
+ | SDXL-ControlNet-Pose | 1.70 | 2.78 | 4.05 | 1.98 |
102
+ | **Kolors-ControlNet-Pose** | **3.33** | **3.63** | **4.78** | **4.4** |
103
+
104
+
105
+ <font color=gray style="font-size:12px">*The [SDXL-ControlNet-Canny](https://huggingface.co/diffusers/controlnet-canny-sdxl-1.0) and [SDXL-ControlNet-Depth](https://huggingface.co/diffusers/controlnet-depth-sdxl-1.0) load [DreamShaper-XL](https://civitai.com/models/112902?modelVersionId=351306) as backbone model.*</font>
106
+
107
+
108
+ <table >
109
+ <tr>
110
+ <td colspan="4" align="center">Compare Result</td>
111
+ </tr>
112
+
113
+ <tr>
114
+ <td align="center">Condition Image </td>
115
+ <td align="center">Prompt </td>
116
+ <td align="center">Kolors-ControlNet Result </td>
117
+ <td align="center">SDXL-ControlNet Result </td>
118
+ </tr>
119
+
120
+ <tr>
121
+ <td align="center"><img src="outputs/Canny_woman_1_condition.jpg" width=400px/></td>
122
+ <td align="center"><font style="font-size:12px">一个漂亮的女孩,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K。</p> A beautiful girl, high quality, ultra clear, colorful, ultra high resolution, best quality, 8k, HD, 4K. </font> </td>
123
+ <td align="center"><img src="outputs/Canny_woman_1.jpg" width=400px/></td>
124
+ <td align="center"><img src="outputs/Canny_woman_1_sdxl.jpg" width=400px/></td>
125
+ </tr>
126
+
127
+
128
+ <tr>
129
+ <td align="center"><img src="outputs/Depth_bird_condition.jpg" width=400px/></td>
130
+ <td align="center"><font style="font-size:12px">一只颜色鲜艳的小鸟,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K。</p> A colorful bird, high quality, ultra clear, colorful, ultra high resolution, best quality, 8k, HD, 4K. </font> </td>
131
+ <td align="center"><img src="outputs/Depth_bird.jpg" width=400px/></td>
132
+ <td align="center"><img src="outputs/Depth_bird_sdxl.jpg" width=400px/></td>
133
+ </tr>
134
+
135
+ <tr>
136
+ <td align="center"><img src="outputs/Pose_woman_3_condition.jpg" width=400px/></td>
137
+ <td align="center"><font style="font-size:12px">一位穿着紫色泡泡袖连衣裙、戴着皇冠和白色蕾丝手套的女孩双手托脸,高品质,超清晰,色彩鲜艳,超高分辨率 ,最佳品质,8k,高清,4K。</p> A girl wearing a purple puff-sleeve dress, with a crown and white lace gloves, is cupping her face with both hands. High quality, ultra-clear, vibrant colors, ultra-high resolution, best quality, 8k, HD, 4K. </font> </td>
138
+ <td align="center"><img src="outputs/Pose_woman_3.jpg" width=400px/></td>
139
+ <td align="center"><img src="outputs/Pose_woman_3_sdxl.jpg" width=400px/></td>
140
+ </tr>
141
+
142
+
143
+
144
+ </table>
145
+
146
+
147
+ ------
148
+
149
+
150
+ ## <a name="Usage"></a>🛠️ Usage
151
+
152
+ ### Requirements
153
+
154
+ The dependencies and installation are basically the same as the [Kolors-BaseModel](https://huggingface.co/Kwai-Kolors/Kolors).
155
+
156
+ <br>
157
+
158
+
159
+ ### Weights download
160
+ ```bash
161
+ # Canny - ControlNet
162
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-ControlNet-Canny --local-dir weights/Kolors-ControlNet-Canny
163
+
164
+ # Depth - ControlNet
165
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-ControlNet-Depth --local-dir weights/Kolors-ControlNet-Depth
166
+
167
+ # Pose - ControlNet
168
+ huggingface-cli download --resume-download Kwai-Kolors/Kolors-ControlNet-Pose --local-dir weights/Kolors-ControlNet-Pose
169
+ ```
170
+
171
+ If you intend to utilize the depth estimation network, please make sure to download its corresponding model weights.
172
+ ```
173
+ huggingface-cli download lllyasviel/Annotators ./dpt_hybrid-midas-501f0c75.pt --local-dir ./controlnet/annotator/ckpts
174
+ ```
175
+
176
+ Thanks to [DWPose](https://github.com/IDEA-Research/DWPose/tree/onnx?tab=readme-ov-file), you can utilize the pose estimation network. Please download the Pose model dw-ll_ucoco_384.onnx ([baidu](https://pan.baidu.com/s/1nuBjw-KKSxD_BkpmwXUJiw?pwd=28d7), [google](https://drive.google.com/file/d/12L8E2oAgZy4VACGSK9RaZBZrfgx7VTA2/view?usp=sharing)) and Det model yolox_l.onnx ([baidu](https://pan.baidu.com/s/1fpfIVpv5ypo4c1bUlzkMYQ?pwd=mjdn), [google](https://drive.google.com/file/d/1w9pXC8tT0p9ndMN-CArp1__b2GbzewWI/view?usp=sharing)). Then please put them into `controlnet/annotator/ckpts/`.
177
+
178
+
179
+ ### Inference
180
+
181
+
182
+ **a. Using canny ControlNet:**
183
+
184
+ ```bash
185
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_1.png 一个漂亮的女孩,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K Canny
186
+
187
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/dog.png 全景,一只可爱的白色小狗坐在杯子里,看向镜头,动漫风格,3d渲染,辛烷值渲染 Canny
188
+
189
+ # The image will be saved to "controlnet/outputs/"
190
+ ```
191
+
192
+ **b. Using depth ControlNet:**
193
+
194
+ ```bash
195
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_2.png 新海诚风格,丰富的色彩,穿着绿色衬衫的女人站在田野里,唯美风景,清新明亮,斑驳的光影,最好的质量,超细节,8K画质 Depth
196
+
197
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/bird.png 一只颜色鲜艳的小鸟,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K Depth
198
+
199
+ # The image will be saved to "controlnet/outputs/"
200
+ ```
201
+
202
+ **c. Using pose ControlNet:**
203
+
204
+ ```bash
205
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_3.png 一位穿着紫色泡泡袖连衣裙、戴着皇冠和白色蕾丝手套的女孩双手托脸,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K Pose
206
+
207
+ python ./controlnet/sample_controlNet.py ./controlnet/assets/woman_4.png 一个穿着黑色运动外套、白色内搭,上面戴着项链的女子,站在街边,背景是红色建筑和绿树,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K Pose
208
+
209
+ # The image will be saved to "controlnet/outputs/"
210
+ ```
211
+
212
+
213
+ **c. Using depth ControlNet + IP-Adapter-Plus:**
214
+
215
+ If you intend to utilize the kolors-ip-adapter-plus, please make sure to download its corresponding model weights.
216
+
217
+ ```bash
218
+ python ./controlnet/sample_controlNet_ipadapter.py ./controlnet/assets/woman_2.png ./ipadapter/asset/2.png 一个红色头发的女孩,唯美风景,清新明亮,斑驳的光影,最好的质量,超细节,8K画质 Depth
219
+
220
+ python ./controlnet/sample_controlNet_ipadapter.py ./ipadapter/asset/1.png ./controlnet/assets/woman_1.png 一个漂亮的女孩,最好的质量,超细节,8K画质 Depth
221
+
222
+ # The image will be saved to "controlnet/outputs/"
223
+ ```
224
+
225
+ <br>
226
+
227
+
228
+ ### Acknowledgments
229
+ - Thanks to [ControlNet](https://github.com/lllyasviel/ControlNet) for providing the codebase.
230
+
231
+ <br>
232
+
233
+
controlnet/annotator/__init__.py ADDED
File without changes
controlnet/annotator/canny/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import cv2
2
+
3
+
4
+ class CannyDetector:
5
+ def __call__(self, img, low_threshold, high_threshold):
6
+ return cv2.Canny(img, low_threshold, high_threshold)
controlnet/annotator/dwpose/__init__.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Openpose
2
+ # Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
3
+ # 2nd Edited by https://github.com/Hzzone/pytorch-openpose
4
+ # 3rd Edited by ControlNet
5
+ # 4th Edited by ControlNet (added face and correct hands)
6
+
7
+ import os
8
+ os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
9
+
10
+ import torch
11
+ import numpy as np
12
+ from . import util
13
+ from .wholebody import Wholebody
14
+
15
+ def draw_pose(pose, H, W):
16
+ bodies = pose['bodies']
17
+ faces = pose['faces']
18
+ hands = pose['hands']
19
+ candidate = bodies['candidate']
20
+ subset = bodies['subset']
21
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
22
+
23
+ canvas = util.draw_bodypose(canvas, candidate, subset)
24
+
25
+ canvas = util.draw_handpose(canvas, hands)
26
+
27
+ # canvas = util.draw_facepose(canvas, faces)
28
+
29
+ return canvas
30
+
31
+
32
+ class DWposeDetector:
33
+ def __init__(self):
34
+
35
+ self.pose_estimation = Wholebody()
36
+
37
+
38
+ def getres(self, oriImg):
39
+ out_res = {}
40
+ oriImg = oriImg.copy()
41
+ H, W, C = oriImg.shape
42
+ with torch.no_grad():
43
+ candidate, subset = self.pose_estimation(oriImg)
44
+ out_res['candidate']=candidate
45
+ out_res['subset']=subset
46
+ out_res['width']=W
47
+ out_res['height']=H
48
+ return out_res
49
+
50
+ def __call__(self, oriImg):
51
+
52
+ oriImg = oriImg.copy()
53
+ H, W, C = oriImg.shape
54
+ with torch.no_grad():
55
+ _candidate, _subset = self.pose_estimation(oriImg)
56
+
57
+ subset = _subset.copy()
58
+ candidate = _candidate.copy()
59
+ nums, keys, locs = candidate.shape
60
+ candidate[..., 0] /= float(W)
61
+ candidate[..., 1] /= float(H)
62
+ body = candidate[:,:18].copy()
63
+ body = body.reshape(nums*18, locs)
64
+ score = subset[:,:18]
65
+ for i in range(len(score)):
66
+ for j in range(len(score[i])):
67
+ if score[i][j] > 0.3:
68
+ score[i][j] = int(18*i+j)
69
+ else:
70
+ score[i][j] = -1
71
+
72
+ un_visible = subset<0.3
73
+ candidate[un_visible] = -1
74
+
75
+ foot = candidate[:,18:24]
76
+
77
+ faces = candidate[:,24:92]
78
+
79
+ hands = candidate[:,92:113]
80
+ hands = np.vstack([hands, candidate[:,113:]])
81
+
82
+ bodies = dict(candidate=body, subset=score)
83
+ pose = dict(bodies=bodies, hands=hands, faces=faces)
84
+
85
+ out_res = {}
86
+ out_res['candidate']=candidate
87
+ out_res['subset']=subset
88
+ out_res['width']=W
89
+ out_res['height']=H
90
+
91
+ return out_res,draw_pose(pose, H, W)
controlnet/annotator/dwpose/onnxdet.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+ import onnxruntime
5
+
6
+ def nms(boxes, scores, nms_thr):
7
+ """Single class NMS implemented in Numpy."""
8
+ x1 = boxes[:, 0]
9
+ y1 = boxes[:, 1]
10
+ x2 = boxes[:, 2]
11
+ y2 = boxes[:, 3]
12
+
13
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
14
+ order = scores.argsort()[::-1]
15
+
16
+ keep = []
17
+ while order.size > 0:
18
+ i = order[0]
19
+ keep.append(i)
20
+ xx1 = np.maximum(x1[i], x1[order[1:]])
21
+ yy1 = np.maximum(y1[i], y1[order[1:]])
22
+ xx2 = np.minimum(x2[i], x2[order[1:]])
23
+ yy2 = np.minimum(y2[i], y2[order[1:]])
24
+
25
+ w = np.maximum(0.0, xx2 - xx1 + 1)
26
+ h = np.maximum(0.0, yy2 - yy1 + 1)
27
+ inter = w * h
28
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
29
+
30
+ inds = np.where(ovr <= nms_thr)[0]
31
+ order = order[inds + 1]
32
+
33
+ return keep
34
+
35
+ def multiclass_nms(boxes, scores, nms_thr, score_thr):
36
+ """Multiclass NMS implemented in Numpy. Class-aware version."""
37
+ final_dets = []
38
+ num_classes = scores.shape[1]
39
+ for cls_ind in range(num_classes):
40
+ cls_scores = scores[:, cls_ind]
41
+ valid_score_mask = cls_scores > score_thr
42
+ if valid_score_mask.sum() == 0:
43
+ continue
44
+ else:
45
+ valid_scores = cls_scores[valid_score_mask]
46
+ valid_boxes = boxes[valid_score_mask]
47
+ keep = nms(valid_boxes, valid_scores, nms_thr)
48
+ if len(keep) > 0:
49
+ cls_inds = np.ones((len(keep), 1)) * cls_ind
50
+ dets = np.concatenate(
51
+ [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
52
+ )
53
+ final_dets.append(dets)
54
+ if len(final_dets) == 0:
55
+ return None
56
+ return np.concatenate(final_dets, 0)
57
+
58
+ def demo_postprocess(outputs, img_size, p6=False):
59
+ grids = []
60
+ expanded_strides = []
61
+ strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
62
+
63
+ hsizes = [img_size[0] // stride for stride in strides]
64
+ wsizes = [img_size[1] // stride for stride in strides]
65
+
66
+ for hsize, wsize, stride in zip(hsizes, wsizes, strides):
67
+ xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
68
+ grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
69
+ grids.append(grid)
70
+ shape = grid.shape[:2]
71
+ expanded_strides.append(np.full((*shape, 1), stride))
72
+
73
+ grids = np.concatenate(grids, 1)
74
+ expanded_strides = np.concatenate(expanded_strides, 1)
75
+ outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
76
+ outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
77
+
78
+ return outputs
79
+
80
+ def preprocess(img, input_size, swap=(2, 0, 1)):
81
+ if len(img.shape) == 3:
82
+ padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
83
+ else:
84
+ padded_img = np.ones(input_size, dtype=np.uint8) * 114
85
+
86
+ r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
87
+ resized_img = cv2.resize(
88
+ img,
89
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
90
+ interpolation=cv2.INTER_LINEAR,
91
+ ).astype(np.uint8)
92
+ padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
93
+
94
+ padded_img = padded_img.transpose(swap)
95
+ padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
96
+ return padded_img, r
97
+
98
+ def inference_detector(session, oriImg):
99
+ input_shape = (640,640)
100
+ img, ratio = preprocess(oriImg, input_shape)
101
+
102
+ ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
103
+ output = session.run(None, ort_inputs)
104
+ predictions = demo_postprocess(output[0], input_shape)[0]
105
+
106
+ boxes = predictions[:, :4]
107
+ scores = predictions[:, 4:5] * predictions[:, 5:]
108
+
109
+ boxes_xyxy = np.ones_like(boxes)
110
+ boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
111
+ boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
112
+ boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
113
+ boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
114
+ boxes_xyxy /= ratio
115
+ dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
116
+ if dets is not None:
117
+ final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
118
+ isscore = final_scores>0.3
119
+ iscat = final_cls_inds == 0
120
+ isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
121
+ final_boxes = final_boxes[isbbox]
122
+ else:
123
+ final_boxes = np.array([])
124
+
125
+ return final_boxes
controlnet/annotator/dwpose/onnxpose.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+
3
+ import cv2
4
+ import numpy as np
5
+ import onnxruntime as ort
6
+
7
+ def preprocess(
8
+ img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
9
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
10
+ """Do preprocessing for RTMPose model inference.
11
+
12
+ Args:
13
+ img (np.ndarray): Input image in shape.
14
+ input_size (tuple): Input image size in shape (w, h).
15
+
16
+ Returns:
17
+ tuple:
18
+ - resized_img (np.ndarray): Preprocessed image.
19
+ - center (np.ndarray): Center of image.
20
+ - scale (np.ndarray): Scale of image.
21
+ """
22
+ # get shape of image
23
+ img_shape = img.shape[:2]
24
+ out_img, out_center, out_scale = [], [], []
25
+ if len(out_bbox) == 0:
26
+ out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
27
+ for i in range(len(out_bbox)):
28
+ x0 = out_bbox[i][0]
29
+ y0 = out_bbox[i][1]
30
+ x1 = out_bbox[i][2]
31
+ y1 = out_bbox[i][3]
32
+ bbox = np.array([x0, y0, x1, y1])
33
+
34
+ # get center and scale
35
+ center, scale = bbox_xyxy2cs(bbox, padding=1.25)
36
+
37
+ # do affine transformation
38
+ resized_img, scale = top_down_affine(input_size, scale, center, img)
39
+
40
+ # normalize image
41
+ mean = np.array([123.675, 116.28, 103.53])
42
+ std = np.array([58.395, 57.12, 57.375])
43
+ resized_img = (resized_img - mean) / std
44
+
45
+ out_img.append(resized_img)
46
+ out_center.append(center)
47
+ out_scale.append(scale)
48
+
49
+ return out_img, out_center, out_scale
50
+
51
+
52
+ def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
53
+ """Inference RTMPose model.
54
+
55
+ Args:
56
+ sess (ort.InferenceSession): ONNXRuntime session.
57
+ img (np.ndarray): Input image in shape.
58
+
59
+ Returns:
60
+ outputs (np.ndarray): Output of RTMPose model.
61
+ """
62
+ all_out = []
63
+ # build input
64
+ for i in range(len(img)):
65
+ input = [img[i].transpose(2, 0, 1)]
66
+
67
+ # build output
68
+ sess_input = {sess.get_inputs()[0].name: input}
69
+ sess_output = []
70
+ for out in sess.get_outputs():
71
+ sess_output.append(out.name)
72
+
73
+ # run model
74
+ outputs = sess.run(sess_output, sess_input)
75
+ all_out.append(outputs)
76
+
77
+ return all_out
78
+
79
+
80
+ def postprocess(outputs: List[np.ndarray],
81
+ model_input_size: Tuple[int, int],
82
+ center: Tuple[int, int],
83
+ scale: Tuple[int, int],
84
+ simcc_split_ratio: float = 2.0
85
+ ) -> Tuple[np.ndarray, np.ndarray]:
86
+ """Postprocess for RTMPose model output.
87
+
88
+ Args:
89
+ outputs (np.ndarray): Output of RTMPose model.
90
+ model_input_size (tuple): RTMPose model Input image size.
91
+ center (tuple): Center of bbox in shape (x, y).
92
+ scale (tuple): Scale of bbox in shape (w, h).
93
+ simcc_split_ratio (float): Split ratio of simcc.
94
+
95
+ Returns:
96
+ tuple:
97
+ - keypoints (np.ndarray): Rescaled keypoints.
98
+ - scores (np.ndarray): Model predict scores.
99
+ """
100
+ all_key = []
101
+ all_score = []
102
+ for i in range(len(outputs)):
103
+ # use simcc to decode
104
+ simcc_x, simcc_y = outputs[i]
105
+ keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
106
+
107
+ # rescale keypoints
108
+ keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
109
+ all_key.append(keypoints[0])
110
+ all_score.append(scores[0])
111
+
112
+ return np.array(all_key), np.array(all_score)
113
+
114
+
115
+ def bbox_xyxy2cs(bbox: np.ndarray,
116
+ padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
117
+ """Transform the bbox format from (x,y,w,h) into (center, scale)
118
+
119
+ Args:
120
+ bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
121
+ as (left, top, right, bottom)
122
+ padding (float): BBox padding factor that will be multilied to scale.
123
+ Default: 1.0
124
+
125
+ Returns:
126
+ tuple: A tuple containing center and scale.
127
+ - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
128
+ (n, 2)
129
+ - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
130
+ (n, 2)
131
+ """
132
+ # convert single bbox from (4, ) to (1, 4)
133
+ dim = bbox.ndim
134
+ if dim == 1:
135
+ bbox = bbox[None, :]
136
+
137
+ # get bbox center and scale
138
+ x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
139
+ center = np.hstack([x1 + x2, y1 + y2]) * 0.5
140
+ scale = np.hstack([x2 - x1, y2 - y1]) * padding
141
+
142
+ if dim == 1:
143
+ center = center[0]
144
+ scale = scale[0]
145
+
146
+ return center, scale
147
+
148
+
149
+ def _fix_aspect_ratio(bbox_scale: np.ndarray,
150
+ aspect_ratio: float) -> np.ndarray:
151
+ """Extend the scale to match the given aspect ratio.
152
+
153
+ Args:
154
+ scale (np.ndarray): The image scale (w, h) in shape (2, )
155
+ aspect_ratio (float): The ratio of ``w/h``
156
+
157
+ Returns:
158
+ np.ndarray: The reshaped image scale in (2, )
159
+ """
160
+ w, h = np.hsplit(bbox_scale, [1])
161
+ bbox_scale = np.where(w > h * aspect_ratio,
162
+ np.hstack([w, w / aspect_ratio]),
163
+ np.hstack([h * aspect_ratio, h]))
164
+ return bbox_scale
165
+
166
+
167
+ def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
168
+ """Rotate a point by an angle.
169
+
170
+ Args:
171
+ pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
172
+ angle_rad (float): rotation angle in radian
173
+
174
+ Returns:
175
+ np.ndarray: Rotated point in shape (2, )
176
+ """
177
+ sn, cs = np.sin(angle_rad), np.cos(angle_rad)
178
+ rot_mat = np.array([[cs, -sn], [sn, cs]])
179
+ return rot_mat @ pt
180
+
181
+
182
+ def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
183
+ """To calculate the affine matrix, three pairs of points are required. This
184
+ function is used to get the 3rd point, given 2D points a & b.
185
+
186
+ The 3rd point is defined by rotating vector `a - b` by 90 degrees
187
+ anticlockwise, using b as the rotation center.
188
+
189
+ Args:
190
+ a (np.ndarray): The 1st point (x,y) in shape (2, )
191
+ b (np.ndarray): The 2nd point (x,y) in shape (2, )
192
+
193
+ Returns:
194
+ np.ndarray: The 3rd point.
195
+ """
196
+ direction = a - b
197
+ c = b + np.r_[-direction[1], direction[0]]
198
+ return c
199
+
200
+
201
+ def get_warp_matrix(center: np.ndarray,
202
+ scale: np.ndarray,
203
+ rot: float,
204
+ output_size: Tuple[int, int],
205
+ shift: Tuple[float, float] = (0., 0.),
206
+ inv: bool = False) -> np.ndarray:
207
+ """Calculate the affine transformation matrix that can warp the bbox area
208
+ in the input image to the output size.
209
+
210
+ Args:
211
+ center (np.ndarray[2, ]): Center of the bounding box (x, y).
212
+ scale (np.ndarray[2, ]): Scale of the bounding box
213
+ wrt [width, height].
214
+ rot (float): Rotation angle (degree).
215
+ output_size (np.ndarray[2, ] | list(2,)): Size of the
216
+ destination heatmaps.
217
+ shift (0-100%): Shift translation ratio wrt the width/height.
218
+ Default (0., 0.).
219
+ inv (bool): Option to inverse the affine transform direction.
220
+ (inv=False: src->dst or inv=True: dst->src)
221
+
222
+ Returns:
223
+ np.ndarray: A 2x3 transformation matrix
224
+ """
225
+ shift = np.array(shift)
226
+ src_w = scale[0]
227
+ dst_w = output_size[0]
228
+ dst_h = output_size[1]
229
+
230
+ # compute transformation matrix
231
+ rot_rad = np.deg2rad(rot)
232
+ src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
233
+ dst_dir = np.array([0., dst_w * -0.5])
234
+
235
+ # get four corners of the src rectangle in the original image
236
+ src = np.zeros((3, 2), dtype=np.float32)
237
+ src[0, :] = center + scale * shift
238
+ src[1, :] = center + src_dir + scale * shift
239
+ src[2, :] = _get_3rd_point(src[0, :], src[1, :])
240
+
241
+ # get four corners of the dst rectangle in the input image
242
+ dst = np.zeros((3, 2), dtype=np.float32)
243
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
244
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
245
+ dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
246
+
247
+ if inv:
248
+ warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
249
+ else:
250
+ warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
251
+
252
+ return warp_mat
253
+
254
+
255
+ def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
256
+ img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
257
+ """Get the bbox image as the model input by affine transform.
258
+
259
+ Args:
260
+ input_size (dict): The input size of the model.
261
+ bbox_scale (dict): The bbox scale of the img.
262
+ bbox_center (dict): The bbox center of the img.
263
+ img (np.ndarray): The original image.
264
+
265
+ Returns:
266
+ tuple: A tuple containing center and scale.
267
+ - np.ndarray[float32]: img after affine transform.
268
+ - np.ndarray[float32]: bbox scale after affine transform.
269
+ """
270
+ w, h = input_size
271
+ warp_size = (int(w), int(h))
272
+
273
+ # reshape bbox to fixed aspect ratio
274
+ bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
275
+
276
+ # get the affine matrix
277
+ center = bbox_center
278
+ scale = bbox_scale
279
+ rot = 0
280
+ warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
281
+
282
+ # do affine transform
283
+ img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
284
+
285
+ return img, bbox_scale
286
+
287
+
288
+ def get_simcc_maximum(simcc_x: np.ndarray,
289
+ simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
290
+ """Get maximum response location and value from simcc representations.
291
+
292
+ Note:
293
+ instance number: N
294
+ num_keypoints: K
295
+ heatmap height: H
296
+ heatmap width: W
297
+
298
+ Args:
299
+ simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
300
+ simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
301
+
302
+ Returns:
303
+ tuple:
304
+ - locs (np.ndarray): locations of maximum heatmap responses in shape
305
+ (K, 2) or (N, K, 2)
306
+ - vals (np.ndarray): values of maximum heatmap responses in shape
307
+ (K,) or (N, K)
308
+ """
309
+ N, K, Wx = simcc_x.shape
310
+ simcc_x = simcc_x.reshape(N * K, -1)
311
+ simcc_y = simcc_y.reshape(N * K, -1)
312
+
313
+ # get maximum value locations
314
+ x_locs = np.argmax(simcc_x, axis=1)
315
+ y_locs = np.argmax(simcc_y, axis=1)
316
+ locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
317
+ max_val_x = np.amax(simcc_x, axis=1)
318
+ max_val_y = np.amax(simcc_y, axis=1)
319
+
320
+ # get maximum value across x and y axis
321
+ mask = max_val_x > max_val_y
322
+ max_val_x[mask] = max_val_y[mask]
323
+ vals = max_val_x
324
+ locs[vals <= 0.] = -1
325
+
326
+ # reshape
327
+ locs = locs.reshape(N, K, 2)
328
+ vals = vals.reshape(N, K)
329
+
330
+ return locs, vals
331
+
332
+
333
+ def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
334
+ simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
335
+ """Modulate simcc distribution with Gaussian.
336
+
337
+ Args:
338
+ simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
339
+ simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
340
+ simcc_split_ratio (int): The split ratio of simcc.
341
+
342
+ Returns:
343
+ tuple: A tuple containing center and scale.
344
+ - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
345
+ - np.ndarray[float32]: scores in shape (K,) or (n, K)
346
+ """
347
+ keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
348
+ keypoints /= simcc_split_ratio
349
+
350
+ return keypoints, scores
351
+
352
+
353
+ def inference_pose(session, out_bbox, oriImg):
354
+ h, w = session.get_inputs()[0].shape[2:]
355
+ model_input_size = (w, h)
356
+ resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
357
+ outputs = inference(session, resized_img)
358
+ keypoints, scores = postprocess(outputs, model_input_size, center, scale)
359
+
360
+ return keypoints, scores
controlnet/annotator/dwpose/util.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import matplotlib
4
+ import cv2
5
+
6
+
7
+ eps = 0.01
8
+
9
+
10
+ def smart_resize(x, s):
11
+ Ht, Wt = s
12
+ if x.ndim == 2:
13
+ Ho, Wo = x.shape
14
+ Co = 1
15
+ else:
16
+ Ho, Wo, Co = x.shape
17
+ if Co == 3 or Co == 1:
18
+ k = float(Ht + Wt) / float(Ho + Wo)
19
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
20
+ else:
21
+ return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
22
+
23
+
24
+ def smart_resize_k(x, fx, fy):
25
+ if x.ndim == 2:
26
+ Ho, Wo = x.shape
27
+ Co = 1
28
+ else:
29
+ Ho, Wo, Co = x.shape
30
+ Ht, Wt = Ho * fy, Wo * fx
31
+ if Co == 3 or Co == 1:
32
+ k = float(Ht + Wt) / float(Ho + Wo)
33
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
34
+ else:
35
+ return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
36
+
37
+
38
+ def padRightDownCorner(img, stride, padValue):
39
+ h = img.shape[0]
40
+ w = img.shape[1]
41
+
42
+ pad = 4 * [None]
43
+ pad[0] = 0 # up
44
+ pad[1] = 0 # left
45
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
46
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
47
+
48
+ img_padded = img
49
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
50
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
51
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
52
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
53
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
54
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
55
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
56
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
57
+
58
+ return img_padded, pad
59
+
60
+
61
+ def transfer(model, model_weights):
62
+ transfered_model_weights = {}
63
+ for weights_name in model.state_dict().keys():
64
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
65
+ return transfered_model_weights
66
+
67
+
68
+ def draw_bodypose(canvas, candidate, subset):
69
+ H, W, C = canvas.shape
70
+ candidate = np.array(candidate)
71
+ subset = np.array(subset)
72
+
73
+ stickwidth = 4
74
+
75
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
76
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
77
+ [1, 16], [16, 18], [3, 17], [6, 18]]
78
+
79
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
80
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
81
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
82
+
83
+ for i in range(17):
84
+ for n in range(len(subset)):
85
+ index = subset[n][np.array(limbSeq[i]) - 1]
86
+ if -1 in index:
87
+ continue
88
+ Y = candidate[index.astype(int), 0] * float(W)
89
+ X = candidate[index.astype(int), 1] * float(H)
90
+ mX = np.mean(X)
91
+ mY = np.mean(Y)
92
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
93
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
94
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
95
+ cv2.fillConvexPoly(canvas, polygon, colors[i])
96
+
97
+ canvas = (canvas * 0.6).astype(np.uint8)
98
+
99
+ for i in range(18):
100
+ for n in range(len(subset)):
101
+ index = int(subset[n][i])
102
+ if index == -1:
103
+ continue
104
+ x, y = candidate[index][0:2]
105
+ x = int(x * W)
106
+ y = int(y * H)
107
+ cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
108
+
109
+ return canvas
110
+
111
+
112
+ def draw_handpose(canvas, all_hand_peaks):
113
+ H, W, C = canvas.shape
114
+
115
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
116
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
117
+
118
+ for peaks in all_hand_peaks:
119
+ peaks = np.array(peaks)
120
+
121
+ for ie, e in enumerate(edges):
122
+ x1, y1 = peaks[e[0]]
123
+ x2, y2 = peaks[e[1]]
124
+ x1 = int(x1 * W)
125
+ y1 = int(y1 * H)
126
+ x2 = int(x2 * W)
127
+ y2 = int(y2 * H)
128
+ if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
129
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
130
+
131
+ for i, keyponit in enumerate(peaks):
132
+ x, y = keyponit
133
+ x = int(x * W)
134
+ y = int(y * H)
135
+ if x > eps and y > eps:
136
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
137
+ return canvas
138
+
139
+
140
+ def draw_facepose(canvas, all_lmks):
141
+ H, W, C = canvas.shape
142
+ for lmks in all_lmks:
143
+ lmks = np.array(lmks)
144
+ for lmk in lmks:
145
+ x, y = lmk
146
+ x = int(x * W)
147
+ y = int(y * H)
148
+ if x > eps and y > eps:
149
+ cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
150
+ return canvas
151
+
152
+
153
+ # detect hand according to body pose keypoints
154
+ # please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
155
+ def handDetect(candidate, subset, oriImg):
156
+ # right hand: wrist 4, elbow 3, shoulder 2
157
+ # left hand: wrist 7, elbow 6, shoulder 5
158
+ ratioWristElbow = 0.33
159
+ detect_result = []
160
+ image_height, image_width = oriImg.shape[0:2]
161
+ for person in subset.astype(int):
162
+ # if any of three not detected
163
+ has_left = np.sum(person[[5, 6, 7]] == -1) == 0
164
+ has_right = np.sum(person[[2, 3, 4]] == -1) == 0
165
+ if not (has_left or has_right):
166
+ continue
167
+ hands = []
168
+ #left hand
169
+ if has_left:
170
+ left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
171
+ x1, y1 = candidate[left_shoulder_index][:2]
172
+ x2, y2 = candidate[left_elbow_index][:2]
173
+ x3, y3 = candidate[left_wrist_index][:2]
174
+ hands.append([x1, y1, x2, y2, x3, y3, True])
175
+ # right hand
176
+ if has_right:
177
+ right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
178
+ x1, y1 = candidate[right_shoulder_index][:2]
179
+ x2, y2 = candidate[right_elbow_index][:2]
180
+ x3, y3 = candidate[right_wrist_index][:2]
181
+ hands.append([x1, y1, x2, y2, x3, y3, False])
182
+
183
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
184
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
185
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
186
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
187
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
188
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
189
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
190
+ x = x3 + ratioWristElbow * (x3 - x2)
191
+ y = y3 + ratioWristElbow * (y3 - y2)
192
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
193
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
194
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
195
+ # x-y refers to the center --> offset to topLeft point
196
+ # handRectangle.x -= handRectangle.width / 2.f;
197
+ # handRectangle.y -= handRectangle.height / 2.f;
198
+ x -= width / 2
199
+ y -= width / 2 # width = height
200
+ # overflow the image
201
+ if x < 0: x = 0
202
+ if y < 0: y = 0
203
+ width1 = width
204
+ width2 = width
205
+ if x + width > image_width: width1 = image_width - x
206
+ if y + width > image_height: width2 = image_height - y
207
+ width = min(width1, width2)
208
+ # the max hand box value is 20 pixels
209
+ if width >= 20:
210
+ detect_result.append([int(x), int(y), int(width), is_left])
211
+
212
+ '''
213
+ return value: [[x, y, w, True if left hand else False]].
214
+ width=height since the network require squared input.
215
+ x, y is the coordinate of top left
216
+ '''
217
+ return detect_result
218
+
219
+
220
+ # Written by Lvmin
221
+ def faceDetect(candidate, subset, oriImg):
222
+ # left right eye ear 14 15 16 17
223
+ detect_result = []
224
+ image_height, image_width = oriImg.shape[0:2]
225
+ for person in subset.astype(int):
226
+ has_head = person[0] > -1
227
+ if not has_head:
228
+ continue
229
+
230
+ has_left_eye = person[14] > -1
231
+ has_right_eye = person[15] > -1
232
+ has_left_ear = person[16] > -1
233
+ has_right_ear = person[17] > -1
234
+
235
+ if not (has_left_eye or has_right_eye or has_left_ear or has_right_ear):
236
+ continue
237
+
238
+ head, left_eye, right_eye, left_ear, right_ear = person[[0, 14, 15, 16, 17]]
239
+
240
+ width = 0.0
241
+ x0, y0 = candidate[head][:2]
242
+
243
+ if has_left_eye:
244
+ x1, y1 = candidate[left_eye][:2]
245
+ d = max(abs(x0 - x1), abs(y0 - y1))
246
+ width = max(width, d * 3.0)
247
+
248
+ if has_right_eye:
249
+ x1, y1 = candidate[right_eye][:2]
250
+ d = max(abs(x0 - x1), abs(y0 - y1))
251
+ width = max(width, d * 3.0)
252
+
253
+ if has_left_ear:
254
+ x1, y1 = candidate[left_ear][:2]
255
+ d = max(abs(x0 - x1), abs(y0 - y1))
256
+ width = max(width, d * 1.5)
257
+
258
+ if has_right_ear:
259
+ x1, y1 = candidate[right_ear][:2]
260
+ d = max(abs(x0 - x1), abs(y0 - y1))
261
+ width = max(width, d * 1.5)
262
+
263
+ x, y = x0, y0
264
+
265
+ x -= width
266
+ y -= width
267
+
268
+ if x < 0:
269
+ x = 0
270
+
271
+ if y < 0:
272
+ y = 0
273
+
274
+ width1 = width * 2
275
+ width2 = width * 2
276
+
277
+ if x + width > image_width:
278
+ width1 = image_width - x
279
+
280
+ if y + width > image_height:
281
+ width2 = image_height - y
282
+
283
+ width = min(width1, width2)
284
+
285
+ if width >= 20:
286
+ detect_result.append([int(x), int(y), int(width)])
287
+
288
+ return detect_result
289
+
290
+
291
+ # get max index of 2d array
292
+ def npmax(array):
293
+ arrayindex = array.argmax(1)
294
+ arrayvalue = array.max(1)
295
+ i = arrayvalue.argmax()
296
+ j = arrayindex[i]
297
+ return i, j
controlnet/annotator/dwpose/wholebody.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+ import onnxruntime as ort
5
+ from .onnxdet import inference_detector
6
+ from .onnxpose import inference_pose
7
+
8
+ class Wholebody:
9
+ def __init__(self):
10
+ device = 'cuda:0'
11
+ providers = ['CPUExecutionProvider'
12
+ ] if device == 'cpu' else ['CUDAExecutionProvider']
13
+ # providers = ['CPUExecutionProvider']
14
+ providers = ['CUDAExecutionProvider']
15
+ onnx_det = 'annotator/ckpts/yolox_l.onnx'
16
+ onnx_pose = 'annotator/ckpts/dw-ll_ucoco_384.onnx'
17
+
18
+ self.session_det = ort.InferenceSession(path_or_bytes=onnx_det, providers=providers)
19
+ self.session_pose = ort.InferenceSession(path_or_bytes=onnx_pose, providers=providers)
20
+ def __call__(self, oriImg):
21
+ det_result = inference_detector(self.session_det, oriImg)
22
+ keypoints, scores = inference_pose(self.session_pose, det_result, oriImg)
23
+
24
+ keypoints_info = np.concatenate(
25
+ (keypoints, scores[..., None]), axis=-1)
26
+ # compute neck joint
27
+ neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
28
+ # neck score when visualizing pred
29
+ neck[:, 2:4] = np.logical_and(
30
+ keypoints_info[:, 5, 2:4] > 0.3,
31
+ keypoints_info[:, 6, 2:4] > 0.3).astype(int)
32
+ new_keypoints_info = np.insert(
33
+ keypoints_info, 17, neck, axis=1)
34
+ mmpose_idx = [
35
+ 17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3
36
+ ]
37
+ openpose_idx = [
38
+ 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17
39
+ ]
40
+ new_keypoints_info[:, openpose_idx] = \
41
+ new_keypoints_info[:, mmpose_idx]
42
+ keypoints_info = new_keypoints_info
43
+
44
+ keypoints, scores = keypoints_info[
45
+ ..., :2], keypoints_info[..., 2]
46
+
47
+ return keypoints, scores
48
+
49
+
controlnet/annotator/midas/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
controlnet/annotator/midas/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Midas Depth Estimation
2
+ # From https://github.com/isl-org/MiDaS
3
+ # MIT LICENSE
4
+
5
+ import cv2
6
+ import numpy as np
7
+ import torch
8
+
9
+ from einops import rearrange
10
+ from .api import MiDaSInference
11
+
12
+
13
+ class MidasDetector:
14
+ def __init__(self):
15
+ self.model = MiDaSInference(model_type="dpt_hybrid").cuda()
16
+ self.rng = np.random.RandomState(0)
17
+
18
+ def __call__(self, input_image):
19
+ assert input_image.ndim == 3
20
+ image_depth = input_image
21
+ with torch.no_grad():
22
+ image_depth = torch.from_numpy(image_depth).float().cuda()
23
+ image_depth = image_depth / 127.5 - 1.0
24
+ image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
25
+ depth = self.model(image_depth)[0]
26
+
27
+ depth -= torch.min(depth)
28
+ depth /= torch.max(depth)
29
+ depth = depth.cpu().numpy()
30
+ depth_image = (depth * 255.0).clip(0, 255).astype(np.uint8)
31
+
32
+ return depth_image
33
+
34
+
35
+
controlnet/annotator/midas/api.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # based on https://github.com/isl-org/MiDaS
2
+
3
+ import cv2
4
+ import os
5
+ import torch
6
+ import torch.nn as nn
7
+ from torchvision.transforms import Compose
8
+
9
+ from .midas.dpt_depth import DPTDepthModel
10
+ from .midas.midas_net import MidasNet
11
+ from .midas.midas_net_custom import MidasNet_small
12
+ from .midas.transforms import Resize, NormalizeImage, PrepareForNet
13
+ from annotator.util import annotator_ckpts_path
14
+
15
+
16
+ ISL_PATHS = {
17
+ "dpt_large": os.path.join(annotator_ckpts_path, "dpt_large-midas-2f21e586.pt"),
18
+ "dpt_hybrid": os.path.join(annotator_ckpts_path, "dpt_hybrid-midas-501f0c75.pt"),
19
+ "midas_v21": "",
20
+ "midas_v21_small": "",
21
+ }
22
+
23
+ remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/dpt_hybrid-midas-501f0c75.pt"
24
+
25
+
26
+ def disabled_train(self, mode=True):
27
+ """Overwrite model.train with this function to make sure train/eval mode
28
+ does not change anymore."""
29
+ return self
30
+
31
+
32
+ def load_midas_transform(model_type):
33
+ # https://github.com/isl-org/MiDaS/blob/master/run.py
34
+ # load transform only
35
+ if model_type == "dpt_large": # DPT-Large
36
+ net_w, net_h = 384, 384
37
+ resize_mode = "minimal"
38
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
39
+
40
+ elif model_type == "dpt_hybrid": # DPT-Hybrid
41
+ net_w, net_h = 384, 384
42
+ resize_mode = "minimal"
43
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
44
+
45
+ elif model_type == "midas_v21":
46
+ net_w, net_h = 384, 384
47
+ resize_mode = "upper_bound"
48
+ normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
49
+
50
+ elif model_type == "midas_v21_small":
51
+ net_w, net_h = 256, 256
52
+ resize_mode = "upper_bound"
53
+ normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
54
+
55
+ else:
56
+ assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
57
+
58
+ transform = Compose(
59
+ [
60
+ Resize(
61
+ net_w,
62
+ net_h,
63
+ resize_target=None,
64
+ keep_aspect_ratio=True,
65
+ ensure_multiple_of=32,
66
+ resize_method=resize_mode,
67
+ image_interpolation_method=cv2.INTER_CUBIC,
68
+ ),
69
+ normalization,
70
+ PrepareForNet(),
71
+ ]
72
+ )
73
+
74
+ return transform
75
+
76
+
77
+ def load_model(model_type):
78
+ # https://github.com/isl-org/MiDaS/blob/master/run.py
79
+ # load network
80
+ model_path = ISL_PATHS[model_type]
81
+ if model_type == "dpt_large": # DPT-Large
82
+ model = DPTDepthModel(
83
+ path=model_path,
84
+ backbone="vitl16_384",
85
+ non_negative=True,
86
+ )
87
+ net_w, net_h = 384, 384
88
+ resize_mode = "minimal"
89
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
90
+
91
+ elif model_type == "dpt_hybrid": # DPT-Hybrid
92
+ if not os.path.exists(model_path):
93
+ from basicsr.utils.download_util import load_file_from_url
94
+ load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
95
+
96
+ model = DPTDepthModel(
97
+ path=model_path,
98
+ backbone="vitb_rn50_384",
99
+ non_negative=True,
100
+ )
101
+ net_w, net_h = 384, 384
102
+ resize_mode = "minimal"
103
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
104
+
105
+ elif model_type == "midas_v21":
106
+ model = MidasNet(model_path, non_negative=True)
107
+ net_w, net_h = 384, 384
108
+ resize_mode = "upper_bound"
109
+ normalization = NormalizeImage(
110
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
111
+ )
112
+
113
+ elif model_type == "midas_v21_small":
114
+ model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
115
+ non_negative=True, blocks={'expand': True})
116
+ net_w, net_h = 256, 256
117
+ resize_mode = "upper_bound"
118
+ normalization = NormalizeImage(
119
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
120
+ )
121
+
122
+ else:
123
+ print(f"model_type '{model_type}' not implemented, use: --model_type large")
124
+ assert False
125
+
126
+ transform = Compose(
127
+ [
128
+ Resize(
129
+ net_w,
130
+ net_h,
131
+ resize_target=None,
132
+ keep_aspect_ratio=True,
133
+ ensure_multiple_of=32,
134
+ resize_method=resize_mode,
135
+ image_interpolation_method=cv2.INTER_CUBIC,
136
+ ),
137
+ normalization,
138
+ PrepareForNet(),
139
+ ]
140
+ )
141
+
142
+ return model.eval(), transform
143
+
144
+
145
+ class MiDaSInference(nn.Module):
146
+ MODEL_TYPES_TORCH_HUB = [
147
+ "DPT_Large",
148
+ "DPT_Hybrid",
149
+ "MiDaS_small"
150
+ ]
151
+ MODEL_TYPES_ISL = [
152
+ "dpt_large",
153
+ "dpt_hybrid",
154
+ "midas_v21",
155
+ "midas_v21_small",
156
+ ]
157
+
158
+ def __init__(self, model_type):
159
+ super().__init__()
160
+ assert (model_type in self.MODEL_TYPES_ISL)
161
+ model, _ = load_model(model_type)
162
+ self.model = model
163
+ self.model.train = disabled_train
164
+
165
+ def forward(self, x):
166
+ with torch.no_grad():
167
+ prediction = self.model(x)
168
+ return prediction
169
+
controlnet/annotator/midas/midas/__init__.py ADDED
File without changes
controlnet/annotator/midas/midas/base_model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class BaseModel(torch.nn.Module):
5
+ def load(self, path):
6
+ """Load model from file.
7
+
8
+ Args:
9
+ path (str): file path
10
+ """
11
+ parameters = torch.load(path, map_location=torch.device('cpu'))
12
+
13
+ if "optimizer" in parameters:
14
+ parameters = parameters["model"]
15
+
16
+ self.load_state_dict(parameters)
controlnet/annotator/midas/midas/blocks.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from .vit import (
5
+ _make_pretrained_vitb_rn50_384,
6
+ _make_pretrained_vitl16_384,
7
+ _make_pretrained_vitb16_384,
8
+ forward_vit,
9
+ )
10
+
11
+ def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
12
+ if backbone == "vitl16_384":
13
+ pretrained = _make_pretrained_vitl16_384(
14
+ use_pretrained, hooks=hooks, use_readout=use_readout
15
+ )
16
+ scratch = _make_scratch(
17
+ [256, 512, 1024, 1024], features, groups=groups, expand=expand
18
+ ) # ViT-L/16 - 85.0% Top1 (backbone)
19
+ elif backbone == "vitb_rn50_384":
20
+ pretrained = _make_pretrained_vitb_rn50_384(
21
+ use_pretrained,
22
+ hooks=hooks,
23
+ use_vit_only=use_vit_only,
24
+ use_readout=use_readout,
25
+ )
26
+ scratch = _make_scratch(
27
+ [256, 512, 768, 768], features, groups=groups, expand=expand
28
+ ) # ViT-H/16 - 85.0% Top1 (backbone)
29
+ elif backbone == "vitb16_384":
30
+ pretrained = _make_pretrained_vitb16_384(
31
+ use_pretrained, hooks=hooks, use_readout=use_readout
32
+ )
33
+ scratch = _make_scratch(
34
+ [96, 192, 384, 768], features, groups=groups, expand=expand
35
+ ) # ViT-B/16 - 84.6% Top1 (backbone)
36
+ elif backbone == "resnext101_wsl":
37
+ pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
38
+ scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3
39
+ elif backbone == "efficientnet_lite3":
40
+ pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
41
+ scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3
42
+ else:
43
+ print(f"Backbone '{backbone}' not implemented")
44
+ assert False
45
+
46
+ return pretrained, scratch
47
+
48
+
49
+ def _make_scratch(in_shape, out_shape, groups=1, expand=False):
50
+ scratch = nn.Module()
51
+
52
+ out_shape1 = out_shape
53
+ out_shape2 = out_shape
54
+ out_shape3 = out_shape
55
+ out_shape4 = out_shape
56
+ if expand==True:
57
+ out_shape1 = out_shape
58
+ out_shape2 = out_shape*2
59
+ out_shape3 = out_shape*4
60
+ out_shape4 = out_shape*8
61
+
62
+ scratch.layer1_rn = nn.Conv2d(
63
+ in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
64
+ )
65
+ scratch.layer2_rn = nn.Conv2d(
66
+ in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
67
+ )
68
+ scratch.layer3_rn = nn.Conv2d(
69
+ in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
70
+ )
71
+ scratch.layer4_rn = nn.Conv2d(
72
+ in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
73
+ )
74
+
75
+ return scratch
76
+
77
+
78
+ def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
79
+ efficientnet = torch.hub.load(
80
+ "rwightman/gen-efficientnet-pytorch",
81
+ "tf_efficientnet_lite3",
82
+ pretrained=use_pretrained,
83
+ exportable=exportable
84
+ )
85
+ return _make_efficientnet_backbone(efficientnet)
86
+
87
+
88
+ def _make_efficientnet_backbone(effnet):
89
+ pretrained = nn.Module()
90
+
91
+ pretrained.layer1 = nn.Sequential(
92
+ effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
93
+ )
94
+ pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
95
+ pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
96
+ pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
97
+
98
+ return pretrained
99
+
100
+
101
+ def _make_resnet_backbone(resnet):
102
+ pretrained = nn.Module()
103
+ pretrained.layer1 = nn.Sequential(
104
+ resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
105
+ )
106
+
107
+ pretrained.layer2 = resnet.layer2
108
+ pretrained.layer3 = resnet.layer3
109
+ pretrained.layer4 = resnet.layer4
110
+
111
+ return pretrained
112
+
113
+
114
+ def _make_pretrained_resnext101_wsl(use_pretrained):
115
+ resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
116
+ return _make_resnet_backbone(resnet)
117
+
118
+
119
+
120
+ class Interpolate(nn.Module):
121
+ """Interpolation module.
122
+ """
123
+
124
+ def __init__(self, scale_factor, mode, align_corners=False):
125
+ """Init.
126
+
127
+ Args:
128
+ scale_factor (float): scaling
129
+ mode (str): interpolation mode
130
+ """
131
+ super(Interpolate, self).__init__()
132
+
133
+ self.interp = nn.functional.interpolate
134
+ self.scale_factor = scale_factor
135
+ self.mode = mode
136
+ self.align_corners = align_corners
137
+
138
+ def forward(self, x):
139
+ """Forward pass.
140
+
141
+ Args:
142
+ x (tensor): input
143
+
144
+ Returns:
145
+ tensor: interpolated data
146
+ """
147
+
148
+ x = self.interp(
149
+ x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
150
+ )
151
+
152
+ return x
153
+
154
+
155
+ class ResidualConvUnit(nn.Module):
156
+ """Residual convolution module.
157
+ """
158
+
159
+ def __init__(self, features):
160
+ """Init.
161
+
162
+ Args:
163
+ features (int): number of features
164
+ """
165
+ super().__init__()
166
+
167
+ self.conv1 = nn.Conv2d(
168
+ features, features, kernel_size=3, stride=1, padding=1, bias=True
169
+ )
170
+
171
+ self.conv2 = nn.Conv2d(
172
+ features, features, kernel_size=3, stride=1, padding=1, bias=True
173
+ )
174
+
175
+ self.relu = nn.ReLU(inplace=True)
176
+
177
+ def forward(self, x):
178
+ """Forward pass.
179
+
180
+ Args:
181
+ x (tensor): input
182
+
183
+ Returns:
184
+ tensor: output
185
+ """
186
+ out = self.relu(x)
187
+ out = self.conv1(out)
188
+ out = self.relu(out)
189
+ out = self.conv2(out)
190
+
191
+ return out + x
192
+
193
+
194
+ class FeatureFusionBlock(nn.Module):
195
+ """Feature fusion block.
196
+ """
197
+
198
+ def __init__(self, features):
199
+ """Init.
200
+
201
+ Args:
202
+ features (int): number of features
203
+ """
204
+ super(FeatureFusionBlock, self).__init__()
205
+
206
+ self.resConfUnit1 = ResidualConvUnit(features)
207
+ self.resConfUnit2 = ResidualConvUnit(features)
208
+
209
+ def forward(self, *xs):
210
+ """Forward pass.
211
+
212
+ Returns:
213
+ tensor: output
214
+ """
215
+ output = xs[0]
216
+
217
+ if len(xs) == 2:
218
+ output += self.resConfUnit1(xs[1])
219
+
220
+ output = self.resConfUnit2(output)
221
+
222
+ output = nn.functional.interpolate(
223
+ output, scale_factor=2, mode="bilinear", align_corners=True
224
+ )
225
+
226
+ return output
227
+
228
+
229
+
230
+
231
+ class ResidualConvUnit_custom(nn.Module):
232
+ """Residual convolution module.
233
+ """
234
+
235
+ def __init__(self, features, activation, bn):
236
+ """Init.
237
+
238
+ Args:
239
+ features (int): number of features
240
+ """
241
+ super().__init__()
242
+
243
+ self.bn = bn
244
+
245
+ self.groups=1
246
+
247
+ self.conv1 = nn.Conv2d(
248
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
249
+ )
250
+
251
+ self.conv2 = nn.Conv2d(
252
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
253
+ )
254
+
255
+ if self.bn==True:
256
+ self.bn1 = nn.BatchNorm2d(features)
257
+ self.bn2 = nn.BatchNorm2d(features)
258
+
259
+ self.activation = activation
260
+
261
+ self.skip_add = nn.quantized.FloatFunctional()
262
+
263
+ def forward(self, x):
264
+ """Forward pass.
265
+
266
+ Args:
267
+ x (tensor): input
268
+
269
+ Returns:
270
+ tensor: output
271
+ """
272
+
273
+ out = self.activation(x)
274
+ out = self.conv1(out)
275
+ if self.bn==True:
276
+ out = self.bn1(out)
277
+
278
+ out = self.activation(out)
279
+ out = self.conv2(out)
280
+ if self.bn==True:
281
+ out = self.bn2(out)
282
+
283
+ if self.groups > 1:
284
+ out = self.conv_merge(out)
285
+
286
+ return self.skip_add.add(out, x)
287
+
288
+ # return out + x
289
+
290
+
291
+ class FeatureFusionBlock_custom(nn.Module):
292
+ """Feature fusion block.
293
+ """
294
+
295
+ def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
296
+ """Init.
297
+
298
+ Args:
299
+ features (int): number of features
300
+ """
301
+ super(FeatureFusionBlock_custom, self).__init__()
302
+
303
+ self.deconv = deconv
304
+ self.align_corners = align_corners
305
+
306
+ self.groups=1
307
+
308
+ self.expand = expand
309
+ out_features = features
310
+ if self.expand==True:
311
+ out_features = features//2
312
+
313
+ self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
314
+
315
+ self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
316
+ self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
317
+
318
+ self.skip_add = nn.quantized.FloatFunctional()
319
+
320
+ def forward(self, *xs):
321
+ """Forward pass.
322
+
323
+ Returns:
324
+ tensor: output
325
+ """
326
+ output = xs[0]
327
+
328
+ if len(xs) == 2:
329
+ res = self.resConfUnit1(xs[1])
330
+ output = self.skip_add.add(output, res)
331
+ # output += res
332
+
333
+ output = self.resConfUnit2(output)
334
+
335
+ output = nn.functional.interpolate(
336
+ output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
337
+ )
338
+
339
+ output = self.out_conv(output)
340
+
341
+ return output
342
+
controlnet/annotator/midas/midas/dpt_depth.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ from .base_model import BaseModel
6
+ from .blocks import (
7
+ FeatureFusionBlock,
8
+ FeatureFusionBlock_custom,
9
+ Interpolate,
10
+ _make_encoder,
11
+ forward_vit,
12
+ )
13
+
14
+
15
+ def _make_fusion_block(features, use_bn):
16
+ return FeatureFusionBlock_custom(
17
+ features,
18
+ nn.ReLU(False),
19
+ deconv=False,
20
+ bn=use_bn,
21
+ expand=False,
22
+ align_corners=True,
23
+ )
24
+
25
+
26
+ class DPT(BaseModel):
27
+ def __init__(
28
+ self,
29
+ head,
30
+ features=256,
31
+ backbone="vitb_rn50_384",
32
+ readout="project",
33
+ channels_last=False,
34
+ use_bn=False,
35
+ ):
36
+
37
+ super(DPT, self).__init__()
38
+
39
+ self.channels_last = channels_last
40
+
41
+ hooks = {
42
+ "vitb_rn50_384": [0, 1, 8, 11],
43
+ "vitb16_384": [2, 5, 8, 11],
44
+ "vitl16_384": [5, 11, 17, 23],
45
+ }
46
+
47
+ # Instantiate backbone and reassemble blocks
48
+ self.pretrained, self.scratch = _make_encoder(
49
+ backbone,
50
+ features,
51
+ False, # Set to true of you want to train from scratch, uses ImageNet weights
52
+ groups=1,
53
+ expand=False,
54
+ exportable=False,
55
+ hooks=hooks[backbone],
56
+ use_readout=readout,
57
+ )
58
+
59
+ self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
60
+ self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
61
+ self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
62
+ self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
63
+
64
+ self.scratch.output_conv = head
65
+
66
+
67
+ def forward(self, x):
68
+ if self.channels_last == True:
69
+ x.contiguous(memory_format=torch.channels_last)
70
+
71
+ layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
72
+
73
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
74
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
75
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
76
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
77
+
78
+ path_4 = self.scratch.refinenet4(layer_4_rn)
79
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
80
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
81
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
82
+
83
+ out = self.scratch.output_conv(path_1)
84
+
85
+ return out
86
+
87
+
88
+ class DPTDepthModel(DPT):
89
+ def __init__(self, path=None, non_negative=True, **kwargs):
90
+ features = kwargs["features"] if "features" in kwargs else 256
91
+
92
+ head = nn.Sequential(
93
+ nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
94
+ Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
95
+ nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
96
+ nn.ReLU(True),
97
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
98
+ nn.ReLU(True) if non_negative else nn.Identity(),
99
+ nn.Identity(),
100
+ )
101
+
102
+ super().__init__(head, **kwargs)
103
+
104
+ if path is not None:
105
+ self.load(path)
106
+
107
+ def forward(self, x):
108
+ return super().forward(x).squeeze(dim=1)
109
+
controlnet/annotator/midas/midas/midas_net.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
2
+ This file contains code that is adapted from
3
+ https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ from .base_model import BaseModel
9
+ from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
10
+
11
+
12
+ class MidasNet(BaseModel):
13
+ """Network for monocular depth estimation.
14
+ """
15
+
16
+ def __init__(self, path=None, features=256, non_negative=True):
17
+ """Init.
18
+
19
+ Args:
20
+ path (str, optional): Path to saved model. Defaults to None.
21
+ features (int, optional): Number of features. Defaults to 256.
22
+ backbone (str, optional): Backbone network for encoder. Defaults to resnet50
23
+ """
24
+ print("Loading weights: ", path)
25
+
26
+ super(MidasNet, self).__init__()
27
+
28
+ use_pretrained = False if path is None else True
29
+
30
+ self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
31
+
32
+ self.scratch.refinenet4 = FeatureFusionBlock(features)
33
+ self.scratch.refinenet3 = FeatureFusionBlock(features)
34
+ self.scratch.refinenet2 = FeatureFusionBlock(features)
35
+ self.scratch.refinenet1 = FeatureFusionBlock(features)
36
+
37
+ self.scratch.output_conv = nn.Sequential(
38
+ nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
39
+ Interpolate(scale_factor=2, mode="bilinear"),
40
+ nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
41
+ nn.ReLU(True),
42
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
43
+ nn.ReLU(True) if non_negative else nn.Identity(),
44
+ )
45
+
46
+ if path:
47
+ self.load(path)
48
+
49
+ def forward(self, x):
50
+ """Forward pass.
51
+
52
+ Args:
53
+ x (tensor): input data (image)
54
+
55
+ Returns:
56
+ tensor: depth
57
+ """
58
+
59
+ layer_1 = self.pretrained.layer1(x)
60
+ layer_2 = self.pretrained.layer2(layer_1)
61
+ layer_3 = self.pretrained.layer3(layer_2)
62
+ layer_4 = self.pretrained.layer4(layer_3)
63
+
64
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
65
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
66
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
67
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
68
+
69
+ path_4 = self.scratch.refinenet4(layer_4_rn)
70
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
71
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
72
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
73
+
74
+ out = self.scratch.output_conv(path_1)
75
+
76
+ return torch.squeeze(out, dim=1)
controlnet/annotator/midas/midas/midas_net_custom.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
2
+ This file contains code that is adapted from
3
+ https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ from .base_model import BaseModel
9
+ from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
10
+
11
+
12
+ class MidasNet_small(BaseModel):
13
+ """Network for monocular depth estimation.
14
+ """
15
+
16
+ def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
17
+ blocks={'expand': True}):
18
+ """Init.
19
+
20
+ Args:
21
+ path (str, optional): Path to saved model. Defaults to None.
22
+ features (int, optional): Number of features. Defaults to 256.
23
+ backbone (str, optional): Backbone network for encoder. Defaults to resnet50
24
+ """
25
+ print("Loading weights: ", path)
26
+
27
+ super(MidasNet_small, self).__init__()
28
+
29
+ use_pretrained = False if path else True
30
+
31
+ self.channels_last = channels_last
32
+ self.blocks = blocks
33
+ self.backbone = backbone
34
+
35
+ self.groups = 1
36
+
37
+ features1=features
38
+ features2=features
39
+ features3=features
40
+ features4=features
41
+ self.expand = False
42
+ if "expand" in self.blocks and self.blocks['expand'] == True:
43
+ self.expand = True
44
+ features1=features
45
+ features2=features*2
46
+ features3=features*4
47
+ features4=features*8
48
+
49
+ self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
50
+
51
+ self.scratch.activation = nn.ReLU(False)
52
+
53
+ self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
54
+ self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
55
+ self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
56
+ self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
57
+
58
+
59
+ self.scratch.output_conv = nn.Sequential(
60
+ nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
61
+ Interpolate(scale_factor=2, mode="bilinear"),
62
+ nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
63
+ self.scratch.activation,
64
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
65
+ nn.ReLU(True) if non_negative else nn.Identity(),
66
+ nn.Identity(),
67
+ )
68
+
69
+ if path:
70
+ self.load(path)
71
+
72
+
73
+ def forward(self, x):
74
+ """Forward pass.
75
+
76
+ Args:
77
+ x (tensor): input data (image)
78
+
79
+ Returns:
80
+ tensor: depth
81
+ """
82
+ if self.channels_last==True:
83
+ print("self.channels_last = ", self.channels_last)
84
+ x.contiguous(memory_format=torch.channels_last)
85
+
86
+
87
+ layer_1 = self.pretrained.layer1(x)
88
+ layer_2 = self.pretrained.layer2(layer_1)
89
+ layer_3 = self.pretrained.layer3(layer_2)
90
+ layer_4 = self.pretrained.layer4(layer_3)
91
+
92
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
93
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
94
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
95
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
96
+
97
+
98
+ path_4 = self.scratch.refinenet4(layer_4_rn)
99
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
100
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
101
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
102
+
103
+ out = self.scratch.output_conv(path_1)
104
+
105
+ return torch.squeeze(out, dim=1)
106
+
107
+
108
+
109
+ def fuse_model(m):
110
+ prev_previous_type = nn.Identity()
111
+ prev_previous_name = ''
112
+ previous_type = nn.Identity()
113
+ previous_name = ''
114
+ for name, module in m.named_modules():
115
+ if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
116
+ # print("FUSED ", prev_previous_name, previous_name, name)
117
+ torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
118
+ elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
119
+ # print("FUSED ", prev_previous_name, previous_name)
120
+ torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
121
+ # elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
122
+ # print("FUSED ", previous_name, name)
123
+ # torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
124
+
125
+ prev_previous_type = previous_type
126
+ prev_previous_name = previous_name
127
+ previous_type = type(module)
128
+ previous_name = name
controlnet/annotator/midas/midas/transforms.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import math
4
+
5
+
6
+ def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
7
+ """Rezise the sample to ensure the given size. Keeps aspect ratio.
8
+
9
+ Args:
10
+ sample (dict): sample
11
+ size (tuple): image size
12
+
13
+ Returns:
14
+ tuple: new size
15
+ """
16
+ shape = list(sample["disparity"].shape)
17
+
18
+ if shape[0] >= size[0] and shape[1] >= size[1]:
19
+ return sample
20
+
21
+ scale = [0, 0]
22
+ scale[0] = size[0] / shape[0]
23
+ scale[1] = size[1] / shape[1]
24
+
25
+ scale = max(scale)
26
+
27
+ shape[0] = math.ceil(scale * shape[0])
28
+ shape[1] = math.ceil(scale * shape[1])
29
+
30
+ # resize
31
+ sample["image"] = cv2.resize(
32
+ sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
33
+ )
34
+
35
+ sample["disparity"] = cv2.resize(
36
+ sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
37
+ )
38
+ sample["mask"] = cv2.resize(
39
+ sample["mask"].astype(np.float32),
40
+ tuple(shape[::-1]),
41
+ interpolation=cv2.INTER_NEAREST,
42
+ )
43
+ sample["mask"] = sample["mask"].astype(bool)
44
+
45
+ return tuple(shape)
46
+
47
+
48
+ class Resize(object):
49
+ """Resize sample to given size (width, height).
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ width,
55
+ height,
56
+ resize_target=True,
57
+ keep_aspect_ratio=False,
58
+ ensure_multiple_of=1,
59
+ resize_method="lower_bound",
60
+ image_interpolation_method=cv2.INTER_AREA,
61
+ ):
62
+ """Init.
63
+
64
+ Args:
65
+ width (int): desired output width
66
+ height (int): desired output height
67
+ resize_target (bool, optional):
68
+ True: Resize the full sample (image, mask, target).
69
+ False: Resize image only.
70
+ Defaults to True.
71
+ keep_aspect_ratio (bool, optional):
72
+ True: Keep the aspect ratio of the input sample.
73
+ Output sample might not have the given width and height, and
74
+ resize behaviour depends on the parameter 'resize_method'.
75
+ Defaults to False.
76
+ ensure_multiple_of (int, optional):
77
+ Output width and height is constrained to be multiple of this parameter.
78
+ Defaults to 1.
79
+ resize_method (str, optional):
80
+ "lower_bound": Output will be at least as large as the given size.
81
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
82
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
83
+ Defaults to "lower_bound".
84
+ """
85
+ self.__width = width
86
+ self.__height = height
87
+
88
+ self.__resize_target = resize_target
89
+ self.__keep_aspect_ratio = keep_aspect_ratio
90
+ self.__multiple_of = ensure_multiple_of
91
+ self.__resize_method = resize_method
92
+ self.__image_interpolation_method = image_interpolation_method
93
+
94
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
95
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
96
+
97
+ if max_val is not None and y > max_val:
98
+ y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
99
+
100
+ if y < min_val:
101
+ y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
102
+
103
+ return y
104
+
105
+ def get_size(self, width, height):
106
+ # determine new height and width
107
+ scale_height = self.__height / height
108
+ scale_width = self.__width / width
109
+
110
+ if self.__keep_aspect_ratio:
111
+ if self.__resize_method == "lower_bound":
112
+ # scale such that output size is lower bound
113
+ if scale_width > scale_height:
114
+ # fit width
115
+ scale_height = scale_width
116
+ else:
117
+ # fit height
118
+ scale_width = scale_height
119
+ elif self.__resize_method == "upper_bound":
120
+ # scale such that output size is upper bound
121
+ if scale_width < scale_height:
122
+ # fit width
123
+ scale_height = scale_width
124
+ else:
125
+ # fit height
126
+ scale_width = scale_height
127
+ elif self.__resize_method == "minimal":
128
+ # scale as least as possbile
129
+ if abs(1 - scale_width) < abs(1 - scale_height):
130
+ # fit width
131
+ scale_height = scale_width
132
+ else:
133
+ # fit height
134
+ scale_width = scale_height
135
+ else:
136
+ raise ValueError(
137
+ f"resize_method {self.__resize_method} not implemented"
138
+ )
139
+
140
+ if self.__resize_method == "lower_bound":
141
+ new_height = self.constrain_to_multiple_of(
142
+ scale_height * height, min_val=self.__height
143
+ )
144
+ new_width = self.constrain_to_multiple_of(
145
+ scale_width * width, min_val=self.__width
146
+ )
147
+ elif self.__resize_method == "upper_bound":
148
+ new_height = self.constrain_to_multiple_of(
149
+ scale_height * height, max_val=self.__height
150
+ )
151
+ new_width = self.constrain_to_multiple_of(
152
+ scale_width * width, max_val=self.__width
153
+ )
154
+ elif self.__resize_method == "minimal":
155
+ new_height = self.constrain_to_multiple_of(scale_height * height)
156
+ new_width = self.constrain_to_multiple_of(scale_width * width)
157
+ else:
158
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
159
+
160
+ return (new_width, new_height)
161
+
162
+ def __call__(self, sample):
163
+ width, height = self.get_size(
164
+ sample["image"].shape[1], sample["image"].shape[0]
165
+ )
166
+
167
+ # resize sample
168
+ sample["image"] = cv2.resize(
169
+ sample["image"],
170
+ (width, height),
171
+ interpolation=self.__image_interpolation_method,
172
+ )
173
+
174
+ if self.__resize_target:
175
+ if "disparity" in sample:
176
+ sample["disparity"] = cv2.resize(
177
+ sample["disparity"],
178
+ (width, height),
179
+ interpolation=cv2.INTER_NEAREST,
180
+ )
181
+
182
+ if "depth" in sample:
183
+ sample["depth"] = cv2.resize(
184
+ sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
185
+ )
186
+
187
+ sample["mask"] = cv2.resize(
188
+ sample["mask"].astype(np.float32),
189
+ (width, height),
190
+ interpolation=cv2.INTER_NEAREST,
191
+ )
192
+ sample["mask"] = sample["mask"].astype(bool)
193
+
194
+ return sample
195
+
196
+
197
+ class NormalizeImage(object):
198
+ """Normlize image by given mean and std.
199
+ """
200
+
201
+ def __init__(self, mean, std):
202
+ self.__mean = mean
203
+ self.__std = std
204
+
205
+ def __call__(self, sample):
206
+ sample["image"] = (sample["image"] - self.__mean) / self.__std
207
+
208
+ return sample
209
+
210
+
211
+ class PrepareForNet(object):
212
+ """Prepare sample for usage as network input.
213
+ """
214
+
215
+ def __init__(self):
216
+ pass
217
+
218
+ def __call__(self, sample):
219
+ image = np.transpose(sample["image"], (2, 0, 1))
220
+ sample["image"] = np.ascontiguousarray(image).astype(np.float32)
221
+
222
+ if "mask" in sample:
223
+ sample["mask"] = sample["mask"].astype(np.float32)
224
+ sample["mask"] = np.ascontiguousarray(sample["mask"])
225
+
226
+ if "disparity" in sample:
227
+ disparity = sample["disparity"].astype(np.float32)
228
+ sample["disparity"] = np.ascontiguousarray(disparity)
229
+
230
+ if "depth" in sample:
231
+ depth = sample["depth"].astype(np.float32)
232
+ sample["depth"] = np.ascontiguousarray(depth)
233
+
234
+ return sample
controlnet/annotator/midas/midas/vit.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import timm
4
+ import types
5
+ import math
6
+ import torch.nn.functional as F
7
+
8
+
9
+ class Slice(nn.Module):
10
+ def __init__(self, start_index=1):
11
+ super(Slice, self).__init__()
12
+ self.start_index = start_index
13
+
14
+ def forward(self, x):
15
+ return x[:, self.start_index :]
16
+
17
+
18
+ class AddReadout(nn.Module):
19
+ def __init__(self, start_index=1):
20
+ super(AddReadout, self).__init__()
21
+ self.start_index = start_index
22
+
23
+ def forward(self, x):
24
+ if self.start_index == 2:
25
+ readout = (x[:, 0] + x[:, 1]) / 2
26
+ else:
27
+ readout = x[:, 0]
28
+ return x[:, self.start_index :] + readout.unsqueeze(1)
29
+
30
+
31
+ class ProjectReadout(nn.Module):
32
+ def __init__(self, in_features, start_index=1):
33
+ super(ProjectReadout, self).__init__()
34
+ self.start_index = start_index
35
+
36
+ self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
37
+
38
+ def forward(self, x):
39
+ readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
40
+ features = torch.cat((x[:, self.start_index :], readout), -1)
41
+
42
+ return self.project(features)
43
+
44
+
45
+ class Transpose(nn.Module):
46
+ def __init__(self, dim0, dim1):
47
+ super(Transpose, self).__init__()
48
+ self.dim0 = dim0
49
+ self.dim1 = dim1
50
+
51
+ def forward(self, x):
52
+ x = x.transpose(self.dim0, self.dim1)
53
+ return x
54
+
55
+
56
+ def forward_vit(pretrained, x):
57
+ b, c, h, w = x.shape
58
+
59
+ glob = pretrained.model.forward_flex(x)
60
+
61
+ layer_1 = pretrained.activations["1"]
62
+ layer_2 = pretrained.activations["2"]
63
+ layer_3 = pretrained.activations["3"]
64
+ layer_4 = pretrained.activations["4"]
65
+
66
+ layer_1 = pretrained.act_postprocess1[0:2](layer_1)
67
+ layer_2 = pretrained.act_postprocess2[0:2](layer_2)
68
+ layer_3 = pretrained.act_postprocess3[0:2](layer_3)
69
+ layer_4 = pretrained.act_postprocess4[0:2](layer_4)
70
+
71
+ unflatten = nn.Sequential(
72
+ nn.Unflatten(
73
+ 2,
74
+ torch.Size(
75
+ [
76
+ h // pretrained.model.patch_size[1],
77
+ w // pretrained.model.patch_size[0],
78
+ ]
79
+ ),
80
+ )
81
+ )
82
+
83
+ if layer_1.ndim == 3:
84
+ layer_1 = unflatten(layer_1)
85
+ if layer_2.ndim == 3:
86
+ layer_2 = unflatten(layer_2)
87
+ if layer_3.ndim == 3:
88
+ layer_3 = unflatten(layer_3)
89
+ if layer_4.ndim == 3:
90
+ layer_4 = unflatten(layer_4)
91
+
92
+ layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
93
+ layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
94
+ layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
95
+ layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
96
+
97
+ return layer_1, layer_2, layer_3, layer_4
98
+
99
+
100
+ def _resize_pos_embed(self, posemb, gs_h, gs_w):
101
+ posemb_tok, posemb_grid = (
102
+ posemb[:, : self.start_index],
103
+ posemb[0, self.start_index :],
104
+ )
105
+
106
+ gs_old = int(math.sqrt(len(posemb_grid)))
107
+
108
+ posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
109
+ posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
110
+ posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
111
+
112
+ posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
113
+
114
+ return posemb
115
+
116
+
117
+ def forward_flex(self, x):
118
+ b, c, h, w = x.shape
119
+
120
+ pos_embed = self._resize_pos_embed(
121
+ self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
122
+ )
123
+
124
+ B = x.shape[0]
125
+
126
+ if hasattr(self.patch_embed, "backbone"):
127
+ x = self.patch_embed.backbone(x)
128
+ if isinstance(x, (list, tuple)):
129
+ x = x[-1] # last feature if backbone outputs list/tuple of features
130
+
131
+ x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
132
+
133
+ if getattr(self, "dist_token", None) is not None:
134
+ cls_tokens = self.cls_token.expand(
135
+ B, -1, -1
136
+ ) # stole cls_tokens impl from Phil Wang, thanks
137
+ dist_token = self.dist_token.expand(B, -1, -1)
138
+ x = torch.cat((cls_tokens, dist_token, x), dim=1)
139
+ else:
140
+ cls_tokens = self.cls_token.expand(
141
+ B, -1, -1
142
+ ) # stole cls_tokens impl from Phil Wang, thanks
143
+ x = torch.cat((cls_tokens, x), dim=1)
144
+
145
+ x = x + pos_embed
146
+ x = self.pos_drop(x)
147
+
148
+ for blk in self.blocks:
149
+ x = blk(x)
150
+
151
+ x = self.norm(x)
152
+
153
+ return x
154
+
155
+
156
+ activations = {}
157
+
158
+
159
+ def get_activation(name):
160
+ def hook(model, input, output):
161
+ activations[name] = output
162
+
163
+ return hook
164
+
165
+
166
+ def get_readout_oper(vit_features, features, use_readout, start_index=1):
167
+ if use_readout == "ignore":
168
+ readout_oper = [Slice(start_index)] * len(features)
169
+ elif use_readout == "add":
170
+ readout_oper = [AddReadout(start_index)] * len(features)
171
+ elif use_readout == "project":
172
+ readout_oper = [
173
+ ProjectReadout(vit_features, start_index) for out_feat in features
174
+ ]
175
+ else:
176
+ assert (
177
+ False
178
+ ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
179
+
180
+ return readout_oper
181
+
182
+
183
+ def _make_vit_b16_backbone(
184
+ model,
185
+ features=[96, 192, 384, 768],
186
+ size=[384, 384],
187
+ hooks=[2, 5, 8, 11],
188
+ vit_features=768,
189
+ use_readout="ignore",
190
+ start_index=1,
191
+ ):
192
+ pretrained = nn.Module()
193
+
194
+ pretrained.model = model
195
+ pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
196
+ pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
197
+ pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
198
+ pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
199
+
200
+ pretrained.activations = activations
201
+
202
+ readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
203
+
204
+ # 32, 48, 136, 384
205
+ pretrained.act_postprocess1 = nn.Sequential(
206
+ readout_oper[0],
207
+ Transpose(1, 2),
208
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
209
+ nn.Conv2d(
210
+ in_channels=vit_features,
211
+ out_channels=features[0],
212
+ kernel_size=1,
213
+ stride=1,
214
+ padding=0,
215
+ ),
216
+ nn.ConvTranspose2d(
217
+ in_channels=features[0],
218
+ out_channels=features[0],
219
+ kernel_size=4,
220
+ stride=4,
221
+ padding=0,
222
+ bias=True,
223
+ dilation=1,
224
+ groups=1,
225
+ ),
226
+ )
227
+
228
+ pretrained.act_postprocess2 = nn.Sequential(
229
+ readout_oper[1],
230
+ Transpose(1, 2),
231
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
232
+ nn.Conv2d(
233
+ in_channels=vit_features,
234
+ out_channels=features[1],
235
+ kernel_size=1,
236
+ stride=1,
237
+ padding=0,
238
+ ),
239
+ nn.ConvTranspose2d(
240
+ in_channels=features[1],
241
+ out_channels=features[1],
242
+ kernel_size=2,
243
+ stride=2,
244
+ padding=0,
245
+ bias=True,
246
+ dilation=1,
247
+ groups=1,
248
+ ),
249
+ )
250
+
251
+ pretrained.act_postprocess3 = nn.Sequential(
252
+ readout_oper[2],
253
+ Transpose(1, 2),
254
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
255
+ nn.Conv2d(
256
+ in_channels=vit_features,
257
+ out_channels=features[2],
258
+ kernel_size=1,
259
+ stride=1,
260
+ padding=0,
261
+ ),
262
+ )
263
+
264
+ pretrained.act_postprocess4 = nn.Sequential(
265
+ readout_oper[3],
266
+ Transpose(1, 2),
267
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
268
+ nn.Conv2d(
269
+ in_channels=vit_features,
270
+ out_channels=features[3],
271
+ kernel_size=1,
272
+ stride=1,
273
+ padding=0,
274
+ ),
275
+ nn.Conv2d(
276
+ in_channels=features[3],
277
+ out_channels=features[3],
278
+ kernel_size=3,
279
+ stride=2,
280
+ padding=1,
281
+ ),
282
+ )
283
+
284
+ pretrained.model.start_index = start_index
285
+ pretrained.model.patch_size = [16, 16]
286
+
287
+ # We inject this function into the VisionTransformer instances so that
288
+ # we can use it with interpolated position embeddings without modifying the library source.
289
+ pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
290
+ pretrained.model._resize_pos_embed = types.MethodType(
291
+ _resize_pos_embed, pretrained.model
292
+ )
293
+
294
+ return pretrained
295
+
296
+
297
+ def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
298
+ model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
299
+
300
+ hooks = [5, 11, 17, 23] if hooks == None else hooks
301
+ return _make_vit_b16_backbone(
302
+ model,
303
+ features=[256, 512, 1024, 1024],
304
+ hooks=hooks,
305
+ vit_features=1024,
306
+ use_readout=use_readout,
307
+ )
308
+
309
+
310
+ def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
311
+ model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
312
+
313
+ hooks = [2, 5, 8, 11] if hooks == None else hooks
314
+ return _make_vit_b16_backbone(
315
+ model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
316
+ )
317
+
318
+
319
+ def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
320
+ model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
321
+
322
+ hooks = [2, 5, 8, 11] if hooks == None else hooks
323
+ return _make_vit_b16_backbone(
324
+ model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
325
+ )
326
+
327
+
328
+ def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
329
+ model = timm.create_model(
330
+ "vit_deit_base_distilled_patch16_384", pretrained=pretrained
331
+ )
332
+
333
+ hooks = [2, 5, 8, 11] if hooks == None else hooks
334
+ return _make_vit_b16_backbone(
335
+ model,
336
+ features=[96, 192, 384, 768],
337
+ hooks=hooks,
338
+ use_readout=use_readout,
339
+ start_index=2,
340
+ )
341
+
342
+
343
+ def _make_vit_b_rn50_backbone(
344
+ model,
345
+ features=[256, 512, 768, 768],
346
+ size=[384, 384],
347
+ hooks=[0, 1, 8, 11],
348
+ vit_features=768,
349
+ use_vit_only=False,
350
+ use_readout="ignore",
351
+ start_index=1,
352
+ ):
353
+ pretrained = nn.Module()
354
+
355
+ pretrained.model = model
356
+
357
+ if use_vit_only == True:
358
+ pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
359
+ pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
360
+ else:
361
+ pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
362
+ get_activation("1")
363
+ )
364
+ pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
365
+ get_activation("2")
366
+ )
367
+
368
+ pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
369
+ pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
370
+
371
+ pretrained.activations = activations
372
+
373
+ readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
374
+
375
+ if use_vit_only == True:
376
+ pretrained.act_postprocess1 = nn.Sequential(
377
+ readout_oper[0],
378
+ Transpose(1, 2),
379
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
380
+ nn.Conv2d(
381
+ in_channels=vit_features,
382
+ out_channels=features[0],
383
+ kernel_size=1,
384
+ stride=1,
385
+ padding=0,
386
+ ),
387
+ nn.ConvTranspose2d(
388
+ in_channels=features[0],
389
+ out_channels=features[0],
390
+ kernel_size=4,
391
+ stride=4,
392
+ padding=0,
393
+ bias=True,
394
+ dilation=1,
395
+ groups=1,
396
+ ),
397
+ )
398
+
399
+ pretrained.act_postprocess2 = nn.Sequential(
400
+ readout_oper[1],
401
+ Transpose(1, 2),
402
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
403
+ nn.Conv2d(
404
+ in_channels=vit_features,
405
+ out_channels=features[1],
406
+ kernel_size=1,
407
+ stride=1,
408
+ padding=0,
409
+ ),
410
+ nn.ConvTranspose2d(
411
+ in_channels=features[1],
412
+ out_channels=features[1],
413
+ kernel_size=2,
414
+ stride=2,
415
+ padding=0,
416
+ bias=True,
417
+ dilation=1,
418
+ groups=1,
419
+ ),
420
+ )
421
+ else:
422
+ pretrained.act_postprocess1 = nn.Sequential(
423
+ nn.Identity(), nn.Identity(), nn.Identity()
424
+ )
425
+ pretrained.act_postprocess2 = nn.Sequential(
426
+ nn.Identity(), nn.Identity(), nn.Identity()
427
+ )
428
+
429
+ pretrained.act_postprocess3 = nn.Sequential(
430
+ readout_oper[2],
431
+ Transpose(1, 2),
432
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
433
+ nn.Conv2d(
434
+ in_channels=vit_features,
435
+ out_channels=features[2],
436
+ kernel_size=1,
437
+ stride=1,
438
+ padding=0,
439
+ ),
440
+ )
441
+
442
+ pretrained.act_postprocess4 = nn.Sequential(
443
+ readout_oper[3],
444
+ Transpose(1, 2),
445
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
446
+ nn.Conv2d(
447
+ in_channels=vit_features,
448
+ out_channels=features[3],
449
+ kernel_size=1,
450
+ stride=1,
451
+ padding=0,
452
+ ),
453
+ nn.Conv2d(
454
+ in_channels=features[3],
455
+ out_channels=features[3],
456
+ kernel_size=3,
457
+ stride=2,
458
+ padding=1,
459
+ ),
460
+ )
461
+
462
+ pretrained.model.start_index = start_index
463
+ pretrained.model.patch_size = [16, 16]
464
+
465
+ # We inject this function into the VisionTransformer instances so that
466
+ # we can use it with interpolated position embeddings without modifying the library source.
467
+ pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
468
+
469
+ # We inject this function into the VisionTransformer instances so that
470
+ # we can use it with interpolated position embeddings without modifying the library source.
471
+ pretrained.model._resize_pos_embed = types.MethodType(
472
+ _resize_pos_embed, pretrained.model
473
+ )
474
+
475
+ return pretrained
476
+
477
+
478
+ def _make_pretrained_vitb_rn50_384(
479
+ pretrained, use_readout="ignore", hooks=None, use_vit_only=False
480
+ ):
481
+ model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
482
+
483
+ hooks = [0, 1, 8, 11] if hooks == None else hooks
484
+ return _make_vit_b_rn50_backbone(
485
+ model,
486
+ features=[256, 512, 768, 768],
487
+ size=[384, 384],
488
+ hooks=hooks,
489
+ use_vit_only=use_vit_only,
490
+ use_readout=use_readout,
491
+ )
controlnet/annotator/midas/utils.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utils for monoDepth."""
2
+ import sys
3
+ import re
4
+ import numpy as np
5
+ import cv2
6
+ import torch
7
+
8
+
9
+ def read_pfm(path):
10
+ """Read pfm file.
11
+
12
+ Args:
13
+ path (str): path to file
14
+
15
+ Returns:
16
+ tuple: (data, scale)
17
+ """
18
+ with open(path, "rb") as file:
19
+
20
+ color = None
21
+ width = None
22
+ height = None
23
+ scale = None
24
+ endian = None
25
+
26
+ header = file.readline().rstrip()
27
+ if header.decode("ascii") == "PF":
28
+ color = True
29
+ elif header.decode("ascii") == "Pf":
30
+ color = False
31
+ else:
32
+ raise Exception("Not a PFM file: " + path)
33
+
34
+ dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
35
+ if dim_match:
36
+ width, height = list(map(int, dim_match.groups()))
37
+ else:
38
+ raise Exception("Malformed PFM header.")
39
+
40
+ scale = float(file.readline().decode("ascii").rstrip())
41
+ if scale < 0:
42
+ # little-endian
43
+ endian = "<"
44
+ scale = -scale
45
+ else:
46
+ # big-endian
47
+ endian = ">"
48
+
49
+ data = np.fromfile(file, endian + "f")
50
+ shape = (height, width, 3) if color else (height, width)
51
+
52
+ data = np.reshape(data, shape)
53
+ data = np.flipud(data)
54
+
55
+ return data, scale
56
+
57
+
58
+ def write_pfm(path, image, scale=1):
59
+ """Write pfm file.
60
+
61
+ Args:
62
+ path (str): pathto file
63
+ image (array): data
64
+ scale (int, optional): Scale. Defaults to 1.
65
+ """
66
+
67
+ with open(path, "wb") as file:
68
+ color = None
69
+
70
+ if image.dtype.name != "float32":
71
+ raise Exception("Image dtype must be float32.")
72
+
73
+ image = np.flipud(image)
74
+
75
+ if len(image.shape) == 3 and image.shape[2] == 3: # color image
76
+ color = True
77
+ elif (
78
+ len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
79
+ ): # greyscale
80
+ color = False
81
+ else:
82
+ raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
83
+
84
+ file.write("PF\n" if color else "Pf\n".encode())
85
+ file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
86
+
87
+ endian = image.dtype.byteorder
88
+
89
+ if endian == "<" or endian == "=" and sys.byteorder == "little":
90
+ scale = -scale
91
+
92
+ file.write("%f\n".encode() % scale)
93
+
94
+ image.tofile(file)
95
+
96
+
97
+ def read_image(path):
98
+ """Read image and output RGB image (0-1).
99
+
100
+ Args:
101
+ path (str): path to file
102
+
103
+ Returns:
104
+ array: RGB image (0-1)
105
+ """
106
+ img = cv2.imread(path)
107
+
108
+ if img.ndim == 2:
109
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
110
+
111
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
112
+
113
+ return img
114
+
115
+
116
+ def resize_image(img):
117
+ """Resize image and make it fit for network.
118
+
119
+ Args:
120
+ img (array): image
121
+
122
+ Returns:
123
+ tensor: data ready for network
124
+ """
125
+ height_orig = img.shape[0]
126
+ width_orig = img.shape[1]
127
+
128
+ if width_orig > height_orig:
129
+ scale = width_orig / 384
130
+ else:
131
+ scale = height_orig / 384
132
+
133
+ height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
134
+ width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
135
+
136
+ img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
137
+
138
+ img_resized = (
139
+ torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
140
+ )
141
+ img_resized = img_resized.unsqueeze(0)
142
+
143
+ return img_resized
144
+
145
+
146
+ def resize_depth(depth, width, height):
147
+ """Resize depth map and bring to CPU (numpy).
148
+
149
+ Args:
150
+ depth (tensor): depth
151
+ width (int): image width
152
+ height (int): image height
153
+
154
+ Returns:
155
+ array: processed depth
156
+ """
157
+ depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
158
+
159
+ depth_resized = cv2.resize(
160
+ depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
161
+ )
162
+
163
+ return depth_resized
164
+
165
+ def write_depth(path, depth, bits=1):
166
+ """Write depth map to pfm and png file.
167
+
168
+ Args:
169
+ path (str): filepath without extension
170
+ depth (array): depth
171
+ """
172
+ write_pfm(path + ".pfm", depth.astype(np.float32))
173
+
174
+ depth_min = depth.min()
175
+ depth_max = depth.max()
176
+
177
+ max_val = (2**(8*bits))-1
178
+
179
+ if depth_max - depth_min > np.finfo("float").eps:
180
+ out = max_val * (depth - depth_min) / (depth_max - depth_min)
181
+ else:
182
+ out = np.zeros(depth.shape, dtype=depth.type)
183
+
184
+ if bits == 1:
185
+ cv2.imwrite(path + ".png", out.astype("uint8"))
186
+ elif bits == 2:
187
+ cv2.imwrite(path + ".png", out.astype("uint16"))
188
+
189
+ return
controlnet/annotator/util.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import numpy as np
4
+ import cv2
5
+ import os
6
+ import PIL
7
+
8
+ annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
9
+
10
+ def HWC3(x):
11
+ assert x.dtype == np.uint8
12
+ if x.ndim == 2:
13
+ x = x[:, :, None]
14
+ assert x.ndim == 3
15
+ H, W, C = x.shape
16
+ assert C == 1 or C == 3 or C == 4
17
+ if C == 3:
18
+ return x
19
+ if C == 1:
20
+ return np.concatenate([x, x, x], axis=2)
21
+ if C == 4:
22
+ color = x[:, :, 0:3].astype(np.float32)
23
+ alpha = x[:, :, 3:4].astype(np.float32) / 255.0
24
+ y = color * alpha + 255.0 * (1.0 - alpha)
25
+ y = y.clip(0, 255).astype(np.uint8)
26
+ return y
27
+
28
+
29
+ def resize_image(input_image, resolution, short = False, interpolation=None):
30
+ if isinstance(input_image,PIL.Image.Image):
31
+ mode = 'pil'
32
+ W,H = input_image.size
33
+
34
+ elif isinstance(input_image,np.ndarray):
35
+ mode = 'cv2'
36
+ H, W, _ = input_image.shape
37
+
38
+ H = float(H)
39
+ W = float(W)
40
+ if short:
41
+ k = float(resolution) / min(H, W) # k>1 放大, k<1 缩小
42
+ else:
43
+ k = float(resolution) / max(H, W) # k>1 放大, k<1 缩小
44
+ H *= k
45
+ W *= k
46
+ H = int(np.round(H / 64.0)) * 64
47
+ W = int(np.round(W / 64.0)) * 64
48
+
49
+ if mode == 'cv2':
50
+ if interpolation is None:
51
+ interpolation = cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA
52
+ img = cv2.resize(input_image, (W, H), interpolation=interpolation)
53
+
54
+ elif mode == 'pil':
55
+ if interpolation is None:
56
+ interpolation = PIL.Image.LANCZOS if k > 1 else PIL.Image.BILINEAR
57
+ img = input_image.resize((W, H), resample=interpolation)
58
+
59
+ return img
60
+
61
+ # def resize_image(input_image, resolution):
62
+ # H, W, C = input_image.shape
63
+ # H = float(H)
64
+ # W = float(W)
65
+ # k = float(resolution) / min(H, W)
66
+ # H *= k
67
+ # W *= k
68
+ # H = int(np.round(H / 64.0)) * 64
69
+ # W = int(np.round(W / 64.0)) * 64
70
+ # img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
71
+ # return img
72
+
73
+
74
+ def nms(x, t, s):
75
+ x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
76
+
77
+ f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
78
+ f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
79
+ f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
80
+ f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
81
+
82
+ y = np.zeros_like(x)
83
+
84
+ for f in [f1, f2, f3, f4]:
85
+ np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
86
+
87
+ z = np.zeros_like(y, dtype=np.uint8)
88
+ z[y > t] = 255
89
+ return z
90
+
91
+
92
+ def make_noise_disk(H, W, C, F):
93
+ noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
94
+ noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
95
+ noise = noise[F: F + H, F: F + W]
96
+ noise -= np.min(noise)
97
+ noise /= np.max(noise)
98
+ if C == 1:
99
+ noise = noise[:, :, None]
100
+ return noise
101
+
102
+
103
+ def min_max_norm(x):
104
+ x -= np.min(x)
105
+ x /= np.maximum(np.max(x), 1e-5)
106
+ return x
107
+
108
+
109
+ def safe_step(x, step=2):
110
+ y = x.astype(np.float32) * float(step + 1)
111
+ y = y.astype(np.int32).astype(np.float32) / float(step)
112
+ return y
113
+
114
+
115
+ def img2mask(img, H, W, low=10, high=90):
116
+ assert img.ndim == 3 or img.ndim == 2
117
+ assert img.dtype == np.uint8
118
+
119
+ if img.ndim == 3:
120
+ y = img[:, :, random.randrange(0, img.shape[2])]
121
+ else:
122
+ y = img
123
+
124
+ y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
125
+
126
+ if random.uniform(0, 1) < 0.5:
127
+ y = 255 - y
128
+
129
+ return y < np.percentile(y, random.randrange(low, high))
controlnet/assets/bird.png ADDED

Git LFS Details

  • SHA256: e74821365819a2141455e85d5a1c4fa443167dc707e296059c6f4a9d3d93b2f5
  • Pointer size: 131 Bytes
  • Size of remote file: 612 kB
controlnet/assets/dog.png ADDED

Git LFS Details

  • SHA256: a48c9d517b9a9bd27f31c7fa7e6e4128e27e485168c566dc88db9ece60703338
  • Pointer size: 132 Bytes
  • Size of remote file: 1.48 MB
controlnet/assets/woman_1.png ADDED
controlnet/assets/woman_2.png ADDED

Git LFS Details

  • SHA256: 747d7dd713fc6c50494dcfbb66ffb5bc8fbbb2e84e638a976b5aaca29f9b85f3
  • Pointer size: 131 Bytes
  • Size of remote file: 684 kB
controlnet/assets/woman_3.png ADDED

Git LFS Details

  • SHA256: 0a46fa8e9d61907f4b8088ab39b7d53a1cc1e57f739305e04026509a00b6e4f1
  • Pointer size: 132 Bytes
  • Size of remote file: 1.31 MB
controlnet/assets/woman_4.png ADDED