Add link to paper

#2
by nielsr HF Staff - opened
Files changed (1) hide show
  1. README.md +4 -222
README.md CHANGED
@@ -4,226 +4,8 @@ library_name: diffusers
4
  pipeline_tag: image-to-3d
5
  ---
6
 
7
- # File information
8
 
9
- The repository contains the following file information:
10
-
11
- Filename: model_index.json
12
- Content: {
13
- "_class_name": "MVDiffusionImagePipeline",
14
- "_diffusers_version": "0.30.3",
15
- "feature_extractor": [
16
- "transformers",
17
- "CLIPImageProcessor"
18
- ],
19
- "image_encoder": [
20
- "transformers",
21
- "CLIPVisionModelWithProjection"
22
- ],
23
- "requires_safety_checker": true,
24
- "safety_checker": [
25
- null,
26
- null
27
- ],
28
- "scheduler": [
29
- "diffusers",
30
- "DDIMScheduler"
31
- ],
32
- "unet": [
33
- "mv_unet",
34
- "UnifieldWrappedUNet"
35
- ],
36
- "vae": [
37
- "diffusers",
38
- "AutoencoderKL"
39
- ]
40
- }
41
-
42
- Filename: config.json
43
- Content: {
44
- "_class_name": "AutoencoderKL",
45
- "_diffusers_version": "0.30.3",
46
- "_name_or_path": "Luffuly/unique3d-mvimage-diffuser",
47
- "act_fn": "silu",
48
- "block_out_channels": [
49
- 128,
50
- 256,
51
- 512,
52
- 512
53
- ],
54
- "down_block_types": [
55
- "DownEncoderBlock2D",
56
- "DownEncoderBlock2D",
57
- "DownEncoderBlock2D",
58
- "DownEncoderBlock2D"
59
- ],
60
- "force_upcast": true,
61
- "in_channels": 3,
62
- "latent_channels": 4,
63
- "latents_mean": null,
64
- "latents_std": null,
65
- "layers_per_block": 2,
66
- "mid_block_add_attention": true,
67
- "norm_num_groups": 32,
68
- "out_channels": 3,
69
- "sample_size": 256,
70
- "scaling_factor": 0.18215,
71
- "shift_factor": null,
72
- "up_block_types": [
73
- "UpDecoderBlock2D",
74
- "UpDecoderBlock2D",
75
- "UpDecoderBlock2D",
76
- "UpDecoderBlock2D"
77
- ],
78
- "use_post_quant_conv": true,
79
- "use_quant_conv": true
80
- }
81
-
82
- Filename: scheduler_config.json
83
- Content: {
84
- "_class_name": "DDIMScheduler",
85
- "_diffusers_version": "0.30.3",
86
- "beta_end": 0.012,
87
- "beta_schedule": "scaled_linear",
88
- "beta_start": 0.00085,
89
- "clip_sample": false,
90
- "clip_sample_range": 1.0,
91
- "dynamic_thresholding_ratio": 0.995,
92
- "num_train_timesteps": 1000,
93
- "prediction_type": "epsilon",
94
- "rescale_betas_zero_snr": false,
95
- "sample_max_value": 1.0,
96
- "set_alpha_to_one": false,
97
- "skip_prk_steps": true,
98
- "steps_offset": 1,
99
- "thresholding": false,
100
- "timestep_spacing": "leading",
101
- "trained_betas": null
102
- }
103
-
104
- Filename: config.json
105
- Content: {
106
- "_class_name": "UnifieldWrappedUNet",
107
- "_diffusers_version": "0.30.3",
108
- "_name_or_path": "outputs/vroid-mvimage-6view/checkpoint",
109
- "act_fn": "silu",
110
- "addition_embed_type": null,
111
- "addition_embed_type_num_heads": 64,
112
- "addition_time_embed_dim": null,
113
- "attention_head_dim": 8,
114
- "attention_type": "default",
115
- "block_out_channels": [
116
- 320,
117
- 640,
118
- 1280,
119
- 1280
120
- ],
121
- "center_input_sample": false,
122
- "class_embed_type": null,
123
- "class_embeddings_concat": false,
124
- "conv_in_kernel": 3,
125
- "conv_out_kernel": 3,
126
- "cross_attention_dim": 768,
127
- "cross_attention_norm": null,
128
- "down_block_types": [
129
- "CrossAttnDownBlock2D",
130
- "CrossAttnDownBlock2D",
131
- "CrossAttnDownBlock2D",
132
- "DownBlock2D"
133
- ],
134
- "downsample_padding": 1,
135
- "dropout": 0.0,
136
- "dual_cross_attention": false,
137
- "encoder_hid_dim": null,
138
- "encoder_hid_dim_type": null,
139
- "flip_sin_to_cos": true,
140
- "freq_shift": 0,
141
- "in_channels": 8,
142
- "layers_per_block": 2,
143
- "mid_block_only_cross_attention": null,
144
- "mid_block_scale_factor": 1,
145
- "mid_block_type": "UNetMidBlock2DCrossAttn",
146
- "norm_eps": 1e-05,
147
- "norm_num_groups": 32,
148
- "num_attention_heads": null,
149
- "num_class_embeds": 8,
150
- "only_cross_attention": false,
151
- "out_channels": 4,
152
- "projection_class_embeddings_input_dim": null,
153
- "resnet_out_scale_factor": 1.0,
154
- "resnet_skip_time_act": false,
155
- "resnet_time_scale_shift": "default",
156
- "reverse_transformer_layers_per_block": null,
157
- "sample_size": 64,
158
- "time_cond_proj_dim": null,
159
- "time_embedding_act_fn": null,
160
- "time_embedding_dim": null,
161
- "time_embedding_type": "positional",
162
- "timestep_post_act": null,
163
- "transformer_layers_per_block": 1,
164
- "up_block_types": [
165
- "UpBlock2D",
166
- "CrossAttnUpBlock2D",
167
- "CrossAttnUpBlock2D",
168
- "CrossAttnUpBlock2D"
169
- ],
170
- "n_views": 6,
171
- "upcast_attention": false,
172
- "use_linear_projection": false
173
- }
174
-
175
- Filename: preprocessor_config.json
176
- Content: {
177
- "crop_size": {
178
- "height": 224,
179
- "width": 224
180
- },
181
- "do_center_crop": true,
182
- "do_convert_rgb": true,
183
- "do_normalize": true,
184
- "do_rescale": true,
185
- "do_resize": true,
186
- "image_mean": [
187
- 0.48145466,
188
- 0.4578275,
189
- 0.40821073
190
- ],
191
- "image_processor_type": "CLIPImageProcessor",
192
- "image_std": [
193
- 0.26862954,
194
- 0.26130258,
195
- 0.27577711
196
- ],
197
- "resample": 3,
198
- "rescale_factor": 0.00392156862745098,
199
- "size": {
200
- "shortest_edge": 224
201
- }
202
- }
203
-
204
- Filename: config.json
205
- Content: {
206
- "_name_or_path": "Luffuly/unique3d-mvimage-diffuser",
207
- "architectures": [
208
- "CLIPVisionModelWithProjection"
209
- ],
210
- "attention_dropout": 0.0,
211
- "dropout": 0.0,
212
- "hidden_act": "quick_gelu",
213
- "hidden_size": 1024,
214
- "image_size": 224,
215
- "initializer_factor": 1.0,
216
- "initializer_range": 0.02,
217
- "intermediate_size": 4096,
218
- "layer_norm_eps": 1e-05,
219
- "model_type": "clip_vision_model",
220
- "num_attention_heads": 16,
221
- "num_channels": 3,
222
- "num_hidden_layers": 24,
223
- "patch_size": 14,
224
- "projection_dim": 768,
225
- "torch_dtype": "float16",
226
- "transformers_version": "4.45.2"
227
- }
228
-
229
- # Code: https://github.com/TingtingLiao/soap
 
4
  pipeline_tag: image-to-3d
5
  ---
6
 
7
+ # SOAP: Style-Omniscient Animatable Portraits
8
 
9
+ This repository contains the model of the paper [SOAP: Style-Omniscient Animatable Portraits](https://huggingface.co/papers/2505.05022).
10
+
11
+ Code: https://github.com/TingtingLiao/soap