|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Emu3VisionVQ model configuration """ |
|
|
|
from typing import List |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.utils import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class Emu3VisionVQConfig(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`Emu3VisionVQ`]. It is used to instantiate an video movq |
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the |
|
defaults will yield a configuration to the VQ model presented in Emu3 paper. |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
|
|
Args: |
|
codebook_size (`int`, *optional*, defaults to 32768): |
|
Codebook size of the VQ model. |
|
embed_dim (`int`, *optional*, defaults to 4): |
|
Dimension of the quantized vector in codebook. |
|
z_channels (`int`, *optional*, defaults to 4): |
|
Dimension of the output channel of encoder and the input channel of decoder |
|
double_z (`bool`, *optional*, defaults to False): |
|
Whether double the output dim of the encoder. |
|
in_channels (`int`, *optional*, defaults to 3): |
|
Input channel of encoder. |
|
out_channels (`int`, *optional*, defaults to 3): |
|
Output channel of decoder. |
|
temporal_downsample_factor (`int`, *optional*, defaults to 4): |
|
Temporal downsample factor. |
|
ch (`int`, *optional*, defaults to 256): |
|
Basic channel number of the intermediate blocks. |
|
ch_mult (`List[int]`, *optional*, defaults to `[1, 2, 2, 4]`): |
|
Channel scaling factor of the intermediate blocks. |
|
num_res_blocks (`int`, *optional*, defaults to 2): |
|
Residual block number in each stage. |
|
attn_resolutions (`List[int]`, *optional*, defaults to 3): |
|
Stage indices to apply attention. |
|
dropout (`float`, *optional*, defaults to 0.0): |
|
Dropout probability. |
|
|
|
```python |
|
>>> from transformers import Emu3VisionVQ, Emu3VisionVQConfig |
|
|
|
>>> # Initializing a video VQ model of Emu3 configuration |
|
>>> configuration = Emu3VisionVQConfig() |
|
|
|
>>> # Initializing a model from the Emu3 VQ model style configuration |
|
>>> model = Emu3VisionVQModel(configuration) |
|
|
|
>>> # Accessing the model configuration |
|
>>> configuration = model.config |
|
```""" |
|
|
|
model_type = "Emu3VisionVQ" |
|
|
|
def __init__( |
|
self, |
|
codebook_size: int = 32768, |
|
embed_dim: int = 4, |
|
z_channels: int = 4, |
|
double_z: bool = False, |
|
in_channels: int = 3, |
|
out_channels: int = 3, |
|
temporal_downsample_factor: int = 4, |
|
ch: int = 256, |
|
ch_mult: List[int] = [1, 2, 2, 4], |
|
num_res_blocks: int = 2, |
|
attn_resolutions: List[int] = [3], |
|
dropout: float = 0.0, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
self.codebook_size = codebook_size |
|
self.embed_dim = embed_dim |
|
self.z_channels = z_channels |
|
self.double_z = double_z |
|
self.in_channels = in_channels |
|
self.out_channels = out_channels |
|
self.temporal_downsample_factor = temporal_downsample_factor |
|
self.ch = ch |
|
self.ch_mult = ch_mult |
|
self.num_res_blocks = num_res_blocks |
|
self.attn_resolutions = attn_resolutions |
|
self.dropout = dropout |
|
|