from typing import Any | |
from transformers.configuration_utils import PretrainedConfig | |
__all__ = ["VoRAConfig"] | |
class VoRAConfig(PretrainedConfig): | |
model_type = "vora" | |
_auto_class = "AutoConfig" | |
def __init__( | |
self, | |
llm: str = "", | |
aux_vision: str = "", | |
lora: dict = {}, | |
image_size: int = 448, | |
vision_embedding_type: str = "", | |
vision_embedding_intermediate_size: int = 1536, | |
patch_size: int = 14, | |
vision_attention_mask: str = "bidirectional", | |
rms_norm_eps: float = 1e-5, | |
**kwargs: Any, | |
): | |
super().__init__(**kwargs) | |
self.llm = llm | |
self.aux_vision = aux_vision | |
self.lora = lora | |
self.image_size = image_size | |
self.vision_embedding_type = vision_embedding_type | |
self.vision_embedding_intermediate_size = vision_embedding_intermediate_size | |
self.patch_size = patch_size | |
self.vision_attention_mask = vision_attention_mask | |
self.rms_norm_eps = rms_norm_eps | |