"""TIPSv2 model configuration.""" from transformers import PretrainedConfig _VISION_FN_BY_GEOMETRY = { (768, 12): "vit_base", (1024, 24): "vit_large", (1152, 27): "vit_so400m", (1536, 40): "vit_giant2", } class TIPSv2Config(PretrainedConfig): """Configuration for TIPSv2 vision-language model.""" model_type = "tipsv2" def __init__( self, vision_config=None, text_config=None, temperature_init_value=0.01, **kwargs, ): super().__init__(**kwargs) vision_config = vision_config or {} text_config = text_config or {} hidden_size = vision_config.get("hidden_size", 768) num_hidden_layers = vision_config.get("num_hidden_layers", 12) self.vision_fn = _VISION_FN_BY_GEOMETRY[(hidden_size, num_hidden_layers)] self.embed_dim = hidden_size self.patch_size = vision_config.get("patch_size", 14) self.img_size = vision_config.get("image_size", 448) self.ffn_layer = "swiglu" if vision_config.get("use_swiglu_ffn", False) else "mlp" self.init_values = vision_config.get("layerscale_value", 1.0) self.num_register_tokens = vision_config.get("num_register_tokens", 1) self.text_hidden_size = text_config.get("hidden_size", 768) self.text_mlp_dim = text_config.get("intermediate_size", 3072) self.text_num_heads = text_config.get("num_attention_heads", 12) self.text_num_layers = text_config.get("num_hidden_layers", 12) self.vocab_size = text_config.get("vocab_size", 32000) self.max_len = text_config.get("max_position_embeddings", 64) self.temperature = temperature_init_value