| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """PyTorch Aria vision transformer.""" |
| |
|
| | from typing import Optional, Tuple, Union |
| |
|
| | import torch |
| | import torch.utils.checkpoint |
| | from transformers import SiglipVisionConfig, SiglipVisionModel |
| | from transformers.modeling_outputs import BaseModelOutputWithPooling |
| | from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer |
| |
|
| |
|
| | class AriaVisionConfig(SiglipVisionConfig): |
| | """Configuration class for AriaVisionModel.""" |
| |
|
| | model_type = "aria_vision_model" |
| |
|
| | def __init__( |
| | self, |
| | **kwargs, |
| | ): |
| | super().__init__(**kwargs) |
| |
|
| |
|
| | class IdentityOp(torch.nn.Module): |
| | """ |
| | An identity operation that returns the input unchanged. |
| | |
| | This can be used as a placeholder or to maintain architectural consistency |
| | when a specific operation is not needed. |
| | """ |
| |
|
| | def __init__(self, *args, **kwargs): |
| | super().__init__() |
| |
|
| | def forward(self, x, *args, **kwargs): |
| | return x |
| |
|
| |
|
| | class AriaVisionTransformer(Idefics2VisionTransformer): |
| | """ |
| | Aria Vision Transformer model based on Idefics2VisionTransformer. |
| | |
| | This class extends the original Idefics2VisionTransformer by removing the post-layernorm operation. |
| | """ |
| |
|
| | def __init__(self, config: AriaVisionConfig): |
| | super().__init__(config) |
| | self.post_layernorm = IdentityOp() |
| |
|
| |
|
| | class AriaVisionModel(SiglipVisionModel): |
| | """ |
| | Aria Vision Model extends SiglipVisionModel to support pixel_mask. |
| | |
| | The pixel_mask is a 2D boolean tensor that indicates which pixels in the input |
| | image are actual content and which are padding. It has the same height and width |
| | as the input image, where: |
| | - True (1) values represent pixels from the original image |
| | - False (0) values represent padding pixels |
| | |
| | This mask helps the model focus on the relevant parts of the image during processing. |
| | """ |
| |
|
| | config_class = AriaVisionConfig |
| | main_input_name = "pixel_values" |
| | _supports_sdpa = False |
| |
|
| | def __init__(self, config: AriaVisionConfig): |
| | super().__init__(config) |
| | self.vision_model = AriaVisionTransformer(config) |
| |
|
| | |
| | self.post_init() |
| |
|
| | def forward( |
| | self, |
| | pixel_values: torch.Tensor, |
| | pixel_mask: Optional[torch.BoolTensor] = None, |
| | output_attentions: Optional[bool] = None, |
| | output_hidden_states: Optional[bool] = None, |
| | return_dict: Optional[bool] = None, |
| | ) -> Union[Tuple, BaseModelOutputWithPooling]: |
| | """ |
| | Forward pass of the AriaVisionModel. |
| | |
| | Args: |
| | pixel_values (torch.Tensor): The pixel values of the input images. |
| | pixel_mask (Optional[torch.BoolTensor]): Mask for the pixel values. |
| | output_attentions (Optional[bool]): Whether to output attentions. |
| | output_hidden_states (Optional[bool]): Whether to output hidden states. |
| | return_dict (Optional[bool]): Whether to return a ModelOutput object. |
| | |
| | Returns: |
| | Union[Tuple, BaseModelOutputWithPooling]: The model's output. |
| | """ |
| | return_dict = ( |
| | return_dict if return_dict is not None else self.config.use_return_dict |
| | ) |
| | patch_attention_mask = self._create_patch_attention_mask(pixel_mask) |
| |
|
| | vit_oup = self.vision_model( |
| | pixel_values=pixel_values, |
| | patch_attention_mask=patch_attention_mask, |
| | output_attentions=output_attentions, |
| | output_hidden_states=output_hidden_states, |
| | return_dict=return_dict, |
| | ) |
| |
|
| | image_atts = self._create_image_attention_mask(patch_attention_mask) |
| |
|
| | return vit_oup, image_atts |
| |
|
| | def _create_patch_attention_mask(self, pixel_mask): |
| | if pixel_mask is None: |
| | return None |
| |
|
| | patches_subgrid = pixel_mask.unfold( |
| | dimension=1, |
| | size=self.vision_model.config.patch_size, |
| | step=self.vision_model.config.patch_size, |
| | ).unfold( |
| | dimension=2, |
| | size=self.vision_model.config.patch_size, |
| | step=self.vision_model.config.patch_size, |
| | ) |
| | return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() |
| |
|
| | def _create_image_attention_mask(self, patch_attention_mask): |
| | if patch_attention_mask is None: |
| | return None |
| |
|
| | flattened_mask = patch_attention_mask.flatten(1) |
| | return torch.logical_not(flattened_mask) |
| |
|