| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from typing import Optional |
|
|
| import attrs |
|
|
| from .discrete_video import DiscreteVideoFSQStateDictTokenizer |
| from .ar_networks import CausalDiscreteVideoTokenizer |
| from .lazy_config_init import LazyCall as L |
| from .lazy_config_init import LazyDict |
|
|
|
|
| def create_discrete_video_fsq_tokenizer_state_dict_config( |
| ckpt_path, pixel_chunk_duration=33, compression_ratio=[8, 16, 16] |
| ) -> LazyDict: |
| CausalDiscreteFactorizedVideoTokenizerConfig: LazyDict = L(CausalDiscreteVideoTokenizer)( |
| |
| |
| |
| |
| |
| |
| attn_resolutions=[32], |
| channels=128, |
| channels_mult=[2, 4, 4], |
| dropout=0.0, |
| in_channels=3, |
| num_res_blocks=2, |
| out_channels=3, |
| resolution=1024, |
| patch_size=4, |
| patch_method="haar", |
| z_channels=16, |
| z_factor=1, |
| num_groups=1, |
| legacy_mode=False, |
| spatial_compression=16, |
| temporal_compression=8, |
| embedding_dim=6, |
| levels=[8, 8, 8, 5, 5, 5], |
| name="CausalDiscreteFactorizedVideoTokenizer", |
| ) |
|
|
| return L(DiscreteVideoFSQStateDictTokenizer)( |
| enc_fp=ckpt_path.replace("ema.jit", "encoder.jit"), |
| dec_fp=ckpt_path.replace("ema.jit", "decoder.jit"), |
| tokenizer_module=CausalDiscreteFactorizedVideoTokenizerConfig, |
| name="discrete_video_fsq", |
| latent_ch=6, |
| is_bf16=True, |
| pixel_chunk_duration=pixel_chunk_duration, |
| latent_chunk_duration=1 + (pixel_chunk_duration - 1) // compression_ratio[0], |
| max_enc_batch_size=8, |
| max_dec_batch_size=4, |
| levels=[8, 8, 8, 5, 5, 5], |
| compression_ratio=compression_ratio, |
| ) |
|
|
|
|
| @attrs.define(slots=False) |
| class TextTokenizerConfig: |
| """ |
| Text tokenizer config |
| |
| Args: |
| config: Config file to define the text tokenizer class. |
| data_key (str): The input key from data_dict that will be passed to the text tokenizer. |
| tokenize_here (bool): Whether to use the tokenizer to perform online tokenization. |
| tokenizer_offset (int): Offset that is added to the tokens. |
| vocab_size (int): Vocabulary size of the tokenizer. |
| """ |
|
|
| config: LazyDict |
| data_key: str = "" |
| tokenize_here: bool = False |
| tokenizer_offset: int = 0 |
| vocab_size: int = 0 |
|
|
|
|
| @attrs.define(slots=False) |
| class VideoTokenizerConfig: |
| """ |
| Video tokenizer config |
| |
| Args: |
| config: Config file to define the video tokenizer class. |
| data_key (str): The input key from data_dict that will be passed to the video tokenizer. |
| tokenize_here (bool): Whether to use the tokenizer to perform online tokenization. |
| tokenizer_offset (int): Offset that is added to the tokens. In case of joint text-video tokenizers, we |
| add an offset to make sure that video tokens and text tokens don't overlap. |
| vocab_size (int): Vocabulary size of the tokenizer. |
| max_seq_len (int): Maximum token length for an input video. |
| """ |
|
|
| config: LazyDict |
| data_key: str = "" |
| tokenize_here: bool = True |
| tokenizer_offset: int = 0 |
| vocab_size: int = 0 |
| max_seq_len: int = -1 |
|
|
|
|
| @attrs.define(slots=False) |
| class TokenizerConfig: |
| """ |
| Joint tokenizer config |
| |
| Args: |
| text_tokenizer (TextTokenizerConfig): Text tokenizer config file |
| class_tokenizer (ClassTokenizerConfig): Class tokenizer config file |
| video_tokenizer (VideoTokenizerConfig): Video tokenizer config file |
| image_tokenizer (ImageTokenizerConfig): Image tokenizer config file |
| seq_len (int): Final token sequence length |
| training_type (str): Type of training we use. Supports ["text_only", "text_to_video", "class_to_image", "image_text_interleaved"] |
| add_special_tokens (bool): Whether to add special tokens to the output tokens |
| pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64. |
| """ |
|
|
| text_tokenizer: Optional[TextTokenizerConfig] = None |
| video_tokenizer: Optional[VideoTokenizerConfig] = None |
| seq_len: int = 4096 |
| training_type: str = None |
| add_special_tokens: bool = True |
| pad_to_multiple_of: Optional[int] = 64 |
|
|