Skip to content

Config

Include all available vision encoder configurations.

PEConfig dataclass

Vision Tower Config.

Source code in inference/models/perception_encoder/vision_encoder/config.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
@dataclass
class PEConfig:
    """Vision Tower Config."""

    patch_size: int
    width: int
    layers: int
    heads: int
    mlp_ratio: float
    output_dim: Optional[int]

    ls_init_value: float = None
    drop_path: float = 0.0

    image_size: int = (224,)
    use_abs_posemb: bool = True
    use_cls_token: bool = False
    use_rope2d: bool = True

    pool_type: str = "attn"
    attn_pooler_heads: int = 8

    use_ln_pre: bool = True
    use_ln_post: bool = True

PETextConfig dataclass

Text Tower Config.

Source code in inference/models/perception_encoder/vision_encoder/config.py
55
56
57
58
59
60
61
62
63
64
65
66
67
@dataclass
class PETextConfig:
    """Text Tower Config."""

    context_length: int
    width: int
    heads: int
    layers: int

    output_dim: int

    mlp_ratio: float = 4.0
    vocab_size: int = 49408