Skip to content

Da3

DepthAnything3Net

Bases: Module

Depth Anything 3 network for depth estimation. Simplified for single-view depth-only inference.

This network consists of: - Backbone: DinoV2 feature extractor - Head: DualDPT for depth prediction

Returns:

Type Description

Dictionary containing:

  • depth: Predicted depth map (B, H, W)
  • depth_conf: Depth confidence map (B, H, W)
Source code in inference/models/depth_anything_v3/architecture/da3.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
class DepthAnything3Net(nn.Module):
    """
    Depth Anything 3 network for depth estimation.
    Simplified for single-view depth-only inference.

    This network consists of:
    - Backbone: DinoV2 feature extractor
    - Head: DualDPT for depth prediction

    Returns:
        Dictionary containing:
        - depth: Predicted depth map (B, H, W)
        - depth_conf: Depth confidence map (B, H, W)
    """

    PATCH_SIZE = 14

    def __init__(
        self,
        backbone_name: str,
        out_layers: list,
        alt_start: int,
        qknorm_start: int,
        rope_start: int,
        cat_token: bool,
        head_dim_in: int,
        head_output_dim: int,
        head_features: int,
        head_out_channels: list,
    ):
        """
        Initialize DepthAnything3Net.

        Args:
            backbone_name: DinoV2 backbone variant ("vits" or "vitb")
            out_layers: Layer indices to extract features from
            alt_start: Layer index to start alternating attention
            qknorm_start: Layer index to start QK normalization
            rope_start: Layer index to start RoPE
            cat_token: Whether to concatenate local and global tokens
            head_dim_in: Input dimension for the head
            head_output_dim: Output dimension for the head
            head_features: Feature dimension in the head
            head_out_channels: Output channel dimensions per stage
        """
        super().__init__()
        self.backbone = DinoV2(
            name=backbone_name,
            out_layers=out_layers,
            alt_start=alt_start,
            qknorm_start=qknorm_start,
            rope_start=rope_start,
            cat_token=cat_token,
        )
        self.head = DualDPT(
            dim_in=head_dim_in,
            output_dim=head_output_dim,
            features=head_features,
            out_channels=head_out_channels,
        )
        self.device = (
            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        )

    def forward(
        self,
        x: torch.Tensor,
    ) -> Dict[str, torch.Tensor]:
        """
        Forward pass through the network.

        Args:
            x: Input images (B, N, 3, H, W) where N=1 for single-view

        Returns:
            Dictionary containing depth predictions
        """
        # Extract features using backbone
        feats, _ = self.backbone(x)
        H, W = x.shape[-2], x.shape[-1]

        # Process features through depth head
        with torch.autocast(device_type=x.device.type, enabled=False):
            output = self._process_depth_head(feats, H, W)

        return output

    def _process_depth_head(
        self, feats: list[torch.Tensor], H: int, W: int
    ) -> Dict[str, torch.Tensor]:
        """Process features through the depth prediction head."""
        return self.head(feats, H, W, patch_start_idx=0)

__init__(backbone_name, out_layers, alt_start, qknorm_start, rope_start, cat_token, head_dim_in, head_output_dim, head_features, head_out_channels)

Initialize DepthAnything3Net.

Parameters:

Name Type Description Default
backbone_name str

DinoV2 backbone variant ("vits" or "vitb")

required
out_layers list

Layer indices to extract features from

required
alt_start int

Layer index to start alternating attention

required
qknorm_start int

Layer index to start QK normalization

required
rope_start int

Layer index to start RoPE

required
cat_token bool

Whether to concatenate local and global tokens

required
head_dim_in int

Input dimension for the head

required
head_output_dim int

Output dimension for the head

required
head_features int

Feature dimension in the head

required
head_out_channels list

Output channel dimensions per stage

required
Source code in inference/models/depth_anything_v3/architecture/da3.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def __init__(
    self,
    backbone_name: str,
    out_layers: list,
    alt_start: int,
    qknorm_start: int,
    rope_start: int,
    cat_token: bool,
    head_dim_in: int,
    head_output_dim: int,
    head_features: int,
    head_out_channels: list,
):
    """
    Initialize DepthAnything3Net.

    Args:
        backbone_name: DinoV2 backbone variant ("vits" or "vitb")
        out_layers: Layer indices to extract features from
        alt_start: Layer index to start alternating attention
        qknorm_start: Layer index to start QK normalization
        rope_start: Layer index to start RoPE
        cat_token: Whether to concatenate local and global tokens
        head_dim_in: Input dimension for the head
        head_output_dim: Output dimension for the head
        head_features: Feature dimension in the head
        head_out_channels: Output channel dimensions per stage
    """
    super().__init__()
    self.backbone = DinoV2(
        name=backbone_name,
        out_layers=out_layers,
        alt_start=alt_start,
        qknorm_start=qknorm_start,
        rope_start=rope_start,
        cat_token=cat_token,
    )
    self.head = DualDPT(
        dim_in=head_dim_in,
        output_dim=head_output_dim,
        features=head_features,
        out_channels=head_out_channels,
    )
    self.device = (
        torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    )

forward(x)

Forward pass through the network.

Parameters:

Name Type Description Default
x Tensor

Input images (B, N, 3, H, W) where N=1 for single-view

required

Returns:

Type Description
Dict[str, Tensor]

Dictionary containing depth predictions

Source code in inference/models/depth_anything_v3/architecture/da3.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def forward(
    self,
    x: torch.Tensor,
) -> Dict[str, torch.Tensor]:
    """
    Forward pass through the network.

    Args:
        x: Input images (B, N, 3, H, W) where N=1 for single-view

    Returns:
        Dictionary containing depth predictions
    """
    # Extract features using backbone
    feats, _ = self.backbone(x)
    H, W = x.shape[-2], x.shape[-1]

    # Process features through depth head
    with torch.autocast(device_type=x.device.type, enabled=False):
        output = self._process_depth_head(feats, H, W)

    return output