Da3

`DepthAnything3Net` ¶

Bases: Module

Depth Anything 3 network for depth estimation. Simplified for single-view depth-only inference.

This network consists of: - Backbone: DinoV2 feature extractor - Head: DualDPT for depth prediction

Returns:

Type	Description
	Dictionary containing:
	depth: Predicted depth map (B, H, W)
	depth_conf: Depth confidence map (B, H, W)

Source code in inference/models/depth_anything_v3/architecture/da3.py

class DepthAnything3Net(nn.Module):
    """
    Depth Anything 3 network for depth estimation.
    Simplified for single-view depth-only inference.

    This network consists of:
    - Backbone: DinoV2 feature extractor
    - Head: DualDPT for depth prediction

    Returns:
        Dictionary containing:
        - depth: Predicted depth map (B, H, W)
        - depth_conf: Depth confidence map (B, H, W)
    """

    PATCH_SIZE = 14

    def __init__(
        self,
        backbone_name: str,
        out_layers: list,
        alt_start: int,
        qknorm_start: int,
        rope_start: int,
        cat_token: bool,
        head_dim_in: int,
        head_output_dim: int,
        head_features: int,
        head_out_channels: list,
    ):
        """
        Initialize DepthAnything3Net.

        Args:
            backbone_name: DinoV2 backbone variant ("vits" or "vitb")
            out_layers: Layer indices to extract features from
            alt_start: Layer index to start alternating attention
            qknorm_start: Layer index to start QK normalization
            rope_start: Layer index to start RoPE
            cat_token: Whether to concatenate local and global tokens
            head_dim_in: Input dimension for the head
            head_output_dim: Output dimension for the head
            head_features: Feature dimension in the head
            head_out_channels: Output channel dimensions per stage
        """
        super().__init__()
        self.backbone = DinoV2(
            name=backbone_name,
            out_layers=out_layers,
            alt_start=alt_start,
            qknorm_start=qknorm_start,
            rope_start=rope_start,
            cat_token=cat_token,
        )
        self.head = DualDPT(
            dim_in=head_dim_in,
            output_dim=head_output_dim,
            features=head_features,
            out_channels=head_out_channels,
        )
        self.device = (
            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        )

    def forward(
        self,
        x: torch.Tensor,
    ) -> Dict[str, torch.Tensor]:
        """
        Forward pass through the network.

        Args:
            x: Input images (B, N, 3, H, W) where N=1 for single-view

        Returns:
            Dictionary containing depth predictions
        """
        # Extract features using backbone
        feats, _ = self.backbone(x)
        H, W = x.shape[-2], x.shape[-1]

        # Process features through depth head
        with torch.autocast(device_type=x.device.type, enabled=False):
            output = self._process_depth_head(feats, H, W)

        return output

    def _process_depth_head(
        self, feats: list[torch.Tensor], H: int, W: int
    ) -> Dict[str, torch.Tensor]:
        """Process features through the depth prediction head."""
        return self.head(feats, H, W, patch_start_idx=0)

`init(backbone_name, out_layers, alt_start, qknorm_start, rope_start, cat_token, head_dim_in, head_output_dim, head_features, head_out_channels)` ¶

Initialize DepthAnything3Net.

Parameters:

Name	Type	Description	Default
`backbone_name`	`str`	DinoV2 backbone variant ("vits" or "vitb")	required
`out_layers`	`list`	Layer indices to extract features from	required
`alt_start`	`int`	Layer index to start alternating attention	required
`qknorm_start`	`int`	Layer index to start QK normalization	required
`rope_start`	`int`	Layer index to start RoPE	required
`cat_token`	`bool`	Whether to concatenate local and global tokens	required
`head_dim_in`	`int`	Input dimension for the head	required
`head_output_dim`	`int`	Output dimension for the head	required
`head_features`	`int`	Feature dimension in the head	required
`head_out_channels`	`list`	Output channel dimensions per stage	required

Source code in inference/models/depth_anything_v3/architecture/da3.py

def __init__(
    self,
    backbone_name: str,
    out_layers: list,
    alt_start: int,
    qknorm_start: int,
    rope_start: int,
    cat_token: bool,
    head_dim_in: int,
    head_output_dim: int,
    head_features: int,
    head_out_channels: list,
):
    """
    Initialize DepthAnything3Net.

    Args:
        backbone_name: DinoV2 backbone variant ("vits" or "vitb")
        out_layers: Layer indices to extract features from
        alt_start: Layer index to start alternating attention
        qknorm_start: Layer index to start QK normalization
        rope_start: Layer index to start RoPE
        cat_token: Whether to concatenate local and global tokens
        head_dim_in: Input dimension for the head
        head_output_dim: Output dimension for the head
        head_features: Feature dimension in the head
        head_out_channels: Output channel dimensions per stage
    """
    super().__init__()
    self.backbone = DinoV2(
        name=backbone_name,
        out_layers=out_layers,
        alt_start=alt_start,
        qknorm_start=qknorm_start,
        rope_start=rope_start,
        cat_token=cat_token,
    )
    self.head = DualDPT(
        dim_in=head_dim_in,
        output_dim=head_output_dim,
        features=head_features,
        out_channels=head_out_channels,
    )
    self.device = (
        torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    )

`forward(x)` ¶

Forward pass through the network.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Input images (B, N, 3, H, W) where N=1 for single-view	required

Returns:

Type	Description
`Dict[str, Tensor]`	Dictionary containing depth predictions

Source code in inference/models/depth_anything_v3/architecture/da3.py

def forward(
    self,
    x: torch.Tensor,
) -> Dict[str, torch.Tensor]:
    """
    Forward pass through the network.

    Args:
        x: Input images (B, N, 3, H, W) where N=1 for single-view

    Returns:
        Dictionary containing depth predictions
    """
    # Extract features using backbone
    feats, _ = self.backbone(x)
    H, W = x.shape[-2], x.shape[-1]

    # Process features through depth head
    with torch.autocast(device_type=x.device.type, enabled=False):
        output = self._process_depth_head(feats, H, W)

    return output

Da3

DepthAnything3Net ¶

__init__(backbone_name, out_layers, alt_start, qknorm_start, rope_start, cat_token, head_dim_in, head_output_dim, head_features, head_out_channels) ¶

forward(x) ¶

`DepthAnything3Net` ¶

`init(backbone_name, out_layers, alt_start, qknorm_start, rope_start, cat_token, head_dim_in, head_output_dim, head_features, head_out_channels)` ¶

`forward(x)` ¶