Skip to content

Roboflow Inference

Segment anything3

roboflow/inference

Segment anything3

`SegmentAnything3` ¶

Bases: RoboflowCoreModel

SAM3 wrapper with a similar interface to SAM2 in this codebase.

Source code in inference/models/sam3/segment_anything3.py

class SegmentAnything3(RoboflowCoreModel):
    """SAM3 wrapper with a similar interface to SAM2 in this codebase."""

    def __init__(
        self,
        *args,
        model_id: str = "sam3/sam3_final",
        **kwargs,
    ):
        super().__init__(*args, model_id=model_id, **kwargs)

        # Lazy import SAM3 to avoid hard dependency when disabled
        from sam3 import build_sam3_image_model

        checkpoint = self.cache_file("weights.pt")
        bpe_path = self.cache_file("bpe_simple_vocab_16e6.txt.gz")

        self.sam3_lock = threading.RLock()

        self.model = build_sam3_image_model(
            bpe_path=bpe_path,
            checkpoint_path=checkpoint,
            device="cuda" if torch.cuda.is_available() else "cpu",
            load_from_HF=False,
            compile=False,
        )

        # Preprocessing and postprocessing for PCS image path
        self.transform = ComposeAPI(
            transforms=[
                RandomResizeAPI(
                    sizes=SAM3_IMAGE_SIZE,
                    max_size=SAM3_IMAGE_SIZE,
                    square=True,
                    consistent_transform=False,
                ),
                ToTensorAPI(),
                NormalizeAPI(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ]
        )

        self.image_size = SAM3_IMAGE_SIZE
        self.task_type = "unsupervised-segmentation"

    def _is_core_sam3_endpoint(self) -> bool:
        return isinstance(self.endpoint, str) and self.endpoint.startswith("sam3/")

    @property
    def model_artifact_bucket(self):
        # Use CORE bucket for base SAM3, standard INFER bucket for fine-tuned models
        return CORE_MODEL_BUCKET if self._is_core_sam3_endpoint() else INFER_BUCKET

    def download_weights(self) -> None:
        infer_bucket_files = self.get_infer_bucket_file_list()

        # Auth check aligned with chosen endpoint type
        if MODELS_CACHE_AUTH_ENABLED:
            endpoint_type = (
                ModelEndpointType.CORE_MODEL
                if self._is_core_sam3_endpoint()
                else ModelEndpointType.ORT
            )
            if not _check_if_api_key_has_access_to_model(
                api_key=self.api_key,
                model_id=self.endpoint,
                endpoint_type=endpoint_type,
            ):
                raise RoboflowAPINotAuthorizedError(
                    f"API key {self.api_key} does not have access to model {self.endpoint}"
                )

        # Already cached
        if are_all_files_cached(files=infer_bucket_files, model_id=self.endpoint):
            return None

        # S3 path works for both; keys are {endpoint}/<file>
        if is_model_artefacts_bucket_available():
            self.download_model_artefacts_from_s3()
            return None

        # API fallback
        if self._is_core_sam3_endpoint():
            # Base SAM3 from core_model endpoint; preserves filenames
            return super().download_model_from_roboflow_api()

        # Fine-tuned SAM3: use ORT endpoint to fetch weights map or model url
        api_data = get_roboflow_model_data(
            api_key=self.api_key,
            model_id=self.endpoint,
            endpoint_type=ModelEndpointType.ORT,
            device_id=self.device_id,
        )

        ort = api_data.get("ort") if isinstance(api_data, dict) else None
        if not isinstance(ort, dict):
            raise ModelArtefactError("ORT response malformed for fine-tuned SAM3")

        # Preferred: explicit weights map of filename -> URL
        weights_map = ort.get("weights")
        if isinstance(weights_map, dict) and len(weights_map) > 0:
            for filename, url in weights_map.items():
                resp = get_from_url(
                    url, json_response=False, verify_content_length=True
                )
                save_bytes_in_cache(
                    content=resp.content,
                    file=str(filename),
                    model_id=self.endpoint,
                )
            return None

        raise ModelArtefactError(
            "ORT response missing both 'weights' for fine-tuned SAM3"
        )

    def get_infer_bucket_file_list(self) -> List[str]:
        # SAM3 weights managed by env; no core bucket artifacts

        return [
            "weights.pt",
            "bpe_simple_vocab_16e6.txt.gz",
        ]

    def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
        np_image = load_image_rgb(image)
        return np_image

    def infer_from_request(self, request: Sam3InferenceRequest):
        # with self.sam3_lock:
        t1 = perf_counter()
        if isinstance(request, Sam3SegmentationRequest):
            # Pass strongly-typed fields to preserve Sam3Prompt objects
            result = self.segment_image(
                image=request.image,
                image_id=request.image_id,
                prompts=request.prompts,
                output_prob_thresh=request.output_prob_thresh or 0.5,
                format=request.format or "polygon",
            )
            # segment_image now returns either bytes or a response model
            return result
        else:
            raise ValueError(f"Invalid request type {type(request)}")

    def segment_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        prompts: Optional[List[Sam3Prompt]] = None,
        output_prob_thresh: float = 0.5,
        format: Optional[str] = "polygon",
        **kwargs,
    ):
        np_image = load_image_rgb(image)
        h, w = np_image.shape[:2]
        pil_image = Image.fromarray(np_image)

        # Inference-only path; disable autograd throughout
        with torch.inference_mode():
            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                start_ts = perf_counter()

                # TODO this can also take tensor directly instead of PIL image, so we want to avoid double conversion
                # TODO: this also supports multiple images for multi batch inference
                datapoint = Sam3Datapoint(
                    find_queries=[],
                    images=[Sam3ImageDP(data=pil_image, objects=[], size=(h, w))],
                )

                # Build prompts in order
                prompts = prompts or []

                # Map prompt_index -> prompt_id to retrieve results later
                prompt_ids: List[int] = []
                for idx, p in enumerate(prompts):
                    if getattr(p, "boxes", None):
                        q = _build_visual_query(
                            coco_id=idx,
                            h=h,
                            w=w,
                            boxes=p.boxes,
                            labels=p.box_labels or [],
                            text=p.text,
                        )
                    else:
                        q = _build_text_query(
                            coco_id=idx,
                            h=h,
                            w=w,
                            text=p.text,
                        )
                    datapoint.find_queries.append(q)
                    prompt_ids.append(idx)

                # Transform and collate to BatchedDatapoint
                datapoint = self.transform(datapoint)
                batch = collate_fn_api(batch=[datapoint], dict_key="dummy")["dummy"]
                batch = copy_data_to_device(
                    batch,
                    torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                    non_blocking=True,
                )

                # Forward
                output = self.model(batch)

                # Postprocess to original size and build per-prompt results
                post = PostProcessImage(
                    max_dets_per_img=-1,
                    iou_type="segm",
                    use_original_sizes_box=True,
                    use_original_sizes_mask=True,
                    convert_mask_to_rle=False,
                    detection_threshold=float(
                        output_prob_thresh if output_prob_thresh is not None else 0.35
                    ),
                    to_cpu=True,
                )
                processed = post.process_results(output, batch.find_metadatas)

        # Batched prompt response (even for a single prompt)
        prompt_results: List[Sam3PromptResult] = []
        for idx, coco_id in enumerate(prompt_ids):
            has_visual = bool(getattr(prompts[idx], "boxes", None))
            num_boxes = len(prompts[idx].boxes or []) if has_visual else 0
            echo = Sam3PromptEcho(
                prompt_index=idx,
                type=("visual" if has_visual else "text"),
                text=prompts[idx].text,
                num_boxes=num_boxes,
            )
            masks_np = _to_numpy_masks(processed[coco_id].get("masks"))
            scores = list(processed[coco_id].get("scores", []))
            preds = _masks_to_predictions(masks_np, scores, format)
            prompt_results.append(
                Sam3PromptResult(prompt_index=idx, echo=echo, predictions=preds)
            )
        return Sam3SegmentationResponse(
            time=perf_counter() - start_ts, prompt_results=prompt_results
        )