Segment anything

`SegmentAnything` ¶

Bases: RoboflowCoreModel

SegmentAnything class for handling segmentation tasks.

Attributes:

Name	Type	Description
`sam`		The segmentation model.
`predictor`		The predictor for the segmentation model.
`ort_session`		ONNX runtime inference session.
`embedding_cache`		Cache for embeddings.
`image_size_cache`		Cache for image sizes.
`embedding_cache_keys`		Keys for the embedding cache.
`low_res_logits_cache`		Cache for low resolution logits.
`segmentation_cache_keys`		Keys for the segmentation cache.

Source code in inference/models/sam/segment_anything.py

class SegmentAnything(RoboflowCoreModel):
    """SegmentAnything class for handling segmentation tasks.

    Attributes:
        sam: The segmentation model.
        predictor: The predictor for the segmentation model.
        ort_session: ONNX runtime inference session.
        embedding_cache: Cache for embeddings.
        image_size_cache: Cache for image sizes.
        embedding_cache_keys: Keys for the embedding cache.
        low_res_logits_cache: Cache for low resolution logits.
        segmentation_cache_keys: Keys for the segmentation cache.
    """

    def __init__(self, *args, model_id: str = f"sam/{SAM_VERSION_ID}", **kwargs):
        """Initializes the SegmentAnything.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__(*args, model_id=model_id, **kwargs)
        self.sam = sam_model_registry[self.version_id](
            checkpoint=self.cache_file("encoder.pth")
        )
        self.sam.to(device="cuda" if torch.cuda.is_available() else "cpu")
        self.predictor = SamPredictor(self.sam)
        self.ort_session = onnxruntime.InferenceSession(
            self.cache_file("decoder.onnx"),
            providers=[
                "CUDAExecutionProvider",
                "OpenVINOExecutionProvider",
                "CPUExecutionProvider",
            ],
        )
        self.embedding_cache = {}
        self.image_size_cache = {}
        self.embedding_cache_keys = []

        self.low_res_logits_cache = {}
        self.segmentation_cache_keys = []
        self.task_type = "unsupervised-segmentation"

    def get_infer_bucket_file_list(self) -> List[str]:
        """Gets the list of files required for inference.

        Returns:
            List[str]: List of file names.
        """
        return ["encoder.pth", "decoder.onnx"]

    def embed_image(self, image: Any, image_id: Optional[str] = None, **kwargs):
        """
        Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
        the cached result will be returned.

        Args:
            image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
            image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                      with this ID. Defaults to None.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                               and the second element is the shape (height, width) of the processed image.

        Notes:
            - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
            - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.

        Example:
            >>> img_array = ... # some image array
            >>> embed_image(img_array, image_id="sample123")
            (array([...]), (224, 224))
        """
        if image_id and image_id in self.embedding_cache:
            return (
                self.embedding_cache[image_id],
                self.image_size_cache[image_id],
            )
        img_in = self.preproc_image(image)
        self.predictor.set_image(img_in)
        embedding = self.predictor.get_image_embedding().cpu().numpy()
        if image_id:
            self.embedding_cache[image_id] = embedding
            self.image_size_cache[image_id] = img_in.shape[:2]
            self.embedding_cache_keys.append(image_id)
            if len(self.embedding_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
                cache_key = self.embedding_cache_keys.pop(0)
                del self.embedding_cache[cache_key]
                del self.image_size_cache[cache_key]
        return (embedding, img_in.shape[:2])

    def infer_from_request(self, request: SamInferenceRequest):
        """Performs inference based on the request type.

        Args:
            request (SamInferenceRequest): The inference request.

        Returns:
            Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
        """
        t1 = perf_counter()
        if isinstance(request, SamEmbeddingRequest):
            embedding, _ = self.embed_image(**request.dict())
            inference_time = perf_counter() - t1
            if request.format == "json":
                return SamEmbeddingResponse(
                    embeddings=embedding.tolist(), time=inference_time
                )
            elif request.format == "binary":
                binary_vector = BytesIO()
                np.save(binary_vector, embedding)
                binary_vector.seek(0)
                return SamEmbeddingResponse(
                    embeddings=binary_vector.getvalue(), time=inference_time
                )
        elif isinstance(request, SamSegmentationRequest):
            masks, low_res_masks = self.segment_image(**request.dict())
            if request.format == "json":
                masks = masks > self.predictor.model.mask_threshold
                masks = masks2poly(masks)
                low_res_masks = low_res_masks > self.predictor.model.mask_threshold
                low_res_masks = masks2poly(low_res_masks)
            elif request.format == "binary":
                binary_vector = BytesIO()
                np.savez_compressed(
                    binary_vector, masks=masks, low_res_masks=low_res_masks
                )
                binary_vector.seek(0)
                binary_data = binary_vector.getvalue()
                return binary_data
            else:
                raise ValueError(f"Invalid format {request.format}")

            response = SamSegmentationResponse(
                masks=[m.tolist() for m in masks],
                low_res_masks=[m.tolist() for m in low_res_masks],
                time=perf_counter() - t1,
            )
            return response

    def preproc_image(self, image: InferenceRequestImage):
        """Preprocesses an image.

        Args:
            image (InferenceRequestImage): The image to preprocess.

        Returns:
            np.array: The preprocessed image.
        """
        np_image = load_image_rgb(image)
        return np_image

    def segment_image(
        self,
        image: Any,
        embeddings: Optional[Union[np.ndarray, List[List[float]]]] = None,
        embeddings_format: Optional[str] = "json",
        has_mask_input: Optional[bool] = False,
        image_id: Optional[str] = None,
        mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
        mask_input_format: Optional[str] = "json",
        orig_im_size: Optional[List[int]] = None,
        point_coords: Optional[List[List[float]]] = [],
        point_labels: Optional[List[int]] = [],
        use_mask_input_cache: Optional[bool] = True,
        **kwargs,
    ):
        """
        Segments an image based on provided embeddings, points, masks, or cached results.
        If embeddings are not directly provided, the function can derive them from the input image or cache.

        Args:
            image (Any): The image to be segmented.
            embeddings (Optional[Union[np.ndarray, List[List[float]]]]): The embeddings of the image.
                Defaults to None, in which case the image is used to compute embeddings.
            embeddings_format (Optional[str]): Format of the provided embeddings; either 'json' or 'binary'. Defaults to 'json'.
            has_mask_input (Optional[bool]): Specifies whether mask input is provided. Defaults to False.
            image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
            mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input mask for the image.
            mask_input_format (Optional[str]): Format of the provided mask input; either 'json' or 'binary'. Defaults to 'json'.
            orig_im_size (Optional[List[int]]): Original size of the image when providing embeddings directly.
            point_coords (Optional[List[List[float]]]): Coordinates of points in the image. Defaults to an empty list.
            point_labels (Optional[List[int]]): Labels associated with the provided points. Defaults to an empty list.
            use_mask_input_cache (Optional[bool]): Flag to determine if cached mask input should be used. Defaults to True.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the segmentation masks of the image
                                          and the second element is the low resolution segmentation masks.

        Raises:
            ValueError: If necessary inputs are missing or inconsistent.

        Notes:
            - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
              on repeated requests for the same image.
            - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.
        """
        if not embeddings:
            if not image and not image_id:
                raise ValueError(
                    "Must provide either image, cached image_id, or embeddings"
                )
            elif image_id and not image and image_id not in self.embedding_cache:
                raise ValueError(
                    f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
                )
            embedding, original_image_size = self.embed_image(
                image=image, image_id=image_id
            )
        else:
            if not orig_im_size:
                raise ValueError(
                    "Must provide original image size if providing embeddings"
                )
            original_image_size = orig_im_size
            if embeddings_format == "json":
                embedding = np.array(embeddings)
            elif embeddings_format == "binary":
                embedding = np.load(BytesIO(embeddings))

        point_coords = point_coords
        point_coords.append([0, 0])
        point_coords = np.array(point_coords, dtype=np.float32)
        point_coords = np.expand_dims(point_coords, axis=0)
        point_coords = self.predictor.transform.apply_coords(
            point_coords,
            original_image_size,
        )

        point_labels = point_labels
        point_labels.append(-1)
        point_labels = np.array(point_labels, dtype=np.float32)
        point_labels = np.expand_dims(point_labels, axis=0)

        if has_mask_input:
            if (
                image_id
                and image_id in self.low_res_logits_cache
                and use_mask_input_cache
            ):
                mask_input = self.low_res_logits_cache[image_id]
            elif not mask_input and (
                not image_id or image_id not in self.low_res_logits_cache
            ):
                raise ValueError("Must provide either mask_input or cached image_id")
            else:
                if mask_input_format == "json":
                    polys = mask_input
                    mask_input = np.zeros((1, len(polys), 256, 256), dtype=np.uint8)
                    for i, poly in enumerate(polys):
                        poly = ShapelyPolygon(poly)
                        raster = rasterio.features.rasterize(
                            [poly], out_shape=(256, 256)
                        )
                        mask_input[0, i, :, :] = raster
                elif mask_input_format == "binary":
                    binary_data = base64.b64decode(mask_input)
                    mask_input = np.load(BytesIO(binary_data))
        else:
            mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32)

        ort_inputs = {
            "image_embeddings": embedding.astype(np.float32),
            "point_coords": point_coords.astype(np.float32),
            "point_labels": point_labels,
            "mask_input": mask_input.astype(np.float32),
            "has_mask_input": (
                np.zeros(1, dtype=np.float32)
                if not has_mask_input
                else np.ones(1, dtype=np.float32)
            ),
            "orig_im_size": np.array(original_image_size, dtype=np.float32),
        }
        masks, _, low_res_logits = self.ort_session.run(None, ort_inputs)
        if image_id:
            self.low_res_logits_cache[image_id] = low_res_logits
            if image_id not in self.segmentation_cache_keys:
                self.segmentation_cache_keys.append(image_id)
            if len(self.segmentation_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
                cache_key = self.segmentation_cache_keys.pop(0)
                del self.low_res_logits_cache[cache_key]
        masks = masks[0]
        low_res_masks = low_res_logits[0]

        return masks, low_res_masks

`init(*args, model_id=f'sam/{SAM_VERSION_ID}', **kwargs)` ¶

Initializes the SegmentAnything.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/sam/segment_anything.py

def __init__(self, *args, model_id: str = f"sam/{SAM_VERSION_ID}", **kwargs):
    """Initializes the SegmentAnything.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__(*args, model_id=model_id, **kwargs)
    self.sam = sam_model_registry[self.version_id](
        checkpoint=self.cache_file("encoder.pth")
    )
    self.sam.to(device="cuda" if torch.cuda.is_available() else "cpu")
    self.predictor = SamPredictor(self.sam)
    self.ort_session = onnxruntime.InferenceSession(
        self.cache_file("decoder.onnx"),
        providers=[
            "CUDAExecutionProvider",
            "OpenVINOExecutionProvider",
            "CPUExecutionProvider",
        ],
    )
    self.embedding_cache = {}
    self.image_size_cache = {}
    self.embedding_cache_keys = []

    self.low_res_logits_cache = {}
    self.segmentation_cache_keys = []
    self.task_type = "unsupervised-segmentation"

`embed_image(image, image_id=None, **kwargs)` ¶

Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached, the cached result will be returned.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be embedded. The format should be compatible with the preproc_image method.	required
`image_id`	`Optional[str]`	An identifier for the image. If provided, the embedding result will be cached with this ID. Defaults to None.	`None`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
	Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image and the second element is the shape (height, width) of the processed image.

Notes

Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Example

img_array = ... # some image array embed_image(img_array, image_id="sample123") (array([...]), (224, 224))

Source code in inference/models/sam/segment_anything.py

def embed_image(self, image: Any, image_id: Optional[str] = None, **kwargs):
    """
    Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
    the cached result will be returned.

    Args:
        image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
        image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                  with this ID. Defaults to None.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                           and the second element is the shape (height, width) of the processed image.

    Notes:
        - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
        - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.

    Example:
        >>> img_array = ... # some image array
        >>> embed_image(img_array, image_id="sample123")
        (array([...]), (224, 224))
    """
    if image_id and image_id in self.embedding_cache:
        return (
            self.embedding_cache[image_id],
            self.image_size_cache[image_id],
        )
    img_in = self.preproc_image(image)
    self.predictor.set_image(img_in)
    embedding = self.predictor.get_image_embedding().cpu().numpy()
    if image_id:
        self.embedding_cache[image_id] = embedding
        self.image_size_cache[image_id] = img_in.shape[:2]
        self.embedding_cache_keys.append(image_id)
        if len(self.embedding_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
            cache_key = self.embedding_cache_keys.pop(0)
            del self.embedding_cache[cache_key]
            del self.image_size_cache[cache_key]
    return (embedding, img_in.shape[:2])

`get_infer_bucket_file_list()` ¶

Gets the list of files required for inference.

Returns:

Type	Description
`List[str]`	List[str]: List of file names.

Source code in inference/models/sam/segment_anything.py

def get_infer_bucket_file_list(self) -> List[str]:
    """Gets the list of files required for inference.

    Returns:
        List[str]: List of file names.
    """
    return ["encoder.pth", "decoder.onnx"]

`infer_from_request(request)` ¶

Performs inference based on the request type.

Parameters:

Name	Type	Description	Default
`request`	`SamInferenceRequest`	The inference request.	required

Returns:

Type	Description
	Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.

Source code in inference/models/sam/segment_anything.py

def infer_from_request(self, request: SamInferenceRequest):
    """Performs inference based on the request type.

    Args:
        request (SamInferenceRequest): The inference request.

    Returns:
        Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
    """
    t1 = perf_counter()
    if isinstance(request, SamEmbeddingRequest):
        embedding, _ = self.embed_image(**request.dict())
        inference_time = perf_counter() - t1
        if request.format == "json":
            return SamEmbeddingResponse(
                embeddings=embedding.tolist(), time=inference_time
            )
        elif request.format == "binary":
            binary_vector = BytesIO()
            np.save(binary_vector, embedding)
            binary_vector.seek(0)
            return SamEmbeddingResponse(
                embeddings=binary_vector.getvalue(), time=inference_time
            )
    elif isinstance(request, SamSegmentationRequest):
        masks, low_res_masks = self.segment_image(**request.dict())
        if request.format == "json":
            masks = masks > self.predictor.model.mask_threshold
            masks = masks2poly(masks)
            low_res_masks = low_res_masks > self.predictor.model.mask_threshold
            low_res_masks = masks2poly(low_res_masks)
        elif request.format == "binary":
            binary_vector = BytesIO()
            np.savez_compressed(
                binary_vector, masks=masks, low_res_masks=low_res_masks
            )
            binary_vector.seek(0)
            binary_data = binary_vector.getvalue()
            return binary_data
        else:
            raise ValueError(f"Invalid format {request.format}")

        response = SamSegmentationResponse(
            masks=[m.tolist() for m in masks],
            low_res_masks=[m.tolist() for m in low_res_masks],
            time=perf_counter() - t1,
        )
        return response

`preproc_image(image)` ¶

Preprocesses an image.

Parameters:

Name	Type	Description	Default
`image`	`InferenceRequestImage`	The image to preprocess.	required

Returns:

Type	Description
	np.array: The preprocessed image.

Source code in inference/models/sam/segment_anything.py

def preproc_image(self, image: InferenceRequestImage):
    """Preprocesses an image.

    Args:
        image (InferenceRequestImage): The image to preprocess.

    Returns:
        np.array: The preprocessed image.
    """
    np_image = load_image_rgb(image)
    return np_image

`segment_image(image, embeddings=None, embeddings_format='json', has_mask_input=False, image_id=None, mask_input=None, mask_input_format='json', orig_im_size=None, point_coords=[], point_labels=[], use_mask_input_cache=True, **kwargs)` ¶

Segments an image based on provided embeddings, points, masks, or cached results. If embeddings are not directly provided, the function can derive them from the input image or cache.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be segmented.	required
`embeddings`	`Optional[Union[ndarray, List[List[float]]]]`	The embeddings of the image. Defaults to None, in which case the image is used to compute embeddings.	`None`
`embeddings_format`	`Optional[str]`	Format of the provided embeddings; either 'json' or 'binary'. Defaults to 'json'.	`'json'`
`has_mask_input`	`Optional[bool]`	Specifies whether mask input is provided. Defaults to False.	`False`
`image_id`	`Optional[str]`	A cached identifier for the image. Useful for accessing cached embeddings or masks.	`None`
`mask_input`	`Optional[Union[ndarray, List[List[List[float]]]]]`	Input mask for the image.	`None`
`mask_input_format`	`Optional[str]`	Format of the provided mask input; either 'json' or 'binary'. Defaults to 'json'.	`'json'`
`orig_im_size`	`Optional[List[int]]`	Original size of the image when providing embeddings directly.	`None`
`point_coords`	`Optional[List[List[float]]]`	Coordinates of points in the image. Defaults to an empty list.	`[]`
`point_labels`	`Optional[List[int]]`	Labels associated with the provided points. Defaults to an empty list.	`[]`
`use_mask_input_cache`	`Optional[bool]`	Flag to determine if cached mask input should be used. Defaults to True.	`True`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
	Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the segmentation masks of the image and the second element is the low resolution segmentation masks.

Raises:

Type	Description
`ValueError`	If necessary inputs are missing or inconsistent.

Notes

Embeddings, segmentations, and low-resolution logits can be cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Source code in inference/models/sam/segment_anything.py

def segment_image(
    self,
    image: Any,
    embeddings: Optional[Union[np.ndarray, List[List[float]]]] = None,
    embeddings_format: Optional[str] = "json",
    has_mask_input: Optional[bool] = False,
    image_id: Optional[str] = None,
    mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
    mask_input_format: Optional[str] = "json",
    orig_im_size: Optional[List[int]] = None,
    point_coords: Optional[List[List[float]]] = [],
    point_labels: Optional[List[int]] = [],
    use_mask_input_cache: Optional[bool] = True,
    **kwargs,
):
    """
    Segments an image based on provided embeddings, points, masks, or cached results.
    If embeddings are not directly provided, the function can derive them from the input image or cache.

    Args:
        image (Any): The image to be segmented.
        embeddings (Optional[Union[np.ndarray, List[List[float]]]]): The embeddings of the image.
            Defaults to None, in which case the image is used to compute embeddings.
        embeddings_format (Optional[str]): Format of the provided embeddings; either 'json' or 'binary'. Defaults to 'json'.
        has_mask_input (Optional[bool]): Specifies whether mask input is provided. Defaults to False.
        image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
        mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input mask for the image.
        mask_input_format (Optional[str]): Format of the provided mask input; either 'json' or 'binary'. Defaults to 'json'.
        orig_im_size (Optional[List[int]]): Original size of the image when providing embeddings directly.
        point_coords (Optional[List[List[float]]]): Coordinates of points in the image. Defaults to an empty list.
        point_labels (Optional[List[int]]): Labels associated with the provided points. Defaults to an empty list.
        use_mask_input_cache (Optional[bool]): Flag to determine if cached mask input should be used. Defaults to True.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the segmentation masks of the image
                                      and the second element is the low resolution segmentation masks.

    Raises:
        ValueError: If necessary inputs are missing or inconsistent.

    Notes:
        - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
          on repeated requests for the same image.
        - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.
    """
    if not embeddings:
        if not image and not image_id:
            raise ValueError(
                "Must provide either image, cached image_id, or embeddings"
            )
        elif image_id and not image and image_id not in self.embedding_cache:
            raise ValueError(
                f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
            )
        embedding, original_image_size = self.embed_image(
            image=image, image_id=image_id
        )
    else:
        if not orig_im_size:
            raise ValueError(
                "Must provide original image size if providing embeddings"
            )
        original_image_size = orig_im_size
        if embeddings_format == "json":
            embedding = np.array(embeddings)
        elif embeddings_format == "binary":
            embedding = np.load(BytesIO(embeddings))

    point_coords = point_coords
    point_coords.append([0, 0])
    point_coords = np.array(point_coords, dtype=np.float32)
    point_coords = np.expand_dims(point_coords, axis=0)
    point_coords = self.predictor.transform.apply_coords(
        point_coords,
        original_image_size,
    )

    point_labels = point_labels
    point_labels.append(-1)
    point_labels = np.array(point_labels, dtype=np.float32)
    point_labels = np.expand_dims(point_labels, axis=0)

    if has_mask_input:
        if (
            image_id
            and image_id in self.low_res_logits_cache
            and use_mask_input_cache
        ):
            mask_input = self.low_res_logits_cache[image_id]
        elif not mask_input and (
            not image_id or image_id not in self.low_res_logits_cache
        ):
            raise ValueError("Must provide either mask_input or cached image_id")
        else:
            if mask_input_format == "json":
                polys = mask_input
                mask_input = np.zeros((1, len(polys), 256, 256), dtype=np.uint8)
                for i, poly in enumerate(polys):
                    poly = ShapelyPolygon(poly)
                    raster = rasterio.features.rasterize(
                        [poly], out_shape=(256, 256)
                    )
                    mask_input[0, i, :, :] = raster
            elif mask_input_format == "binary":
                binary_data = base64.b64decode(mask_input)
                mask_input = np.load(BytesIO(binary_data))
    else:
        mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32)

    ort_inputs = {
        "image_embeddings": embedding.astype(np.float32),
        "point_coords": point_coords.astype(np.float32),
        "point_labels": point_labels,
        "mask_input": mask_input.astype(np.float32),
        "has_mask_input": (
            np.zeros(1, dtype=np.float32)
            if not has_mask_input
            else np.ones(1, dtype=np.float32)
        ),
        "orig_im_size": np.array(original_image_size, dtype=np.float32),
    }
    masks, _, low_res_logits = self.ort_session.run(None, ort_inputs)
    if image_id:
        self.low_res_logits_cache[image_id] = low_res_logits
        if image_id not in self.segmentation_cache_keys:
            self.segmentation_cache_keys.append(image_id)
        if len(self.segmentation_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
            cache_key = self.segmentation_cache_keys.pop(0)
            del self.low_res_logits_cache[cache_key]
    masks = masks[0]
    low_res_masks = low_res_logits[0]

    return masks, low_res_masks

Segment anything