Skip to content

Segment anything

SegmentAnything

Bases: RoboflowCoreModel

SegmentAnything class for handling segmentation tasks.

Attributes:

Name Type Description
sam

The segmentation model.

predictor

The predictor for the segmentation model.

ort_session

ONNX runtime inference session.

embedding_cache

Cache for embeddings.

image_size_cache

Cache for image sizes.

embedding_cache_keys

Keys for the embedding cache.

low_res_logits_cache

Cache for low resolution logits.

segmentation_cache_keys

Keys for the segmentation cache.

Source code in inference/models/sam/segment_anything.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
class SegmentAnything(RoboflowCoreModel):
    """SegmentAnything class for handling segmentation tasks.

    Attributes:
        sam: The segmentation model.
        predictor: The predictor for the segmentation model.
        ort_session: ONNX runtime inference session.
        embedding_cache: Cache for embeddings.
        image_size_cache: Cache for image sizes.
        embedding_cache_keys: Keys for the embedding cache.
        low_res_logits_cache: Cache for low resolution logits.
        segmentation_cache_keys: Keys for the segmentation cache.
    """

    def __init__(self, *args, model_id: str = f"sam/{SAM_VERSION_ID}", **kwargs):
        """Initializes the SegmentAnything.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__(*args, model_id=model_id, **kwargs)
        self.sam = sam_model_registry[self.version_id](
            checkpoint=self.cache_file("encoder.pth")
        )
        self.sam.to(device="cuda" if torch.cuda.is_available() else "cpu")
        self.predictor = SamPredictor(self.sam)
        self.ort_session = onnxruntime.InferenceSession(
            self.cache_file("decoder.onnx"),
            providers=[
                "CUDAExecutionProvider",
                "OpenVINOExecutionProvider",
                "CPUExecutionProvider",
            ],
        )
        self.embedding_cache = {}
        self.image_size_cache = {}
        self.embedding_cache_keys = []

        self.low_res_logits_cache = {}
        self.segmentation_cache_keys = []
        self.task_type = "unsupervised-segmentation"

    def get_infer_bucket_file_list(self) -> List[str]:
        """Gets the list of files required for inference.

        Returns:
            List[str]: List of file names.
        """
        return ["encoder.pth", "decoder.onnx"]

    def embed_image(self, image: Any, image_id: Optional[str] = None, **kwargs):
        """
        Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
        the cached result will be returned.

        Args:
            image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
            image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                      with this ID. Defaults to None.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                               and the second element is the shape (height, width) of the processed image.

        Notes:
            - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
            - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.

        Example:
            >>> img_array = ... # some image array
            >>> embed_image(img_array, image_id="sample123")
            (array([...]), (224, 224))
        """
        if image_id and image_id in self.embedding_cache:
            return (
                self.embedding_cache[image_id],
                self.image_size_cache[image_id],
            )
        img_in = self.preproc_image(image)
        self.predictor.set_image(img_in)
        embedding = self.predictor.get_image_embedding().cpu().numpy()
        if image_id:
            self.embedding_cache[image_id] = embedding
            self.image_size_cache[image_id] = img_in.shape[:2]
            self.embedding_cache_keys.append(image_id)
            if len(self.embedding_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
                cache_key = self.embedding_cache_keys.pop(0)
                del self.embedding_cache[cache_key]
                del self.image_size_cache[cache_key]
        return (embedding, img_in.shape[:2])

    def infer_from_request(self, request: SamInferenceRequest):
        """Performs inference based on the request type.

        Args:
            request (SamInferenceRequest): The inference request.

        Returns:
            Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
        """
        t1 = perf_counter()
        if isinstance(request, SamEmbeddingRequest):
            embedding, _ = self.embed_image(**request.dict())
            inference_time = perf_counter() - t1
            if request.format == "json":
                return SamEmbeddingResponse(
                    embeddings=embedding.tolist(), time=inference_time
                )
            elif request.format == "binary":
                binary_vector = BytesIO()
                np.save(binary_vector, embedding)
                binary_vector.seek(0)
                return SamEmbeddingResponse(
                    embeddings=binary_vector.getvalue(), time=inference_time
                )
        elif isinstance(request, SamSegmentationRequest):
            masks, low_res_masks = self.segment_image(**request.dict())
            if request.format == "json":
                masks = masks > self.predictor.model.mask_threshold
                masks = masks2poly(masks)
                low_res_masks = low_res_masks > self.predictor.model.mask_threshold
                low_res_masks = masks2poly(low_res_masks)
            elif request.format == "binary":
                binary_vector = BytesIO()
                np.savez_compressed(
                    binary_vector, masks=masks, low_res_masks=low_res_masks
                )
                binary_vector.seek(0)
                binary_data = binary_vector.getvalue()
                return binary_data
            else:
                raise ValueError(f"Invalid format {request.format}")

            response = SamSegmentationResponse(
                masks=[m.tolist() for m in masks],
                low_res_masks=[m.tolist() for m in low_res_masks],
                time=perf_counter() - t1,
            )
            return response

    def preproc_image(self, image: InferenceRequestImage):
        """Preprocesses an image.

        Args:
            image (InferenceRequestImage): The image to preprocess.

        Returns:
            np.array: The preprocessed image.
        """
        np_image = load_image_rgb(image)
        return np_image

    def segment_image(
        self,
        image: Any,
        embeddings: Optional[Union[np.ndarray, List[List[float]]]] = None,
        embeddings_format: Optional[str] = "json",
        has_mask_input: Optional[bool] = False,
        image_id: Optional[str] = None,
        mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
        mask_input_format: Optional[str] = "json",
        orig_im_size: Optional[List[int]] = None,
        point_coords: Optional[List[List[float]]] = [],
        point_labels: Optional[List[int]] = [],
        use_mask_input_cache: Optional[bool] = True,
        **kwargs,
    ):
        """
        Segments an image based on provided embeddings, points, masks, or cached results.
        If embeddings are not directly provided, the function can derive them from the input image or cache.

        Args:
            image (Any): The image to be segmented.
            embeddings (Optional[Union[np.ndarray, List[List[float]]]]): The embeddings of the image.
                Defaults to None, in which case the image is used to compute embeddings.
            embeddings_format (Optional[str]): Format of the provided embeddings; either 'json' or 'binary'. Defaults to 'json'.
            has_mask_input (Optional[bool]): Specifies whether mask input is provided. Defaults to False.
            image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
            mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input mask for the image.
            mask_input_format (Optional[str]): Format of the provided mask input; either 'json' or 'binary'. Defaults to 'json'.
            orig_im_size (Optional[List[int]]): Original size of the image when providing embeddings directly.
            point_coords (Optional[List[List[float]]]): Coordinates of points in the image. Defaults to an empty list.
            point_labels (Optional[List[int]]): Labels associated with the provided points. Defaults to an empty list.
            use_mask_input_cache (Optional[bool]): Flag to determine if cached mask input should be used. Defaults to True.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the segmentation masks of the image
                                          and the second element is the low resolution segmentation masks.

        Raises:
            ValueError: If necessary inputs are missing or inconsistent.

        Notes:
            - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
              on repeated requests for the same image.
            - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.
        """
        if not embeddings:
            if not image and not image_id:
                raise ValueError(
                    "Must provide either image, cached image_id, or embeddings"
                )
            elif image_id and not image and image_id not in self.embedding_cache:
                raise ValueError(
                    f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
                )
            embedding, original_image_size = self.embed_image(
                image=image, image_id=image_id
            )
        else:
            if not orig_im_size:
                raise ValueError(
                    "Must provide original image size if providing embeddings"
                )
            original_image_size = orig_im_size
            if embeddings_format == "json":
                embedding = np.array(embeddings)
            elif embeddings_format == "binary":
                embedding = np.load(BytesIO(embeddings))

        point_coords = point_coords
        point_coords.append([0, 0])
        point_coords = np.array(point_coords, dtype=np.float32)
        point_coords = np.expand_dims(point_coords, axis=0)
        point_coords = self.predictor.transform.apply_coords(
            point_coords,
            original_image_size,
        )

        point_labels = point_labels
        point_labels.append(-1)
        point_labels = np.array(point_labels, dtype=np.float32)
        point_labels = np.expand_dims(point_labels, axis=0)

        if has_mask_input:
            if (
                image_id
                and image_id in self.low_res_logits_cache
                and use_mask_input_cache
            ):
                mask_input = self.low_res_logits_cache[image_id]
            elif not mask_input and (
                not image_id or image_id not in self.low_res_logits_cache
            ):
                raise ValueError("Must provide either mask_input or cached image_id")
            else:
                if mask_input_format == "json":
                    polys = mask_input
                    mask_input = np.zeros((1, len(polys), 256, 256), dtype=np.uint8)
                    for i, poly in enumerate(polys):
                        poly = ShapelyPolygon(poly)
                        raster = rasterio.features.rasterize(
                            [poly], out_shape=(256, 256)
                        )
                        mask_input[0, i, :, :] = raster
                elif mask_input_format == "binary":
                    binary_data = base64.b64decode(mask_input)
                    mask_input = np.load(BytesIO(binary_data))
        else:
            mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32)

        ort_inputs = {
            "image_embeddings": embedding.astype(np.float32),
            "point_coords": point_coords.astype(np.float32),
            "point_labels": point_labels,
            "mask_input": mask_input.astype(np.float32),
            "has_mask_input": (
                np.zeros(1, dtype=np.float32)
                if not has_mask_input
                else np.ones(1, dtype=np.float32)
            ),
            "orig_im_size": np.array(original_image_size, dtype=np.float32),
        }
        masks, _, low_res_logits = self.ort_session.run(None, ort_inputs)
        if image_id:
            self.low_res_logits_cache[image_id] = low_res_logits
            if image_id not in self.segmentation_cache_keys:
                self.segmentation_cache_keys.append(image_id)
            if len(self.segmentation_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
                cache_key = self.segmentation_cache_keys.pop(0)
                del self.low_res_logits_cache[cache_key]
        masks = masks[0]
        low_res_masks = low_res_logits[0]

        return masks, low_res_masks

__init__(*args, model_id=f'sam/{SAM_VERSION_ID}', **kwargs)

Initializes the SegmentAnything.

Parameters:

Name Type Description Default
*args

Variable length argument list.

()
**kwargs

Arbitrary keyword arguments.

{}
Source code in inference/models/sam/segment_anything.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def __init__(self, *args, model_id: str = f"sam/{SAM_VERSION_ID}", **kwargs):
    """Initializes the SegmentAnything.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__(*args, model_id=model_id, **kwargs)
    self.sam = sam_model_registry[self.version_id](
        checkpoint=self.cache_file("encoder.pth")
    )
    self.sam.to(device="cuda" if torch.cuda.is_available() else "cpu")
    self.predictor = SamPredictor(self.sam)
    self.ort_session = onnxruntime.InferenceSession(
        self.cache_file("decoder.onnx"),
        providers=[
            "CUDAExecutionProvider",
            "OpenVINOExecutionProvider",
            "CPUExecutionProvider",
        ],
    )
    self.embedding_cache = {}
    self.image_size_cache = {}
    self.embedding_cache_keys = []

    self.low_res_logits_cache = {}
    self.segmentation_cache_keys = []
    self.task_type = "unsupervised-segmentation"

embed_image(image, image_id=None, **kwargs)

Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached, the cached result will be returned.

Parameters:

Name Type Description Default
image Any

The image to be embedded. The format should be compatible with the preproc_image method.

required
image_id Optional[str]

An identifier for the image. If provided, the embedding result will be cached with this ID. Defaults to None.

None
**kwargs

Additional keyword arguments.

{}

Returns:

Type Description

Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image and the second element is the shape (height, width) of the processed image.

Notes
  • Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
  • The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.
Example

img_array = ... # some image array embed_image(img_array, image_id="sample123") (array([...]), (224, 224))

Source code in inference/models/sam/segment_anything.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def embed_image(self, image: Any, image_id: Optional[str] = None, **kwargs):
    """
    Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
    the cached result will be returned.

    Args:
        image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
        image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                  with this ID. Defaults to None.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                           and the second element is the shape (height, width) of the processed image.

    Notes:
        - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
        - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.

    Example:
        >>> img_array = ... # some image array
        >>> embed_image(img_array, image_id="sample123")
        (array([...]), (224, 224))
    """
    if image_id and image_id in self.embedding_cache:
        return (
            self.embedding_cache[image_id],
            self.image_size_cache[image_id],
        )
    img_in = self.preproc_image(image)
    self.predictor.set_image(img_in)
    embedding = self.predictor.get_image_embedding().cpu().numpy()
    if image_id:
        self.embedding_cache[image_id] = embedding
        self.image_size_cache[image_id] = img_in.shape[:2]
        self.embedding_cache_keys.append(image_id)
        if len(self.embedding_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
            cache_key = self.embedding_cache_keys.pop(0)
            del self.embedding_cache[cache_key]
            del self.image_size_cache[cache_key]
    return (embedding, img_in.shape[:2])

get_infer_bucket_file_list()

Gets the list of files required for inference.

Returns:

Type Description
List[str]

List[str]: List of file names.

Source code in inference/models/sam/segment_anything.py
72
73
74
75
76
77
78
def get_infer_bucket_file_list(self) -> List[str]:
    """Gets the list of files required for inference.

    Returns:
        List[str]: List of file names.
    """
    return ["encoder.pth", "decoder.onnx"]

infer_from_request(request)

Performs inference based on the request type.

Parameters:

Name Type Description Default
request SamInferenceRequest

The inference request.

required

Returns:

Type Description

Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.

Source code in inference/models/sam/segment_anything.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def infer_from_request(self, request: SamInferenceRequest):
    """Performs inference based on the request type.

    Args:
        request (SamInferenceRequest): The inference request.

    Returns:
        Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
    """
    t1 = perf_counter()
    if isinstance(request, SamEmbeddingRequest):
        embedding, _ = self.embed_image(**request.dict())
        inference_time = perf_counter() - t1
        if request.format == "json":
            return SamEmbeddingResponse(
                embeddings=embedding.tolist(), time=inference_time
            )
        elif request.format == "binary":
            binary_vector = BytesIO()
            np.save(binary_vector, embedding)
            binary_vector.seek(0)
            return SamEmbeddingResponse(
                embeddings=binary_vector.getvalue(), time=inference_time
            )
    elif isinstance(request, SamSegmentationRequest):
        masks, low_res_masks = self.segment_image(**request.dict())
        if request.format == "json":
            masks = masks > self.predictor.model.mask_threshold
            masks = masks2poly(masks)
            low_res_masks = low_res_masks > self.predictor.model.mask_threshold
            low_res_masks = masks2poly(low_res_masks)
        elif request.format == "binary":
            binary_vector = BytesIO()
            np.savez_compressed(
                binary_vector, masks=masks, low_res_masks=low_res_masks
            )
            binary_vector.seek(0)
            binary_data = binary_vector.getvalue()
            return binary_data
        else:
            raise ValueError(f"Invalid format {request.format}")

        response = SamSegmentationResponse(
            masks=[m.tolist() for m in masks],
            low_res_masks=[m.tolist() for m in low_res_masks],
            time=perf_counter() - t1,
        )
        return response

preproc_image(image)

Preprocesses an image.

Parameters:

Name Type Description Default
image InferenceRequestImage

The image to preprocess.

required

Returns:

Type Description

np.array: The preprocessed image.

Source code in inference/models/sam/segment_anything.py
172
173
174
175
176
177
178
179
180
181
182
def preproc_image(self, image: InferenceRequestImage):
    """Preprocesses an image.

    Args:
        image (InferenceRequestImage): The image to preprocess.

    Returns:
        np.array: The preprocessed image.
    """
    np_image = load_image_rgb(image)
    return np_image

segment_image(image, embeddings=None, embeddings_format='json', has_mask_input=False, image_id=None, mask_input=None, mask_input_format='json', orig_im_size=None, point_coords=[], point_labels=[], use_mask_input_cache=True, **kwargs)

Segments an image based on provided embeddings, points, masks, or cached results. If embeddings are not directly provided, the function can derive them from the input image or cache.

Parameters:

Name Type Description Default
image Any

The image to be segmented.

required
embeddings Optional[Union[ndarray, List[List[float]]]]

The embeddings of the image. Defaults to None, in which case the image is used to compute embeddings.

None
embeddings_format Optional[str]

Format of the provided embeddings; either 'json' or 'binary'. Defaults to 'json'.

'json'
has_mask_input Optional[bool]

Specifies whether mask input is provided. Defaults to False.

False
image_id Optional[str]

A cached identifier for the image. Useful for accessing cached embeddings or masks.

None
mask_input Optional[Union[ndarray, List[List[List[float]]]]]

Input mask for the image.

None
mask_input_format Optional[str]

Format of the provided mask input; either 'json' or 'binary'. Defaults to 'json'.

'json'
orig_im_size Optional[List[int]]

Original size of the image when providing embeddings directly.

None
point_coords Optional[List[List[float]]]

Coordinates of points in the image. Defaults to an empty list.

[]
point_labels Optional[List[int]]

Labels associated with the provided points. Defaults to an empty list.

[]
use_mask_input_cache Optional[bool]

Flag to determine if cached mask input should be used. Defaults to True.

True
**kwargs

Additional keyword arguments.

{}

Returns:

Type Description

Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the segmentation masks of the image and the second element is the low resolution segmentation masks.

Raises:

Type Description
ValueError

If necessary inputs are missing or inconsistent.

Notes
  • Embeddings, segmentations, and low-resolution logits can be cached to improve performance on repeated requests for the same image.
  • The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.
Source code in inference/models/sam/segment_anything.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
def segment_image(
    self,
    image: Any,
    embeddings: Optional[Union[np.ndarray, List[List[float]]]] = None,
    embeddings_format: Optional[str] = "json",
    has_mask_input: Optional[bool] = False,
    image_id: Optional[str] = None,
    mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
    mask_input_format: Optional[str] = "json",
    orig_im_size: Optional[List[int]] = None,
    point_coords: Optional[List[List[float]]] = [],
    point_labels: Optional[List[int]] = [],
    use_mask_input_cache: Optional[bool] = True,
    **kwargs,
):
    """
    Segments an image based on provided embeddings, points, masks, or cached results.
    If embeddings are not directly provided, the function can derive them from the input image or cache.

    Args:
        image (Any): The image to be segmented.
        embeddings (Optional[Union[np.ndarray, List[List[float]]]]): The embeddings of the image.
            Defaults to None, in which case the image is used to compute embeddings.
        embeddings_format (Optional[str]): Format of the provided embeddings; either 'json' or 'binary'. Defaults to 'json'.
        has_mask_input (Optional[bool]): Specifies whether mask input is provided. Defaults to False.
        image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
        mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input mask for the image.
        mask_input_format (Optional[str]): Format of the provided mask input; either 'json' or 'binary'. Defaults to 'json'.
        orig_im_size (Optional[List[int]]): Original size of the image when providing embeddings directly.
        point_coords (Optional[List[List[float]]]): Coordinates of points in the image. Defaults to an empty list.
        point_labels (Optional[List[int]]): Labels associated with the provided points. Defaults to an empty list.
        use_mask_input_cache (Optional[bool]): Flag to determine if cached mask input should be used. Defaults to True.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the segmentation masks of the image
                                      and the second element is the low resolution segmentation masks.

    Raises:
        ValueError: If necessary inputs are missing or inconsistent.

    Notes:
        - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
          on repeated requests for the same image.
        - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.
    """
    if not embeddings:
        if not image and not image_id:
            raise ValueError(
                "Must provide either image, cached image_id, or embeddings"
            )
        elif image_id and not image and image_id not in self.embedding_cache:
            raise ValueError(
                f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
            )
        embedding, original_image_size = self.embed_image(
            image=image, image_id=image_id
        )
    else:
        if not orig_im_size:
            raise ValueError(
                "Must provide original image size if providing embeddings"
            )
        original_image_size = orig_im_size
        if embeddings_format == "json":
            embedding = np.array(embeddings)
        elif embeddings_format == "binary":
            embedding = np.load(BytesIO(embeddings))

    point_coords = point_coords
    point_coords.append([0, 0])
    point_coords = np.array(point_coords, dtype=np.float32)
    point_coords = np.expand_dims(point_coords, axis=0)
    point_coords = self.predictor.transform.apply_coords(
        point_coords,
        original_image_size,
    )

    point_labels = point_labels
    point_labels.append(-1)
    point_labels = np.array(point_labels, dtype=np.float32)
    point_labels = np.expand_dims(point_labels, axis=0)

    if has_mask_input:
        if (
            image_id
            and image_id in self.low_res_logits_cache
            and use_mask_input_cache
        ):
            mask_input = self.low_res_logits_cache[image_id]
        elif not mask_input and (
            not image_id or image_id not in self.low_res_logits_cache
        ):
            raise ValueError("Must provide either mask_input or cached image_id")
        else:
            if mask_input_format == "json":
                polys = mask_input
                mask_input = np.zeros((1, len(polys), 256, 256), dtype=np.uint8)
                for i, poly in enumerate(polys):
                    poly = ShapelyPolygon(poly)
                    raster = rasterio.features.rasterize(
                        [poly], out_shape=(256, 256)
                    )
                    mask_input[0, i, :, :] = raster
            elif mask_input_format == "binary":
                binary_data = base64.b64decode(mask_input)
                mask_input = np.load(BytesIO(binary_data))
    else:
        mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32)

    ort_inputs = {
        "image_embeddings": embedding.astype(np.float32),
        "point_coords": point_coords.astype(np.float32),
        "point_labels": point_labels,
        "mask_input": mask_input.astype(np.float32),
        "has_mask_input": (
            np.zeros(1, dtype=np.float32)
            if not has_mask_input
            else np.ones(1, dtype=np.float32)
        ),
        "orig_im_size": np.array(original_image_size, dtype=np.float32),
    }
    masks, _, low_res_logits = self.ort_session.run(None, ort_inputs)
    if image_id:
        self.low_res_logits_cache[image_id] = low_res_logits
        if image_id not in self.segmentation_cache_keys:
            self.segmentation_cache_keys.append(image_id)
        if len(self.segmentation_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
            cache_key = self.segmentation_cache_keys.pop(0)
            del self.low_res_logits_cache[cache_key]
    masks = masks[0]
    low_res_masks = low_res_logits[0]

    return masks, low_res_masks