`inference` API Reference¶

Base class for ONNX models for Roboflow classification inference.

Attributes:

Name	Type	Description
`multiclass`	`bool`	Whether the classification is multi-class or not.

Methods:

Name	Description
`get_infer_bucket_file_list`	Get the list of required files for inference.
`softmax`	Compute softmax values for a given set of scores.
`infer`	ClassificationInferenceRequest) -> Union[List[Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]], Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]]: Perform inference on a given request and return the response.
`draw_predictions`	Draw prediction visuals on an image.

Source code in inference/core/models/classification_base.py

class ClassificationBaseOnnxRoboflowInferenceModel(OnnxRoboflowInferenceModel):
    """Base class for ONNX models for Roboflow classification inference.

    Attributes:
        multiclass (bool): Whether the classification is multi-class or not.

    Methods:
        get_infer_bucket_file_list() -> list: Get the list of required files for inference.
        softmax(x): Compute softmax values for a given set of scores.
        infer(request: ClassificationInferenceRequest) -> Union[List[Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]], Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]]: Perform inference on a given request and return the response.
        draw_predictions(inference_request, inference_response): Draw prediction visuals on an image.
    """

    task_type = "classification"

    preprocess_means = [0.5, 0.5, 0.5]
    preprocess_stds = [0.5, 0.5, 0.5]

    def __init__(self, *args, **kwargs):
        """Initialize the model, setting whether it is multiclass or not."""
        super().__init__(*args, **kwargs)
        self.multiclass = self.environment.get("MULTICLASS", False)

    def draw_predictions(self, inference_request, inference_response):
        """Draw prediction visuals on an image.

        This method overlays the predictions on the input image, including drawing rectangles and text to visualize the predicted classes.

        Args:
            inference_request: The request object containing the image and parameters.
            inference_response: The response object containing the predictions and other details.

        Returns:
            bytes: The bytes of the visualized image in JPEG format.
        """
        image = load_image_rgb(inference_request.image)
        image = Image.fromarray(image)
        draw = ImageDraw.Draw(image)
        font = ImageFont.load_default()
        if isinstance(inference_response.predictions, list):
            prediction = inference_response.predictions[0]
            color = self.colors.get(prediction.class_name, "#4892EA")
            draw.rectangle(
                [0, 0, image.size[1], image.size[0]],
                outline=color,
                width=inference_request.visualization_stroke_width,
            )
            text = f"{prediction.class_id} - {prediction.class_name} {prediction.confidence:.2f}"
            text_size = font.getbbox(text)

            # set button size + 10px margins
            button_size = (text_size[2] + 20, text_size[3] + 20)
            button_img = Image.new("RGBA", button_size, color)
            # put text on button with 10px margins
            button_draw = ImageDraw.Draw(button_img)
            button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

            # put button on source image in position (0, 0)
            image.paste(button_img, (0, 0))
        else:
            if len(inference_response.predictions) > 0:
                box_color = "#4892EA"
                draw.rectangle(
                    [0, 0, image.size[1], image.size[0]],
                    outline=box_color,
                    width=inference_request.visualization_stroke_width,
                )
            row = 0
            predictions = [
                (cls_name, pred)
                for cls_name, pred in inference_response.predictions.items()
            ]
            predictions = sorted(
                predictions, key=lambda x: x[1].confidence, reverse=True
            )
            for i, (cls_name, pred) in enumerate(predictions):
                color = self.colors.get(cls_name, "#4892EA")
                text = f"{cls_name} {pred.confidence:.2f}"
                text_size = font.getbbox(text)

                # set button size + 10px margins
                button_size = (text_size[2] + 20, text_size[3] + 20)
                button_img = Image.new("RGBA", button_size, color)
                # put text on button with 10px margins
                button_draw = ImageDraw.Draw(button_img)
                button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

                # put button on source image in position (0, 0)
                image.paste(button_img, (0, row))
                row += button_size[1]

        buffered = BytesIO()
        image = image.convert("RGB")
        image.save(buffered, format="JPEG")
        return buffered.getvalue()

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["environment.json"].
        """
        return ["environment.json"]

    def infer(
        self,
        image: Any,
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
        return_image_dims: bool = False,
        **kwargs,
    ):
        """
        Perform inference on the provided image(s) and return the predictions.

        Args:
            image (Any): The image or list of images to be processed.
                - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
            disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
            disable_preproc_contrast (bool, optional): If true, the auto contrast preprocessing step is disabled for this call. Default is False.
            disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
            disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.
            return_image_dims (bool, optional): If set to True, the function will also return the dimensions of the image. Defaults to False.
            **kwargs: Additional parameters to customize the inference process.

        Returns:
            Union[List[np.array], np.array, Tuple[List[np.array], List[Tuple[int, int]]], Tuple[np.array, Tuple[int, int]]]:
            If `return_image_dims` is True and a list of images is provided, a tuple containing a list of prediction arrays and a list of image dimensions (width, height) is returned.
            If `return_image_dims` is True and a single image is provided, a tuple containing the prediction array and image dimensions (width, height) is returned.
            If `return_image_dims` is False and a list of images is provided, only the list of prediction arrays is returned.
            If `return_image_dims` is False and a single image is provided, only the prediction array is returned.

        Notes:
            - The input image(s) will be preprocessed (normalized and reshaped) before inference.
            - This function uses an ONNX session to perform inference on the input image(s).
        """
        return super().infer(
            image,
            disable_preproc_auto_orient=disable_preproc_auto_orient,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
            return_image_dims=return_image_dims,
            **kwargs,
        )

    def postprocess(
        self,
        predictions: Tuple[np.ndarray],
        preprocess_return_metadata: PreprocessReturnMetadata,
        return_image_dims=False,
        **kwargs,
    ) -> Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]:
        predictions = predictions[0]
        return self.make_response(
            predictions, preprocess_return_metadata["img_dims"], **kwargs
        )

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray]:
        with self._session_lock:
            predictions = run_session_via_iobinding(
                self.onnx_session, self.input_name, img_in
            )
        return (predictions,)

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        if isinstance(image, list):
            imgs_with_dims = [
                self.preproc_image(
                    i,
                    disable_preproc_auto_orient=kwargs.get(
                        "disable_preproc_auto_orient", False
                    ),
                    disable_preproc_contrast=kwargs.get(
                        "disable_preproc_contrast", False
                    ),
                    disable_preproc_grayscale=kwargs.get(
                        "disable_preproc_grayscale", False
                    ),
                    disable_preproc_static_crop=kwargs.get(
                        "disable_preproc_static_crop", False
                    ),
                )
                for i in image
            ]
            imgs, img_dims = zip(*imgs_with_dims)
            if isinstance(imgs[0], np.ndarray):
                img_in = np.concatenate(imgs, axis=0)
            elif USE_PYTORCH_FOR_PREPROCESSING:
                img_in = torch.cat(imgs, dim=0)
            else:
                raise ValueError(
                    f"Received a list of images of unknown type, {type(imgs[0])}; "
                    "This is most likely a bug. Contact Roboflow team through github issues "
                    "(https://github.com/roboflow/inference/issues) providing full context of the problem"
                )
        else:
            img_in, img_dims = self.preproc_image(
                image,
                disable_preproc_auto_orient=kwargs.get(
                    "disable_preproc_auto_orient", False
                ),
                disable_preproc_contrast=kwargs.get("disable_preproc_contrast", False),
                disable_preproc_grayscale=kwargs.get(
                    "disable_preproc_grayscale", False
                ),
                disable_preproc_static_crop=kwargs.get(
                    "disable_preproc_static_crop", False
                ),
            )
            img_dims = [img_dims]

        img_in /= 255.0

        mean = self.preprocess_means
        std = self.preprocess_stds
        if isinstance(img_in, np.ndarray):
            img_in = img_in.astype(np.float32)
        elif USE_PYTORCH_FOR_PREPROCESSING:
            img_in = img_in.float()
        else:
            raise ValueError(
                f"Received an image of unknown type, {type(img_in)}; "
                "This is most likely a bug. Contact Roboflow team through github issues "
                "(https://github.com/roboflow/inference/issues) providing full context of the problem"
            )

        img_in[:, 0, :, :] = (img_in[:, 0, :, :] - mean[0]) / std[0]
        img_in[:, 1, :, :] = (img_in[:, 1, :, :] - mean[1]) / std[1]
        img_in[:, 2, :, :] = (img_in[:, 2, :, :] - mean[2]) / std[2]
        return img_in, PreprocessReturnMetadata({"img_dims": img_dims})

    def infer_from_request(
        self,
        request: ClassificationInferenceRequest,
    ) -> Union[List[InferenceResponse], InferenceResponse]:
        """
        Handle an inference request to produce an appropriate response.

        Args:
            request (ClassificationInferenceRequest): The request object encapsulating the image(s) and relevant parameters.

        Returns:
            Union[List[InferenceResponse], InferenceResponse]: The response object(s) containing the predictions, visualization, and other pertinent details. If a list of images was provided, a list of responses is returned. Otherwise, a single response is returned.

        Notes:
            - Starts a timer at the beginning to calculate inference time.
            - Processes the image(s) through the `infer` method.
            - Generates the appropriate response object(s) using `make_response`.
            - Calculates and sets the time taken for inference.
            - If visualization is requested, the predictions are drawn on the image.
        """
        t1 = perf_counter()
        kwargs = request.dict()
        confidence = kwargs.get("confidence")
        if isinstance(confidence, str) and not USE_INFERENCE_MODELS:
            kwargs.pop("confidence")
        responses = self.infer(**kwargs, return_image_dims=True)
        for response in responses:
            response.time = perf_counter() - t1
            response.inference_id = getattr(request, "id", None)

        if request.visualize_predictions:
            for response in responses:
                response.visualization = self.draw_predictions(request, response)

        if not isinstance(request.image, list):
            responses = responses[0]

        return responses

    def make_response(
        self,
        predictions,
        img_dims,
        confidence: float = 0.5,
        **kwargs,
    ) -> Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]:
        """
        Create response objects for the given predictions and image dimensions.

        Args:
            predictions (list): List of prediction arrays from the inference process.
            img_dims (list): List of tuples indicating the dimensions (width, height) of each image.
            confidence (float, optional): Confidence threshold for filtering predictions. Defaults to 0.5.
            **kwargs: Additional parameters to influence the response creation process.

        Returns:
            Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]: A response object or a list of response objects encapsulating the prediction details.

        Notes:
            - If the model is multiclass, a `MultiLabelClassificationInferenceResponse` is generated for each image.
            - If the model is not multiclass, a `ClassificationInferenceResponse` is generated for each image.
            - Predictions below the confidence threshold are filtered out.
        """
        responses = []
        confidence_threshold = float(confidence)
        for ind, prediction in enumerate(predictions):
            if self.multiclass:
                preds = prediction[0]
                results = dict()
                predicted_classes = []
                for i, o in enumerate(preds):
                    cls_name = self.class_names[i]
                    score = float(o)
                    results[cls_name] = {"confidence": score, "class_id": i}
                    if score > confidence_threshold:
                        predicted_classes.append(cls_name)
                response = MultiLabelClassificationInferenceResponse(
                    image=InferenceResponseImage(
                        width=img_dims[ind][0], height=img_dims[ind][1]
                    ),
                    predicted_classes=predicted_classes,
                    predictions=results,
                )
            else:
                preds = prediction[0]
                preds = self.softmax(preds)
                results = []
                for i, cls_name in enumerate(self.class_names):
                    score = float(preds[i])
                    if score < confidence_threshold:
                        continue
                    pred = {
                        "class_id": i,
                        "class": cls_name,
                        "confidence": round(score, 4),
                    }
                    results.append(pred)
                results = sorted(results, key=lambda x: x["confidence"], reverse=True)

                response = ClassificationInferenceResponse(
                    image=InferenceResponseImage(
                        width=img_dims[ind][1], height=img_dims[ind][0]
                    ),
                    predictions=results,
                    top=results[0]["class"] if results else "",
                    confidence=results[0]["confidence"] if results else 0.0,
                )
            responses.append(response)

        return responses

    @staticmethod
    def softmax(x):
        """Compute softmax values for each set of scores in x.

        Args:
            x (np.array): The input array containing the scores.

        Returns:
            np.array: The softmax values for each set of scores.
        """
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    def get_model_output_shape(self) -> Tuple[int, int, int]:
        test_image = (np.random.rand(1024, 1024, 3) * 255).astype(np.uint8)
        test_image, _ = self.preprocess(test_image)
        output = np.array(self.predict(test_image))
        return output.shape

    def validate_model_classes(self) -> None:
        output_shape = self.get_model_output_shape()
        num_classes = output_shape[3]
        try:
            assert num_classes == self.num_classes
        except AssertionError:
            raise ValueError(
                f"Number of classes in model ({num_classes}) does not match the number of classes in the environment ({self.num_classes})"
            )

Methods:¶

init ¶

__init__(*args, **kwargs)

Initialize the model, setting whether it is multiclass or not.

Source code in inference/core/models/classification_base.py

def __init__(self, *args, **kwargs):
    """Initialize the model, setting whether it is multiclass or not."""
    super().__init__(*args, **kwargs)
    self.multiclass = self.environment.get("MULTICLASS", False)

draw_predictions ¶

draw_predictions(inference_request, inference_response)

Draw prediction visuals on an image.

This method overlays the predictions on the input image, including drawing rectangles and text to visualize the predicted classes.

Parameters:

Name	Type	Description	Default
`inference_request`		The request object containing the image and parameters.	required
`inference_response`		The response object containing the predictions and other details.	required

Returns:

Name	Type	Description
`bytes`		The bytes of the visualized image in JPEG format.

Source code in inference/core/models/classification_base.py

def draw_predictions(self, inference_request, inference_response):
    """Draw prediction visuals on an image.

    This method overlays the predictions on the input image, including drawing rectangles and text to visualize the predicted classes.

    Args:
        inference_request: The request object containing the image and parameters.
        inference_response: The response object containing the predictions and other details.

    Returns:
        bytes: The bytes of the visualized image in JPEG format.
    """
    image = load_image_rgb(inference_request.image)
    image = Image.fromarray(image)
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    if isinstance(inference_response.predictions, list):
        prediction = inference_response.predictions[0]
        color = self.colors.get(prediction.class_name, "#4892EA")
        draw.rectangle(
            [0, 0, image.size[1], image.size[0]],
            outline=color,
            width=inference_request.visualization_stroke_width,
        )
        text = f"{prediction.class_id} - {prediction.class_name} {prediction.confidence:.2f}"
        text_size = font.getbbox(text)

        # set button size + 10px margins
        button_size = (text_size[2] + 20, text_size[3] + 20)
        button_img = Image.new("RGBA", button_size, color)
        # put text on button with 10px margins
        button_draw = ImageDraw.Draw(button_img)
        button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

        # put button on source image in position (0, 0)
        image.paste(button_img, (0, 0))
    else:
        if len(inference_response.predictions) > 0:
            box_color = "#4892EA"
            draw.rectangle(
                [0, 0, image.size[1], image.size[0]],
                outline=box_color,
                width=inference_request.visualization_stroke_width,
            )
        row = 0
        predictions = [
            (cls_name, pred)
            for cls_name, pred in inference_response.predictions.items()
        ]
        predictions = sorted(
            predictions, key=lambda x: x[1].confidence, reverse=True
        )
        for i, (cls_name, pred) in enumerate(predictions):
            color = self.colors.get(cls_name, "#4892EA")
            text = f"{cls_name} {pred.confidence:.2f}"
            text_size = font.getbbox(text)

            # set button size + 10px margins
            button_size = (text_size[2] + 20, text_size[3] + 20)
            button_img = Image.new("RGBA", button_size, color)
            # put text on button with 10px margins
            button_draw = ImageDraw.Draw(button_img)
            button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

            # put button on source image in position (0, 0)
            image.paste(button_img, (0, row))
            row += button_size[1]

    buffered = BytesIO()
    image = image.convert("RGB")
    image.save(buffered, format="JPEG")
    return buffered.getvalue()

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name	Type	Description
`list`	`list`	A list of required files for inference, e.g., ["environment.json"].

Source code in inference/core/models/classification_base.py

def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns:
        list: A list of required files for inference, e.g., ["environment.json"].
    """
    return ["environment.json"]

infer ¶

infer(
    image,
    disable_preproc_auto_orient=False,
    disable_preproc_contrast=False,
    disable_preproc_grayscale=False,
    disable_preproc_static_crop=False,
    return_image_dims=False,
    **kwargs
)

Perform inference on the provided image(s) and return the predictions.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image or list of images to be processed. - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.	required
`disable_preproc_auto_orient`	`bool`	If true, the auto orient preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_contrast`	`bool`	If true, the auto contrast preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_grayscale`	`bool`	If true, the grayscale preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_static_crop`	`bool`	If true, the static crop preprocessing step is disabled for this call. Default is False.	`False`
`return_image_dims`	`bool`	If set to True, the function will also return the dimensions of the image. Defaults to False.	`False`
`**kwargs`		Additional parameters to customize the inference process.	`{}`

Returns:

Type	Description
	Union[List[np.array], np.array, Tuple[List[np.array], List[Tuple[int, int]]], Tuple[np.array, Tuple[int, int]]]:
	If `return_image_dims` is True and a list of images is provided, a tuple containing a list of prediction arrays and a list of image dimensions (width, height) is returned.
	If `return_image_dims` is True and a single image is provided, a tuple containing the prediction array and image dimensions (width, height) is returned.
	If `return_image_dims` is False and a list of images is provided, only the list of prediction arrays is returned.
	If `return_image_dims` is False and a single image is provided, only the prediction array is returned.

Notes

The input image(s) will be preprocessed (normalized and reshaped) before inference.
This function uses an ONNX session to perform inference on the input image(s).

Source code in inference/core/models/classification_base.py

def infer(
    self,
    image: Any,
    disable_preproc_auto_orient: bool = False,
    disable_preproc_contrast: bool = False,
    disable_preproc_grayscale: bool = False,
    disable_preproc_static_crop: bool = False,
    return_image_dims: bool = False,
    **kwargs,
):
    """
    Perform inference on the provided image(s) and return the predictions.

    Args:
        image (Any): The image or list of images to be processed.
            - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
        disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
        disable_preproc_contrast (bool, optional): If true, the auto contrast preprocessing step is disabled for this call. Default is False.
        disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
        disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.
        return_image_dims (bool, optional): If set to True, the function will also return the dimensions of the image. Defaults to False.
        **kwargs: Additional parameters to customize the inference process.

    Returns:
        Union[List[np.array], np.array, Tuple[List[np.array], List[Tuple[int, int]]], Tuple[np.array, Tuple[int, int]]]:
        If `return_image_dims` is True and a list of images is provided, a tuple containing a list of prediction arrays and a list of image dimensions (width, height) is returned.
        If `return_image_dims` is True and a single image is provided, a tuple containing the prediction array and image dimensions (width, height) is returned.
        If `return_image_dims` is False and a list of images is provided, only the list of prediction arrays is returned.
        If `return_image_dims` is False and a single image is provided, only the prediction array is returned.

    Notes:
        - The input image(s) will be preprocessed (normalized and reshaped) before inference.
        - This function uses an ONNX session to perform inference on the input image(s).
    """
    return super().infer(
        image,
        disable_preproc_auto_orient=disable_preproc_auto_orient,
        disable_preproc_contrast=disable_preproc_contrast,
        disable_preproc_grayscale=disable_preproc_grayscale,
        disable_preproc_static_crop=disable_preproc_static_crop,
        return_image_dims=return_image_dims,
        **kwargs,
    )

infer_from_request ¶

infer_from_request(request)

Handle an inference request to produce an appropriate response.

Parameters:

Name	Type	Description	Default
`request`	`ClassificationInferenceRequest`	The request object encapsulating the image(s) and relevant parameters.	required

Returns:

Type	Description
`Union[List[InferenceResponse], InferenceResponse]`	Union[List[InferenceResponse], InferenceResponse]: The response object(s) containing the predictions, visualization, and other pertinent details. If a list of images was provided, a list of responses is returned. Otherwise, a single response is returned.

Notes

Starts a timer at the beginning to calculate inference time.
Processes the image(s) through the infer method.
Generates the appropriate response object(s) using make_response.
Calculates and sets the time taken for inference.
If visualization is requested, the predictions are drawn on the image.

Source code in inference/core/models/classification_base.py

def infer_from_request(
    self,
    request: ClassificationInferenceRequest,
) -> Union[List[InferenceResponse], InferenceResponse]:
    """
    Handle an inference request to produce an appropriate response.

    Args:
        request (ClassificationInferenceRequest): The request object encapsulating the image(s) and relevant parameters.

    Returns:
        Union[List[InferenceResponse], InferenceResponse]: The response object(s) containing the predictions, visualization, and other pertinent details. If a list of images was provided, a list of responses is returned. Otherwise, a single response is returned.

    Notes:
        - Starts a timer at the beginning to calculate inference time.
        - Processes the image(s) through the `infer` method.
        - Generates the appropriate response object(s) using `make_response`.
        - Calculates and sets the time taken for inference.
        - If visualization is requested, the predictions are drawn on the image.
    """
    t1 = perf_counter()
    kwargs = request.dict()
    confidence = kwargs.get("confidence")
    if isinstance(confidence, str) and not USE_INFERENCE_MODELS:
        kwargs.pop("confidence")
    responses = self.infer(**kwargs, return_image_dims=True)
    for response in responses:
        response.time = perf_counter() - t1
        response.inference_id = getattr(request, "id", None)

    if request.visualize_predictions:
        for response in responses:
            response.visualization = self.draw_predictions(request, response)

    if not isinstance(request.image, list):
        responses = responses[0]

    return responses

make_response ¶

make_response(
    predictions, img_dims, confidence=0.5, **kwargs
)

Create response objects for the given predictions and image dimensions.

Parameters:

Name	Type	Description	Default
`predictions`	`list`	List of prediction arrays from the inference process.	required
`img_dims`	`list`	List of tuples indicating the dimensions (width, height) of each image.	required
`confidence`	`float`	Confidence threshold for filtering predictions. Defaults to 0.5.	`0.5`
`**kwargs`		Additional parameters to influence the response creation process.	`{}`

Returns:

Type	Description
`Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]`	Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]: A response object or a list of response objects encapsulating the prediction details.

Notes

If the model is multiclass, a MultiLabelClassificationInferenceResponse is generated for each image.
If the model is not multiclass, a ClassificationInferenceResponse is generated for each image.
Predictions below the confidence threshold are filtered out.

Source code in inference/core/models/classification_base.py

def make_response(
    self,
    predictions,
    img_dims,
    confidence: float = 0.5,
    **kwargs,
) -> Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]:
    """
    Create response objects for the given predictions and image dimensions.

    Args:
        predictions (list): List of prediction arrays from the inference process.
        img_dims (list): List of tuples indicating the dimensions (width, height) of each image.
        confidence (float, optional): Confidence threshold for filtering predictions. Defaults to 0.5.
        **kwargs: Additional parameters to influence the response creation process.

    Returns:
        Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]: A response object or a list of response objects encapsulating the prediction details.

    Notes:
        - If the model is multiclass, a `MultiLabelClassificationInferenceResponse` is generated for each image.
        - If the model is not multiclass, a `ClassificationInferenceResponse` is generated for each image.
        - Predictions below the confidence threshold are filtered out.
    """
    responses = []
    confidence_threshold = float(confidence)
    for ind, prediction in enumerate(predictions):
        if self.multiclass:
            preds = prediction[0]
            results = dict()
            predicted_classes = []
            for i, o in enumerate(preds):
                cls_name = self.class_names[i]
                score = float(o)
                results[cls_name] = {"confidence": score, "class_id": i}
                if score > confidence_threshold:
                    predicted_classes.append(cls_name)
            response = MultiLabelClassificationInferenceResponse(
                image=InferenceResponseImage(
                    width=img_dims[ind][0], height=img_dims[ind][1]
                ),
                predicted_classes=predicted_classes,
                predictions=results,
            )
        else:
            preds = prediction[0]
            preds = self.softmax(preds)
            results = []
            for i, cls_name in enumerate(self.class_names):
                score = float(preds[i])
                if score < confidence_threshold:
                    continue
                pred = {
                    "class_id": i,
                    "class": cls_name,
                    "confidence": round(score, 4),
                }
                results.append(pred)
            results = sorted(results, key=lambda x: x["confidence"], reverse=True)

            response = ClassificationInferenceResponse(
                image=InferenceResponseImage(
                    width=img_dims[ind][1], height=img_dims[ind][0]
                ),
                predictions=results,
                top=results[0]["class"] if results else "",
                confidence=results[0]["confidence"] if results else 0.0,
            )
        responses.append(response)

    return responses

softmax `staticmethod` ¶

softmax(x)

Compute softmax values for each set of scores in x.

Parameters:

Name	Type	Description	Default
`x`	`array`	The input array containing the scores.	required

Returns:

Type	Description
	np.array: The softmax values for each set of scores.

Source code in inference/core/models/classification_base.py

@staticmethod
def softmax(x):
    """Compute softmax values for each set of scores in x.

    Args:
        x (np.array): The input array containing the scores.

    Returns:
        np.array: The softmax values for each set of scores.
    """
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

inference.core.models.inference_models_adapters ¶

Classes¶

InferenceModelsClassificationAdapter ¶

Bases: Model

Source code in inference/core/models/inference_models_adapters.py

class InferenceModelsClassificationAdapter(Model):
    def __init__(self, model_id: str, api_key: str = None, **kwargs):
        super().__init__()

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY
        model_id = resolve_roboflow_model_alias(model_id=model_id)

        self.task_type = "classification"
        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: Union[ClassificationModel, MultiLabelClassificationModel] = (
            AutoModel.from_pretrained(
                model_id_or_path=model_id,
                api_key=self.api_key,
                allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
                allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
                weights_provider_extra_headers=extra_weights_provider_headers,
                backend=backend,
                **kwargs,
            )
        )
        self.class_names = list(self._model.class_names)

    def map_inference_kwargs(self, kwargs: dict) -> dict:
        kwargs["input_color_format"] = "bgr"
        pre_processing_overrides = PreProcessingOverrides(
            disable_contrast_enhancement=kwargs.get("disable_preproc_contrast", False),
            disable_grayscale=kwargs.get("disable_preproc_grayscale", False),
            disable_static_crop=kwargs.get("disable_preproc_static_crop", False),
        )
        kwargs["pre_processing_overrides"] = pre_processing_overrides
        return kwargs

    def preprocess(self, image: Any, **kwargs):
        is_batch = isinstance(image, list)
        images = image if is_batch else [image]
        np_images: List[np.ndarray] = [
            load_image_bgr(
                v,
                disable_preproc_auto_orient=kwargs.get(
                    "disable_preproc_auto_orient", False
                ),
            )
            for v in images
        ]
        images_shapes = [i.shape[:2] for i in np_images]
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        return self._model.pre_process(np_images, **mapped_kwargs), images_shapes

    def predict(self, img_in, **kwargs):
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        return self._model.forward(img_in, **mapped_kwargs)

    def postprocess(
        self,
        predictions: Tuple[List[KeyPoints], Optional[List[Detections]]],
        returned_metadata: List[Tuple[int, int]],
        **kwargs,
    ) -> Union[
        List[MultiLabelClassificationInferenceResponse],
        List[ClassificationInferenceResponse],
    ]:
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        if isinstance(self._model, MultiLabelClassificationModel):
            post_processed_predictions = self._model.post_process(
                predictions, **mapped_kwargs
            )
            return prepare_multi_label_classification_response(
                post_processed_predictions,
                image_sizes=returned_metadata,
                class_names=self.class_names,
            )
        # Single-label classification: top-1 always wins regardless of
        # confidence, so per-class refinement isn't meaningful here. The base
        # class deliberately opts out of recommendedParameters entirely. The
        # response builder still uses the confidence as a cutoff that decides
        # which alternative classes show up — string-valued "best"/"default"
        # have no meaningful mapping here, so fall back to 0.5.
        post_processed_predictions = self._model.post_process(
            predictions, **mapped_kwargs
        )
        raw_confidence = kwargs.get("confidence")
        confidence_threshold = (
            raw_confidence if isinstance(raw_confidence, (int, float)) else 0.5
        )
        return prepare_classification_response(
            post_processed_predictions,
            image_sizes=returned_metadata,
            class_names=self.class_names,
            confidence_threshold=confidence_threshold,
        )

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

        Args:
            delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
        """
        pass

    def infer_from_request(
        self,
        request: ClassificationInferenceRequest,
    ) -> Union[List[InferenceResponse], InferenceResponse]:
        """
        Handle an inference request to produce an appropriate response.

        Args:
            request (ClassificationInferenceRequest): The request object encapsulating the image(s) and relevant parameters.

        Returns:
            Union[List[InferenceResponse], InferenceResponse]: The response object(s) containing the predictions, visualization, and other pertinent details. If a list of images was provided, a list of responses is returned. Otherwise, a single response is returned.

        Notes:
            - Starts a timer at the beginning to calculate inference time.
            - Processes the image(s) through the `infer` method.
            - Generates the appropriate response object(s) using `make_response`.
            - Calculates and sets the time taken for inference.
            - If visualization is requested, the predictions are drawn on the image.
        """
        t1 = perf_counter()
        responses = self.infer(**request.dict(), return_image_dims=True)
        for response in responses:
            response.time = perf_counter() - t1
            response.inference_id = getattr(request, "id", None)

        if request.visualize_predictions:
            for response in responses:
                response.visualization = draw_predictions(
                    request, response, self.class_names
                )

        if not isinstance(request.image, list):
            responses = responses[0]

        return responses

Methods:¶

clear_cache ¶

clear_cache(delete_from_disk=True)

Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

Parameters:

Name	Type	Description	Default
`delete_from_disk`	`bool`	Whether to delete cached files from disk. Defaults to True.	`True`

Source code in inference/core/models/inference_models_adapters.py

def clear_cache(self, delete_from_disk: bool = True) -> None:
    """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

    Args:
        delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
    """
    pass

infer_from_request ¶

infer_from_request(request)

Handle an inference request to produce an appropriate response.

Parameters:

Name	Type	Description	Default
`request`	`ClassificationInferenceRequest`	The request object encapsulating the image(s) and relevant parameters.	required

Returns:

Type	Description
`Union[List[InferenceResponse], InferenceResponse]`	Union[List[InferenceResponse], InferenceResponse]: The response object(s) containing the predictions, visualization, and other pertinent details. If a list of images was provided, a list of responses is returned. Otherwise, a single response is returned.

Notes

Starts a timer at the beginning to calculate inference time.
Processes the image(s) through the infer method.
Generates the appropriate response object(s) using make_response.
Calculates and sets the time taken for inference.
If visualization is requested, the predictions are drawn on the image.

Source code in inference/core/models/inference_models_adapters.py

def infer_from_request(
    self,
    request: ClassificationInferenceRequest,
) -> Union[List[InferenceResponse], InferenceResponse]:
    """
    Handle an inference request to produce an appropriate response.

    Args:
        request (ClassificationInferenceRequest): The request object encapsulating the image(s) and relevant parameters.

    Returns:
        Union[List[InferenceResponse], InferenceResponse]: The response object(s) containing the predictions, visualization, and other pertinent details. If a list of images was provided, a list of responses is returned. Otherwise, a single response is returned.

    Notes:
        - Starts a timer at the beginning to calculate inference time.
        - Processes the image(s) through the `infer` method.
        - Generates the appropriate response object(s) using `make_response`.
        - Calculates and sets the time taken for inference.
        - If visualization is requested, the predictions are drawn on the image.
    """
    t1 = perf_counter()
    responses = self.infer(**request.dict(), return_image_dims=True)
    for response in responses:
        response.time = perf_counter() - t1
        response.inference_id = getattr(request, "id", None)

    if request.visualize_predictions:
        for response in responses:
            response.visualization = draw_predictions(
                request, response, self.class_names
            )

    if not isinstance(request.image, list):
        responses = responses[0]

    return responses

InferenceModelsInstanceSegmentationAdapter ¶

Bases: Model

Source code in inference/core/models/inference_models_adapters.py

class InferenceModelsInstanceSegmentationAdapter(Model):
    def __init__(self, model_id: str, api_key: str = None, **kwargs):
        super().__init__()

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY
        model_id = resolve_roboflow_model_alias(model_id=model_id)

        self.task_type = "instance-segmentation"

        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: InstanceSegmentationModel = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            rf_detr_max_input_resolution=RFDETR_ONNX_MAX_RESOLUTION,
            **kwargs,
        )
        self.class_names = list(self._model.class_names)
        # Stream pipelining: depth=1 means original synchronous behavior
        # (preprocess→forward→postprocess on each frame, in order). depth=2
        # means two stages in parallel: while the GPU works on the current
        # frame, the CPU prepares/submits the next frame, then harvests the
        # previous response. Only models that explicitly support the deferred
        # GPU handoff contract can use this; other instance-segmentation
        # backends keep depth=1 even if RFDETR_PIPELINE_DEPTH is set.
        self._pipeline_depth = self._resolve_pipeline_depth()
        self._response_delay = max(1, self._pipeline_depth - 1)
        # Per-adapter in-flight futures + metadata. Not thread-safe; the
        # InferencePipeline is single-producer and the adapter is owned by a
        # single worker.
        self._pending_gpu_submissions: Deque[
            Tuple[InferenceFuture, PreprocessingMetadata, dict]
        ] = deque()
        self._pending_futures: Deque[
            Tuple[InferenceFuture, PreprocessingMetadata, dict]
        ] = deque()
        self._gpu_submit_generation = 0
        self._response_executor: Optional[ThreadPoolExecutor] = None
        self._response_executor_finalizer = None
        self._response_futures: Deque[
            Tuple[
                Future[List[InstanceSegmentationInferenceResponse]],
                Optional[str],
            ]
        ] = deque()

    def _resolve_pipeline_depth(self) -> int:
        requested_depth = min(get_rfdetr_pipeline_depth(), MAX_RFDETR_PIPELINE_DEPTH)
        if requested_depth <= 1 or self._model_supports_stream_pipeline():
            return requested_depth
        return 1

    def _model_supports_stream_pipeline(self) -> bool:
        supports_stream_pipeline = getattr(
            self._model, "supports_stream_pipeline", False
        )
        if callable(supports_stream_pipeline):
            return bool(supports_stream_pipeline())
        return bool(supports_stream_pipeline)

    def map_inference_kwargs(self, kwargs: dict) -> dict:
        kwargs["input_color_format"] = "bgr"
        pre_processing_overrides = PreProcessingOverrides(
            disable_contrast_enhancement=kwargs.get("disable_preproc_contrast", False),
            disable_grayscale=kwargs.get("disable_preproc_grayscale", False),
            disable_static_crop=kwargs.get("disable_preproc_static_crop", False),
        )
        if GCP_SERVERLESS:
            enforce_dense_masks_in_inference_models = False
        else:
            enforce_dense_masks_in_inference_models = kwargs.get(
                "enforce_dense_masks_in_inference_models",
                False,
            )
        kwargs["pre_processing_overrides"] = pre_processing_overrides
        if (
            "rle" in self._model.supported_mask_formats
            and not enforce_dense_masks_in_inference_models
        ):
            kwargs["mask_format"] = "rle"
        kwargs.pop(STREAM_PIPELINE_CONTEXT_ID_KWARG, None)
        return kwargs

    def preprocess(self, image: Any, **kwargs):
        is_batch = isinstance(image, list)
        images = image if is_batch else [image]
        np_images: List[np.ndarray] = [
            load_image_bgr(
                v,
                disable_preproc_auto_orient=kwargs.get(
                    "disable_preproc_auto_orient", False
                ),
            )
            for v in images
        ]
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        return self._model.pre_process(np_images, **mapped_kwargs)

    def _request_batch_size(self, img_in: Any) -> int:
        pre_processing_meta = getattr(img_in, "_pre_processing_meta", None)
        if isinstance(pre_processing_meta, (list, tuple)):
            return len(pre_processing_meta)
        shape = getattr(img_in, "shape", None)
        if shape is not None and len(shape) > 0:
            return int(shape[0])
        if isinstance(img_in, (list, tuple)):
            return len(img_in)
        return 1

    def predict(self, img_in, **kwargs):
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        if self._pipeline_depth <= 1:
            # Original path: forward on current frame, postprocess on
            # current frame, all synchronous.
            return self._model.forward(img_in, **mapped_kwargs)
        if self._request_batch_size(img_in) > 1:
            return self._model.forward(img_in, **mapped_kwargs)

        mapped_kwargs["defer_count_to_adapter"] = (
            kwargs.get("response_mask_format") != "rle"
        )
        mapped_kwargs["defer_postprocess_sync"] = True
        mapped_kwargs["reuse_trt_graph_outputs"] = True
        # Pipelined path: before launching frame N's forward, enqueue the
        # oldest frame whose postprocess metadata is already known. That keeps
        # postprocess off the current frame's postprocess() host path while
        # still preserving the correctness dependency for reused TRT outputs.
        self._submit_next_pending_gpu_work()
        pre_processing_meta = getattr(img_in, "_pre_processing_meta", None)
        fut = self._model.forward_async(img_in, pre_processing_meta, **mapped_kwargs)
        stream_pipeline_context_id = kwargs.get(STREAM_PIPELINE_CONTEXT_ID_KWARG)
        if not isinstance(stream_pipeline_context_id, str):
            stream_pipeline_context_id = None
        attach_adapter_mapped_kwargs(
            fut,
            mapped_kwargs,
            stream_pipeline_context_id=stream_pipeline_context_id,
        )
        if pre_processing_meta is not None:
            self._submit_future_gpu_work(fut, pre_processing_meta, mapped_kwargs)
        self._submit_ready_responses()
        return fut

    def flush(self) -> List[InstanceSegmentationInferenceResponse]:
        """Drain the tail of the pipelined queue.

        Returns responses for any in-flight frames whose forward/postprocess
        GPU work was submitted but whose CPU-visible response has not yet been
        materialized. Callers that use `RFDETR_PIPELINE_DEPTH>=2` MUST invoke
        this at stream end or the final frames will be dropped.
        """
        if self._pipeline_depth <= 1:
            return []
        self._submit_all_pending_gpu_work()
        self._submit_all_pending_responses()
        responses: List[InstanceSegmentationInferenceResponse] = []
        while self._response_futures:
            response_future, _ = self._response_futures.popleft()
            responses.extend(
                _resolve_response_future(
                    future=response_future,
                    context="RF-DETR stream pipeline flush",
                )
            )
        return responses

    def shutdown_pipeline(self) -> None:
        if self._response_executor is None:
            return None
        finalizer = self._response_executor_finalizer
        if finalizer is not None and finalizer.alive:
            finalizer.detach()
        self._response_executor.shutdown(wait=False)
        self._response_executor = None
        self._response_executor_finalizer = None

    def _get_response_executor(self) -> ThreadPoolExecutor:
        if self._response_executor is None:
            self._response_executor = ThreadPoolExecutor(max_workers=1)
            self._response_executor_finalizer = finalize(
                self,
                self._response_executor.shutdown,
                wait=False,
            )
        return self._response_executor

    def _submit_future_gpu_work(
        self,
        fut: InferenceFuture,
        meta: PreprocessingMetadata,
        mapped_kwargs: dict,
    ) -> None:
        if adapter_gpu_work_submitted(fut):
            return None
        fut._meta = meta  # type: ignore[attr-defined]
        fut._kwargs = mapped_kwargs  # type: ignore[attr-defined]
        submit_gpu_work = getattr(fut, "submit_gpu_work", None)
        if callable(submit_gpu_work):
            submit_gpu_work(meta)
            self._gpu_submit_generation = getattr(self, "_gpu_submit_generation", 0) + 1
            mark_adapter_gpu_work_submitted(fut, self._gpu_submit_generation)

    def _submit_next_pending_gpu_work(self) -> None:
        if not self._pending_gpu_submissions:
            return None
        self._submit_future_gpu_work(*self._pending_gpu_submissions.popleft())

    def _submit_all_pending_gpu_work(self) -> None:
        while self._pending_gpu_submissions:
            self._submit_future_gpu_work(*self._pending_gpu_submissions.popleft())

    def _submit_response_build(
        self,
        fut: InferenceFuture,
        meta: PreprocessingMetadata,
        mapped_kwargs: dict,
    ) -> None:
        fut._meta = meta  # type: ignore[attr-defined]
        fut._kwargs = mapped_kwargs  # type: ignore[attr-defined]
        response_future = self._get_response_executor().submit(
            self._finalize_future,
            fut,
            meta,
            mapped_kwargs,
        )
        context_id = get_adapter_stream_pipeline_context_id(fut)
        self._response_futures.append(
            (
                response_future,
                context_id,
            )
        )

    def _submit_ready_responses(self) -> None:
        while self._pending_futures:
            fut, meta, mapped_kwargs = self._pending_futures[0]
            submit_generation = get_adapter_gpu_submit_generation(fut)
            if submit_generation is None:
                self._submit_future_gpu_work(fut, meta, mapped_kwargs)
                submit_generation = get_adapter_gpu_submit_generation(fut)
            if submit_generation is None:
                break
            gpu_submit_generation = getattr(self, "_gpu_submit_generation", 0)
            if gpu_submit_generation < submit_generation + self._response_delay:
                break
            self._submit_response_build(*self._pending_futures.popleft())

    def _submit_all_pending_responses(self) -> None:
        while self._pending_futures:
            self._submit_response_build(*self._pending_futures.popleft())

    def postprocess(
        self,
        predictions,
        preprocess_return_metadata: PreprocessingMetadata,
        **kwargs,
    ) -> List[InstanceSegmentationInferenceResponse]:
        if self._pipeline_depth <= 1 or not isinstance(predictions, InferenceFuture):
            return self._postprocess_sync(
                predictions, preprocess_return_metadata, **kwargs
            )
        fut: InferenceFuture = predictions
        mapped_kwargs = get_adapter_mapped_kwargs(fut)
        self._pending_gpu_submissions.append(
            (
                fut,
                preprocess_return_metadata,
                mapped_kwargs,
            )
        )
        self._pending_futures.append((fut, preprocess_return_metadata, mapped_kwargs))
        if len(self._pending_futures) > self._response_delay:
            self._submit_next_pending_gpu_work()
            self._submit_ready_responses()

        if not self._response_futures:
            return self._empty_responses_for_metadata(
                preprocess_return_metadata=preprocess_return_metadata,
                workflow_execution=kwargs.get("source") == "workflow-execution",
            )

        response_future, context_id = self._response_futures.popleft()
        if kwargs.get("source") == "workflow-execution":
            responses = self._empty_responses_for_metadata(
                preprocess_return_metadata=preprocess_return_metadata,
                workflow_execution=True,
            )
            if responses:
                attach_async_response_future(
                    response=responses[0],
                    response_future=response_future,
                    context_id=context_id,
                )
            return responses
        return _resolve_response_future(
            future=response_future,
            context="RF-DETR stream pipeline response finalization",
        )

    def _empty_responses_for_metadata(
        self,
        preprocess_return_metadata: PreprocessingMetadata,
        workflow_execution: bool,
    ) -> List[InstanceSegmentationInferenceResponse]:
        if workflow_execution:
            return [
                InstanceSegmentationInferenceResponseDC(
                    predictions=[],
                    image=InferenceResponseImageDC(
                        width=m.original_size.width,
                        height=m.original_size.height,
                    ),
                )
                for m in preprocess_return_metadata
            ]
        return [
            InstanceSegmentationInferenceResponse(
                predictions=[],
                image=InferenceResponseImage(
                    width=m.original_size.width,
                    height=m.original_size.height,
                ),
            )
            for m in preprocess_return_metadata
        ]

    def _finalize_future(
        self,
        fut: InferenceFuture,
        preprocess_return_metadata: PreprocessingMetadata,
        mapped_kwargs: dict,
    ) -> List[InstanceSegmentationInferenceResponse]:
        # Override the future's stashed meta (which was `None` at submit
        # time) with the correct metadata for the frame whose forward pass
        # the future represents. This is an allowed private-surface tweak
        # because _DirectInferenceFuture's post_process is memoised.
        fut._meta = preprocess_return_metadata  # type: ignore[attr-defined]
        fut._kwargs = mapped_kwargs  # type: ignore[attr-defined]
        detections_list = fut.result()
        return self._build_responses_from_detections(
            detections_list, preprocess_return_metadata, **mapped_kwargs
        )

    def _postprocess_sync(
        self,
        predictions: List[InstanceDetections],
        preprocess_return_metadata: PreprocessingMetadata,
        **kwargs,
    ) -> List[InstanceSegmentationInferenceResponse]:
        return_in_rle = kwargs.get("response_mask_format") == "rle"
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        mapped_kwargs["defer_count_to_adapter"] = not return_in_rle
        detections_list = self._model.post_process(
            predictions, preprocess_return_metadata, **mapped_kwargs
        )
        return self._build_responses_from_detections(
            detections_list, preprocess_return_metadata, **kwargs
        )

    def _build_responses_from_detections(
        self,
        detections_list: List[InstanceDetections],
        preprocess_return_metadata: PreprocessingMetadata,
        **kwargs,
    ) -> List[InstanceSegmentationInferenceResponse]:
        return_in_rle = kwargs.get("response_mask_format") == "rle"
        # Workflow callers consume a plain dict via `_is_response_dc_to_dict`;
        # dataclasses avoid pydantic validation + `model_dump` overhead per
        # frame. Keep the pydantic path for RLE responses and for non-workflow
        # callers that rely on the response model type.
        use_dc = (
            kwargs.get("source") == "workflow-execution"
            and not return_in_rle
            and getattr(self, "_pipeline_depth", 1) > 1
        )

        responses: List[InstanceSegmentationInferenceResponse] = []
        for preproc_metadata, det in zip(preprocess_return_metadata, detections_list):
            finalize_pending = get_deferred_postprocess_finalizer(det)
            if callable(finalize_pending):
                det = finalize_pending()
            H = preproc_metadata.original_size.height
            W = preproc_metadata.original_size.width

            combined_gpu = getattr(det, "_combined_gpu", None)
            mask_gpu = getattr(det, "_mask_gpu", None)
            mask_packed_gpu = getattr(det, "_mask_packed_gpu", None)
            mask_cpu = getattr(det, "_mask_cpu", None)
            defer_count_to_adapter = getattr(det, "_defer_count_to_adapter", False)
            done_event = get_deferred_postprocess_done_event(det)
            dense_mask_cuda = isinstance(mask_gpu, torch.Tensor) and mask_gpu.is_cuda
            packed_mask_cuda = (
                isinstance(mask_packed_gpu, torch.Tensor) and mask_packed_gpu.is_cuda
            )
            if (
                not return_in_rle
                and done_event is not None
                and (dense_mask_cuda or packed_mask_cuda)
            ):
                device = mask_gpu.device if dense_mask_cuda else mask_packed_gpu.device
                stream = torch.cuda.current_stream(device)
                done_event.wait(stream)

                if (
                    defer_count_to_adapter
                    and isinstance(combined_gpu, torch.Tensor)
                    and combined_gpu.is_cuda
                ):
                    # combined_np / class_column / combined_slice are scratch views;
                    # use only for survivor counting and in-loop scalar extraction.
                    combined_host = get_pinned_buffer(
                        "combined_full",
                        tuple(combined_gpu.shape),
                        combined_gpu.dtype,
                    )
                    combined_host.copy_(combined_gpu, non_blocking=True)
                    stream.synchronize()
                    combined_np = combined_host.numpy()
                    class_column = combined_np[:, 5]
                    inactive_indices = np.flatnonzero(class_column < 0)
                    n_survivors = (
                        int(inactive_indices[0])
                        if inactive_indices.size > 0
                        else int(class_column.shape[0])
                    )
                    if n_survivors == 0:
                        xyxy = np.empty((0, 4), dtype=np.int32)
                        confs = np.empty((0,), dtype=np.float32)
                        class_ids = np.empty((0,), dtype=np.int32)
                        polys_or_rles = []
                    else:
                        combined_slice = combined_np[:n_survivors]
                        xyxy = combined_slice[:, :4]
                        confs = combined_slice[:, 4].view(np.float32)
                        class_ids = combined_slice[:, 5]
                        if packed_mask_cuda:
                            packed_slice = mask_packed_gpu[:n_survivors]
                            packed_host = get_pinned_buffer(
                                "mask_packed",
                                tuple(packed_slice.shape),
                                packed_slice.dtype,
                            )
                            packed_host.copy_(packed_slice, non_blocking=True)
                            stream.synchronize()
                            polys_or_rles = bitpacked_masks2poly(
                                packed_host.numpy(), width=W
                            )
                        else:
                            mask_slice = mask_gpu[:n_survivors]
                            mask_host = get_pinned_buffer(
                                "mask", tuple(mask_slice.shape), mask_slice.dtype
                            )
                            mask_host.copy_(mask_slice, non_blocking=True)
                            stream.synchronize()
                            polys_or_rles = masks2poly(mask_host.numpy())
                else:
                    n_survivors = int(det.xyxy.shape[0])
                    if n_survivors == 0:
                        xyxy = np.empty((0, 4), dtype=np.int32)
                        confs = np.empty((0,), dtype=np.float32)
                        class_ids = np.empty((0,), dtype=np.int32)
                        polys_or_rles = []
                    else:
                        mask_slice = mask_gpu[:n_survivors]
                        mask_host = get_pinned_buffer(
                            "mask", tuple(mask_slice.shape), mask_slice.dtype
                        )
                        if (
                            isinstance(combined_gpu, torch.Tensor)
                            and combined_gpu.is_cuda
                            and tuple(combined_gpu.shape)
                            == (n_survivors, det.xyxy.shape[1] + 2)
                        ):
                            combined_slice = combined_gpu[:n_survivors]
                            combined_host = get_pinned_buffer(
                                "combined",
                                tuple(combined_slice.shape),
                                combined_slice.dtype,
                            )
                            combined_host.copy_(combined_slice, non_blocking=True)
                            mask_host.copy_(mask_slice, non_blocking=True)
                            stream.synchronize()
                            combined_np = combined_host.numpy()
                            xyxy = combined_np[:, :4]
                            confs = combined_np[:, 4].view(np.float32)
                            class_ids = combined_np[:, 5]
                            polys_or_rles = masks2poly(mask_host.numpy())
                        else:
                            xyxy_host = get_pinned_buffer(
                                "xyxy", tuple(det.xyxy.shape), det.xyxy.dtype
                            )
                            conf_host = get_pinned_buffer(
                                "conf",
                                tuple(det.confidence.shape),
                                det.confidence.dtype,
                            )
                            class_host = get_pinned_buffer(
                                "class_id",
                                tuple(det.class_id.shape),
                                det.class_id.dtype,
                            )
                            xyxy_host.copy_(det.xyxy, non_blocking=True)
                            conf_host.copy_(det.confidence, non_blocking=True)
                            class_host.copy_(det.class_id, non_blocking=True)
                            mask_host.copy_(mask_slice, non_blocking=True)
                            stream.synchronize()
                            xyxy = xyxy_host.numpy()
                            confs = conf_host.numpy()
                            class_ids = class_host.numpy()
                            polys_or_rles = masks2poly(mask_host.numpy())
            elif not return_in_rle and isinstance(mask_cpu, np.ndarray):
                xyxy = det.xyxy.detach().cpu().numpy()
                confs = det.confidence.detach().cpu().numpy()
                class_ids = det.class_id.detach().cpu().numpy()
                polys_or_rles = masks2poly(mask_cpu)
            else:
                xyxy = det.xyxy.detach().cpu().numpy()
                confs = det.confidence.detach().cpu().numpy()
                if isinstance(det.mask, torch.Tensor):
                    masks = det.mask.detach().cpu().numpy()
                    if return_in_rle:
                        polys_or_rles = [
                            torch_mask_to_coco_rle(mask=mask) for mask in masks
                        ]
                    else:
                        polys_or_rles = masks2poly(masks)
                else:
                    if return_in_rle:
                        polys_or_rles = det.mask.to_coco_rle_masks()
                    else:
                        polys_or_rles = rle_masks2poly(det.mask)
                class_ids = det.class_id.detach().cpu().numpy()

            # Some branches above intentionally keep numpy views into
            # thread-local pinned scratch buffers. Only scalar values and
            # polygon/RLE lists may be stored on responses below; do not return
            # those arrays or any view derived from them.
            predictions: List[
                Union[InstanceSegmentationPrediction, InstanceSegmentationRLEPrediction]
            ] = []

            for (x1, y1, x2, y2), mask_as_poly_or_rle, conf, class_id in zip(
                xyxy, polys_or_rles, confs, class_ids
            ):
                cx = (float(x1) + float(x2)) / 2.0
                cy = (float(y1) + float(y2)) / 2.0
                w = float(x2) - float(x1)
                h = float(y2) - float(y1)
                class_id_int = int(class_id)
                class_name = (
                    self.class_names[class_id_int]
                    if 0 <= class_id_int < len(self.class_names)
                    else str(class_id_int)
                )
                if (
                    kwargs.get("class_filter")
                    and class_name not in kwargs["class_filter"]
                ):
                    continue
                if use_dc:
                    predictions.append(
                        InstanceSegmentationPredictionDC(
                            x=cx,
                            y=cy,
                            width=w,
                            height=h,
                            confidence=float(conf),
                            class_name=class_name,
                            class_id=class_id_int,
                            points=[
                                PointDC(x=float(point[0]), y=float(point[1]))
                                for point in mask_as_poly_or_rle
                            ],
                        )
                    )
                else:
                    if not return_in_rle:
                        predictions.append(
                            InstanceSegmentationPrediction(
                                x=cx,
                                y=cy,
                                width=w,
                                height=h,
                                confidence=float(conf),
                                points=[
                                    Point(x=point[0], y=point[1])
                                    for point in mask_as_poly_or_rle
                                ],
                                **{"class": class_name},
                                class_id=class_id_int,
                            )
                        )
                    else:
                        if isinstance(mask_as_poly_or_rle["counts"], bytes):
                            mask_as_poly_or_rle["counts"] = mask_as_poly_or_rle[
                                "counts"
                            ].decode("ascii")
                        predictions.append(
                            InstanceSegmentationRLEPrediction(
                                x=cx,
                                y=cy,
                                width=w,
                                height=h,
                                confidence=float(conf),
                                rle=mask_as_poly_or_rle,
                                **{"class": class_name},
                                class_id=class_id_int,
                            )
                        )

            if use_dc:
                responses.append(
                    InstanceSegmentationInferenceResponseDC(
                        predictions=predictions,
                        image=InferenceResponseImageDC(width=W, height=H),
                    )
                )
            else:
                responses.append(
                    InstanceSegmentationInferenceResponse(
                        predictions=predictions,
                        image=InferenceResponseImage(width=W, height=H),
                    )
                )
        return responses

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

        Args:
            delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
        """
        pass

    def draw_predictions(
        self,
        inference_request: InferenceRequest,
        inference_response: InferenceResponse,
    ) -> bytes:
        """Draw predictions from an inference response onto the original image provided by an inference request

        Args:
            inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
            inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

        Returns:
            str: A base64 encoded image string
        """
        class_id_2_color = {
            i: DEFAULT_COLOR_PALETTE[i % len(DEFAULT_COLOR_PALETTE)]
            for i, class_name in enumerate(self._model.class_names)
        }
        return draw_detection_predictions(
            inference_request=inference_request,
            inference_response=inference_response,
            colors=class_id_2_color,
        )

Methods:¶

clear_cache ¶

clear_cache(delete_from_disk=True)

Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

Parameters:

Name	Type	Description	Default
`delete_from_disk`	`bool`	Whether to delete cached files from disk. Defaults to True.	`True`

Source code in inference/core/models/inference_models_adapters.py

def clear_cache(self, delete_from_disk: bool = True) -> None:
    """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

    Args:
        delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
    """
    pass

draw_predictions ¶

draw_predictions(inference_request, inference_response)

Draw predictions from an inference response onto the original image provided by an inference request

Parameters:

Name	Type	Description	Default
`inference_request`	`ObjectDetectionInferenceRequest`	The inference request containing the image on which to draw predictions	required
`inference_response`	`ObjectDetectionInferenceResponse`	The inference response containing predictions to be drawn	required

Returns:

Name	Type	Description
`str`	`bytes`	A base64 encoded image string

Source code in inference/core/models/inference_models_adapters.py

def draw_predictions(
    self,
    inference_request: InferenceRequest,
    inference_response: InferenceResponse,
) -> bytes:
    """Draw predictions from an inference response onto the original image provided by an inference request

    Args:
        inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
        inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

    Returns:
        str: A base64 encoded image string
    """
    class_id_2_color = {
        i: DEFAULT_COLOR_PALETTE[i % len(DEFAULT_COLOR_PALETTE)]
        for i, class_name in enumerate(self._model.class_names)
    }
    return draw_detection_predictions(
        inference_request=inference_request,
        inference_response=inference_response,
        colors=class_id_2_color,
    )

flush ¶

flush()

Drain the tail of the pipelined queue.

Returns responses for any in-flight frames whose forward/postprocess GPU work was submitted but whose CPU-visible response has not yet been materialized. Callers that use RFDETR_PIPELINE_DEPTH>=2 MUST invoke this at stream end or the final frames will be dropped.

Source code in inference/core/models/inference_models_adapters.py

def flush(self) -> List[InstanceSegmentationInferenceResponse]:
    """Drain the tail of the pipelined queue.

    Returns responses for any in-flight frames whose forward/postprocess
    GPU work was submitted but whose CPU-visible response has not yet been
    materialized. Callers that use `RFDETR_PIPELINE_DEPTH>=2` MUST invoke
    this at stream end or the final frames will be dropped.
    """
    if self._pipeline_depth <= 1:
        return []
    self._submit_all_pending_gpu_work()
    self._submit_all_pending_responses()
    responses: List[InstanceSegmentationInferenceResponse] = []
    while self._response_futures:
        response_future, _ = self._response_futures.popleft()
        responses.extend(
            _resolve_response_future(
                future=response_future,
                context="RF-DETR stream pipeline flush",
            )
        )
    return responses

InferenceModelsKeyPointsDetectionAdapter ¶

Bases: Model

Source code in inference/core/models/inference_models_adapters.py

class InferenceModelsKeyPointsDetectionAdapter(Model):
    def __init__(self, model_id: str, api_key: str = None, **kwargs):
        super().__init__()

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY
        model_id = resolve_roboflow_model_alias(model_id=model_id)

        self.task_type = "keypoint-detection"

        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: KeyPointsDetectionModel = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            **kwargs,
        )
        self.class_names = list(self._model.class_names)

    def map_inference_kwargs(self, kwargs: dict) -> dict:
        kwargs["input_color_format"] = "bgr"
        if "request" in kwargs:
            keypoint_confidence_threshold = kwargs["request"].keypoint_confidence
            kwargs["key_points_threshold"] = keypoint_confidence_threshold
        pre_processing_overrides = PreProcessingOverrides(
            disable_contrast_enhancement=kwargs.get("disable_preproc_contrast", False),
            disable_grayscale=kwargs.get("disable_preproc_grayscale", False),
            disable_static_crop=kwargs.get("disable_preproc_static_crop", False),
        )
        kwargs["pre_processing_overrides"] = pre_processing_overrides
        return kwargs

    def preprocess(self, image: Any, **kwargs):
        is_batch = isinstance(image, list)
        images = image if is_batch else [image]
        np_images: List[np.ndarray] = [
            load_image_bgr(
                v,
                disable_preproc_auto_orient=kwargs.get(
                    "disable_preproc_auto_orient", False
                ),
            )
            for v in images
        ]
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        return self._model.pre_process(np_images, **mapped_kwargs)

    def predict(self, img_in, **kwargs):
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        return self._model.forward(img_in, **mapped_kwargs)

    def postprocess(
        self,
        predictions: Tuple[List[KeyPoints], Optional[List[Detections]]],
        preprocess_return_metadata: PreprocessingMetadata,
        **kwargs,
    ) -> List[KeypointsDetectionInferenceResponse]:
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        keypoints_list, detections_list = self._model.post_process(
            predictions, preprocess_return_metadata, **mapped_kwargs
        )
        if detections_list is None:
            raise RuntimeError(
                "Keypoints detection model does not provide instances detection - this is not supported for "
                "models from `inference-models` package which are adapted to work with `inference`."
            )
        key_points_classes = self._model.key_points_classes
        responses: List[KeypointsDetectionInferenceResponse] = []
        for preproc_metadata, keypoints, det in zip(
            preprocess_return_metadata, keypoints_list, detections_list
        ):

            H = preproc_metadata.original_size.height
            W = preproc_metadata.original_size.width

            xyxy = det.xyxy.detach().cpu().numpy()
            confs = det.confidence.detach().cpu().numpy()
            class_ids = det.class_id.detach().cpu().numpy()
            keypoints_xy = keypoints.xy.detach().cpu().tolist()
            keypoints_class_id = keypoints.class_id.detach().cpu().tolist()
            keypoints_confidence = keypoints.confidence.detach().cpu().tolist()
            predictions: List[KeypointsPrediction] = []

            for (
                (x1, y1, x2, y2),
                conf,
                class_id,
                instance_keypoints_xy,
                instance_keypoints_class_id,
                instance_keypoints_confidence,
            ) in zip(
                xyxy,
                confs,
                class_ids,
                keypoints_xy,
                keypoints_class_id,
                keypoints_confidence,
            ):
                cx = (float(x1) + float(x2)) / 2.0
                cy = (float(y1) + float(y2)) / 2.0
                w = float(x2) - float(x1)
                h = float(y2) - float(y1)
                class_id_int = int(class_id)
                class_name = (
                    self.class_names[class_id_int]
                    if 0 <= class_id_int < len(self.class_names)
                    else str(class_id_int)
                )
                if (
                    kwargs.get("class_filter")
                    and class_name not in kwargs["class_filter"]
                ):
                    continue
                predictions.append(
                    KeypointsPrediction(
                        x=cx,
                        y=cy,
                        width=w,
                        height=h,
                        confidence=float(conf),
                        **{"class": class_name},
                        class_id=class_id_int,
                        keypoints=model_keypoints_to_response(
                            instance_keypoints_xy=instance_keypoints_xy,
                            instance_keypoints_confidence=instance_keypoints_confidence,
                            instance_keypoints_class_id=instance_keypoints_class_id,
                            key_points_classes=key_points_classes,
                        ),
                    )
                )

            responses.append(
                KeypointsDetectionInferenceResponse(
                    predictions=predictions,
                    image=InferenceResponseImage(width=W, height=H),
                )
            )

        return responses

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

        Args:
            delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
        """
        pass

    def draw_predictions(
        self,
        inference_request: InferenceRequest,
        inference_response: InferenceResponse,
    ) -> bytes:
        """Draw predictions from an inference response onto the original image provided by an inference request

        Args:
            inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
            inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

        Returns:
            str: A base64 encoded image string
        """
        class_id_2_color = {
            i: DEFAULT_COLOR_PALETTE[i % len(DEFAULT_COLOR_PALETTE)]
            for i, class_name in enumerate(self._model.class_names)
        }
        return draw_detection_predictions(
            inference_request=inference_request,
            inference_response=inference_response,
            colors=class_id_2_color,
        )

Methods:¶

clear_cache ¶

clear_cache(delete_from_disk=True)

Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

Parameters:

Name	Type	Description	Default
`delete_from_disk`	`bool`	Whether to delete cached files from disk. Defaults to True.	`True`

Source code in inference/core/models/inference_models_adapters.py

def clear_cache(self, delete_from_disk: bool = True) -> None:
    """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

    Args:
        delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
    """
    pass

draw_predictions ¶

draw_predictions(inference_request, inference_response)

Draw predictions from an inference response onto the original image provided by an inference request

Parameters:

Name	Type	Description	Default
`inference_request`	`ObjectDetectionInferenceRequest`	The inference request containing the image on which to draw predictions	required
`inference_response`	`ObjectDetectionInferenceResponse`	The inference response containing predictions to be drawn	required

Returns:

Name	Type	Description
`str`	`bytes`	A base64 encoded image string

Source code in inference/core/models/inference_models_adapters.py

def draw_predictions(
    self,
    inference_request: InferenceRequest,
    inference_response: InferenceResponse,
) -> bytes:
    """Draw predictions from an inference response onto the original image provided by an inference request

    Args:
        inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
        inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

    Returns:
        str: A base64 encoded image string
    """
    class_id_2_color = {
        i: DEFAULT_COLOR_PALETTE[i % len(DEFAULT_COLOR_PALETTE)]
        for i, class_name in enumerate(self._model.class_names)
    }
    return draw_detection_predictions(
        inference_request=inference_request,
        inference_response=inference_response,
        colors=class_id_2_color,
    )

InferenceModelsObjectDetectionAdapter ¶

Bases: Model

Source code in inference/core/models/inference_models_adapters.py

class InferenceModelsObjectDetectionAdapter(Model):
    def __init__(self, model_id: str, api_key: str = None, **kwargs):
        super().__init__()

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY
        model_id = resolve_roboflow_model_alias(model_id=model_id)

        self.task_type = "object-detection"

        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: ObjectDetectionModel = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            rf_detr_max_input_resolution=RFDETR_ONNX_MAX_RESOLUTION,
            **kwargs,
        )
        self._preprocess_supports_independent_stage_execution = (
            _supports_independent_stage_execution(self._model.pre_process)
        )
        self.class_names = list(self._model.class_names)

    def map_inference_kwargs(self, kwargs: dict) -> dict:
        kwargs["input_color_format"] = "bgr"
        pre_processing_overrides = PreProcessingOverrides(
            disable_contrast_enhancement=kwargs.get("disable_preproc_contrast", False),
            disable_grayscale=kwargs.get("disable_preproc_grayscale", False),
            disable_static_crop=kwargs.get("disable_preproc_static_crop", False),
        )
        kwargs["pre_processing_overrides"] = pre_processing_overrides
        return kwargs

    def preprocess(self, image: Any, **kwargs):
        is_batch = isinstance(image, list)
        images = image if is_batch else [image]
        np_images: List[np.ndarray] = [
            load_image_bgr(
                v,
                disable_preproc_auto_orient=kwargs.get(
                    "disable_preproc_auto_orient", False
                ),
            )
            for v in images
        ]
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        if self._preprocess_supports_independent_stage_execution:
            mapped_kwargs["independent_stage_execution"] = False
        return self._model.pre_process(np_images, **mapped_kwargs)

    def predict(self, img_in, **kwargs):
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        return self._model.forward(img_in, **mapped_kwargs)

    def postprocess(
        self,
        predictions: List[Detections],
        preprocess_return_metadata: PreprocessingMetadata,
        **kwargs,
    ) -> List[ObjectDetectionInferenceResponse]:
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        detections_list = self._model.post_process(
            predictions, preprocess_return_metadata, **mapped_kwargs
        )

        responses: List[ObjectDetectionInferenceResponse] = []
        for preproc_metadata, det in zip(preprocess_return_metadata, detections_list):
            H = preproc_metadata.original_size.height
            W = preproc_metadata.original_size.width

            xyxy = det.xyxy.detach().cpu().numpy()
            confs = det.confidence.detach().cpu().numpy()
            class_ids = det.class_id.detach().cpu().numpy()

            predictions: List[ObjectDetectionPrediction] = []

            for (x1, y1, x2, y2), conf, class_id in zip(xyxy, confs, class_ids):
                cx = (float(x1) + float(x2)) / 2.0
                cy = (float(y1) + float(y2)) / 2.0
                w = float(x2) - float(x1)
                h = float(y2) - float(y1)
                class_id_int = int(class_id)
                class_name = (
                    self.class_names[class_id_int]
                    if 0 <= class_id_int < len(self.class_names)
                    else str(class_id_int)
                )
                if (
                    kwargs.get("class_filter")
                    and class_name not in kwargs["class_filter"]
                ):
                    continue
                predictions.append(
                    ObjectDetectionPrediction(
                        x=cx,
                        y=cy,
                        width=w,
                        height=h,
                        confidence=float(conf),
                        **{"class": class_name},
                        class_id=class_id_int,
                    )
                )

            responses.append(
                ObjectDetectionInferenceResponse(
                    predictions=predictions,
                    image=InferenceResponseImage(width=W, height=H),
                )
            )
        return responses

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

        Args:
            delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
        """
        pass

    def draw_predictions(
        self,
        inference_request: InferenceRequest,
        inference_response: InferenceResponse,
    ) -> bytes:
        """Draw predictions from an inference response onto the original image provided by an inference request

        Args:
            inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
            inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

        Returns:
            str: A base64 encoded image string
        """
        class_id_2_color = {
            i: DEFAULT_COLOR_PALETTE[i % len(DEFAULT_COLOR_PALETTE)]
            for i, class_name in enumerate(self._model.class_names)
        }
        return draw_detection_predictions(
            inference_request=inference_request,
            inference_response=inference_response,
            colors=class_id_2_color,
        )

Methods:¶

clear_cache ¶

clear_cache(delete_from_disk=True)

Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

Parameters:

Name	Type	Description	Default
`delete_from_disk`	`bool`	Whether to delete cached files from disk. Defaults to True.	`True`

Source code in inference/core/models/inference_models_adapters.py

def clear_cache(self, delete_from_disk: bool = True) -> None:
    """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

    Args:
        delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
    """
    pass

draw_predictions ¶

draw_predictions(inference_request, inference_response)

Draw predictions from an inference response onto the original image provided by an inference request

Parameters:

Name	Type	Description	Default
`inference_request`	`ObjectDetectionInferenceRequest`	The inference request containing the image on which to draw predictions	required
`inference_response`	`ObjectDetectionInferenceResponse`	The inference response containing predictions to be drawn	required

Returns:

Name	Type	Description
`str`	`bytes`	A base64 encoded image string

Source code in inference/core/models/inference_models_adapters.py

def draw_predictions(
    self,
    inference_request: InferenceRequest,
    inference_response: InferenceResponse,
) -> bytes:
    """Draw predictions from an inference response onto the original image provided by an inference request

    Args:
        inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
        inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

    Returns:
        str: A base64 encoded image string
    """
    class_id_2_color = {
        i: DEFAULT_COLOR_PALETTE[i % len(DEFAULT_COLOR_PALETTE)]
        for i, class_name in enumerate(self._model.class_names)
    }
    return draw_detection_predictions(
        inference_request=inference_request,
        inference_response=inference_response,
        colors=class_id_2_color,
    )

InferenceModelsSemanticSegmentationAdapter ¶

Bases: Model

Source code in inference/core/models/inference_models_adapters.py

class InferenceModelsSemanticSegmentationAdapter(Model):
    def __init__(self, model_id: str, api_key: str = None, **kwargs):
        super().__init__()

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY
        model_id = resolve_roboflow_model_alias(model_id=model_id)

        self.task_type = "semantic-segmentation"

        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: SemanticSegmentationModel = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            **kwargs,
        )
        self.class_names = list(self._model.class_names)

    @property
    def class_map(self):
        # match segment.roboflow.com
        return {str(k): v for k, v in enumerate(self.class_names)}

    def map_inference_kwargs(self, kwargs: dict) -> dict:
        kwargs["input_color_format"] = "bgr"
        pre_processing_overrides = PreProcessingOverrides(
            disable_contrast_enhancement=kwargs.get("disable_preproc_contrast", False),
            disable_grayscale=kwargs.get("disable_preproc_grayscale", False),
            disable_static_crop=kwargs.get("disable_preproc_static_crop", False),
        )
        kwargs["pre_processing_overrides"] = pre_processing_overrides
        return kwargs

    def preprocess(self, image: Any, **kwargs):
        is_batch = isinstance(image, list)
        images = image if is_batch else [image]
        np_images: List[np.ndarray] = [
            load_image_bgr(
                v,
                disable_preproc_auto_orient=kwargs.get(
                    "disable_preproc_auto_orient", False
                ),
            )
            for v in images
        ]
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        return self._model.pre_process(np_images, **mapped_kwargs)

    def predict(self, img_in, **kwargs):
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        return self._model.forward(img_in, **mapped_kwargs)

    def postprocess(
        self,
        predictions: torch.Tensor,
        preprocess_return_metadata: PreprocessingMetadata,
        **kwargs,
    ) -> List[SemanticSegmentationInferenceResponse]:
        mapped_kwargs = self.map_inference_kwargs(kwargs)
        segmentation_results = self._model.post_process(
            predictions, preprocess_return_metadata, **mapped_kwargs
        )

        responses: List[SemanticSegmentationInferenceResponse] = []
        for preproc_metadata, segmentation in zip(
            preprocess_return_metadata, segmentation_results
        ):
            height = preproc_metadata.original_size.height
            width = preproc_metadata.original_size.width
            response_image = InferenceResponseImage(width=width, height=height)
            # WARNING! This way of conversion is hazardous - first of all, if background class is not in class names,
            # for certain pre-processing, we end up with -1 values which will be wrapped to 255 - second of all,
            # we can support only 256 classes - those constraints should be fine until inference 2.0
            response_predictions = SemanticSegmentationPrediction(
                segmentation_mask=self.img_to_b64_str(
                    segmentation.segmentation_map.to(torch.uint8)
                ),
                confidence_mask=self.img_to_b64_str(
                    (segmentation.confidence * 255).to(torch.uint8)
                ),
                class_map=self.class_map,
                image=dict(response_image),
            )
            response = SemanticSegmentationInferenceResponse(
                predictions=response_predictions,
                image=response_image,
            )
            responses.append(response)
        return responses

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

        Args:
            delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
        """
        pass

    def img_to_b64_str(self, img: torch.Tensor) -> str:
        if img.dtype != torch.uint8:
            raise ValueError(
                f"img_to_b64_str requires uint8 tensor but got dtype {img.dtype}"
            )

        img = Image.fromarray(img.cpu().numpy())
        buffered = io.BytesIO()
        img.save(buffered, format="PNG")

        img_str = base64.b64encode(buffered.getvalue())
        img_str = img_str.decode("ascii")

        return img_str

    def draw_predictions(
        self,
        inference_request: InferenceRequest,
        inference_response: InferenceResponse,
    ) -> bytes:
        raise NotImplementedError(
            "draw_predictions(...) is not implemented for semantic segmentation models - responses contain "
            "visualization already."
        )

Methods:¶

clear_cache ¶

clear_cache(delete_from_disk=True)

Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

Parameters:

Name	Type	Description	Default
`delete_from_disk`	`bool`	Whether to delete cached files from disk. Defaults to True.	`True`

Source code in inference/core/models/inference_models_adapters.py

def clear_cache(self, delete_from_disk: bool = True) -> None:
    """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.

    Args:
        delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
    """
    pass

Functions:¶

draw_predictions ¶

draw_predictions(
    inference_request, inference_response, class_names
)

Draw prediction visuals on an image.

This method overlays the predictions on the input image, including drawing rectangles and text to visualize the predicted classes.

Parameters:

Name	Type	Description	Default
`inference_request`		The request object containing the image and parameters.	required
`inference_response`		The response object containing the predictions and other details.	required
`class_names`	`List[str]`	List of class names corresponding to the model's classes.	required

Returns:

Name	Type	Description
`bytes`		The bytes of the visualized image in JPEG format.

Source code in inference/core/models/inference_models_adapters.py

def draw_predictions(inference_request, inference_response, class_names: List[str]):
    """Draw prediction visuals on an image.

    This method overlays the predictions on the input image, including drawing rectangles and text to visualize the predicted classes.

    Args:
        inference_request: The request object containing the image and parameters.
        inference_response: The response object containing the predictions and other details.
        class_names: List of class names corresponding to the model's classes.

    Returns:
        bytes: The bytes of the visualized image in JPEG format.
    """
    image = load_image_rgb(inference_request.image)
    image = Image.fromarray(image)
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    class_id_2_color = {
        i: DEFAULT_COLOR_PALETTE[i % len(DEFAULT_COLOR_PALETTE)]
        for i, class_name in enumerate(class_names)
    }
    if isinstance(inference_response.predictions, list):
        prediction = inference_response.predictions[0]
        color = class_id_2_color.get(prediction.class_id, "#4892EA")
        draw.rectangle(
            [0, 0, image.size[1], image.size[0]],
            outline=color,
            width=inference_request.visualization_stroke_width,
        )
        text = f"{prediction.class_id} - {prediction.class_name} {prediction.confidence:.2f}"
        text_size = font.getbbox(text)

        # set button size + 10px margins
        button_size = (text_size[2] + 20, text_size[3] + 20)
        button_img = Image.new("RGBA", button_size, color)
        # put text on button with 10px margins
        button_draw = ImageDraw.Draw(button_img)
        button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

        # put button on source image in position (0, 0)
        image.paste(button_img, (0, 0))
    else:
        if len(inference_response.predictions) > 0:
            box_color = "#4892EA"
            draw.rectangle(
                [0, 0, image.size[1], image.size[0]],
                outline=box_color,
                width=inference_request.visualization_stroke_width,
            )
        row = 0
        predictions = [
            (cls_name, pred)
            for cls_name, pred in inference_response.predictions.items()
        ]
        predictions = sorted(predictions, key=lambda x: x[1].confidence, reverse=True)
        for i, (cls_name, pred) in enumerate(predictions):
            color = class_id_2_color.get(cls_name, "#4892EA")
            text = f"{cls_name} {pred.confidence:.2f}"
            text_size = font.getbbox(text)

            # set button size + 10px margins
            button_size = (text_size[2] + 20, text_size[3] + 20)
            button_img = Image.new("RGBA", button_size, color)
            # put text on button with 10px margins
            button_draw = ImageDraw.Draw(button_img)
            button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

            # put button on source image in position (0, 0)
            image.paste(button_img, (0, row))
            row += button_size[1]

    buffered = BytesIO()
    image = image.convert("RGB")
    image.save(buffered, format="JPEG")
    return buffered.getvalue()

get_pinned_buffer ¶

get_pinned_buffer(name, shape, dtype)

Return a thread-local pinned CPU scratch tensor for async DtoH copies.

Response finalization can run on a worker thread while the inference thread submits later GPU work. Keeping this cache thread-local avoids two workers writing into the same scratch tensor. The small LRU cap prevents retaining a new pinned allocation for every transient shape.

The cache is keyed by (name, dtype) only. When a cached buffer is large enough, this returns a view into that entry, not a fresh allocation. .numpy() and any slices alias the scratch memory until the next copy_ into the same cache slot. Values that outlive the current finalization must be copied or reduced to scalars / fresh polygon arrays before returning.

Source code in inference/core/models/inference_models_adapters.py

def get_pinned_buffer(name: str, shape, dtype: torch.dtype) -> torch.Tensor:
    """Return a thread-local pinned CPU scratch tensor for async DtoH copies.

    Response finalization can run on a worker thread while the inference thread
    submits later GPU work. Keeping this cache thread-local avoids two workers
    writing into the same scratch tensor. The small LRU cap prevents retaining a
    new pinned allocation for every transient shape.

    The cache is keyed by ``(name, dtype)`` only. When a cached buffer is large
    enough, this returns a **view** into that entry, not a fresh allocation.
    ``.numpy()`` and any slices alias the scratch memory until the next
    ``copy_`` into the same cache slot. Values that outlive the current
    finalization must be copied or reduced to scalars / fresh polygon arrays
    before returning.
    """
    cache = getattr(_PINNED_HOST_BUFFER_CONTEXT, "cache", None)
    if cache is None:
        cache = OrderedDict()
        _PINNED_HOST_BUFFER_CONTEXT.cache = cache
    key = (name, dtype)
    buf = cache.get(key)
    if buf is not None and all(buf.shape[i] >= shape[i] for i in range(len(shape))):
        cache.move_to_end(key)
        return buf[tuple(slice(0, s) for s in shape)]
    buf = torch.empty(shape, dtype=dtype, pin_memory=True)
    cache[key] = buf
    cache.move_to_end(key)
    while len(cache) > _PINNED_HOST_BUFFER_CACHE_SIZE:
        cache.popitem(last=False)
    return buf

prepare_multi_label_classification_response ¶

prepare_multi_label_classification_response(
    post_processed_predictions, image_sizes, class_names
)

Build the API response from a model's post-processed predictions.

prediction.class_ids is the authoritative list of "passed" classes — the model's post_process already applied the full priority chain (user → per-class → global → default), so the response builder doesn't re-threshold here. The full per-class score vector is still emitted in image_predictions_dict for UI display.

Source code in inference/core/models/inference_models_adapters.py

def prepare_multi_label_classification_response(
    post_processed_predictions: List[MultiLabelClassificationPrediction],
    image_sizes: List[Tuple[int, int]],
    class_names: List[str],
) -> List[MultiLabelClassificationInferenceResponse]:
    """Build the API response from a model's post-processed predictions.

    `prediction.class_ids` is the authoritative list of "passed" classes —
    the model's `post_process` already applied the
    full priority chain (user → per-class → global → default), so the
    response builder doesn't re-threshold here. The full per-class score
    vector is still emitted in `image_predictions_dict` for UI display.
    """
    results = []
    for prediction, image_size in zip(post_processed_predictions, image_sizes):
        class_confidences = _reshape_classification_confidences(
            confidence=prediction.confidence.cpu(),
            expected_num_images=1,
            class_names=class_names,
        )[0].tolist()
        image_predictions_dict = {
            class_names[class_id]: {
                "confidence": confidence,
                "class_id": class_id,
            }
            for class_id, confidence in enumerate(class_confidences)
        }
        predicted_classes = [
            class_names[class_id] for class_id in prediction.class_ids.tolist()
        ]
        results.append(
            MultiLabelClassificationInferenceResponse(
                predictions=image_predictions_dict,
                predicted_classes=predicted_classes,
                image=InferenceResponseImage(width=image_size[1], height=image_size[0]),
                # essentially pushing a dummy values as I have no intention breaking the new API for the sake of delivering value that has no practical use
            )
        )
    return results

inference.core.models.instance_segmentation_base ¶

Classes¶

InstanceSegmentationBaseOnnxRoboflowInferenceModel ¶

Roboflow ONNX Instance Segmentation model.

This class implements an instance segmentation specific inference method for ONNX models provided by Roboflow.

Source code in inference/core/models/instance_segmentation_base.py

class InstanceSegmentationBaseOnnxRoboflowInferenceModel(OnnxRoboflowInferenceModel):
    """Roboflow ONNX Instance Segmentation model.

    This class implements an instance segmentation specific inference method
    for ONNX models provided by Roboflow.
    """

    task_type = "instance-segmentation"
    num_masks = 32

    def infer(
        self,
        image: Any,
        class_agnostic_nms: bool = False,
        confidence: float = DEFAULT_CONFIDENCE,
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
        iou_threshold: float = DEFAULT_IOU_THRESH,
        mask_decode_mode: str = DEFAULT_MASK_DECODE_MODE,
        max_candidates: int = DEFAULT_MAX_CANDIDATES,
        max_detections: int = DEFAUlT_MAX_DETECTIONS,
        return_image_dims: bool = False,
        tradeoff_factor: float = DEFAULT_TRADEOFF_FACTOR,
        **kwargs,
    ) -> Union[PREDICTIONS_TYPE, Tuple[PREDICTIONS_TYPE, List[Tuple[int, int]]]]:
        """
        Process an image or list of images for instance segmentation.

        Args:
            image (Any): An image or a list of images for processing.
                - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
            class_agnostic_nms (bool, optional): Whether to use class-agnostic non-maximum suppression. Defaults to False.
            confidence (float, optional): Confidence threshold for predictions. Defaults to 0.4.
            iou_threshold (float, optional): IoU threshold for non-maximum suppression. Defaults to 0.3.
            mask_decode_mode (str, optional): Decoding mode for masks. Choices are "accurate", "tradeoff", and "fast". Defaults to "accurate".
            max_candidates (int, optional): Maximum number of candidate detections. Defaults to 3000.
            max_detections (int, optional): Maximum number of detections after non-maximum suppression. Defaults to 300.
            return_image_dims (bool, optional): Whether to return the dimensions of the processed images. Defaults to False.
            tradeoff_factor (float, optional): Tradeoff factor used when `mask_decode_mode` is set to "tradeoff". Must be in [0.0, 1.0]. Defaults to 0.5.
            disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
            disable_preproc_contrast (bool, optional): If true, the auto contrast preprocessing step is disabled for this call. Default is False.
            disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
            disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.
            **kwargs: Additional parameters to customize the inference process.

        Returns:
            Union[List[List[List[float]]], Tuple[List[List[List[float]]], List[Tuple[int, int]]]]: The list of predictions, with each prediction being a list of lists. Optionally, also returns the dimensions of the processed images.

        Raises:
            InvalidMaskDecodeArgument: If an invalid `mask_decode_mode` is provided or if the `tradeoff_factor` is outside the allowed range.

        Notes:
            - Processes input images and normalizes them.
            - Makes predictions using the ONNX runtime.
            - Applies non-maximum suppression to the predictions.
            - Decodes the masks according to the specified mode.
        """
        return super().infer(
            image,
            class_agnostic_nms=class_agnostic_nms,
            confidence=confidence,
            disable_preproc_auto_orient=disable_preproc_auto_orient,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
            iou_threshold=iou_threshold,
            mask_decode_mode=mask_decode_mode,
            max_candidates=max_candidates,
            max_detections=max_detections,
            return_image_dims=return_image_dims,
            tradeoff_factor=tradeoff_factor,
            **kwargs,
        )

    def postprocess(
        self,
        predictions: Tuple[np.ndarray, np.ndarray],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Union[
        InstanceSegmentationInferenceResponse,
        List[InstanceSegmentationInferenceResponse],
    ]:
        predictions, protos = predictions
        predictions = w_np_non_max_suppression(
            predictions,
            conf_thresh=kwargs["confidence"],
            iou_thresh=kwargs["iou_threshold"],
            class_agnostic=kwargs["class_agnostic_nms"],
            max_detections=kwargs["max_detections"],
            max_candidate_detections=kwargs["max_candidates"],
            num_masks=self.num_masks,
        )
        infer_shape = (self.img_size_h, self.img_size_w)
        masks = []
        mask_decode_mode = kwargs["mask_decode_mode"]
        tradeoff_factor = kwargs["tradeoff_factor"]
        img_in_shape = preprocess_return_metadata["im_shape"]

        predictions = [np.array(p) for p in predictions]

        for pred, proto, img_dim in zip(
            predictions, protos, preprocess_return_metadata["img_dims"]
        ):
            if pred.size == 0:
                masks.append([])
                continue
            if mask_decode_mode == "accurate":
                batch_masks = process_mask_accurate(
                    proto, pred[:, 7:], pred[:, :4], img_in_shape[2:]
                )
                output_mask_shape = img_in_shape[2:]
            elif mask_decode_mode == "tradeoff":
                if not 0 <= tradeoff_factor <= 1:
                    raise InvalidMaskDecodeArgument(
                        f"Invalid tradeoff_factor: {tradeoff_factor}. Must be in [0.0, 1.0]"
                    )
                batch_masks = process_mask_tradeoff(
                    proto,
                    pred[:, 7:],
                    pred[:, :4],
                    img_in_shape[2:],
                    tradeoff_factor,
                )
                output_mask_shape = batch_masks.shape[1:]
            elif mask_decode_mode == "fast":
                batch_masks = process_mask_fast(
                    proto, pred[:, 7:], pred[:, :4], img_in_shape[2:]
                )
                output_mask_shape = batch_masks.shape[1:]
            else:
                raise InvalidMaskDecodeArgument(
                    f"Invalid mask_decode_mode: {mask_decode_mode}. Must be one of ['accurate', 'fast', 'tradeoff']"
                )
            polys = masks2poly(batch_masks)
            pred[:, :4] = post_process_bboxes(
                [pred[:, :4]],
                infer_shape,
                [img_dim],
                self.preproc,
                resize_method=self.resize_method,
                disable_preproc_static_crop=preprocess_return_metadata[
                    "disable_preproc_static_crop"
                ],
            )[0]
            polys = post_process_polygons(
                img_dim,
                polys,
                output_mask_shape,
                self.preproc,
                resize_method=self.resize_method,
            )
            masks.append(polys)
        return self.make_response(
            predictions, masks, preprocess_return_metadata["img_dims"], **kwargs
        )

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        img_in, img_dims = self.load_image(
            image,
            disable_preproc_auto_orient=kwargs.get("disable_preproc_auto_orient"),
            disable_preproc_contrast=kwargs.get("disable_preproc_contrast"),
            disable_preproc_grayscale=kwargs.get("disable_preproc_grayscale"),
            disable_preproc_static_crop=kwargs.get("disable_preproc_static_crop"),
        )

        img_in /= 255.0
        return img_in, PreprocessReturnMetadata(
            {
                "img_dims": img_dims,
                "im_shape": img_in.shape,
                "disable_preproc_static_crop": kwargs.get(
                    "disable_preproc_static_crop"
                ),
            }
        )

    def make_response(
        self,
        predictions: List[List[List[float]]],
        masks: List[List[List[float]]],
        img_dims: List[Tuple[int, int]],
        class_filter: Optional[List[str]] = None,
        **kwargs,
    ) -> Union[
        InstanceSegmentationInferenceResponse,
        List[InstanceSegmentationInferenceResponse],
    ]:
        """
        Create instance segmentation inference response objects for the provided predictions and masks.

        Args:
            predictions (List[List[List[float]]]): List of prediction data, one for each image.
            masks (List[List[List[float]]]): List of masks corresponding to the predictions.
            img_dims (List[Tuple[int, int]]): List of image dimensions corresponding to the processed images.
            class_filter (Optional[List[str]]): List of class names to filter predictions by. Defaults to None (no filtering).

        Returns:
            Union[InstanceSegmentationInferenceResponse, List[InstanceSegmentationInferenceResponse]]: A single instance segmentation response or a list of instance segmentation responses based on the number of processed images.

        Notes:
            - For each image, constructs an `InstanceSegmentationInferenceResponse` object.
            - Each response contains a list of `InstanceSegmentationPrediction` objects.
        """
        responses = []
        for ind, (batch_predictions, batch_masks) in enumerate(zip(predictions, masks)):
            predictions = []
            for pred, mask in zip(batch_predictions, batch_masks):
                if class_filter and not self.class_names[int(pred[6])] in class_filter:
                    # TODO: logger.debug
                    continue
                # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                predictions.append(
                    InstanceSegmentationPrediction(
                        **{
                            "x": pred[0] + (pred[2] - pred[0]) / 2,
                            "y": pred[1] + (pred[3] - pred[1]) / 2,
                            "width": pred[2] - pred[0],
                            "height": pred[3] - pred[1],
                            "points": [Point(x=point[0], y=point[1]) for point in mask],
                            "confidence": pred[4],
                            "class": self.class_names[int(pred[6])],
                            "class_id": int(pred[6]),
                        }
                    )
                )
            response = InstanceSegmentationInferenceResponse(
                predictions=predictions,
                image=InferenceResponseImage(
                    width=img_dims[ind][1], height=img_dims[ind][0]
                ),
            )
            responses.append(response)
        return responses

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
        """Runs inference on the ONNX model.

        Args:
            img_in (np.ndarray): The preprocessed image(s) to run inference on.

        Returns:
            Tuple[np.ndarray, np.ndarray]: The ONNX model predictions and the ONNX model protos.

        Raises:
            NotImplementedError: This method must be implemented by a subclass.
        """
        raise NotImplementedError("predict must be implemented by a subclass")

    def validate_model_classes(self) -> None:
        output_shape = self.get_model_output_shape()
        num_classes = get_num_classes_from_model_prediction_shape(
            output_shape[2], masks=self.num_masks
        )
        try:
            assert num_classes == self.num_classes
        except AssertionError:
            raise ValueError(
                f"Number of classes in model ({num_classes}) does not match the number of classes in the environment ({self.num_classes})"
            )

Methods:¶

infer ¶

infer(
    image,
    class_agnostic_nms=False,
    confidence=DEFAULT_CONFIDENCE,
    disable_preproc_auto_orient=False,
    disable_preproc_contrast=False,
    disable_preproc_grayscale=False,
    disable_preproc_static_crop=False,
    iou_threshold=DEFAULT_IOU_THRESH,
    mask_decode_mode=DEFAULT_MASK_DECODE_MODE,
    max_candidates=DEFAULT_MAX_CANDIDATES,
    max_detections=DEFAUlT_MAX_DETECTIONS,
    return_image_dims=False,
    tradeoff_factor=DEFAULT_TRADEOFF_FACTOR,
    **kwargs
)

Process an image or list of images for instance segmentation.

Parameters:

Name	Type	Description	Default
`image`	`Any`	An image or a list of images for processing. - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.	required
`class_agnostic_nms`	`bool`	Whether to use class-agnostic non-maximum suppression. Defaults to False.	`False`
`confidence`	`float`	Confidence threshold for predictions. Defaults to 0.4.	`DEFAULT_CONFIDENCE`
`iou_threshold`	`float`	IoU threshold for non-maximum suppression. Defaults to 0.3.	`DEFAULT_IOU_THRESH`
`mask_decode_mode`	`str`	Decoding mode for masks. Choices are "accurate", "tradeoff", and "fast". Defaults to "accurate".	`DEFAULT_MASK_DECODE_MODE`
`max_candidates`	`int`	Maximum number of candidate detections. Defaults to 3000.	`DEFAULT_MAX_CANDIDATES`
`max_detections`	`int`	Maximum number of detections after non-maximum suppression. Defaults to 300.	`DEFAUlT_MAX_DETECTIONS`
`return_image_dims`	`bool`	Whether to return the dimensions of the processed images. Defaults to False.	`False`
`tradeoff_factor`	`float`	Tradeoff factor used when `mask_decode_mode` is set to "tradeoff". Must be in [0.0, 1.0]. Defaults to 0.5.	`DEFAULT_TRADEOFF_FACTOR`
`disable_preproc_auto_orient`	`bool`	If true, the auto orient preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_contrast`	`bool`	If true, the auto contrast preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_grayscale`	`bool`	If true, the grayscale preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_static_crop`	`bool`	If true, the static crop preprocessing step is disabled for this call. Default is False.	`False`
`**kwargs`		Additional parameters to customize the inference process.	`{}`

Returns:

Type	Description
`Union[PREDICTIONS_TYPE, Tuple[PREDICTIONS_TYPE, List[Tuple[int, int]]]]`	Union[List[List[List[float]]], Tuple[List[List[List[float]]], List[Tuple[int, int]]]]: The list of predictions, with each prediction being a list of lists. Optionally, also returns the dimensions of the processed images.

Raises:

Type	Description
`InvalidMaskDecodeArgument`	If an invalid `mask_decode_mode` is provided or if the `tradeoff_factor` is outside the allowed range.

Notes

Processes input images and normalizes them.
Makes predictions using the ONNX runtime.
Applies non-maximum suppression to the predictions.
Decodes the masks according to the specified mode.

Source code in inference/core/models/instance_segmentation_base.py

def infer(
    self,
    image: Any,
    class_agnostic_nms: bool = False,
    confidence: float = DEFAULT_CONFIDENCE,
    disable_preproc_auto_orient: bool = False,
    disable_preproc_contrast: bool = False,
    disable_preproc_grayscale: bool = False,
    disable_preproc_static_crop: bool = False,
    iou_threshold: float = DEFAULT_IOU_THRESH,
    mask_decode_mode: str = DEFAULT_MASK_DECODE_MODE,
    max_candidates: int = DEFAULT_MAX_CANDIDATES,
    max_detections: int = DEFAUlT_MAX_DETECTIONS,
    return_image_dims: bool = False,
    tradeoff_factor: float = DEFAULT_TRADEOFF_FACTOR,
    **kwargs,
) -> Union[PREDICTIONS_TYPE, Tuple[PREDICTIONS_TYPE, List[Tuple[int, int]]]]:
    """
    Process an image or list of images for instance segmentation.

    Args:
        image (Any): An image or a list of images for processing.
            - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
        class_agnostic_nms (bool, optional): Whether to use class-agnostic non-maximum suppression. Defaults to False.
        confidence (float, optional): Confidence threshold for predictions. Defaults to 0.4.
        iou_threshold (float, optional): IoU threshold for non-maximum suppression. Defaults to 0.3.
        mask_decode_mode (str, optional): Decoding mode for masks. Choices are "accurate", "tradeoff", and "fast". Defaults to "accurate".
        max_candidates (int, optional): Maximum number of candidate detections. Defaults to 3000.
        max_detections (int, optional): Maximum number of detections after non-maximum suppression. Defaults to 300.
        return_image_dims (bool, optional): Whether to return the dimensions of the processed images. Defaults to False.
        tradeoff_factor (float, optional): Tradeoff factor used when `mask_decode_mode` is set to "tradeoff". Must be in [0.0, 1.0]. Defaults to 0.5.
        disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
        disable_preproc_contrast (bool, optional): If true, the auto contrast preprocessing step is disabled for this call. Default is False.
        disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
        disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.
        **kwargs: Additional parameters to customize the inference process.

    Returns:
        Union[List[List[List[float]]], Tuple[List[List[List[float]]], List[Tuple[int, int]]]]: The list of predictions, with each prediction being a list of lists. Optionally, also returns the dimensions of the processed images.

    Raises:
        InvalidMaskDecodeArgument: If an invalid `mask_decode_mode` is provided or if the `tradeoff_factor` is outside the allowed range.

    Notes:
        - Processes input images and normalizes them.
        - Makes predictions using the ONNX runtime.
        - Applies non-maximum suppression to the predictions.
        - Decodes the masks according to the specified mode.
    """
    return super().infer(
        image,
        class_agnostic_nms=class_agnostic_nms,
        confidence=confidence,
        disable_preproc_auto_orient=disable_preproc_auto_orient,
        disable_preproc_contrast=disable_preproc_contrast,
        disable_preproc_grayscale=disable_preproc_grayscale,
        disable_preproc_static_crop=disable_preproc_static_crop,
        iou_threshold=iou_threshold,
        mask_decode_mode=mask_decode_mode,
        max_candidates=max_candidates,
        max_detections=max_detections,
        return_image_dims=return_image_dims,
        tradeoff_factor=tradeoff_factor,
        **kwargs,
    )

make_response ¶

make_response(
    predictions,
    masks,
    img_dims,
    class_filter=None,
    **kwargs
)

Create instance segmentation inference response objects for the provided predictions and masks.

Parameters:

Name	Type	Description	Default
`predictions`	`List[List[List[float]]]`	List of prediction data, one for each image.	required
`masks`	`List[List[List[float]]]`	List of masks corresponding to the predictions.	required
`img_dims`	`List[Tuple[int, int]]`	List of image dimensions corresponding to the processed images.	required
`class_filter`	`Optional[List[str]]`	List of class names to filter predictions by. Defaults to None (no filtering).	`None`

Returns:

Type	Description
`Union[InstanceSegmentationInferenceResponse, List[InstanceSegmentationInferenceResponse]]`	Union[InstanceSegmentationInferenceResponse, List[InstanceSegmentationInferenceResponse]]: A single instance segmentation response or a list of instance segmentation responses based on the number of processed images.

Notes

For each image, constructs an InstanceSegmentationInferenceResponse object.
Each response contains a list of InstanceSegmentationPrediction objects.

Source code in inference/core/models/instance_segmentation_base.py

def make_response(
    self,
    predictions: List[List[List[float]]],
    masks: List[List[List[float]]],
    img_dims: List[Tuple[int, int]],
    class_filter: Optional[List[str]] = None,
    **kwargs,
) -> Union[
    InstanceSegmentationInferenceResponse,
    List[InstanceSegmentationInferenceResponse],
]:
    """
    Create instance segmentation inference response objects for the provided predictions and masks.

    Args:
        predictions (List[List[List[float]]]): List of prediction data, one for each image.
        masks (List[List[List[float]]]): List of masks corresponding to the predictions.
        img_dims (List[Tuple[int, int]]): List of image dimensions corresponding to the processed images.
        class_filter (Optional[List[str]]): List of class names to filter predictions by. Defaults to None (no filtering).

    Returns:
        Union[InstanceSegmentationInferenceResponse, List[InstanceSegmentationInferenceResponse]]: A single instance segmentation response or a list of instance segmentation responses based on the number of processed images.

    Notes:
        - For each image, constructs an `InstanceSegmentationInferenceResponse` object.
        - Each response contains a list of `InstanceSegmentationPrediction` objects.
    """
    responses = []
    for ind, (batch_predictions, batch_masks) in enumerate(zip(predictions, masks)):
        predictions = []
        for pred, mask in zip(batch_predictions, batch_masks):
            if class_filter and not self.class_names[int(pred[6])] in class_filter:
                # TODO: logger.debug
                continue
            # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
            predictions.append(
                InstanceSegmentationPrediction(
                    **{
                        "x": pred[0] + (pred[2] - pred[0]) / 2,
                        "y": pred[1] + (pred[3] - pred[1]) / 2,
                        "width": pred[2] - pred[0],
                        "height": pred[3] - pred[1],
                        "points": [Point(x=point[0], y=point[1]) for point in mask],
                        "confidence": pred[4],
                        "class": self.class_names[int(pred[6])],
                        "class_id": int(pred[6]),
                    }
                )
            )
        response = InstanceSegmentationInferenceResponse(
            predictions=predictions,
            image=InferenceResponseImage(
                width=img_dims[ind][1], height=img_dims[ind][0]
            ),
        )
        responses.append(response)
    return responses

predict ¶

predict(img_in, **kwargs)

Runs inference on the ONNX model.

Parameters:

Name	Type	Description	Default
`img_in`	`ndarray`	The preprocessed image(s) to run inference on.	required

Returns:

Type	Description
`Tuple[ndarray, ndarray]`	Tuple[np.ndarray, np.ndarray]: The ONNX model predictions and the ONNX model protos.

Raises:

Type	Description
`NotImplementedError`	This method must be implemented by a subclass.

Source code in inference/core/models/instance_segmentation_base.py

def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
    """Runs inference on the ONNX model.

    Args:
        img_in (np.ndarray): The preprocessed image(s) to run inference on.

    Returns:
        Tuple[np.ndarray, np.ndarray]: The ONNX model predictions and the ONNX model protos.

    Raises:
        NotImplementedError: This method must be implemented by a subclass.
    """
    raise NotImplementedError("predict must be implemented by a subclass")

Functions:¶

inference.core.models.keypoints_detection_base ¶

Classes¶

KeypointsDetectionBaseOnnxRoboflowInferenceModel ¶

Bases: ObjectDetectionBaseOnnxRoboflowInferenceModel

Roboflow ONNX Object detection model. This class implements an object detection specific infer method.

Source code in inference/core/models/keypoints_detection_base.py

class KeypointsDetectionBaseOnnxRoboflowInferenceModel(
    ObjectDetectionBaseOnnxRoboflowInferenceModel
):
    """Roboflow ONNX Object detection model. This class implements an object detection specific infer method."""

    task_type = "keypoint-detection"

    def __init__(self, model_id: str, *args, **kwargs):
        super().__init__(model_id, *args, **kwargs)

    def get_infer_bucket_file_list(self) -> list:
        """Returns the list of files to be downloaded from the inference bucket for ONNX model.

        Returns:
            list: A list of filenames specific to ONNX models.
        """
        return ["environment.json", "class_names.txt", "keypoints_metadata.json"]

    def postprocess(
        self,
        predictions: Tuple[np.ndarray],
        preproc_return_metadata: PreprocessReturnMetadata,
        class_agnostic_nms=DEFAULT_CLASS_AGNOSTIC_NMS,
        confidence: float = DEFAULT_CONFIDENCE,
        iou_threshold: float = DEFAULT_IOU_THRESH,
        max_candidates: int = DEFAULT_MAX_CANDIDATES,
        max_detections: int = DEFAUlT_MAX_DETECTIONS,
        return_image_dims: bool = False,
        **kwargs,
    ) -> List[KeypointsDetectionInferenceResponse]:
        """Postprocesses the object detection predictions.

        Args:
            predictions (np.ndarray): Raw predictions from the model.
            img_dims (List[Tuple[int, int]]): Dimensions of the images.
            class_agnostic_nms (bool): Whether to apply class-agnostic non-max suppression. Default is False.
            confidence (float): Confidence threshold for filtering detections. Default is 0.5.
            iou_threshold (float): IoU threshold for non-max suppression. Default is 0.5.
            max_candidates (int): Maximum number of candidate detections. Default is 3000.
            max_detections (int): Maximum number of final detections. Default is 300.

        Returns:
            List[KeypointsDetectionInferenceResponse]: The post-processed predictions.
        """
        predictions = predictions[0]
        number_of_classes = len(self.get_class_names)
        num_masks = predictions.shape[2] - 5 - number_of_classes
        predictions = w_np_non_max_suppression(
            predictions,
            conf_thresh=confidence,
            iou_thresh=iou_threshold,
            class_agnostic=class_agnostic_nms,
            max_detections=max_detections,
            max_candidate_detections=max_candidates,
            num_masks=num_masks,
        )

        infer_shape = (self.img_size_h, self.img_size_w)
        img_dims = preproc_return_metadata["img_dims"]
        predictions = post_process_bboxes(
            predictions=predictions,
            infer_shape=infer_shape,
            img_dims=img_dims,
            preproc=self.preproc,
            resize_method=self.resize_method,
            disable_preproc_static_crop=preproc_return_metadata[
                "disable_preproc_static_crop"
            ],
        )
        predictions = post_process_keypoints(
            predictions=predictions,
            keypoints_start_index=-num_masks,
            infer_shape=infer_shape,
            img_dims=img_dims,
            preproc=self.preproc,
            resize_method=self.resize_method,
            disable_preproc_static_crop=preproc_return_metadata[
                "disable_preproc_static_crop"
            ],
        )
        return self.make_response(predictions, img_dims, **kwargs)

    def make_response(
        self,
        predictions: List[List[float]],
        img_dims: List[Tuple[int, int]],
        class_filter: Optional[List[str]] = None,
        *args,
        **kwargs,
    ) -> List[KeypointsDetectionInferenceResponse]:
        """Constructs object detection response objects based on predictions.

        Args:
            predictions (List[List[float]]): The list of predictions.
            img_dims (List[Tuple[int, int]]): Dimensions of the images.
            class_filter (Optional[List[str]]): A list of class names to filter, if provided.

        Returns:
            List[KeypointsDetectionInferenceResponse]: A list of response objects containing keypoints detection predictions.
        """
        if isinstance(img_dims, dict) and "img_dims" in img_dims:
            img_dims = img_dims["img_dims"]
        keypoint_confidence_threshold = 0.0
        if "request" in kwargs:
            keypoint_confidence_threshold = kwargs["request"].keypoint_confidence
        responses = [
            KeypointsDetectionInferenceResponse(
                predictions=[
                    KeypointsPrediction(
                        # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                        **{
                            "x": (pred[0] + pred[2]) / 2,
                            "y": (pred[1] + pred[3]) / 2,
                            "width": pred[2] - pred[0],
                            "height": pred[3] - pred[1],
                            "confidence": pred[4],
                            "class": self.class_names[int(pred[6])],
                            "class_id": int(pred[6]),
                            "keypoints": model_keypoints_to_response(
                                keypoints_metadata=self.keypoints_metadata,
                                keypoints=pred[7:],
                                predicted_object_class_id=int(pred[6]),
                                keypoint_confidence_threshold=keypoint_confidence_threshold,
                            ),
                        }
                    )
                    for pred in batch_predictions
                    if not class_filter
                    or self.class_names[int(pred[6])] in class_filter
                ],
                image=InferenceResponseImage(
                    width=img_dims[ind][1], height=img_dims[ind][0]
                ),
            )
            for ind, batch_predictions in enumerate(predictions)
        ]
        return responses

    def keypoints_count(self) -> int:
        raise NotImplementedError

    def validate_model_classes(self) -> None:
        num_keypoints = self.keypoints_count()
        output_shape = self.get_model_output_shape()
        num_classes = get_num_classes_from_model_prediction_shape(
            len_prediction=output_shape[2], keypoints=num_keypoints
        )
        if num_classes != self.num_classes:
            raise ValueError(
                f"Number of classes in model ({num_classes}) does not match the number of classes in the environment ({self.num_classes})"
            )

Methods:¶

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Returns the list of files to be downloaded from the inference bucket for ONNX model.

Returns:

Name	Type	Description
`list`	`list`	A list of filenames specific to ONNX models.

Source code in inference/core/models/keypoints_detection_base.py

def get_infer_bucket_file_list(self) -> list:
    """Returns the list of files to be downloaded from the inference bucket for ONNX model.

    Returns:
        list: A list of filenames specific to ONNX models.
    """
    return ["environment.json", "class_names.txt", "keypoints_metadata.json"]

make_response ¶

make_response(
    predictions,
    img_dims,
    class_filter=None,
    *args,
    **kwargs
)

Constructs object detection response objects based on predictions.

Parameters:

Name	Type	Description	Default
`predictions`	`List[List[float]]`	The list of predictions.	required
`img_dims`	`List[Tuple[int, int]]`	Dimensions of the images.	required
`class_filter`	`Optional[List[str]]`	A list of class names to filter, if provided.	`None`

Returns:

Type	Description
`List[KeypointsDetectionInferenceResponse]`	List[KeypointsDetectionInferenceResponse]: A list of response objects containing keypoints detection predictions.

Source code in inference/core/models/keypoints_detection_base.py

def make_response(
    self,
    predictions: List[List[float]],
    img_dims: List[Tuple[int, int]],
    class_filter: Optional[List[str]] = None,
    *args,
    **kwargs,
) -> List[KeypointsDetectionInferenceResponse]:
    """Constructs object detection response objects based on predictions.

    Args:
        predictions (List[List[float]]): The list of predictions.
        img_dims (List[Tuple[int, int]]): Dimensions of the images.
        class_filter (Optional[List[str]]): A list of class names to filter, if provided.

    Returns:
        List[KeypointsDetectionInferenceResponse]: A list of response objects containing keypoints detection predictions.
    """
    if isinstance(img_dims, dict) and "img_dims" in img_dims:
        img_dims = img_dims["img_dims"]
    keypoint_confidence_threshold = 0.0
    if "request" in kwargs:
        keypoint_confidence_threshold = kwargs["request"].keypoint_confidence
    responses = [
        KeypointsDetectionInferenceResponse(
            predictions=[
                KeypointsPrediction(
                    # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                    **{
                        "x": (pred[0] + pred[2]) / 2,
                        "y": (pred[1] + pred[3]) / 2,
                        "width": pred[2] - pred[0],
                        "height": pred[3] - pred[1],
                        "confidence": pred[4],
                        "class": self.class_names[int(pred[6])],
                        "class_id": int(pred[6]),
                        "keypoints": model_keypoints_to_response(
                            keypoints_metadata=self.keypoints_metadata,
                            keypoints=pred[7:],
                            predicted_object_class_id=int(pred[6]),
                            keypoint_confidence_threshold=keypoint_confidence_threshold,
                        ),
                    }
                )
                for pred in batch_predictions
                if not class_filter
                or self.class_names[int(pred[6])] in class_filter
            ],
            image=InferenceResponseImage(
                width=img_dims[ind][1], height=img_dims[ind][0]
            ),
        )
        for ind, batch_predictions in enumerate(predictions)
    ]
    return responses

postprocess ¶

postprocess(
    predictions,
    preproc_return_metadata,
    class_agnostic_nms=DEFAULT_CLASS_AGNOSTIC_NMS,
    confidence=DEFAULT_CONFIDENCE,
    iou_threshold=DEFAULT_IOU_THRESH,
    max_candidates=DEFAULT_MAX_CANDIDATES,
    max_detections=DEFAUlT_MAX_DETECTIONS,
    return_image_dims=False,
    **kwargs
)

Postprocesses the object detection predictions.

Parameters:

Name	Type	Description	Default
`predictions`	`ndarray`	Raw predictions from the model.	required
`img_dims`	`List[Tuple[int, int]]`	Dimensions of the images.	required
`class_agnostic_nms`	`bool`	Whether to apply class-agnostic non-max suppression. Default is False.	`DEFAULT_CLASS_AGNOSTIC_NMS`
`confidence`	`float`	Confidence threshold for filtering detections. Default is 0.5.	`DEFAULT_CONFIDENCE`
`iou_threshold`	`float`	IoU threshold for non-max suppression. Default is 0.5.	`DEFAULT_IOU_THRESH`
`max_candidates`	`int`	Maximum number of candidate detections. Default is 3000.	`DEFAULT_MAX_CANDIDATES`
`max_detections`	`int`	Maximum number of final detections. Default is 300.	`DEFAUlT_MAX_DETECTIONS`

Returns:

Type	Description
`List[KeypointsDetectionInferenceResponse]`	List[KeypointsDetectionInferenceResponse]: The post-processed predictions.

Source code in inference/core/models/keypoints_detection_base.py

def postprocess(
    self,
    predictions: Tuple[np.ndarray],
    preproc_return_metadata: PreprocessReturnMetadata,
    class_agnostic_nms=DEFAULT_CLASS_AGNOSTIC_NMS,
    confidence: float = DEFAULT_CONFIDENCE,
    iou_threshold: float = DEFAULT_IOU_THRESH,
    max_candidates: int = DEFAULT_MAX_CANDIDATES,
    max_detections: int = DEFAUlT_MAX_DETECTIONS,
    return_image_dims: bool = False,
    **kwargs,
) -> List[KeypointsDetectionInferenceResponse]:
    """Postprocesses the object detection predictions.

    Args:
        predictions (np.ndarray): Raw predictions from the model.
        img_dims (List[Tuple[int, int]]): Dimensions of the images.
        class_agnostic_nms (bool): Whether to apply class-agnostic non-max suppression. Default is False.
        confidence (float): Confidence threshold for filtering detections. Default is 0.5.
        iou_threshold (float): IoU threshold for non-max suppression. Default is 0.5.
        max_candidates (int): Maximum number of candidate detections. Default is 3000.
        max_detections (int): Maximum number of final detections. Default is 300.

    Returns:
        List[KeypointsDetectionInferenceResponse]: The post-processed predictions.
    """
    predictions = predictions[0]
    number_of_classes = len(self.get_class_names)
    num_masks = predictions.shape[2] - 5 - number_of_classes
    predictions = w_np_non_max_suppression(
        predictions,
        conf_thresh=confidence,
        iou_thresh=iou_threshold,
        class_agnostic=class_agnostic_nms,
        max_detections=max_detections,
        max_candidate_detections=max_candidates,
        num_masks=num_masks,
    )

    infer_shape = (self.img_size_h, self.img_size_w)
    img_dims = preproc_return_metadata["img_dims"]
    predictions = post_process_bboxes(
        predictions=predictions,
        infer_shape=infer_shape,
        img_dims=img_dims,
        preproc=self.preproc,
        resize_method=self.resize_method,
        disable_preproc_static_crop=preproc_return_metadata[
            "disable_preproc_static_crop"
        ],
    )
    predictions = post_process_keypoints(
        predictions=predictions,
        keypoints_start_index=-num_masks,
        infer_shape=infer_shape,
        img_dims=img_dims,
        preproc=self.preproc,
        resize_method=self.resize_method,
        disable_preproc_static_crop=preproc_return_metadata[
            "disable_preproc_static_crop"
        ],
    )
    return self.make_response(predictions, img_dims, **kwargs)

Functions:¶

inference.core.models.object_detection_base ¶

Classes¶

ObjectDetectionBaseOnnxRoboflowInferenceModel ¶

Roboflow ONNX Object detection model. This class implements an object detection specific infer method.

Source code in inference/core/models/object_detection_base.py

class ObjectDetectionBaseOnnxRoboflowInferenceModel(OnnxRoboflowInferenceModel):
    """Roboflow ONNX Object detection model. This class implements an object detection specific infer method."""

    task_type = "object-detection"
    box_format = "xywh"

    def infer(
        self,
        image: Any,
        class_agnostic_nms: bool = DEFAULT_CLASS_AGNOSTIC_NMS,
        confidence: float = DEFAULT_CONFIDENCE,
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
        iou_threshold: float = DEFAULT_IOU_THRESH,
        fix_batch_size: bool = False,
        max_candidates: int = DEFAULT_MAX_CANDIDATES,
        max_detections: int = DEFAUlT_MAX_DETECTIONS,
        return_image_dims: bool = False,
        **kwargs,
    ) -> Any:
        """
        Runs object detection inference on one or multiple images and returns the detections.

        Args:
            image (Any): The input image or a list of images to process.
                - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
            class_agnostic_nms (bool, optional): Whether to use class-agnostic non-maximum suppression. Defaults to False.
            confidence (float, optional): Confidence threshold for predictions. Defaults to 0.4.
            iou_threshold (float, optional): IoU threshold for non-maximum suppression. Defaults to 0.3.
            fix_batch_size (bool, optional): If True, fix the batch size for predictions. Useful when the model requires a fixed batch size. Defaults to False.
            max_candidates (int, optional): Maximum number of candidate detections. Defaults to 3000.
            max_detections (int, optional): Maximum number of detections after non-maximum suppression. Defaults to 300.
            return_image_dims (bool, optional): Whether to return the dimensions of the processed images along with the predictions. Defaults to False.
            disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
            disable_preproc_contrast (bool, optional): If true, the auto contrast preprocessing step is disabled for this call. Default is False.
            disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
            disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            Union[List[ObjectDetectionInferenceResponse], ObjectDetectionInferenceResponse]: One or multiple object detection inference responses based on the number of processed images. Each response contains a list of predictions. If `return_image_dims` is True, it will return a tuple with predictions and image dimensions.

        Raises:
            ValueError: If batching is not enabled for the model and more than one image is passed for processing.
        """
        return super().infer(
            image,
            class_agnostic_nms=class_agnostic_nms,
            confidence=confidence,
            disable_preproc_auto_orient=disable_preproc_auto_orient,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
            iou_threshold=iou_threshold,
            fix_batch_size=fix_batch_size,
            max_candidates=max_candidates,
            max_detections=max_detections,
            return_image_dims=return_image_dims,
            **kwargs,
        )

    def make_response(
        self,
        predictions: List[List[float]],
        img_dims: List[Tuple[int, int]],
        class_filter: Optional[List[str]] = None,
        *args,
        **kwargs,
    ) -> List[ObjectDetectionInferenceResponse]:
        """Constructs object detection response objects based on predictions.

        Args:
            predictions (List[List[float]]): The list of predictions.
            img_dims (List[Tuple[int, int]]): Dimensions of the images.
            class_filter (Optional[List[str]]): A list of class names to filter, if provided.

        Returns:
            List[ObjectDetectionInferenceResponse]: A list of response objects containing object detection predictions.
        """

        if isinstance(img_dims, dict) and "img_dims" in img_dims:
            img_dims = img_dims["img_dims"]

        predictions = predictions[
            : len(img_dims)
        ]  # If the batch size was fixed we have empty preds at the end

        responses = [
            ObjectDetectionInferenceResponse(
                predictions=[
                    ObjectDetectionPrediction(
                        # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                        **{
                            "x": (pred[0] + pred[2]) / 2,
                            "y": (pred[1] + pred[3]) / 2,
                            "width": pred[2] - pred[0],
                            "height": pred[3] - pred[1],
                            "confidence": pred[4],
                            "class": self.class_names[int(pred[6])],
                            "class_id": int(pred[6]),
                        }
                    )
                    for pred in batch_predictions
                    if not class_filter
                    or self.class_names[int(pred[6])] in class_filter
                ],
                image=InferenceResponseImage(
                    width=img_dims[ind][1], height=img_dims[ind][0]
                ),
            )
            for ind, batch_predictions in enumerate(predictions)
        ]
        return responses

    def postprocess(
        self,
        predictions: Tuple[np.ndarray, ...],
        preproc_return_metadata: PreprocessReturnMetadata,
        class_agnostic_nms=DEFAULT_CLASS_AGNOSTIC_NMS,
        confidence: float = DEFAULT_CONFIDENCE,
        iou_threshold: float = DEFAULT_IOU_THRESH,
        max_candidates: int = DEFAULT_MAX_CANDIDATES,
        max_detections: int = DEFAUlT_MAX_DETECTIONS,
        return_image_dims: bool = False,
        **kwargs,
    ) -> List[ObjectDetectionInferenceResponse]:
        """Postprocesses the object detection predictions.

        Args:
            predictions (np.ndarray): Raw predictions from the model.
            img_dims (List[Tuple[int, int]]): Dimensions of the images.
            class_agnostic_nms (bool): Whether to apply class-agnostic non-max suppression. Default is False.
            confidence (float): Confidence threshold for filtering detections. Default is 0.5.
            iou_threshold (float): IoU threshold for non-max suppression. Default is 0.5.
            max_candidates (int): Maximum number of candidate detections. Default is 3000.
            max_detections (int): Maximum number of final detections. Default is 300.

        Returns:
            List[ObjectDetectionInferenceResponse]: The post-processed predictions.
        """
        predictions = predictions[0]
        predictions = w_np_non_max_suppression(
            predictions,
            conf_thresh=confidence,
            iou_thresh=iou_threshold,
            class_agnostic=class_agnostic_nms,
            max_detections=max_detections,
            max_candidate_detections=max_candidates,
            box_format=self.box_format,
        )

        infer_shape = (self.img_size_h, self.img_size_w)
        img_dims = preproc_return_metadata["img_dims"]
        predictions = post_process_bboxes(
            predictions,
            infer_shape,
            img_dims,
            self.preproc,
            resize_method=self.resize_method,
            disable_preproc_static_crop=preproc_return_metadata[
                "disable_preproc_static_crop"
            ],
        )
        return self.make_response(predictions, img_dims, **kwargs)

    def preprocess(
        self,
        image: Any,
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
        fix_batch_size: bool = False,
        **kwargs,
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        """Preprocesses an object detection inference request.

        Args:
            request (ObjectDetectionInferenceRequest): The request object containing images.

        Returns:
            Tuple[np.ndarray, List[Tuple[int, int]]]: Preprocessed image inputs and corresponding dimensions.
        """
        img_in, img_dims = self.load_image(
            image,
            disable_preproc_auto_orient=disable_preproc_auto_orient,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
        )

        img_in /= 255.0

        if self.batching_enabled:
            batch_padding = 0
            if FIX_BATCH_SIZE or fix_batch_size:
                if MAX_BATCH_SIZE == float("inf"):
                    logger.warning(
                        "Requested fix_batch_size but MAX_BATCH_SIZE is not set. Using dynamic batching."
                    )
                    batch_padding = 0
                else:
                    batch_padding = MAX_BATCH_SIZE - img_in.shape[0]
            if batch_padding < 0:
                raise ValueError(
                    f"Requested fix_batch_size but passed in {img_in.shape[0]} images "
                    f"when the model's batch size is {MAX_BATCH_SIZE}\n"
                    f"Consider turning off fix_batch_size, changing `MAX_BATCH_SIZE` in"
                    f"your inference server config, or passing at most {MAX_BATCH_SIZE} images at a time"
                )
            width_remainder = img_in.shape[2] % 32
            height_remainder = img_in.shape[3] % 32
            if width_remainder > 0:
                width_padding = 32 - width_remainder
            else:
                width_padding = 0
            if height_remainder > 0:
                height_padding = 32 - height_remainder
            else:
                height_padding = 0

            if isinstance(img_in, np.ndarray):
                img_in = np.pad(
                    img_in,
                    (
                        (0, batch_padding),
                        (0, 0),
                        (0, width_padding),
                        (0, height_padding),
                    ),
                    "constant",
                )
            elif USE_PYTORCH_FOR_PREPROCESSING:
                img_in = torch.nn.functional.pad(
                    img_in,
                    (
                        0,
                        height_padding,  # height padding
                        0,
                        width_padding,  # width padding
                        0,
                        0,  # channels
                        0,
                        batch_padding,
                    ),  # batch
                    mode="constant",
                    value=0,
                )
            else:
                raise ValueError(
                    f"Received an image of unknown type, {type(img_in)}; "
                    "This is most likely a bug. Contact Roboflow team through github issues "
                    "(https://github.com/roboflow/inference/issues) providing full context of the problem"
                )

        return img_in, PreprocessReturnMetadata(
            {
                "img_dims": img_dims,
                "disable_preproc_static_crop": disable_preproc_static_crop,
            }
        )

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray]:
        """Runs inference on the ONNX model.

        Args:
            img_in (np.ndarray): The preprocessed image(s) to run inference on.

        Returns:
            Tuple[np.ndarray]: The ONNX model predictions.

        Raises:
            NotImplementedError: This method must be implemented by a subclass.
        """
        raise NotImplementedError("predict must be implemented by a subclass")

    def validate_model_classes(self) -> None:
        output_shape = self.get_model_output_shape()
        num_classes = get_num_classes_from_model_prediction_shape(
            output_shape[2], masks=0
        )
        try:
            assert num_classes == self.num_classes
        except AssertionError:
            raise ValueError(
                f"Number of classes in model ({num_classes}) does not match the number of classes in the environment ({self.num_classes})"
            )

Methods:¶

infer ¶

infer(
    image,
    class_agnostic_nms=DEFAULT_CLASS_AGNOSTIC_NMS,
    confidence=DEFAULT_CONFIDENCE,
    disable_preproc_auto_orient=False,
    disable_preproc_contrast=False,
    disable_preproc_grayscale=False,
    disable_preproc_static_crop=False,
    iou_threshold=DEFAULT_IOU_THRESH,
    fix_batch_size=False,
    max_candidates=DEFAULT_MAX_CANDIDATES,
    max_detections=DEFAUlT_MAX_DETECTIONS,
    return_image_dims=False,
    **kwargs
)

Runs object detection inference on one or multiple images and returns the detections.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The input image or a list of images to process. - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.	required
`class_agnostic_nms`	`bool`	Whether to use class-agnostic non-maximum suppression. Defaults to False.	`DEFAULT_CLASS_AGNOSTIC_NMS`
`confidence`	`float`	Confidence threshold for predictions. Defaults to 0.4.	`DEFAULT_CONFIDENCE`
`iou_threshold`	`float`	IoU threshold for non-maximum suppression. Defaults to 0.3.	`DEFAULT_IOU_THRESH`
`fix_batch_size`	`bool`	If True, fix the batch size for predictions. Useful when the model requires a fixed batch size. Defaults to False.	`False`
`max_candidates`	`int`	Maximum number of candidate detections. Defaults to 3000.	`DEFAULT_MAX_CANDIDATES`
`max_detections`	`int`	Maximum number of detections after non-maximum suppression. Defaults to 300.	`DEFAUlT_MAX_DETECTIONS`
`return_image_dims`	`bool`	Whether to return the dimensions of the processed images along with the predictions. Defaults to False.	`False`
`disable_preproc_auto_orient`	`bool`	If true, the auto orient preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_contrast`	`bool`	If true, the auto contrast preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_grayscale`	`bool`	If true, the grayscale preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_static_crop`	`bool`	If true, the static crop preprocessing step is disabled for this call. Default is False.	`False`
`*args`		Variable length argument list.	required
`**kwargs`		Arbitrary keyword arguments.	`{}`

Returns:

Type	Description
`Any`	Union[List[ObjectDetectionInferenceResponse], ObjectDetectionInferenceResponse]: One or multiple object detection inference responses based on the number of processed images. Each response contains a list of predictions. If `return_image_dims` is True, it will return a tuple with predictions and image dimensions.

Raises:

Type	Description
`ValueError`	If batching is not enabled for the model and more than one image is passed for processing.

Source code in inference/core/models/object_detection_base.py

def infer(
    self,
    image: Any,
    class_agnostic_nms: bool = DEFAULT_CLASS_AGNOSTIC_NMS,
    confidence: float = DEFAULT_CONFIDENCE,
    disable_preproc_auto_orient: bool = False,
    disable_preproc_contrast: bool = False,
    disable_preproc_grayscale: bool = False,
    disable_preproc_static_crop: bool = False,
    iou_threshold: float = DEFAULT_IOU_THRESH,
    fix_batch_size: bool = False,
    max_candidates: int = DEFAULT_MAX_CANDIDATES,
    max_detections: int = DEFAUlT_MAX_DETECTIONS,
    return_image_dims: bool = False,
    **kwargs,
) -> Any:
    """
    Runs object detection inference on one or multiple images and returns the detections.

    Args:
        image (Any): The input image or a list of images to process.
            - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
        class_agnostic_nms (bool, optional): Whether to use class-agnostic non-maximum suppression. Defaults to False.
        confidence (float, optional): Confidence threshold for predictions. Defaults to 0.4.
        iou_threshold (float, optional): IoU threshold for non-maximum suppression. Defaults to 0.3.
        fix_batch_size (bool, optional): If True, fix the batch size for predictions. Useful when the model requires a fixed batch size. Defaults to False.
        max_candidates (int, optional): Maximum number of candidate detections. Defaults to 3000.
        max_detections (int, optional): Maximum number of detections after non-maximum suppression. Defaults to 300.
        return_image_dims (bool, optional): Whether to return the dimensions of the processed images along with the predictions. Defaults to False.
        disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
        disable_preproc_contrast (bool, optional): If true, the auto contrast preprocessing step is disabled for this call. Default is False.
        disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
        disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.

    Returns:
        Union[List[ObjectDetectionInferenceResponse], ObjectDetectionInferenceResponse]: One or multiple object detection inference responses based on the number of processed images. Each response contains a list of predictions. If `return_image_dims` is True, it will return a tuple with predictions and image dimensions.

    Raises:
        ValueError: If batching is not enabled for the model and more than one image is passed for processing.
    """
    return super().infer(
        image,
        class_agnostic_nms=class_agnostic_nms,
        confidence=confidence,
        disable_preproc_auto_orient=disable_preproc_auto_orient,
        disable_preproc_contrast=disable_preproc_contrast,
        disable_preproc_grayscale=disable_preproc_grayscale,
        disable_preproc_static_crop=disable_preproc_static_crop,
        iou_threshold=iou_threshold,
        fix_batch_size=fix_batch_size,
        max_candidates=max_candidates,
        max_detections=max_detections,
        return_image_dims=return_image_dims,
        **kwargs,
    )

make_response ¶

make_response(
    predictions,
    img_dims,
    class_filter=None,
    *args,
    **kwargs
)

Constructs object detection response objects based on predictions.

Parameters:

Name	Type	Description	Default
`predictions`	`List[List[float]]`	The list of predictions.	required
`img_dims`	`List[Tuple[int, int]]`	Dimensions of the images.	required
`class_filter`	`Optional[List[str]]`	A list of class names to filter, if provided.	`None`

Returns:

Type	Description
`List[ObjectDetectionInferenceResponse]`	List[ObjectDetectionInferenceResponse]: A list of response objects containing object detection predictions.

Source code in inference/core/models/object_detection_base.py

def make_response(
    self,
    predictions: List[List[float]],
    img_dims: List[Tuple[int, int]],
    class_filter: Optional[List[str]] = None,
    *args,
    **kwargs,
) -> List[ObjectDetectionInferenceResponse]:
    """Constructs object detection response objects based on predictions.

    Args:
        predictions (List[List[float]]): The list of predictions.
        img_dims (List[Tuple[int, int]]): Dimensions of the images.
        class_filter (Optional[List[str]]): A list of class names to filter, if provided.

    Returns:
        List[ObjectDetectionInferenceResponse]: A list of response objects containing object detection predictions.
    """

    if isinstance(img_dims, dict) and "img_dims" in img_dims:
        img_dims = img_dims["img_dims"]

    predictions = predictions[
        : len(img_dims)
    ]  # If the batch size was fixed we have empty preds at the end

    responses = [
        ObjectDetectionInferenceResponse(
            predictions=[
                ObjectDetectionPrediction(
                    # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                    **{
                        "x": (pred[0] + pred[2]) / 2,
                        "y": (pred[1] + pred[3]) / 2,
                        "width": pred[2] - pred[0],
                        "height": pred[3] - pred[1],
                        "confidence": pred[4],
                        "class": self.class_names[int(pred[6])],
                        "class_id": int(pred[6]),
                    }
                )
                for pred in batch_predictions
                if not class_filter
                or self.class_names[int(pred[6])] in class_filter
            ],
            image=InferenceResponseImage(
                width=img_dims[ind][1], height=img_dims[ind][0]
            ),
        )
        for ind, batch_predictions in enumerate(predictions)
    ]
    return responses

postprocess ¶

postprocess(
    predictions,
    preproc_return_metadata,
    class_agnostic_nms=DEFAULT_CLASS_AGNOSTIC_NMS,
    confidence=DEFAULT_CONFIDENCE,
    iou_threshold=DEFAULT_IOU_THRESH,
    max_candidates=DEFAULT_MAX_CANDIDATES,
    max_detections=DEFAUlT_MAX_DETECTIONS,
    return_image_dims=False,
    **kwargs
)

Postprocesses the object detection predictions.

Parameters:

Name	Type	Description	Default
`predictions`	`ndarray`	Raw predictions from the model.	required
`img_dims`	`List[Tuple[int, int]]`	Dimensions of the images.	required
`class_agnostic_nms`	`bool`	Whether to apply class-agnostic non-max suppression. Default is False.	`DEFAULT_CLASS_AGNOSTIC_NMS`
`confidence`	`float`	Confidence threshold for filtering detections. Default is 0.5.	`DEFAULT_CONFIDENCE`
`iou_threshold`	`float`	IoU threshold for non-max suppression. Default is 0.5.	`DEFAULT_IOU_THRESH`
`max_candidates`	`int`	Maximum number of candidate detections. Default is 3000.	`DEFAULT_MAX_CANDIDATES`
`max_detections`	`int`	Maximum number of final detections. Default is 300.	`DEFAUlT_MAX_DETECTIONS`

Returns:

Type	Description
`List[ObjectDetectionInferenceResponse]`	List[ObjectDetectionInferenceResponse]: The post-processed predictions.

Source code in inference/core/models/object_detection_base.py

def postprocess(
    self,
    predictions: Tuple[np.ndarray, ...],
    preproc_return_metadata: PreprocessReturnMetadata,
    class_agnostic_nms=DEFAULT_CLASS_AGNOSTIC_NMS,
    confidence: float = DEFAULT_CONFIDENCE,
    iou_threshold: float = DEFAULT_IOU_THRESH,
    max_candidates: int = DEFAULT_MAX_CANDIDATES,
    max_detections: int = DEFAUlT_MAX_DETECTIONS,
    return_image_dims: bool = False,
    **kwargs,
) -> List[ObjectDetectionInferenceResponse]:
    """Postprocesses the object detection predictions.

    Args:
        predictions (np.ndarray): Raw predictions from the model.
        img_dims (List[Tuple[int, int]]): Dimensions of the images.
        class_agnostic_nms (bool): Whether to apply class-agnostic non-max suppression. Default is False.
        confidence (float): Confidence threshold for filtering detections. Default is 0.5.
        iou_threshold (float): IoU threshold for non-max suppression. Default is 0.5.
        max_candidates (int): Maximum number of candidate detections. Default is 3000.
        max_detections (int): Maximum number of final detections. Default is 300.

    Returns:
        List[ObjectDetectionInferenceResponse]: The post-processed predictions.
    """
    predictions = predictions[0]
    predictions = w_np_non_max_suppression(
        predictions,
        conf_thresh=confidence,
        iou_thresh=iou_threshold,
        class_agnostic=class_agnostic_nms,
        max_detections=max_detections,
        max_candidate_detections=max_candidates,
        box_format=self.box_format,
    )

    infer_shape = (self.img_size_h, self.img_size_w)
    img_dims = preproc_return_metadata["img_dims"]
    predictions = post_process_bboxes(
        predictions,
        infer_shape,
        img_dims,
        self.preproc,
        resize_method=self.resize_method,
        disable_preproc_static_crop=preproc_return_metadata[
            "disable_preproc_static_crop"
        ],
    )
    return self.make_response(predictions, img_dims, **kwargs)

predict ¶

predict(img_in, **kwargs)

Runs inference on the ONNX model.

Parameters:

Name	Type	Description	Default
`img_in`	`ndarray`	The preprocessed image(s) to run inference on.	required

Returns:

Type	Description
`Tuple[ndarray]`	Tuple[np.ndarray]: The ONNX model predictions.

Raises:

Type	Description
`NotImplementedError`	This method must be implemented by a subclass.

Source code in inference/core/models/object_detection_base.py

def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray]:
    """Runs inference on the ONNX model.

    Args:
        img_in (np.ndarray): The preprocessed image(s) to run inference on.

    Returns:
        Tuple[np.ndarray]: The ONNX model predictions.

    Raises:
        NotImplementedError: This method must be implemented by a subclass.
    """
    raise NotImplementedError("predict must be implemented by a subclass")

preprocess ¶

preprocess(
    image,
    disable_preproc_auto_orient=False,
    disable_preproc_contrast=False,
    disable_preproc_grayscale=False,
    disable_preproc_static_crop=False,
    fix_batch_size=False,
    **kwargs
)

Preprocesses an object detection inference request.

Parameters:

Name	Type	Description	Default
`request`	`ObjectDetectionInferenceRequest`	The request object containing images.	required

Returns:

Type	Description
`Tuple[ndarray, PreprocessReturnMetadata]`	Tuple[np.ndarray, List[Tuple[int, int]]]: Preprocessed image inputs and corresponding dimensions.

Source code in inference/core/models/object_detection_base.py

def preprocess(
    self,
    image: Any,
    disable_preproc_auto_orient: bool = False,
    disable_preproc_contrast: bool = False,
    disable_preproc_grayscale: bool = False,
    disable_preproc_static_crop: bool = False,
    fix_batch_size: bool = False,
    **kwargs,
) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
    """Preprocesses an object detection inference request.

    Args:
        request (ObjectDetectionInferenceRequest): The request object containing images.

    Returns:
        Tuple[np.ndarray, List[Tuple[int, int]]]: Preprocessed image inputs and corresponding dimensions.
    """
    img_in, img_dims = self.load_image(
        image,
        disable_preproc_auto_orient=disable_preproc_auto_orient,
        disable_preproc_contrast=disable_preproc_contrast,
        disable_preproc_grayscale=disable_preproc_grayscale,
        disable_preproc_static_crop=disable_preproc_static_crop,
    )

    img_in /= 255.0

    if self.batching_enabled:
        batch_padding = 0
        if FIX_BATCH_SIZE or fix_batch_size:
            if MAX_BATCH_SIZE == float("inf"):
                logger.warning(
                    "Requested fix_batch_size but MAX_BATCH_SIZE is not set. Using dynamic batching."
                )
                batch_padding = 0
            else:
                batch_padding = MAX_BATCH_SIZE - img_in.shape[0]
        if batch_padding < 0:
            raise ValueError(
                f"Requested fix_batch_size but passed in {img_in.shape[0]} images "
                f"when the model's batch size is {MAX_BATCH_SIZE}\n"
                f"Consider turning off fix_batch_size, changing `MAX_BATCH_SIZE` in"
                f"your inference server config, or passing at most {MAX_BATCH_SIZE} images at a time"
            )
        width_remainder = img_in.shape[2] % 32
        height_remainder = img_in.shape[3] % 32
        if width_remainder > 0:
            width_padding = 32 - width_remainder
        else:
            width_padding = 0
        if height_remainder > 0:
            height_padding = 32 - height_remainder
        else:
            height_padding = 0

        if isinstance(img_in, np.ndarray):
            img_in = np.pad(
                img_in,
                (
                    (0, batch_padding),
                    (0, 0),
                    (0, width_padding),
                    (0, height_padding),
                ),
                "constant",
            )
        elif USE_PYTORCH_FOR_PREPROCESSING:
            img_in = torch.nn.functional.pad(
                img_in,
                (
                    0,
                    height_padding,  # height padding
                    0,
                    width_padding,  # width padding
                    0,
                    0,  # channels
                    0,
                    batch_padding,
                ),  # batch
                mode="constant",
                value=0,
            )
        else:
            raise ValueError(
                f"Received an image of unknown type, {type(img_in)}; "
                "This is most likely a bug. Contact Roboflow team through github issues "
                "(https://github.com/roboflow/inference/issues) providing full context of the problem"
            )

    return img_in, PreprocessReturnMetadata(
        {
            "img_dims": img_dims,
            "disable_preproc_static_crop": disable_preproc_static_crop,
        }
    )

Functions:¶

inference.core.models.roboflow ¶

Classes¶

OnnxRoboflowCoreModel ¶

Bases: RoboflowCoreModel

Roboflow Inference Model that operates using an ONNX model file.

Source code in inference/core/models/roboflow.py

class OnnxRoboflowCoreModel(RoboflowCoreModel):
    """Roboflow Inference Model that operates using an ONNX model file."""

    pass

OnnxRoboflowInferenceModel ¶

Roboflow Inference Model that operates using an ONNX model file.

Source code in inference/core/models/roboflow.py

class OnnxRoboflowInferenceModel(RoboflowInferenceModel):
    """Roboflow Inference Model that operates using an ONNX model file."""

    def __init__(
        self,
        model_id: str,
        onnxruntime_execution_providers: List[
            str
        ] = get_onnxruntime_execution_providers(ONNXRUNTIME_EXECUTION_PROVIDERS),
        *args,
        **kwargs,
    ):
        """Initializes the OnnxRoboflowInferenceModel instance.

        Args:
            model_id (str): The identifier for the specific ONNX model.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__(model_id, *args, **kwargs)
        if self.load_weights or not self.has_model_metadata:
            self.onnxruntime_execution_providers = onnxruntime_execution_providers
            expanded_execution_providers = []
            for ep in self.onnxruntime_execution_providers:
                if ep == "TensorrtExecutionProvider":
                    ep = (
                        "TensorrtExecutionProvider",
                        {
                            "trt_engine_cache_enable": True,
                            "trt_engine_cache_path": os.path.join(
                                TENSORRT_CACHE_PATH, self.endpoint
                            ),
                            "trt_fp16_enable": True,
                        },
                    )
                expanded_execution_providers.append(ep)
            self.onnxruntime_execution_providers = expanded_execution_providers

        self.image_loader_threadpool = ThreadPoolExecutor(max_workers=None)
        self._session_lock = Lock()
        try:
            self.initialize_model(**kwargs)
            self.validate_model()
        except ModelArtefactError as e:
            logger.error(f"Unable to validate model artifacts, clearing cache: {e}")
            if DISK_CACHE_CLEANUP:
                self.clear_cache(delete_from_disk=True)
            else:
                logger.error("NOT deleting model from cache, inspect model artifacts")
            raise ModelArtefactError from e

    def infer(self, image: Any, **kwargs) -> Any:
        """Runs inference on given data.
        - image:
            can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
        """
        input_elements = len(image) if isinstance(image, list) else 1
        max_batch_size = MAX_BATCH_SIZE if self.batching_enabled else self.batch_size
        if (input_elements == 1) or (max_batch_size == float("inf")):
            return super().infer(image, **kwargs)
        logger.debug(
            f"Inference will be executed in batches, as there is {input_elements} input elements and "
            f"maximum batch size for a model is set to: {max_batch_size}"
        )
        inference_results = []
        for batch_input in create_batches(sequence=image, batch_size=max_batch_size):
            batch_inference_results = super().infer(batch_input, **kwargs)
            inference_results.append(batch_inference_results)
        return self.merge_inference_results(inference_results=inference_results)

    def merge_inference_results(self, inference_results: List[Any]) -> Any:
        return list(itertools.chain(*inference_results))

    def validate_model(self) -> None:
        if MODEL_VALIDATION_DISABLED:
            logger.debug("Model validation disabled.")
            return None
        logger.debug(f"Starting model validation for {self.endpoint}")
        validate_model_error_count = cache.get(
            self.endpoint + "_validate_model_error_count"
        )
        if validate_model_error_count is None:
            validate_model_error_count = 0
        if validate_model_error_count > 3:
            raise ModelArtefactError(
                "Model validation failed multiple times, ignoring this model."
            )
        if not self.load_weights:
            return
        try:
            assert self.onnx_session is not None
        except AssertionError as e:
            cache.set(
                self.endpoint + "_validate_model_error_count",
                validate_model_error_count + 1,
                expire=60,
            )
            raise ModelArtefactError(
                "ONNX session not initialized. Check that the model weights are available."
            ) from e
        try:
            self.run_test_inference()
        except Exception as e:
            cache.set(
                self.endpoint + "_validate_model_error_count",
                validate_model_error_count + 1,
                expire=60,
            )
            raise ModelArtefactError(f"Unable to run test inference. Cause: {e}") from e
        try:
            self.validate_model_classes()
        except Exception as e:
            cache.set(
                self.endpoint + "_validate_model_error_count",
                validate_model_error_count + 1,
                expire=60,
            )
            raise ModelArtefactError(
                f"Unable to validate model classes. Cause: {e}"
            ) from e
        logger.debug(f"Model validation finished for {self.endpoint}")
        cache.set(self.endpoint + "_validate_model_error_count", 0, expire=3600)

    def run_test_inference(self) -> None:
        test_image = (np.random.rand(1024, 1024, 3) * 255).astype(np.uint8)
        logger.debug(f"Running test inference. Image size: {test_image.shape}")
        result = self.infer(test_image, usage_inference_test_run=True)
        logger.debug(f"Test inference finished.")
        return result

    def get_model_output_shape(self) -> Tuple[int, int, int]:
        test_image = (np.random.rand(1024, 1024, 3) * 255).astype(np.uint8)
        logger.debug(f"Getting model output shape. Image size: {test_image.shape}")
        test_image, _ = self.preprocess(test_image)
        output = self.predict(test_image)[0]
        logger.debug(f"Model output shape test finished.")
        return output.shape

    def validate_model_classes(self) -> None:
        pass

    def get_infer_bucket_file_list(self) -> list:
        """Returns the list of files to be downloaded from the inference bucket for ONNX model.

        Returns:
            list: A list of filenames specific to ONNX models.
        """
        return ["environment.json", "class_names.txt"]

    def initialize_model(self, **kwargs) -> None:
        """Initializes the ONNX model, setting up the inference session and other necessary properties."""
        logger.debug("Getting model artefacts")
        self.get_model_artifacts(**kwargs)
        logger.debug("Creating inference session")
        if self.load_weights or not self.has_model_metadata:
            t1_session = perf_counter()
            # Create an ONNX Runtime Session with a list of execution providers in priority order. ORT attempts to load providers until one is successful. This keeps the code across devices identical.
            providers = self.onnxruntime_execution_providers

            if not self.load_weights:
                providers = ["OpenVINOExecutionProvider", "CPUExecutionProvider"]
            try:
                session_options = onnxruntime.SessionOptions()
                session_options.log_severity_level = 3
                # TensorRT does better graph optimization for its EP than onnx
                if has_trt(providers):
                    session_options.graph_optimization_level = (
                        onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
                    )
                self.onnx_session = onnxruntime.InferenceSession(
                    self.cache_file(self.weights_file),
                    providers=providers,
                    sess_options=session_options,
                )
            except Exception as e:
                self.clear_cache(delete_from_disk=DISK_CACHE_CLEANUP)
                raise ModelArtefactError(
                    f"Unable to load ONNX session. Cause: {e}"
                ) from e
            logger.debug(f"Session created in {perf_counter() - t1_session} seconds")

            if REQUIRED_ONNX_PROVIDERS:
                available_providers = onnxruntime.get_available_providers()
                for provider in REQUIRED_ONNX_PROVIDERS:
                    if provider not in available_providers:
                        raise OnnxProviderNotAvailable(
                            f"Required ONNX Execution Provider {provider} is not availble. "
                            "Check that you are using the correct docker image on a supported device. "
                            "Export list of available providers as ONNXRUNTIME_EXECUTION_PROVIDERS environmental variable, "
                            "consult documentation for more details."
                        )

            inputs = self.onnx_session.get_inputs()[0]
            input_shape = inputs.shape
            self.batch_size = input_shape[0]
            self.img_size_h = input_shape[2]
            self.img_size_w = input_shape[3]
            self.input_name = inputs.name
            if isinstance(self.img_size_h, str) or isinstance(self.img_size_w, str):
                if "resize" in self.preproc:
                    self.img_size_h = int(self.preproc["resize"]["height"])
                    self.img_size_w = int(self.preproc["resize"]["width"])
                else:
                    self.img_size_h = 640
                    self.img_size_w = 640

            if isinstance(self.batch_size, str):
                self.batching_enabled = True
                logger.debug(
                    f"Model {self.endpoint} is loaded with dynamic batching enabled"
                )
            else:
                self.batching_enabled = False
                logger.debug(
                    f"Model {self.endpoint} is loaded with dynamic batching disabled"
                )

            model_metadata = {
                "batch_size": self.batch_size,
                "img_size_h": self.img_size_h,
                "img_size_w": self.img_size_w,
            }
            logger.debug(f"Writing model metadata to memcache")
            self.write_model_metadata_to_memcache(model_metadata)
            if not self.load_weights:  # had to load weights to get metadata
                del self.onnx_session
        else:
            if not self.has_model_metadata:
                raise ValueError(
                    "This should be unreachable, should get weights if we don't have model metadata"
                )
            logger.debug(f"Loading model metadata from memcache")
            metadata = self.model_metadata_from_memcache()
            self.batch_size = metadata["batch_size"]
            self.img_size_h = metadata["img_size_h"]
            self.img_size_w = metadata["img_size_w"]
            if isinstance(self.batch_size, str):
                self.batching_enabled = True
                logger.debug(
                    f"Model {self.endpoint} is loaded with dynamic batching enabled"
                )
            else:
                self.batching_enabled = False
                logger.debug(
                    f"Model {self.endpoint} is loaded with dynamic batching disabled"
                )

        logger.debug("Model initialisation finished.")

    def load_image(
        self,
        image: Any,
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
    ) -> Tuple[np.ndarray, Tuple[Tuple[int, int], ...]]:
        if isinstance(image, list) and len(image) > 1:
            preproc_image = partial(
                self.preproc_image,
                disable_preproc_auto_orient=disable_preproc_auto_orient,
                disable_preproc_contrast=disable_preproc_contrast,
                disable_preproc_grayscale=disable_preproc_grayscale,
                disable_preproc_static_crop=disable_preproc_static_crop,
            )
            imgs_with_dims = self.image_loader_threadpool.map(preproc_image, image)
            imgs, img_dims = zip(*imgs_with_dims)
            if isinstance(imgs[0], np.ndarray):
                img_in = np.concatenate(imgs, axis=0)
            elif USE_PYTORCH_FOR_PREPROCESSING:
                img_in = torch.cat(imgs, dim=0)
            else:
                raise ValueError(
                    f"Received a list of images of unknown type, {type(imgs[0])}; "
                    "This is most likely a bug. Contact Roboflow team through github issues "
                    "(https://github.com/roboflow/inference/issues) providing full context of the problem"
                )
        else:
            if isinstance(image, list):
                image = image[0]
            img_in, img_dims = self.preproc_image(
                image,
                disable_preproc_auto_orient=disable_preproc_auto_orient,
                disable_preproc_contrast=disable_preproc_contrast,
                disable_preproc_grayscale=disable_preproc_grayscale,
                disable_preproc_static_crop=disable_preproc_static_crop,
            )
            img_dims = (img_dims,)
        return img_in, img_dims

    @property
    def weights_file(self) -> str:
        """Returns the file containing the ONNX model weights.

        Returns:
            str: The file path to the weights file.
        """
        return "weights.onnx"

Attributes¶

weights_file `property` ¶

weights_file

Returns the file containing the ONNX model weights.

Returns:

Name	Type	Description
`str`	`str`	The file path to the weights file.

Methods:¶

init ¶

__init__(
    model_id,
    onnxruntime_execution_providers=get_onnxruntime_execution_providers(
        ONNXRUNTIME_EXECUTION_PROVIDERS
    ),
    *args,
    **kwargs
)

Initializes the OnnxRoboflowInferenceModel instance.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	The identifier for the specific ONNX model.	required
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/core/models/roboflow.py

def __init__(
    self,
    model_id: str,
    onnxruntime_execution_providers: List[
        str
    ] = get_onnxruntime_execution_providers(ONNXRUNTIME_EXECUTION_PROVIDERS),
    *args,
    **kwargs,
):
    """Initializes the OnnxRoboflowInferenceModel instance.

    Args:
        model_id (str): The identifier for the specific ONNX model.
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__(model_id, *args, **kwargs)
    if self.load_weights or not self.has_model_metadata:
        self.onnxruntime_execution_providers = onnxruntime_execution_providers
        expanded_execution_providers = []
        for ep in self.onnxruntime_execution_providers:
            if ep == "TensorrtExecutionProvider":
                ep = (
                    "TensorrtExecutionProvider",
                    {
                        "trt_engine_cache_enable": True,
                        "trt_engine_cache_path": os.path.join(
                            TENSORRT_CACHE_PATH, self.endpoint
                        ),
                        "trt_fp16_enable": True,
                    },
                )
            expanded_execution_providers.append(ep)
        self.onnxruntime_execution_providers = expanded_execution_providers

    self.image_loader_threadpool = ThreadPoolExecutor(max_workers=None)
    self._session_lock = Lock()
    try:
        self.initialize_model(**kwargs)
        self.validate_model()
    except ModelArtefactError as e:
        logger.error(f"Unable to validate model artifacts, clearing cache: {e}")
        if DISK_CACHE_CLEANUP:
            self.clear_cache(delete_from_disk=True)
        else:
            logger.error("NOT deleting model from cache, inspect model artifacts")
        raise ModelArtefactError from e

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Returns the list of files to be downloaded from the inference bucket for ONNX model.

Returns:

Name	Type	Description
`list`	`list`	A list of filenames specific to ONNX models.

Source code in inference/core/models/roboflow.py

def get_infer_bucket_file_list(self) -> list:
    """Returns the list of files to be downloaded from the inference bucket for ONNX model.

    Returns:
        list: A list of filenames specific to ONNX models.
    """
    return ["environment.json", "class_names.txt"]

infer ¶

infer(image, **kwargs)

Runs inference on given data. - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

Source code in inference/core/models/roboflow.py

def infer(self, image: Any, **kwargs) -> Any:
    """Runs inference on given data.
    - image:
        can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
    """
    input_elements = len(image) if isinstance(image, list) else 1
    max_batch_size = MAX_BATCH_SIZE if self.batching_enabled else self.batch_size
    if (input_elements == 1) or (max_batch_size == float("inf")):
        return super().infer(image, **kwargs)
    logger.debug(
        f"Inference will be executed in batches, as there is {input_elements} input elements and "
        f"maximum batch size for a model is set to: {max_batch_size}"
    )
    inference_results = []
    for batch_input in create_batches(sequence=image, batch_size=max_batch_size):
        batch_inference_results = super().infer(batch_input, **kwargs)
        inference_results.append(batch_inference_results)
    return self.merge_inference_results(inference_results=inference_results)

initialize_model ¶

initialize_model(**kwargs)

Initializes the ONNX model, setting up the inference session and other necessary properties.

Source code in inference/core/models/roboflow.py

def initialize_model(self, **kwargs) -> None:
    """Initializes the ONNX model, setting up the inference session and other necessary properties."""
    logger.debug("Getting model artefacts")
    self.get_model_artifacts(**kwargs)
    logger.debug("Creating inference session")
    if self.load_weights or not self.has_model_metadata:
        t1_session = perf_counter()
        # Create an ONNX Runtime Session with a list of execution providers in priority order. ORT attempts to load providers until one is successful. This keeps the code across devices identical.
        providers = self.onnxruntime_execution_providers

        if not self.load_weights:
            providers = ["OpenVINOExecutionProvider", "CPUExecutionProvider"]
        try:
            session_options = onnxruntime.SessionOptions()
            session_options.log_severity_level = 3
            # TensorRT does better graph optimization for its EP than onnx
            if has_trt(providers):
                session_options.graph_optimization_level = (
                    onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
                )
            self.onnx_session = onnxruntime.InferenceSession(
                self.cache_file(self.weights_file),
                providers=providers,
                sess_options=session_options,
            )
        except Exception as e:
            self.clear_cache(delete_from_disk=DISK_CACHE_CLEANUP)
            raise ModelArtefactError(
                f"Unable to load ONNX session. Cause: {e}"
            ) from e
        logger.debug(f"Session created in {perf_counter() - t1_session} seconds")

        if REQUIRED_ONNX_PROVIDERS:
            available_providers = onnxruntime.get_available_providers()
            for provider in REQUIRED_ONNX_PROVIDERS:
                if provider not in available_providers:
                    raise OnnxProviderNotAvailable(
                        f"Required ONNX Execution Provider {provider} is not availble. "
                        "Check that you are using the correct docker image on a supported device. "
                        "Export list of available providers as ONNXRUNTIME_EXECUTION_PROVIDERS environmental variable, "
                        "consult documentation for more details."
                    )

        inputs = self.onnx_session.get_inputs()[0]
        input_shape = inputs.shape
        self.batch_size = input_shape[0]
        self.img_size_h = input_shape[2]
        self.img_size_w = input_shape[3]
        self.input_name = inputs.name
        if isinstance(self.img_size_h, str) or isinstance(self.img_size_w, str):
            if "resize" in self.preproc:
                self.img_size_h = int(self.preproc["resize"]["height"])
                self.img_size_w = int(self.preproc["resize"]["width"])
            else:
                self.img_size_h = 640
                self.img_size_w = 640

        if isinstance(self.batch_size, str):
            self.batching_enabled = True
            logger.debug(
                f"Model {self.endpoint} is loaded with dynamic batching enabled"
            )
        else:
            self.batching_enabled = False
            logger.debug(
                f"Model {self.endpoint} is loaded with dynamic batching disabled"
            )

        model_metadata = {
            "batch_size": self.batch_size,
            "img_size_h": self.img_size_h,
            "img_size_w": self.img_size_w,
        }
        logger.debug(f"Writing model metadata to memcache")
        self.write_model_metadata_to_memcache(model_metadata)
        if not self.load_weights:  # had to load weights to get metadata
            del self.onnx_session
    else:
        if not self.has_model_metadata:
            raise ValueError(
                "This should be unreachable, should get weights if we don't have model metadata"
            )
        logger.debug(f"Loading model metadata from memcache")
        metadata = self.model_metadata_from_memcache()
        self.batch_size = metadata["batch_size"]
        self.img_size_h = metadata["img_size_h"]
        self.img_size_w = metadata["img_size_w"]
        if isinstance(self.batch_size, str):
            self.batching_enabled = True
            logger.debug(
                f"Model {self.endpoint} is loaded with dynamic batching enabled"
            )
        else:
            self.batching_enabled = False
            logger.debug(
                f"Model {self.endpoint} is loaded with dynamic batching disabled"
            )

    logger.debug("Model initialisation finished.")

RoboflowCoreModel ¶

Base Roboflow inference model (Inherits from CvModel since all Roboflow models are CV models currently).

Source code in inference/core/models/roboflow.py

class RoboflowCoreModel(RoboflowInferenceModel):
    """Base Roboflow inference model (Inherits from CvModel since all Roboflow models are CV models currently)."""

    def __init__(
        self,
        model_id: str,
        api_key=None,
        **kwargs,
    ):
        """Initializes the RoboflowCoreModel instance.

        Args:
            model_id (str): The identifier for the specific model.
            api_key ([type], optional): The API key for authentication. Defaults to None.
        """
        super().__init__(model_id, api_key=api_key, **kwargs)
        self.download_weights()

    def download_weights(self) -> None:
        """Downloads the model weights from the configured source.

        This method includes handling for AWS access keys and error handling.
        """
        if MODELS_CACHE_AUTH_ENABLED:
            if not _check_if_api_key_has_access_to_model(
                api_key=self.api_key,
                model_id=self.endpoint,
                endpoint_type=ModelEndpointType.CORE_MODEL,
                countinference=self.countinference,
                service_secret=self.service_secret,
            ):
                raise RoboflowAPINotAuthorizedError(
                    f"API key {self.api_key} does not have access to model {self.endpoint}"
                )
        infer_bucket_files = self.get_infer_bucket_file_list()
        if are_all_files_cached(files=infer_bucket_files, model_id=self.endpoint):
            logger.debug("Model artifacts already downloaded, loading from cache")
            return None
        if is_model_artefacts_bucket_available():
            self.download_model_artefacts_from_s3()
            return None
        self.download_model_from_roboflow_api()

    def download_model_from_roboflow_api(self) -> None:

        # Use the same lock file pattern as in clear_cache
        lock_dir = MODEL_CACHE_DIR + "/_file_locks"  # Dedicated lock directory
        os.makedirs(lock_dir, exist_ok=True)  # Ensure lock directory exists.
        lock_file = os.path.join(lock_dir, f"{os.path.basename(self.cache_dir)}.lock")
        try:
            lock = FileLock(lock_file, timeout=120)  # 120 second timeout for downloads
            with lock:
                api_data = get_roboflow_model_data(
                    api_key=self.api_key,
                    model_id=self.endpoint,
                    endpoint_type=ModelEndpointType.CORE_MODEL,
                    device_id=self.device_id,
                    countinference=self.countinference,
                    service_secret=self.service_secret,
                )
                if "weights" not in api_data:
                    raise ModelArtefactError(
                        f"`weights` key not available in Roboflow API response while downloading model weights."
                    )
                for weights_url_key in api_data["weights"]:
                    weights_url = api_data["weights"][weights_url_key]
                    t1 = perf_counter()
                    model_weights_response = get_from_url(
                        weights_url, json_response=False
                    )
                    filename = weights_url.split("?")[0].split("/")[-1]
                    save_bytes_in_cache(
                        content=model_weights_response.content,
                        file=filename,
                        model_id=self.endpoint,
                    )
                    if perf_counter() - t1 > 120:
                        logger.debug(
                            "Weights download took longer than 120 seconds, refreshing API request"
                        )
                        api_data = get_roboflow_model_data(
                            api_key=self.api_key,
                            model_id=self.endpoint,
                            endpoint_type=ModelEndpointType.CORE_MODEL,
                            device_id=self.device_id,
                        )
        except Exception as e:
            logger.error(f"Error downloading model artifacts: {e}")
            raise

    def get_device_id(self) -> str:
        """Returns the device ID associated with this model.

        Returns:
            str: The device ID.
        """
        return self.device_id

    def get_infer_bucket_file_list(self) -> List[str]:
        """Abstract method to get the list of files to be downloaded from the inference bucket.

        Raises:
            NotImplementedError: This method must be implemented in subclasses.

        Returns:
            List[str]: A list of filenames.
        """
        raise NotImplementedError(
            "get_infer_bucket_file_list not implemented for RoboflowCoreModel"
        )

    def preprocess_image(self, image: Image.Image) -> Image.Image:
        """Abstract method to preprocess an image.

        Raises:
            NotImplementedError: This method must be implemented in subclasses.

        Returns:
            Image.Image: The preprocessed PIL image.
        """
        raise NotImplementedError(self.__class__.__name__ + ".preprocess_image")

    @property
    def weights_file(self) -> str:
        """Abstract property representing the file containing the model weights. For core models, all model artifacts are handled through get_infer_bucket_file_list method."""
        return None

    @property
    def model_artifact_bucket(self):
        return CORE_MODEL_BUCKET

Attributes¶

weights_file `property` ¶

weights_file

Abstract property representing the file containing the model weights. For core models, all model artifacts are handled through get_infer_bucket_file_list method.

Methods:¶

init ¶

__init__(model_id, api_key=None, **kwargs)

Initializes the RoboflowCoreModel instance.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	The identifier for the specific model.	required
`api_key`	`[type]`	The API key for authentication. Defaults to None.	`None`

Source code in inference/core/models/roboflow.py

def __init__(
    self,
    model_id: str,
    api_key=None,
    **kwargs,
):
    """Initializes the RoboflowCoreModel instance.

    Args:
        model_id (str): The identifier for the specific model.
        api_key ([type], optional): The API key for authentication. Defaults to None.
    """
    super().__init__(model_id, api_key=api_key, **kwargs)
    self.download_weights()

download_weights ¶

download_weights()

Downloads the model weights from the configured source.

This method includes handling for AWS access keys and error handling.

Source code in inference/core/models/roboflow.py

def download_weights(self) -> None:
    """Downloads the model weights from the configured source.

    This method includes handling for AWS access keys and error handling.
    """
    if MODELS_CACHE_AUTH_ENABLED:
        if not _check_if_api_key_has_access_to_model(
            api_key=self.api_key,
            model_id=self.endpoint,
            endpoint_type=ModelEndpointType.CORE_MODEL,
            countinference=self.countinference,
            service_secret=self.service_secret,
        ):
            raise RoboflowAPINotAuthorizedError(
                f"API key {self.api_key} does not have access to model {self.endpoint}"
            )
    infer_bucket_files = self.get_infer_bucket_file_list()
    if are_all_files_cached(files=infer_bucket_files, model_id=self.endpoint):
        logger.debug("Model artifacts already downloaded, loading from cache")
        return None
    if is_model_artefacts_bucket_available():
        self.download_model_artefacts_from_s3()
        return None
    self.download_model_from_roboflow_api()

get_device_id ¶

get_device_id()

Returns the device ID associated with this model.

Returns:

Name	Type	Description
`str`	`str`	The device ID.

Source code in inference/core/models/roboflow.py

def get_device_id(self) -> str:
    """Returns the device ID associated with this model.

    Returns:
        str: The device ID.
    """
    return self.device_id

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Abstract method to get the list of files to be downloaded from the inference bucket.

Raises:

Type	Description
`NotImplementedError`	This method must be implemented in subclasses.

Returns:

Type	Description
`List[str]`	List[str]: A list of filenames.

Source code in inference/core/models/roboflow.py

def get_infer_bucket_file_list(self) -> List[str]:
    """Abstract method to get the list of files to be downloaded from the inference bucket.

    Raises:
        NotImplementedError: This method must be implemented in subclasses.

    Returns:
        List[str]: A list of filenames.
    """
    raise NotImplementedError(
        "get_infer_bucket_file_list not implemented for RoboflowCoreModel"
    )

preprocess_image ¶

preprocess_image(image)

Abstract method to preprocess an image.

Raises:

Type	Description
`NotImplementedError`	This method must be implemented in subclasses.

Returns:

Type	Description
`Image`	Image.Image: The preprocessed PIL image.

Source code in inference/core/models/roboflow.py

def preprocess_image(self, image: Image.Image) -> Image.Image:
    """Abstract method to preprocess an image.

    Raises:
        NotImplementedError: This method must be implemented in subclasses.

    Returns:
        Image.Image: The preprocessed PIL image.
    """
    raise NotImplementedError(self.__class__.__name__ + ".preprocess_image")

RoboflowInferenceModel ¶

Bases: Model

Base Roboflow inference model.

Source code in inference/core/models/roboflow.py

class RoboflowInferenceModel(Model):
    """Base Roboflow inference model."""

    def __init__(
        self,
        model_id: str,
        cache_dir_root=MODEL_CACHE_DIR,
        api_key=None,
        load_weights=True,
        **kwargs,
    ):
        """
        Initialize the RoboflowInferenceModel object.

        Args:
            model_id (str): The unique identifier for the model.
            cache_dir_root (str, optional): The root directory for the cache. Defaults to MODEL_CACHE_DIR.
            api_key (str, optional): API key for authentication. Defaults to None.
        """
        super().__init__()
        self.load_weights = load_weights
        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}
        self.api_key = api_key if api_key else API_KEY
        self.countinference: Optional[bool] = kwargs.get("countinference")
        self.service_secret: Optional[str] = kwargs.get("service_secret")
        model_id = resolve_roboflow_model_alias(model_id=model_id)
        self.dataset_id, self.version_id = get_model_id_chunks(model_id=model_id)
        self.endpoint = model_id
        self.device_id = GLOBAL_DEVICE_ID
        self.cache_dir = get_cache_dir(
            model_id=self.endpoint, cache_dir_root=cache_dir_root
        )
        self.keypoints_metadata: Optional[dict] = None
        initialise_cache(model_id=self.endpoint)

    def cache_file(self, f: str) -> str:
        """Get the cache file path for a given file.

        Args:
            f (str): Filename.

        Returns:
            str: Full path to the cached file.
        """
        return get_cache_file_path(file=f, model_id=self.endpoint)

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        """Clear the cache directory.

        Args:
            delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
        """
        clear_cache(model_id=self.endpoint, delete_from_disk=delete_from_disk)

    def draw_predictions(
        self,
        inference_request: InferenceRequest,
        inference_response: InferenceResponse,
    ) -> bytes:
        """Draw predictions from an inference response onto the original image provided by an inference request

        Args:
            inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
            inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

        Returns:
            str: A base64 encoded image string
        """
        return draw_detection_predictions(
            inference_request=inference_request,
            inference_response=inference_response,
            colors=self.colors,
        )

    @property
    def get_class_names(self):
        return self.class_names

    def get_device_id(self) -> str:
        """
        Get the device identifier on which the model is deployed.

        Returns:
            str: Device identifier.
        """
        return self.device_id

    def get_infer_bucket_file_list(self) -> List[str]:
        """Get a list of inference bucket files.

        Raises:
            NotImplementedError: If the method is not implemented.

        Returns:
            List[str]: A list of inference bucket files.
        """
        raise NotImplementedError(
            self.__class__.__name__ + ".get_infer_bucket_file_list"
        )

    @property
    def cache_key(self):
        return f"metadata:{self.endpoint}"

    @staticmethod
    def model_metadata_from_memcache_endpoint(endpoint):
        model_metadata = cache.get(f"metadata:{endpoint}")
        return model_metadata

    def model_metadata_from_memcache(self):
        model_metadata = cache.get(self.cache_key)
        return model_metadata

    def write_model_metadata_to_memcache(self, metadata):
        cache.set(
            self.cache_key, metadata, expire=MODEL_METADATA_CACHE_EXPIRATION_TIMEOUT
        )

    @property
    def has_model_metadata(self):
        return self.model_metadata_from_memcache() is not None

    def get_model_artifacts(
        self,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
        **kwargs,
    ) -> None:
        """Fetch or load the model artifacts.

        Downloads the model artifacts from S3 or the Roboflow API if they are not already cached.
        """
        if MODELS_CACHE_AUTH_ENABLED:
            if not _check_if_api_key_has_access_to_model(
                api_key=self.api_key,
                model_id=self.endpoint,
                endpoint_type=ModelEndpointType.ORT,
                countinference=countinference,
                service_secret=service_secret,
            ):
                raise RoboflowAPINotAuthorizedError(
                    f"API key {self.api_key} does not have access to model {self.endpoint}"
                )
        self.cache_model_artefacts(
            countinference=countinference,
            service_secret=service_secret,
            **kwargs,
        )
        self.load_model_artifacts_from_cache()

    def cache_model_artefacts(
        self,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
        **kwargs,
    ) -> None:
        infer_bucket_files = self.get_all_required_infer_bucket_file()

        if are_all_files_cached(files=infer_bucket_files, model_id=self.endpoint):
            return None
        if is_model_artefacts_bucket_available():
            self.download_model_artefacts_from_s3()
            return None
        self.download_model_artifacts_from_roboflow_api(
            countinference=countinference,
            service_secret=service_secret,
            **kwargs,
        )

    def get_all_required_infer_bucket_file(self) -> List[str]:
        infer_bucket_files = self.get_infer_bucket_file_list()
        infer_bucket_files.append(self.weights_file)
        logger.debug(f"List of files required to load model: {infer_bucket_files}")
        return [f for f in infer_bucket_files if f is not None]

    def download_model_artefacts_from_s3(self) -> None:
        with start_span(
            "model.artifacts.download",
            {"model.id": self.endpoint, "model.artifacts.source": "s3"},
        ):
            try:
                logger.debug("Downloading model artifacts from S3")
                infer_bucket_files = self.get_all_required_infer_bucket_file()
                cache_directory = get_cache_dir()
                s3_keys = [f"{self.endpoint}/{file}" for file in infer_bucket_files]
                download_s3_files_to_directory(
                    bucket=self.model_artifact_bucket,
                    keys=s3_keys,
                    target_dir=cache_directory,
                    s3_client=S3_CLIENT,
                )
            except Exception as error:
                raise ModelArtefactError(
                    f"Could not obtain model artefacts from S3 with keys {s3_keys}. Cause: {error}"
                ) from error

    @property
    def model_artifact_bucket(self):
        return INFER_BUCKET

    def download_model_artifacts_from_roboflow_api(
        self,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
        **kwargs,
    ) -> None:
        logger.debug("Downloading model artifacts from Roboflow API")
        with start_span(
            "model.artifacts.download",
            {"model.id": self.endpoint, "model.artifacts.source": "roboflow_api"},
        ):
            # Use the same lock file pattern as in clear_cache
            lock_dir = MODEL_CACHE_DIR + "/_file_locks"  # Dedicated lock directory
            os.makedirs(lock_dir, exist_ok=True)  # Ensure lock directory exists.
            lock_file = os.path.join(
                lock_dir, f"{os.path.basename(self.cache_dir)}.lock"
            )
            try:
                lock = FileLock(
                    lock_file, timeout=120
                )  # 120 second timeout for downloads
                with lock:
                    if self.version_id is not None:
                        api_data = get_roboflow_model_data(
                            api_key=self.api_key,
                            model_id=self.endpoint,
                            endpoint_type=ModelEndpointType.ORT,
                            device_id=self.device_id,
                            countinference=countinference,
                            service_secret=service_secret,
                        )
                        if "ort" not in api_data.keys():
                            raise ModelArtefactError(
                                "Could not find `ort` key in roboflow API model description response."
                            )
                        api_data = api_data["ort"]
                        if "classes" in api_data:
                            save_text_lines_in_cache(
                                content=api_data["classes"],
                                file="class_names.txt",
                                model_id=self.endpoint,
                            )
                        if "model" not in api_data:
                            raise ModelArtefactError(
                                "Could not find `model` key in roboflow API model description response."
                            )
                        if "environment" not in api_data:
                            raise ModelArtefactError(
                                "Could not find `environment` key in roboflow API model description response."
                            )
                        environment = get_from_url(api_data["environment"])
                        model_weights_response = get_from_url(
                            api_data["model"],
                            json_response=False,
                        )
                    else:
                        api_data = get_roboflow_instant_model_data(
                            api_key=self.api_key,
                            model_id=self.endpoint,
                            countinference=countinference,
                            service_secret=service_secret,
                        )
                        if (
                            "modelFiles" not in api_data
                            or "ort" not in api_data["modelFiles"]
                            or "model" not in api_data["modelFiles"]["ort"]
                        ):
                            raise ModelArtefactError(
                                "Could not find `modelFiles` key or `modelFiles`.`ort` or `modelFiles`.`ort`.`model` key in roboflow API model description response."
                            )
                        if "environment" not in api_data:
                            raise ModelArtefactError(
                                "Could not find `environment` key in roboflow API model description response."
                            )
                        model_weights_response = get_from_url(
                            api_data["modelFiles"]["ort"]["model"],
                            json_response=False,
                        )
                        environment = api_data["environment"]
                        if "classes" in api_data:
                            save_text_lines_in_cache(
                                content=api_data["classes"],
                                file="class_names.txt",
                                model_id=self.endpoint,
                            )

                    save_bytes_in_cache(
                        content=model_weights_response.content,
                        file=self.weights_file,
                        model_id=self.endpoint,
                    )
                    if "colors" in api_data:
                        environment["COLORS"] = api_data["colors"]
                    save_json_in_cache(
                        content=environment,
                        file="environment.json",
                        model_id=self.endpoint,
                    )
                    if "keypoints_metadata" in api_data:
                        # TODO: make sure backend provides that
                        save_json_in_cache(
                            content=api_data["keypoints_metadata"],
                            file="keypoints_metadata.json",
                            model_id=self.endpoint,
                        )
            except Exception as e:
                logger.error(f"Error downloading model artifacts: {e}")
                raise

    def load_model_artifacts_from_cache(self) -> None:
        logger.debug("Model artifacts already downloaded, loading model from cache")
        with start_span(
            "model.artifacts.load",
            {"model.id": self.endpoint, "model.artifacts.source": "local_cache"},
        ):
            infer_bucket_files = self.get_all_required_infer_bucket_file()
            if "environment.json" in infer_bucket_files:
                self.environment = load_json_from_cache(
                    file="environment.json",
                    model_id=self.endpoint,
                    object_pairs_hook=OrderedDict,
                )
            if "class_names.txt" in infer_bucket_files:
                self.class_names = load_text_file_from_cache(
                    file="class_names.txt",
                    model_id=self.endpoint,
                    split_lines=True,
                    strip_white_chars=True,
                )
            else:
                self.class_names = get_class_names_from_environment_file(
                    environment=self.environment
                )
            self.colors = get_color_mapping_from_environment(
                environment=self.environment,
                class_names=self.class_names,
            )
            if "keypoints_metadata.json" in infer_bucket_files:
                self.keypoints_metadata = parse_keypoints_metadata(
                    load_json_from_cache(
                        file="keypoints_metadata.json",
                        model_id=self.endpoint,
                        object_pairs_hook=OrderedDict,
                    )
                )
            self.num_classes = len(self.class_names)
            if "PREPROCESSING" not in self.environment:
                raise ModelArtefactError(
                    "Could not find `PREPROCESSING` key in environment file."
                )
            if issubclass(type(self.environment["PREPROCESSING"]), dict):
                self.preproc = self.environment["PREPROCESSING"]
            else:
                self.preproc = json.loads(self.environment["PREPROCESSING"])
            if self.preproc.get("resize"):
                self.resize_method = self.preproc["resize"].get("format", "Stretch to")
                if self.resize_method in [
                    "Fit (reflect edges) in",
                    "Fit within",
                    "Fill (with center crop) in",
                ]:
                    fallback_resize_method = "Fit (black edges) in"
                    logger.warning(
                        "Unsupported resize method '%s', defaulting to '%s' - this may result in degraded model performance.",
                        self.resize_method,
                        fallback_resize_method,
                    )
                    self.resize_method = fallback_resize_method
                if self.resize_method not in [
                    "Stretch to",
                    "Fit (black edges) in",
                    "Fit (grey edges) in",
                    "Fit (white edges) in",
                ]:
                    logger.error(
                        "Unsupported resize method '%s', defaulting to 'Stretch to' - this may result in degraded model performance.",
                        self.resize_method,
                    )
                    self.resize_method = "Stretch to"
            else:
                logger.error(
                    "Unknown resize method, defaulting to 'Stretch to' - this may result in degraded model performance."
                )
                self.resize_method = "Stretch to"
            logger.debug(f"Resize method is '{self.resize_method}'")
            self.multiclass = self.environment.get("MULTICLASS", False)

    def initialize_model(self, **kwargs) -> None:
        """Initialize the model.

        Raises:
            NotImplementedError: If the method is not implemented.
        """
        raise NotImplementedError(self.__class__.__name__ + ".initialize_model")

    def preproc_image(
        self,
        image: Union[Any, InferenceRequestImage],
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
    ) -> Tuple[np.ndarray, Tuple[int, int]]:
        """
        Preprocesses an inference request image by loading it, then applying any pre-processing specified by the Roboflow platform, then scaling it to the inference input dimensions.

        Args:
            image (Union[Any, InferenceRequestImage]): An object containing information necessary to load the image for inference.
            disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
            disable_preproc_contrast (bool, optional): If true, the contrast preprocessing step is disabled for this call. Default is False.
            disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
            disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.

        Returns:
            Tuple[np.ndarray, Tuple[int, int]]: A tuple containing a numpy array of the preprocessed image pixel data and a tuple of the images original size.
        """
        np_image, is_bgr = load_image(
            image,
            disable_preproc_auto_orient=disable_preproc_auto_orient
            or "auto-orient" not in self.preproc.keys()
            or DISABLE_PREPROC_AUTO_ORIENT,
        )
        preprocessed_image, img_dims = self.preprocess_image(
            np_image,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
        )

        if USE_PYTORCH_FOR_PREPROCESSING:
            preprocessed_image = torch.from_numpy(
                np.ascontiguousarray(preprocessed_image)
            )
            if torch.cuda.is_available():
                preprocessed_image = preprocessed_image.cuda()
            preprocessed_image = (
                preprocessed_image.permute(2, 0, 1).unsqueeze(0).contiguous().float()
            )
        if self.resize_method == "Stretch to":
            if isinstance(preprocessed_image, np.ndarray):
                preprocessed_image = preprocessed_image.astype(np.float32)
                resized = cv2.resize(
                    preprocessed_image,
                    (self.img_size_w, self.img_size_h),
                )
            elif USE_PYTORCH_FOR_PREPROCESSING:
                resized = torch.nn.functional.interpolate(
                    preprocessed_image,
                    size=(self.img_size_h, self.img_size_w),
                    mode="bilinear",
                )
            else:
                raise ValueError(
                    f"Received an image of unknown type, {type(preprocessed_image)}; "
                    "This is most likely a bug. Contact Roboflow team through github issues "
                    "(https://github.com/roboflow/inference/issues) providing full context of the problem"
                )

        elif self.resize_method == "Fit (black edges) in":
            resized = letterbox_image(
                preprocessed_image, (self.img_size_w, self.img_size_h)
            )
        elif self.resize_method == "Fit (white edges) in":
            resized = letterbox_image(
                preprocessed_image,
                (self.img_size_w, self.img_size_h),
                color=(255, 255, 255),
            )
        elif self.resize_method == "Fit (grey edges) in":
            resized = letterbox_image(
                preprocessed_image,
                (self.img_size_w, self.img_size_h),
                color=(114, 114, 114),
            )

        if is_bgr:
            if isinstance(resized, np.ndarray):
                resized = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
            else:
                resized = resized[:, [2, 1, 0], :, :]

        if isinstance(resized, np.ndarray):
            img_in = np.transpose(resized, (2, 0, 1))
            img_in = img_in.astype(np.float32)
            img_in = np.expand_dims(img_in, axis=0)
        elif USE_PYTORCH_FOR_PREPROCESSING:
            img_in = resized.float()
        else:
            raise ValueError(
                f"Received an image of unknown type, {type(resized)}; "
                "This is most likely a bug. Contact Roboflow team through github issues "
                "(https://github.com/roboflow/inference/issues) providing full context of the problem"
            )

        return img_in, img_dims

    def preprocess_image(
        self,
        image: np.ndarray,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
    ) -> Tuple[np.ndarray, Tuple[int, int]]:
        """
        Preprocesses the given image using specified preprocessing steps.

        Args:
            image (Image.Image): The PIL image to preprocess.
            disable_preproc_contrast (bool, optional): If true, the contrast preprocessing step is disabled for this call. Default is False.
            disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
            disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.

        Returns:
            Image.Image: The preprocessed PIL image.
        """
        return prepare(
            image,
            self.preproc,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
        )

    @property
    def weights_file(self) -> str:
        """Abstract property representing the file containing the model weights.

        Raises:
            NotImplementedError: This property must be implemented in subclasses.

        Returns:
            str: The file path to the weights file.
        """
        raise NotImplementedError(self.__class__.__name__ + ".weights_file")

Attributes¶

weights_file `property` ¶

weights_file

Abstract property representing the file containing the model weights.

Raises:

Type	Description
`NotImplementedError`	This property must be implemented in subclasses.

Returns:

Name	Type	Description
`str`	`str`	The file path to the weights file.

Methods:¶

init ¶

__init__(
    model_id,
    cache_dir_root=MODEL_CACHE_DIR,
    api_key=None,
    load_weights=True,
    **kwargs
)

Initialize the RoboflowInferenceModel object.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	The unique identifier for the model.	required
`cache_dir_root`	`str`	The root directory for the cache. Defaults to MODEL_CACHE_DIR.	`MODEL_CACHE_DIR`
`api_key`	`str`	API key for authentication. Defaults to None.	`None`

Source code in inference/core/models/roboflow.py

def __init__(
    self,
    model_id: str,
    cache_dir_root=MODEL_CACHE_DIR,
    api_key=None,
    load_weights=True,
    **kwargs,
):
    """
    Initialize the RoboflowInferenceModel object.

    Args:
        model_id (str): The unique identifier for the model.
        cache_dir_root (str, optional): The root directory for the cache. Defaults to MODEL_CACHE_DIR.
        api_key (str, optional): API key for authentication. Defaults to None.
    """
    super().__init__()
    self.load_weights = load_weights
    self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}
    self.api_key = api_key if api_key else API_KEY
    self.countinference: Optional[bool] = kwargs.get("countinference")
    self.service_secret: Optional[str] = kwargs.get("service_secret")
    model_id = resolve_roboflow_model_alias(model_id=model_id)
    self.dataset_id, self.version_id = get_model_id_chunks(model_id=model_id)
    self.endpoint = model_id
    self.device_id = GLOBAL_DEVICE_ID
    self.cache_dir = get_cache_dir(
        model_id=self.endpoint, cache_dir_root=cache_dir_root
    )
    self.keypoints_metadata: Optional[dict] = None
    initialise_cache(model_id=self.endpoint)

cache_file ¶

cache_file(f)

Get the cache file path for a given file.

Parameters:

Name	Type	Description	Default
`f`	`str`	Filename.	required

Returns:

Name	Type	Description
`str`	`str`	Full path to the cached file.

Source code in inference/core/models/roboflow.py

def cache_file(self, f: str) -> str:
    """Get the cache file path for a given file.

    Args:
        f (str): Filename.

    Returns:
        str: Full path to the cached file.
    """
    return get_cache_file_path(file=f, model_id=self.endpoint)

clear_cache ¶

clear_cache(delete_from_disk=True)

Clear the cache directory.

Parameters:

Name	Type	Description	Default
`delete_from_disk`	`bool`	Whether to delete cached files from disk. Defaults to True.	`True`

Source code in inference/core/models/roboflow.py

def clear_cache(self, delete_from_disk: bool = True) -> None:
    """Clear the cache directory.

    Args:
        delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
    """
    clear_cache(model_id=self.endpoint, delete_from_disk=delete_from_disk)

draw_predictions ¶

draw_predictions(inference_request, inference_response)

Draw predictions from an inference response onto the original image provided by an inference request

Parameters:

Name	Type	Description	Default
`inference_request`	`ObjectDetectionInferenceRequest`	The inference request containing the image on which to draw predictions	required
`inference_response`	`ObjectDetectionInferenceResponse`	The inference response containing predictions to be drawn	required

Returns:

Name	Type	Description
`str`	`bytes`	A base64 encoded image string

Source code in inference/core/models/roboflow.py

def draw_predictions(
    self,
    inference_request: InferenceRequest,
    inference_response: InferenceResponse,
) -> bytes:
    """Draw predictions from an inference response onto the original image provided by an inference request

    Args:
        inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
        inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

    Returns:
        str: A base64 encoded image string
    """
    return draw_detection_predictions(
        inference_request=inference_request,
        inference_response=inference_response,
        colors=self.colors,
    )

get_device_id ¶

get_device_id()

Get the device identifier on which the model is deployed.

Returns:

Name	Type	Description
`str`	`str`	Device identifier.

Source code in inference/core/models/roboflow.py

def get_device_id(self) -> str:
    """
    Get the device identifier on which the model is deployed.

    Returns:
        str: Device identifier.
    """
    return self.device_id

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Get a list of inference bucket files.

Raises:

Type	Description
`NotImplementedError`	If the method is not implemented.

Returns:

Type	Description
`List[str]`	List[str]: A list of inference bucket files.

Source code in inference/core/models/roboflow.py

def get_infer_bucket_file_list(self) -> List[str]:
    """Get a list of inference bucket files.

    Raises:
        NotImplementedError: If the method is not implemented.

    Returns:
        List[str]: A list of inference bucket files.
    """
    raise NotImplementedError(
        self.__class__.__name__ + ".get_infer_bucket_file_list"
    )

get_model_artifacts ¶

get_model_artifacts(
    countinference=None, service_secret=None, **kwargs
)

Fetch or load the model artifacts.

Downloads the model artifacts from S3 or the Roboflow API if they are not already cached.

Source code in inference/core/models/roboflow.py

def get_model_artifacts(
    self,
    countinference: Optional[bool] = None,
    service_secret: Optional[str] = None,
    **kwargs,
) -> None:
    """Fetch or load the model artifacts.

    Downloads the model artifacts from S3 or the Roboflow API if they are not already cached.
    """
    if MODELS_CACHE_AUTH_ENABLED:
        if not _check_if_api_key_has_access_to_model(
            api_key=self.api_key,
            model_id=self.endpoint,
            endpoint_type=ModelEndpointType.ORT,
            countinference=countinference,
            service_secret=service_secret,
        ):
            raise RoboflowAPINotAuthorizedError(
                f"API key {self.api_key} does not have access to model {self.endpoint}"
            )
    self.cache_model_artefacts(
        countinference=countinference,
        service_secret=service_secret,
        **kwargs,
    )
    self.load_model_artifacts_from_cache()

initialize_model ¶

initialize_model(**kwargs)

Initialize the model.

Raises:

Type	Description
`NotImplementedError`	If the method is not implemented.

Source code in inference/core/models/roboflow.py

def initialize_model(self, **kwargs) -> None:
    """Initialize the model.

    Raises:
        NotImplementedError: If the method is not implemented.
    """
    raise NotImplementedError(self.__class__.__name__ + ".initialize_model")

preproc_image ¶

preproc_image(
    image,
    disable_preproc_auto_orient=False,
    disable_preproc_contrast=False,
    disable_preproc_grayscale=False,
    disable_preproc_static_crop=False,
)

Preprocesses an inference request image by loading it, then applying any pre-processing specified by the Roboflow platform, then scaling it to the inference input dimensions.

Parameters:

Name	Type	Description	Default
`image`	`Union[Any, InferenceRequestImage]`	An object containing information necessary to load the image for inference.	required
`disable_preproc_auto_orient`	`bool`	If true, the auto orient preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_contrast`	`bool`	If true, the contrast preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_grayscale`	`bool`	If true, the grayscale preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_static_crop`	`bool`	If true, the static crop preprocessing step is disabled for this call. Default is False.	`False`

Returns:

Type	Description
`Tuple[ndarray, Tuple[int, int]]`	Tuple[np.ndarray, Tuple[int, int]]: A tuple containing a numpy array of the preprocessed image pixel data and a tuple of the images original size.

Source code in inference/core/models/roboflow.py

def preproc_image(
    self,
    image: Union[Any, InferenceRequestImage],
    disable_preproc_auto_orient: bool = False,
    disable_preproc_contrast: bool = False,
    disable_preproc_grayscale: bool = False,
    disable_preproc_static_crop: bool = False,
) -> Tuple[np.ndarray, Tuple[int, int]]:
    """
    Preprocesses an inference request image by loading it, then applying any pre-processing specified by the Roboflow platform, then scaling it to the inference input dimensions.

    Args:
        image (Union[Any, InferenceRequestImage]): An object containing information necessary to load the image for inference.
        disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
        disable_preproc_contrast (bool, optional): If true, the contrast preprocessing step is disabled for this call. Default is False.
        disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
        disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.

    Returns:
        Tuple[np.ndarray, Tuple[int, int]]: A tuple containing a numpy array of the preprocessed image pixel data and a tuple of the images original size.
    """
    np_image, is_bgr = load_image(
        image,
        disable_preproc_auto_orient=disable_preproc_auto_orient
        or "auto-orient" not in self.preproc.keys()
        or DISABLE_PREPROC_AUTO_ORIENT,
    )
    preprocessed_image, img_dims = self.preprocess_image(
        np_image,
        disable_preproc_contrast=disable_preproc_contrast,
        disable_preproc_grayscale=disable_preproc_grayscale,
        disable_preproc_static_crop=disable_preproc_static_crop,
    )

    if USE_PYTORCH_FOR_PREPROCESSING:
        preprocessed_image = torch.from_numpy(
            np.ascontiguousarray(preprocessed_image)
        )
        if torch.cuda.is_available():
            preprocessed_image = preprocessed_image.cuda()
        preprocessed_image = (
            preprocessed_image.permute(2, 0, 1).unsqueeze(0).contiguous().float()
        )
    if self.resize_method == "Stretch to":
        if isinstance(preprocessed_image, np.ndarray):
            preprocessed_image = preprocessed_image.astype(np.float32)
            resized = cv2.resize(
                preprocessed_image,
                (self.img_size_w, self.img_size_h),
            )
        elif USE_PYTORCH_FOR_PREPROCESSING:
            resized = torch.nn.functional.interpolate(
                preprocessed_image,
                size=(self.img_size_h, self.img_size_w),
                mode="bilinear",
            )
        else:
            raise ValueError(
                f"Received an image of unknown type, {type(preprocessed_image)}; "
                "This is most likely a bug. Contact Roboflow team through github issues "
                "(https://github.com/roboflow/inference/issues) providing full context of the problem"
            )

    elif self.resize_method == "Fit (black edges) in":
        resized = letterbox_image(
            preprocessed_image, (self.img_size_w, self.img_size_h)
        )
    elif self.resize_method == "Fit (white edges) in":
        resized = letterbox_image(
            preprocessed_image,
            (self.img_size_w, self.img_size_h),
            color=(255, 255, 255),
        )
    elif self.resize_method == "Fit (grey edges) in":
        resized = letterbox_image(
            preprocessed_image,
            (self.img_size_w, self.img_size_h),
            color=(114, 114, 114),
        )

    if is_bgr:
        if isinstance(resized, np.ndarray):
            resized = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
        else:
            resized = resized[:, [2, 1, 0], :, :]

    if isinstance(resized, np.ndarray):
        img_in = np.transpose(resized, (2, 0, 1))
        img_in = img_in.astype(np.float32)
        img_in = np.expand_dims(img_in, axis=0)
    elif USE_PYTORCH_FOR_PREPROCESSING:
        img_in = resized.float()
    else:
        raise ValueError(
            f"Received an image of unknown type, {type(resized)}; "
            "This is most likely a bug. Contact Roboflow team through github issues "
            "(https://github.com/roboflow/inference/issues) providing full context of the problem"
        )

    return img_in, img_dims

preprocess_image ¶

preprocess_image(
    image,
    disable_preproc_contrast=False,
    disable_preproc_grayscale=False,
    disable_preproc_static_crop=False,
)

Preprocesses the given image using specified preprocessing steps.

Parameters:

Name	Type	Description	Default
`image`	`Image`	The PIL image to preprocess.	required
`disable_preproc_contrast`	`bool`	If true, the contrast preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_grayscale`	`bool`	If true, the grayscale preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_static_crop`	`bool`	If true, the static crop preprocessing step is disabled for this call. Default is False.	`False`

Returns:

Type	Description
`Tuple[ndarray, Tuple[int, int]]`	Image.Image: The preprocessed PIL image.

Source code in inference/core/models/roboflow.py

def preprocess_image(
    self,
    image: np.ndarray,
    disable_preproc_contrast: bool = False,
    disable_preproc_grayscale: bool = False,
    disable_preproc_static_crop: bool = False,
) -> Tuple[np.ndarray, Tuple[int, int]]:
    """
    Preprocesses the given image using specified preprocessing steps.

    Args:
        image (Image.Image): The PIL image to preprocess.
        disable_preproc_contrast (bool, optional): If true, the contrast preprocessing step is disabled for this call. Default is False.
        disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
        disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.

    Returns:
        Image.Image: The preprocessed PIL image.
    """
    return prepare(
        image,
        self.preproc,
        disable_preproc_contrast=disable_preproc_contrast,
        disable_preproc_grayscale=disable_preproc_grayscale,
        disable_preproc_static_crop=disable_preproc_static_crop,
    )

Functions:¶

`core/models/utils`¶

inference.core.models.utils.keypoints ¶

Functions:¶

superset_keypoints_count ¶

superset_keypoints_count(keypoints_metadata)

Returns the number of keypoints in the superset.

Source code in inference/core/models/utils/keypoints.py

def superset_keypoints_count(keypoints_metadata: Dict[int, Dict[int, str]]) -> int:
    """Returns the number of keypoints in the superset."""
    max_keypoints = 0
    for keypoints in keypoints_metadata.values():
        if len(keypoints) > max_keypoints:
            max_keypoints = len(keypoints)
    return max_keypoints

`core/registries`¶

Model and block registries for dynamic lookup and plugin discovery.

inference.core.registries.base ¶

Classes¶

ModelRegistry ¶

An object which is able to return model classes based on given model IDs and model types.

Attributes:

Name	Type	Description
`registry_dict`	`dict`	A dictionary mapping model types to model classes.

Source code in inference/core/registries/base.py

class ModelRegistry:
    """An object which is able to return model classes based on given model IDs and model types.

    Attributes:
        registry_dict (dict): A dictionary mapping model types to model classes.
    """

    def __init__(self, registry_dict) -> None:
        """Initializes the ModelRegistry with the given dictionary of registered models.

        Args:
            registry_dict (dict): A dictionary mapping model types to model classes.
        """
        self.registry_dict = registry_dict

    def get_model(
        self,
        model_type: str,
        model_id: str,
        **kwargs,
    ) -> Model:
        """Returns the model class based on the given model type.

        Args:
            model_type (str): The type of the model to be retrieved.
            model_id (str): The ID of the model to be retrieved (unused in the current implementation).

        Returns:
            Model: The model class corresponding to the given model type.

        Raises:
            ModelNotRecognisedError: If the model_type is not found in the registry_dict.
        """
        if model_type not in self.registry_dict:
            raise ModelNotRecognisedError(
                f"Could not find model of type: {model_type} in configured registry."
            )
        return self.registry_dict[model_type]

Methods:¶

init ¶

__init__(registry_dict)

Initializes the ModelRegistry with the given dictionary of registered models.

Parameters:

Name	Type	Description	Default
`registry_dict`	`dict`	A dictionary mapping model types to model classes.	required

Source code in inference/core/registries/base.py

def __init__(self, registry_dict) -> None:
    """Initializes the ModelRegistry with the given dictionary of registered models.

    Args:
        registry_dict (dict): A dictionary mapping model types to model classes.
    """
    self.registry_dict = registry_dict

get_model ¶

get_model(model_type, model_id, **kwargs)

Returns the model class based on the given model type.

Parameters:

Name	Type	Description	Default
`model_type`	`str`	The type of the model to be retrieved.	required
`model_id`	`str`	The ID of the model to be retrieved (unused in the current implementation).	required

Returns:

Name	Type	Description
`Model`	`Model`	The model class corresponding to the given model type.

Raises:

Type	Description
`ModelNotRecognisedError`	If the model_type is not found in the registry_dict.

Source code in inference/core/registries/base.py

def get_model(
    self,
    model_type: str,
    model_id: str,
    **kwargs,
) -> Model:
    """Returns the model class based on the given model type.

    Args:
        model_type (str): The type of the model to be retrieved.
        model_id (str): The ID of the model to be retrieved (unused in the current implementation).

    Returns:
        Model: The model class corresponding to the given model type.

    Raises:
        ModelNotRecognisedError: If the model_type is not found in the registry_dict.
    """
    if model_type not in self.registry_dict:
        raise ModelNotRecognisedError(
            f"Could not find model of type: {model_type} in configured registry."
        )
    return self.registry_dict[model_type]

inference.core.registries.roboflow ¶

Classes¶

RoboflowModelRegistry ¶

Bases: ModelRegistry

A Roboflow-specific model registry which gets the model type using the model id, then returns a model class based on the model type.

Source code in inference/core/registries/roboflow.py

class RoboflowModelRegistry(ModelRegistry):
    """A Roboflow-specific model registry which gets the model type using the model id,
    then returns a model class based on the model type.
    """

    def get_model(
        self,
        model_id: ModelID,
        api_key: str,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
    ) -> Model:
        """Returns the model class based on the given model id and API key.

        Args:
            model_id (str): The ID of the model to be retrieved.
            api_key (str): The API key used to authenticate.

        Returns:
            Model: The model class corresponding to the given model ID and type.

        Raises:
            ModelNotRecognisedError: If the model type is not supported or found.
        """
        model_type = get_model_type(
            model_id,
            api_key,
            countinference=countinference,
            service_secret=service_secret,
        )
        logger.debug(f"Model type: {model_type}")

        if model_type not in self.registry_dict:
            raise ModelNotRecognisedError(
                f"Model type not supported, you may want to try a different inference server configuration or endpoint: {model_type}"
            )
        return self.registry_dict[model_type]

Methods:¶

get_model ¶

get_model(
    model_id,
    api_key,
    countinference=None,
    service_secret=None,
)

Returns the model class based on the given model id and API key.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	The ID of the model to be retrieved.	required
`api_key`	`str`	The API key used to authenticate.	required

Returns:

Name	Type	Description
`Model`	`Model`	The model class corresponding to the given model ID and type.

Raises:

Type	Description
`ModelNotRecognisedError`	If the model type is not supported or found.

Source code in inference/core/registries/roboflow.py

def get_model(
    self,
    model_id: ModelID,
    api_key: str,
    countinference: Optional[bool] = None,
    service_secret: Optional[str] = None,
) -> Model:
    """Returns the model class based on the given model id and API key.

    Args:
        model_id (str): The ID of the model to be retrieved.
        api_key (str): The API key used to authenticate.

    Returns:
        Model: The model class corresponding to the given model ID and type.

    Raises:
        ModelNotRecognisedError: If the model type is not supported or found.
    """
    model_type = get_model_type(
        model_id,
        api_key,
        countinference=countinference,
        service_secret=service_secret,
    )
    logger.debug(f"Model type: {model_type}")

    if model_type not in self.registry_dict:
        raise ModelNotRecognisedError(
            f"Model type not supported, you may want to try a different inference server configuration or endpoint: {model_type}"
        )
    return self.registry_dict[model_type]

Functions:¶

get_model_type ¶

get_model_type(
    model_id,
    api_key=None,
    countinference=None,
    service_secret=None,
)

Retrieves the model type based on the given model ID and API key.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	The ID of the model.	required
`api_key`	`str`	The API key used to authenticate.	`None`

Returns:

Name	Type	Description
`tuple`	`Tuple[TaskType, ModelType]`	The project task type and the model type.

Raises:

Type	Description
`WorkspaceLoadError`	If the workspace could not be loaded or if the API key is invalid.
`DatasetLoadError`	If the dataset could not be loaded due to invalid ID, workspace ID or version ID.
`MissingDefaultModelError`	If default model is not configured and API does not provide this info
`MalformedRoboflowAPIResponseError`	Roboflow API responds in invalid format.

Source code in inference/core/registries/roboflow.py

def get_model_type(
    model_id: ModelID,
    api_key: Optional[str] = None,
    countinference: Optional[bool] = None,
    service_secret: Optional[str] = None,
) -> Tuple[TaskType, ModelType]:
    """Retrieves the model type based on the given model ID and API key.

    Args:
        model_id (str): The ID of the model.
        api_key (str): The API key used to authenticate.

    Returns:
        tuple: The project task type and the model type.

    Raises:
        WorkspaceLoadError: If the workspace could not be loaded or if the API key is invalid.
        DatasetLoadError: If the dataset could not be loaded due to invalid ID, workspace ID or version ID.
        MissingDefaultModelError: If default model is not configured and API does not provide this info
        MalformedRoboflowAPIResponseError: Roboflow API responds in invalid format.
    """

    model_id = resolve_roboflow_model_alias(model_id=model_id)
    local_model_type = _get_local_model_type(model_id=model_id)
    if local_model_type is not None:
        return local_model_type
    dataset_id, version_id = get_model_id_chunks(model_id=model_id)
    # first check if the model id as a whole is in the GENERIC_MODELS dictionary
    if model_id in GENERIC_MODELS:
        logger.debug(f"Loading generic model: {model_id}.")
        return GENERIC_MODELS[model_id]

    # then check if the dataset id is in the GENERIC_MODELS dictionary
    if dataset_id in GENERIC_MODELS:
        logger.debug(f"Loading generic model: {dataset_id}.")
        return GENERIC_MODELS[dataset_id]

    if MODELS_CACHE_AUTH_ENABLED:
        if not _check_if_api_key_has_access_to_model(
            api_key=api_key,
            model_id=model_id,
            countinference=countinference,
            service_secret=service_secret,
        ):
            raise RoboflowAPINotAuthorizedError(
                f"API key {api_key} does not have access to model {model_id}"
            )

    cached_metadata = get_model_metadata_from_cache(
        dataset_id=dataset_id, version_id=version_id
    )

    if cached_metadata is not None:
        _ensure_model_supported_on_this_deployment(
            model_id=model_id,
            project_task_type=cached_metadata[0],
            model_type=cached_metadata[1],
        )
        return cached_metadata[0], cached_metadata[1]
    if version_id == STUB_VERSION_ID:
        if api_key is None:
            raise MissingApiKeyError(
                "Stub model version provided but no API key was provided. API key is required to load stub models."
            )
        workspace_id = get_roboflow_workspace(api_key=api_key)
        project_task_type = get_roboflow_dataset_type(
            api_key=api_key, workspace_id=workspace_id, dataset_id=dataset_id
        )
        model_type = "stub"
        save_model_metadata_in_cache(
            dataset_id=dataset_id,
            version_id=version_id,
            project_task_type=project_task_type,
            model_type=model_type,
        )
        return project_task_type, model_type

    if USE_INFERENCE_MODELS:
        api_data = get_model_metadata_from_inference_models_registry(
            api_key=api_key,
            model_id=model_id,
            countinference=countinference,
            service_secret=service_secret,
        )
        project_task_type = api_data.get("taskType", "object-detection")
    elif version_id is not None:
        api_data = get_roboflow_model_data(
            api_key=api_key,
            model_id=model_id,
            countinference=countinference,
            service_secret=service_secret,
            endpoint_type=ModelEndpointType.ORT,
            device_id=GLOBAL_DEVICE_ID,
        ).get("ort")
        project_task_type = api_data.get("type", "object-detection")
    else:
        api_data = get_roboflow_instant_model_data(
            api_key=api_key,
            model_id=model_id,
            countinference=countinference,
            service_secret=service_secret,
        )
        project_task_type = api_data.get("taskType", "object-detection")
    if api_data is None:
        raise ModelArtefactError("Error loading model artifacts from Roboflow API.")

    # some older projects do not have type field - hence defaulting
    model_type = api_data.get("modelType")
    if model_type is None or model_type == "ort":
        # some very old model versions do not have modelType reported - and API respond in a generic way -
        # then we shall attempt using default model for given task type
        model_type = MODEL_TYPE_DEFAULTS.get(project_task_type)

    if model_type is None or project_task_type is None:
        raise ModelArtefactError("Error loading model artifacts from Roboflow API.")
    _ensure_model_supported_on_this_deployment(
        model_id=model_id,
        project_task_type=project_task_type,
        model_type=model_type,
    )
    save_model_metadata_in_cache(
        dataset_id=dataset_id,
        version_id=version_id,
        project_task_type=project_task_type,
        model_type=model_type,
    )

    return project_task_type, model_type

`core/utils`¶

General-purpose utilities: image encoding, file I/O, hashing, URL handling, and more.

inference.core.utils.container ¶

Functions:¶

is_docker_socket_mounted ¶

is_docker_socket_mounted(docker_socket_path)

Check if the given path is a mounted Docker socket.

Parameters:

Name	Type	Description	Default
`docker_socket_path`	`str`	The path to the socket file.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if the path is a Unix socket, False otherwise.

Source code in inference/core/utils/container.py

def is_docker_socket_mounted(docker_socket_path: str) -> bool:
    """
    Check if the given path is a mounted Docker socket.

    Args:
        docker_socket_path (str): The path to the socket file.

    Returns:
        bool: True if the path is a Unix socket, False otherwise.
    """
    if os.path.exists(docker_socket_path):
        socket_stat = os.stat(docker_socket_path)
        if stat.S_ISSOCK(socket_stat.st_mode):
            return True
    return False

inference.core.utils.cuda_health ¶

CUDA health checking utilities.

Provides a fast, cached health check for GPU/CUDA state. Once CUDA fails, the context is permanently corrupted and cannot recover without process restart. The failure state is cached to avoid repeatedly calling into a broken CUDA runtime.

Classes¶

CudaHealthChecker ¶

Thread-safe CUDA health checker with failure caching.

Once a CUDA failure is detected, the result is cached permanently (CUDA context corruption is unrecoverable). Subsequent calls return the cached failure immediately without touching CUDA.

Source code in inference/core/utils/cuda_health.py

class CudaHealthChecker:
    """Thread-safe CUDA health checker with failure caching.

    Once a CUDA failure is detected, the result is cached permanently
    (CUDA context corruption is unrecoverable). Subsequent calls return
    the cached failure immediately without touching CUDA.
    """

    def __init__(self):
        self._lock = threading.Lock()
        self._cuda_failed: bool = False
        self._failure_error: Optional[str] = None
        self._failure_time: Optional[float] = None
        self._gpu_available: Optional[bool] = None  # None = not yet checked

    def _is_gpu_environment(self) -> bool:
        """Check if we're running in a GPU environment. Cached after first call."""
        if self._gpu_available is not None:
            return self._gpu_available
        try:
            import torch

            self._gpu_available = torch.cuda.is_available()
        except ImportError:
            self._gpu_available = False
        except Exception:
            self._gpu_available = False
        return self._gpu_available

    def check_health(self) -> Tuple[bool, Optional[str]]:
        """Check CUDA health. Returns (is_healthy, error_message).

        - If not a GPU environment: returns (True, None) immediately
        - If CUDA previously failed: returns cached failure immediately
        - Otherwise: runs synchronize + mem_get_info check

        Thread-safe. The actual CUDA check is serialized by the lock to
        prevent concurrent CUDA calls during health checking.
        """
        # Fast path: not a GPU environment
        if not self._is_gpu_environment():
            return True, None

        # Fast path: already known to be failed (unrecoverable)
        if self._cuda_failed:
            return False, self._failure_error

        # Slow path: actually check CUDA
        with self._lock:
            # Double-check after acquiring lock
            if self._cuda_failed:
                return False, self._failure_error

            try:
                import torch

                # Synchronize to surface any pending async CUDA errors
                torch.cuda.synchronize()
                # Query runtime to verify it's still functional
                torch.cuda.mem_get_info()
                return True, None
            except Exception as e:
                error_msg = f"CUDA health check failed: {e}"
                logger.error(error_msg)
                self._cuda_failed = True
                self._failure_error = error_msg
                self._failure_time = time.time()
                return False, error_msg

    @property
    def is_failed(self) -> bool:
        return self._cuda_failed

    @property
    def failure_info(self) -> Optional[dict]:
        if not self._cuda_failed:
            return None
        return {
            "error": self._failure_error,
            "failed_at": self._failure_time,
        }

Methods:¶

check_health ¶

check_health()

Check CUDA health. Returns (is_healthy, error_message).

If not a GPU environment: returns (True, None) immediately
If CUDA previously failed: returns cached failure immediately
Otherwise: runs synchronize + mem_get_info check

Thread-safe. The actual CUDA check is serialized by the lock to prevent concurrent CUDA calls during health checking.

Source code in inference/core/utils/cuda_health.py

def check_health(self) -> Tuple[bool, Optional[str]]:
    """Check CUDA health. Returns (is_healthy, error_message).

    - If not a GPU environment: returns (True, None) immediately
    - If CUDA previously failed: returns cached failure immediately
    - Otherwise: runs synchronize + mem_get_info check

    Thread-safe. The actual CUDA check is serialized by the lock to
    prevent concurrent CUDA calls during health checking.
    """
    # Fast path: not a GPU environment
    if not self._is_gpu_environment():
        return True, None

    # Fast path: already known to be failed (unrecoverable)
    if self._cuda_failed:
        return False, self._failure_error

    # Slow path: actually check CUDA
    with self._lock:
        # Double-check after acquiring lock
        if self._cuda_failed:
            return False, self._failure_error

        try:
            import torch

            # Synchronize to surface any pending async CUDA errors
            torch.cuda.synchronize()
            # Query runtime to verify it's still functional
            torch.cuda.mem_get_info()
            return True, None
        except Exception as e:
            error_msg = f"CUDA health check failed: {e}"
            logger.error(error_msg)
            self._cuda_failed = True
            self._failure_error = error_msg
            self._failure_time = time.time()
            return False, error_msg

Functions:¶

check_cuda_health ¶

check_cuda_health()

Module-level convenience function.

Source code in inference/core/utils/cuda_health.py

def check_cuda_health() -> Tuple[bool, Optional[str]]:
    """Module-level convenience function."""
    return _checker.check_health()

get_cuda_health_checker ¶

get_cuda_health_checker()

Return the singleton for dependency injection / testing.

Source code in inference/core/utils/cuda_health.py

def get_cuda_health_checker() -> CudaHealthChecker:
    """Return the singleton for dependency injection / testing."""
    return _checker

inference.core.utils.environment ¶

Classes¶

Functions:¶

safe_env_to_type ¶

safe_env_to_type(
    variable_name, default_value=None, type_constructor=None
)

Converts env variable to specified type, but only if variable is set - otherwise default is returned. If type_constructor is not given - value of type str will be returned.

Source code in inference/core/utils/environment.py

def safe_env_to_type(
    variable_name: str,
    default_value: Optional[T] = None,
    type_constructor: Optional[Union[Type[T], Callable[[str], T]]] = None,
) -> Optional[T]:
    """
    Converts env variable to specified type, but only if variable is set - otherwise default is returned.
    If `type_constructor` is not given - value of type str will be returned.
    """
    if variable_name not in os.environ:
        return default_value
    variable_value = os.environ[variable_name]
    if type_constructor is None:
        return variable_value
    return type_constructor(variable_value)

safe_split_value ¶

safe_split_value(value, delimiter=',', strip=False)

Splits a separated environment variable into a list.

Parameters:

Name	Type	Description	Default
`value`	`str`	The environment variable value to be split.	required
`delimiter`	`str`	Delimiter to be used	`','`
`strip`	`bool`	Strip leading and trailing whitespace	`False`

Returns:

Type	Description
`Optional[List[str]]`	list or None: The split values as a list, or None if the input is None.

Source code in inference/core/utils/environment.py

def safe_split_value(
    value: Optional[str],
    delimiter: str = ",",
    strip: bool = False,
) -> Optional[List[str]]:
    """
    Splits a separated environment variable into a list.

    Args:
        value (str): The environment variable value to be split.
        delimiter(str): Delimiter to be used
        strip (bool): Strip leading and trailing whitespace

    Returns:
        list or None: The split values as a list, or None if the input is None.
    """
    if value is None:
        return None
    else:
        result = value.split(delimiter)
        if strip:
            result = [element.strip() for element in result if len(element.strip())]
        return result

str2bool ¶

str2bool(value)

Converts an environment variable to a boolean value.

Parameters:

Name	Type	Description	Default
`value`	`str or bool`	The environment variable value to be converted.	required

Returns:

Name	Type	Description
`bool`	`bool`	The converted boolean value.

Raises:

Type	Description
`InvalidEnvironmentVariableError`	If the value is not 'true', 'false', or a boolean.

Source code in inference/core/utils/environment.py

def str2bool(value: Any) -> bool:
    """
    Converts an environment variable to a boolean value.

    Args:
        value (str or bool): The environment variable value to be converted.

    Returns:
        bool: The converted boolean value.

    Raises:
        InvalidEnvironmentVariableError: If the value is not 'true', 'false', or a boolean.
    """
    if isinstance(value, bool):
        return value
    if not issubclass(type(value), str):
        raise InvalidEnvironmentVariableError(
            f"Expected a boolean environment variable (true or false) but got '{value}'"
        )
    if value.lower() == "true":
        return True
    elif value.lower() == "false":
        return False
    else:
        raise InvalidEnvironmentVariableError(
            f"Expected a boolean environment variable (true or false) but got '{value}'"
        )

inference.core.utils.file_system ¶

Classes¶

AtomicPath ¶

Context manager for atomic file writes.

Ensures that files are either written completely or not at all, preventing partial/corrupted files from power failures or crashes.

Usage

with AtomicPath(target_path, allow_override=False) as temp_path: # Write to temp_path with open(temp_path, 'w') as f: f.write(data)

File is atomically moved to target_path on successful exit¶

Source code in inference/core/utils/file_system.py

class AtomicPath:
    """Context manager for atomic file writes.

    Ensures that files are either written completely or not at all,
    preventing partial/corrupted files from power failures or crashes.

    Usage:
        with AtomicPath(target_path, allow_override=False) as temp_path:
            # Write to temp_path
            with open(temp_path, 'w') as f:
                f.write(data)
        # File is atomically moved to target_path on successful exit
    """

    def __init__(self, target_path: str, allow_override: bool = False):
        self.target_path = target_path
        self.allow_override = allow_override
        self.temp_path: Optional[str] = None
        self.temp_file = None

    def __enter__(self) -> str:
        ensure_write_is_allowed(
            path=self.target_path, allow_override=self.allow_override
        )
        ensure_parent_dir_exists(path=self.target_path)

        dir_name = os.path.dirname(os.path.abspath(self.target_path))
        base_name = os.path.basename(self.target_path)
        self.temp_file = tempfile.NamedTemporaryFile(
            dir=dir_name, prefix=".tmp_", suffix="_" + base_name, delete=False
        )
        self.temp_path = self.temp_file.name
        self.temp_file.close()
        return self.temp_path

    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_type is None:
            try:
                if os.name == "nt":  # Windows
                    if os.path.exists(self.target_path):
                        os.remove(self.target_path)
                    os.rename(self.temp_path, self.target_path)
                else:  # POSIX
                    os.replace(self.temp_path, self.target_path)
            except Exception:
                try:
                    os.unlink(self.temp_path)
                except OSError:
                    pass
                raise
        else:
            # Error occurred - clean up temp file
            try:
                os.unlink(self.temp_path)
            except OSError:
                pass
        return False  # Don't suppress exceptions

inference.core.utils.image_utils ¶

Classes¶

Functions:¶

attempt_loading_image_from_string ¶

attempt_loading_image_from_string(
    value, cv_imread_flags=cv2.IMREAD_COLOR
)

Attempt to load an image from a string.

Parameters:

Name	Type	Description	Default
`value`	`Union[str, bytes, bytearray, _IOBase]`	The image data in string format.	required
`cv_imread_flags`	`int`	OpenCV flags used for image reading.	`IMREAD_COLOR`

Returns:

Type	Description
`Tuple[ndarray, bool]`	Tuple[np.ndarray, bool]: A tuple of the loaded image in numpy array format and a boolean flag indicating if the image is in BGR format.

Source code in inference/core/utils/image_utils.py

def attempt_loading_image_from_string(
    value: Union[str, bytes, bytearray, _IOBase],
    cv_imread_flags: int = cv2.IMREAD_COLOR,
) -> Tuple[np.ndarray, bool]:
    """
    Attempt to load an image from a string.

    Args:
        value (Union[str, bytes, bytearray, _IOBase]): The image data in string format.
        cv_imread_flags (int): OpenCV flags used for image reading.

    Returns:
        Tuple[np.ndarray, bool]: A tuple of the loaded image in numpy array format and a boolean flag indicating if the image is in BGR format.
    """
    try:
        return load_image_base64(value=value, cv_imread_flags=cv_imread_flags), True
    except:
        pass
    try:
        return (
            load_image_from_encoded_bytes(value=value, cv_imread_flags=cv_imread_flags),
            True,
        )
    except:
        pass
    try:
        return (
            load_image_from_buffer(value=value, cv_imread_flags=cv_imread_flags),
            True,
        )
    except:
        pass
    try:
        return load_image_from_numpy_str(value=value), True
    except InvalidImageTypeDeclared as error:
        raise error
    except InvalidNumpyInput as error:
        raise InputFormatInferenceFailed(
            message="Input image format could not be inferred from string.",
            public_message="Input image format could not be inferred from string.",
        ) from error

choose_image_decoding_flags ¶

choose_image_decoding_flags(disable_preproc_auto_orient)

Choose the appropriate OpenCV image decoding flags.

Parameters:

Name	Type	Description	Default
`disable_preproc_auto_orient`	`bool`	Flag to disable preprocessing auto-orientation.	required

Returns:

Name	Type	Description
`int`	`int`	OpenCV image decoding flags.

Source code in inference/core/utils/image_utils.py

def choose_image_decoding_flags(disable_preproc_auto_orient: bool) -> int:
    """Choose the appropriate OpenCV image decoding flags.

    Args:
        disable_preproc_auto_orient (bool): Flag to disable preprocessing auto-orientation.

    Returns:
        int: OpenCV image decoding flags.
    """
    cv_imread_flags = cv2.IMREAD_COLOR
    if disable_preproc_auto_orient:
        cv_imread_flags = cv_imread_flags | cv2.IMREAD_IGNORE_ORIENTATION
    return cv_imread_flags

convert_gray_image_to_bgr ¶

convert_gray_image_to_bgr(image)

Convert a grayscale image to BGR format.

Parameters:

Name	Type	Description	Default
`image`	`ndarray`	The grayscale image.	required

Returns:

Type	Description
`ndarray`	np.ndarray: The converted BGR image.

Source code in inference/core/utils/image_utils.py

def convert_gray_image_to_bgr(image: np.ndarray) -> np.ndarray:
    """
    Convert a grayscale image to BGR format.

    Args:
        image (np.ndarray): The grayscale image.

    Returns:
        np.ndarray: The converted BGR image.
    """

    if len(image.shape) == 2 or image.shape[2] == 1:
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    return image

encode_image_to_jpeg_bytes ¶

encode_image_to_jpeg_bytes(image, jpeg_quality=90)

Encode a numpy image to JPEG format in bytes.

Parameters:

Name	Type	Description	Default
`image`	`ndarray`	The numpy array representing a BGR image.	required
`jpeg_quality`	`int`	Quality of the JPEG image.	`90`

Returns:

Name	Type	Description
`bytes`	`bytes`	The JPEG encoded image.

Source code in inference/core/utils/image_utils.py

def encode_image_to_jpeg_bytes(image: np.ndarray, jpeg_quality: int = 90) -> bytes:
    """
    Encode a numpy image to JPEG format in bytes.

    Args:
        image (np.ndarray): The numpy array representing a BGR image.
        jpeg_quality (int): Quality of the JPEG image.

    Returns:
        bytes: The JPEG encoded image.
    """
    encoding_param = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
    _, img_encoded = cv2.imencode(".jpg", image, encoding_param)
    return np.array(img_encoded).tobytes()

extract_image_payload_and_type ¶

extract_image_payload_and_type(value)

Extract the image payload and type from the given value.

This function supports different types of image inputs (e.g., InferenceRequestImage, dict, etc.) and extracts the relevant data and image type for further processing.

Parameters:

Name	Type	Description	Default
`value`	`Any`	The input value which can be an image or information to derive the image.	required

Returns:

Type	Description
`Tuple[Any, Optional[ImageType]]`	Tuple[Any, Optional[ImageType]]: A tuple containing the extracted image data and the corresponding image type.

Source code in inference/core/utils/image_utils.py

def extract_image_payload_and_type(value: Any) -> Tuple[Any, Optional[ImageType]]:
    """Extract the image payload and type from the given value.

    This function supports different types of image inputs (e.g., InferenceRequestImage, dict, etc.)
    and extracts the relevant data and image type for further processing.

    Args:
        value (Any): The input value which can be an image or information to derive the image.

    Returns:
        Tuple[Any, Optional[ImageType]]: A tuple containing the extracted image data and the corresponding image type.
    """
    image_type = None
    if issubclass(type(value), InferenceRequestImage):
        image_type = value.type
        value = value.value
    elif issubclass(type(value), dict):
        image_type = value.get("type")
        value = value.get("value")
    allowed_payload_types = {e.value for e in ImageType}
    if image_type is None:
        return value, image_type
    if image_type.lower() not in allowed_payload_types:
        raise InvalidImageTypeDeclared(
            message=f"Declared image type: {image_type.lower()} which is not in allowed types: {allowed_payload_types}.",
            public_message="Image declaration contains not recognised image type.",
        )
    return value, ImageType(image_type.lower())

load_image ¶

load_image(value, disable_preproc_auto_orient=False)

Loads an image based on the specified type and value.

Parameters:

Name	Type	Description	Default
`value`	`Any`	Image value which could be an instance of InferenceRequestImage, a dict with 'type' and 'value' keys, or inferred based on the value's content.	required

Returns:

Type	Description
`Tuple[ndarray, bool]`	Image.Image: The loaded PIL image, converted to RGB.

Raises:

Type	Description
`NotImplementedError`	If the specified image type is not supported.
`InvalidNumpyInput`	If the numpy input method is used and the input data is invalid.

Source code in inference/core/utils/image_utils.py

def load_image(
    value: Any,
    disable_preproc_auto_orient: bool = False,
) -> Tuple[np.ndarray, bool]:
    """Loads an image based on the specified type and value.

    Args:
        value (Any): Image value which could be an instance of InferenceRequestImage,
            a dict with 'type' and 'value' keys, or inferred based on the value's content.

    Returns:
        Image.Image: The loaded PIL image, converted to RGB.

    Raises:
        NotImplementedError: If the specified image type is not supported.
        InvalidNumpyInput: If the numpy input method is used and the input data is invalid.
    """
    cv_imread_flags = choose_image_decoding_flags(
        disable_preproc_auto_orient=disable_preproc_auto_orient
    )
    value, image_type = extract_image_payload_and_type(value=value)
    if image_type is not None:
        np_image, is_bgr = load_image_with_known_type(
            value=value,
            image_type=image_type,
            cv_imread_flags=cv_imread_flags,
        )
    else:
        np_image, is_bgr = load_image_with_inferred_type(
            value, cv_imread_flags=cv_imread_flags
        )
    np_image = convert_gray_image_to_bgr(image=np_image)
    logger.debug(f"Loaded inference image. Shape: {getattr(np_image, 'shape', None)}")
    return np_image, is_bgr

load_image_base64 ¶

load_image_base64(value, cv_imread_flags=cv2.IMREAD_COLOR)

Loads an image from a base64 encoded string using OpenCV.

Parameters:

Name	Type	Description	Default
`value`	`str`	Base64 encoded string representing the image.	required

Returns:

Type	Description
`ndarray`	np.ndarray: The loaded image as a numpy array.

Source code in inference/core/utils/image_utils.py

def load_image_base64(
    value: Union[str, bytes], cv_imread_flags=cv2.IMREAD_COLOR
) -> np.ndarray:
    """Loads an image from a base64 encoded string using OpenCV.

    Args:
        value (str): Base64 encoded string representing the image.

    Returns:
        np.ndarray: The loaded image as a numpy array.
    """
    # New routes accept images via json body (str), legacy routes accept bytes which need to be decoded as strings
    if not isinstance(value, str):
        try:
            value = value.decode("utf-8")
        except UnicodeDecodeError:
            raise InputImageLoadError(
                message="Could not decode image bytes as base64 string - the payload appears to be raw image bytes, not a base64-encoded string.",
                public_message="Invalid base64 input: the image payload contains raw bytes instead of a base64-encoded string. Please base64-encode the image before sending.",
            )
    value = BASE64_DATA_TYPE_PATTERN.sub("", value)
    try:
        value = pybase64.b64decode(value)
    except binascii.Error as error:
        raise InputImageLoadError(
            message="Could not load valid image from base64 string.",
            public_message="Malformed base64 input image.",
        ) from error
    if len(value) == 0:
        raise InputImageLoadError(
            message="Could not load valid image from base64 string.",
            public_message="Empty image payload.",
        )
    image_np = np.frombuffer(value, np.uint8)
    result = cv2.imdecode(image_np, cv_imread_flags)
    if result is None:
        raise InputImageLoadError(
            message="Could not load valid image from base64 string.",
            public_message="Malformed base64 input image.",
        )
    return result

load_image_from_buffer ¶

load_image_from_buffer(
    value, cv_imread_flags=cv2.IMREAD_COLOR
)

Loads an image from a multipart-encoded input.

Parameters:

Name	Type	Description	Default
`value`	`Any`	Multipart-encoded input representing the image.	required

Returns:

Type	Description
`ndarray`	Image.Image: The loaded PIL image.

Source code in inference/core/utils/image_utils.py

def load_image_from_buffer(
    value: _IOBase,
    cv_imread_flags: int = cv2.IMREAD_COLOR,
) -> np.ndarray:
    """Loads an image from a multipart-encoded input.

    Args:
        value (Any): Multipart-encoded input representing the image.

    Returns:
        Image.Image: The loaded PIL image.
    """
    value.seek(0)
    image_np = np.frombuffer(value.read(), np.uint8)
    result = cv2.imdecode(image_np, cv_imread_flags)
    if result is None:
        raise InputImageLoadError(
            message="Could not load valid image from buffer.",
            public_message="Could not decode bytes into image.",
        )
    return result

load_image_from_encoded_bytes ¶

load_image_from_encoded_bytes(
    value, cv_imread_flags=cv2.IMREAD_COLOR
)

Load an image from encoded bytes.

Parameters:

Name	Type	Description	Default
`value`	`bytes`	The byte sequence representing the image.	required
`cv_imread_flags`	`int`	OpenCV flags used for image reading.	`IMREAD_COLOR`

Returns:

Type	Description
`ndarray`	np.ndarray: The loaded image as a numpy array.

Source code in inference/core/utils/image_utils.py

def load_image_from_encoded_bytes(
    value: bytes, cv_imread_flags: int = cv2.IMREAD_COLOR
) -> np.ndarray:
    """
    Load an image from encoded bytes.

    Args:
        value (bytes): The byte sequence representing the image.
        cv_imread_flags (int): OpenCV flags used for image reading.

    Returns:
        np.ndarray: The loaded image as a numpy array.
    """
    image_np = np.asarray(bytearray(value), dtype=np.uint8)
    image = cv2.imdecode(image_np, cv_imread_flags)
    if image is None:
        raise InputImageLoadError(
            message=f"Could not decode bytes as image.",
            public_message="Data is not image.",
        )
    return image

load_image_from_numpy_str ¶

load_image_from_numpy_str(value)

Loads an image from a numpy array string.

Parameters:

Name	Type	Description	Default
`value`	`Union[bytes, str]`	Base64 string or byte sequence representing the pickled numpy array of the image.	required

Returns:

Type	Description
`ndarray`	Image.Image: The loaded PIL image.

Raises:

Type	Description
`InvalidNumpyInput`	If the numpy data is invalid.

Source code in inference/core/utils/image_utils.py

def load_image_from_numpy_str(value: Union[bytes, str]) -> np.ndarray:
    """Loads an image from a numpy array string.

    Args:
        value (Union[bytes, str]): Base64 string or byte sequence representing the pickled numpy array of the image.

    Returns:
        Image.Image: The loaded PIL image.

    Raises:
        InvalidNumpyInput: If the numpy data is invalid.
    """
    if not ALLOW_NUMPY_INPUT:
        raise InvalidImageTypeDeclared(
            message=f"NumPy image type is not supported in this configuration of `inference`.",
            public_message=f"NumPy image type is not supported in this configuration of `inference`.",
        )
    try:
        if isinstance(value, str):
            value = pybase64.b64decode(value)
        data = pickle.loads(value)
    except (EOFError, TypeError, pickle.UnpicklingError, binascii.Error) as error:
        raise InvalidNumpyInput(
            message=f"Could not unpickle image data. Cause: {error}",
            public_message="Could not deserialize pickle payload.",
        ) from error
    validate_numpy_image(data=data)
    return data

load_image_from_url ¶

load_image_from_url(
    value, cv_imread_flags=cv2.IMREAD_COLOR
)

Loads an image from a given URL.

Parameters:

Name	Type	Description	Default
`value`	`str`	URL of the image.	required

Returns:

Type	Description
`ndarray`	Image.Image: The loaded PIL image.

Source code in inference/core/utils/image_utils.py

def load_image_from_url(
    value: str, cv_imread_flags: int = cv2.IMREAD_COLOR
) -> np.ndarray:
    """Loads an image from a given URL.

    Args:
        value (str): URL of the image.

    Returns:
        Image.Image: The loaded PIL image.
    """
    _ensure_url_input_allowed()
    prepared_url = _validate_url_destination(value=value)
    try:
        image_bytes = _fetch_image_bytes_from_url(prepared_url=prepared_url)
    except URLAddressNotAllowedError as error:
        message = "URL points to a network destination that is not allowed."
        raise InputImageLoadError(
            message=f"{message} Details: {error}",
            public_message=message,
        ) from error
    except (RequestException, ConnectionError) as error:
        raise InputImageLoadError(
            message=f"Could not load image from url: {value}. Details: {error}",
            public_message="Data pointed by URL could not be decoded into image.",
        )
    return load_image_from_encoded_bytes(
        value=image_bytes, cv_imread_flags=cv_imread_flags
    )

load_image_with_inferred_type ¶

load_image_with_inferred_type(
    value, cv_imread_flags=cv2.IMREAD_COLOR
)

Load an image by inferring its type.

Parameters:

Name	Type	Description	Default
`value`	`Any`	The image data.	required
`cv_imread_flags`	`int`	Flags used for OpenCV's imread function.	`IMREAD_COLOR`

Returns:

Type	Description
`Tuple[ndarray, bool]`	Tuple[np.ndarray, bool]: Loaded image as a numpy array and a boolean indicating if the image is in BGR format.

Raises:

Type	Description
`NotImplementedError`	If the image type could not be inferred.

Source code in inference/core/utils/image_utils.py

def load_image_with_inferred_type(
    value: Any,
    cv_imread_flags: int = cv2.IMREAD_COLOR,
) -> Tuple[np.ndarray, bool]:
    """Load an image by inferring its type.

    Args:
        value (Any): The image data.
        cv_imread_flags (int): Flags used for OpenCV's imread function.

    Returns:
        Tuple[np.ndarray, bool]: Loaded image as a numpy array and a boolean indicating if the image is in BGR format.

    Raises:
        NotImplementedError: If the image type could not be inferred.
    """
    if isinstance(value, (np.ndarray, np.generic)):
        validate_numpy_image(data=value)
        return value, True
    elif isinstance(value, Image.Image):
        return np.asarray(value.convert("RGB")), False
    elif isinstance(value, str) and (value.startswith("http")):
        return load_image_from_url(value=value, cv_imread_flags=cv_imread_flags), True
    elif (
        isinstance(value, str)
        and ALLOW_LOADING_IMAGES_FROM_LOCAL_FILESYSTEM
        and os.path.isfile(value)
    ):
        return cv2.imread(value, cv_imread_flags), True
    else:
        return attempt_loading_image_from_string(
            value=value, cv_imread_flags=cv_imread_flags
        )

load_image_with_known_type ¶

load_image_with_known_type(
    value, image_type, cv_imread_flags=cv2.IMREAD_COLOR
)

Load an image using the known image type.

Supports various image types (e.g., NUMPY, PILLOW, etc.) and loads them into a numpy array format.

Parameters:

Name	Type	Description	Default
`value`	`Any`	The image data.	required
`image_type`	`ImageType`	The type of the image.	required
`cv_imread_flags`	`int`	Flags used for OpenCV's imread function.	`IMREAD_COLOR`

Returns:

Type	Description
`Tuple[ndarray, bool]`	Tuple[np.ndarray, bool]: A tuple of the loaded image as a numpy array and a boolean indicating if the image is in BGR format.

Source code in inference/core/utils/image_utils.py

def load_image_with_known_type(
    value: Any,
    image_type: ImageType,
    cv_imread_flags: int = cv2.IMREAD_COLOR,
) -> Tuple[np.ndarray, bool]:
    """Load an image using the known image type.

    Supports various image types (e.g., NUMPY, PILLOW, etc.) and loads them into a numpy array format.

    Args:
        value (Any): The image data.
        image_type (ImageType): The type of the image.
        cv_imread_flags (int): Flags used for OpenCV's imread function.

    Returns:
        Tuple[np.ndarray, bool]: A tuple of the loaded image as a numpy array and a boolean indicating if the image is in BGR format.
    """
    if image_type is ImageType.FILE and not ALLOW_LOADING_IMAGES_FROM_LOCAL_FILESYSTEM:
        raise InputImageLoadError(
            message="Loading images from local filesystem is disabled.",
            public_message="Loading images from local filesystem is disabled.",
        )
    loader = IMAGE_LOADERS[image_type]
    is_bgr = True if image_type is not ImageType.PILLOW else False
    image = loader(value, cv_imread_flags)
    return image, is_bgr

np_image_to_base64 ¶

np_image_to_base64(image)

Convert a numpy image to a base64 encoded byte string.

Parameters:

Name	Type	Description	Default
`image`	`ndarray`	The numpy array representing an image.	required

Returns:

Name	Type	Description
`bytes`	`bytes`	The base64 encoded image.

Source code in inference/core/utils/image_utils.py

@deprecated(
    reason="Method replaced with inference.core.utils.image_utils.encode_image_to_jpeg_bytes"
)
def np_image_to_base64(image: np.ndarray) -> bytes:
    """
    TODO: This function is broken: https://github.com/roboflow/inference/issues/439
    Convert a numpy image to a base64 encoded byte string.

    Args:
        image (np.ndarray): The numpy array representing an image.

    Returns:
        bytes: The base64 encoded image.
    """
    image = Image.fromarray(image)
    with BytesIO() as buffer:
        image = image.convert("RGB")
        image.save(buffer, format="JPEG")
        buffer.seek(0)
        return buffer.getvalue()

validate_numpy_image ¶

validate_numpy_image(data)

Validate if the provided data is a valid numpy image.

Parameters:

Name	Type	Description	Default
`data`	`ndarray`	The numpy array representing an image.	required

Raises:

Type	Description
`InvalidNumpyInput`	If the provided data is not a valid numpy image.

Source code in inference/core/utils/image_utils.py

def validate_numpy_image(data: np.ndarray) -> None:
    """
    Validate if the provided data is a valid numpy image.

    Args:
        data (np.ndarray): The numpy array representing an image.

    Raises:
        InvalidNumpyInput: If the provided data is not a valid numpy image.
    """
    if not issubclass(type(data), np.ndarray):
        raise InvalidNumpyInput(
            message=f"Data provided as input could not be decoded into np.ndarray object.",
            public_message=f"Data provided as input could not be decoded into np.ndarray object.",
        )
    if len(data.shape) != 3 and len(data.shape) != 2:
        raise InvalidNumpyInput(
            message=f"For image given as np.ndarray expected 2 or 3 dimensions, got {len(data.shape)} dimensions.",
            public_message=f"For image given as np.ndarray expected 2 or 3 dimensions.",
        )
    if data.shape[-1] != 3 and data.shape[-1] != 1:
        raise InvalidNumpyInput(
            message=f"For image given as np.ndarray expected 1 or 3 channels, got {data.shape[-1]} channels.",
            public_message="For image given as np.ndarray expected 1 or 3 channels.",
        )

xyxy_to_xywh ¶

xyxy_to_xywh(xyxy)

Convert bounding box format from (xmin, ymin, xmax, ymax) to (xcenter, ycenter, width, height).

Parameters:

Name	Type	Description	Default
`xyxy`	`List[int]`	List containing the coordinates in (xmin, ymin, xmax, ymax) format.	required

Returns:

Type	Description
	List[int]: List containing the converted coordinates in (xcenter, ycenter, width, height) format.

Source code in inference/core/utils/image_utils.py

def xyxy_to_xywh(xyxy):
    """
    Convert bounding box format from (xmin, ymin, xmax, ymax) to (xcenter, ycenter, width, height).

    Args:
        xyxy (List[int]): List containing the coordinates in (xmin, ymin, xmax, ymax) format.

    Returns:
        List[int]: List containing the converted coordinates in (xcenter, ycenter, width, height) format.
    """
    x_temp = (xyxy[0] + xyxy[2]) / 2
    y_temp = (xyxy[1] + xyxy[3]) / 2
    w_temp = abs(xyxy[0] - xyxy[2])
    h_temp = abs(xyxy[1] - xyxy[3])

    return [int(x_temp), int(y_temp), int(w_temp), int(h_temp)]

inference.core.utils.onnx ¶

Functions:¶

get_onnxruntime_execution_providers ¶

get_onnxruntime_execution_providers(value)

Extracts the ONNX runtime execution providers from the given string.

The input string is expected to be a comma-separated list, possibly enclosed within square brackets and containing single quotes.

Parameters:

Name	Type	Description	Default
`value`	`str`	The string containing the list of ONNX runtime execution providers.	required

Returns:

Type	Description
`List[str]`	List[str]: A list of strings representing each execution provider.

Source code in inference/core/utils/onnx.py

def get_onnxruntime_execution_providers(value: str) -> List[str]:
    """Extracts the ONNX runtime execution providers from the given string.

    The input string is expected to be a comma-separated list, possibly enclosed
    within square brackets and containing single quotes.

    Args:
        value (str): The string containing the list of ONNX runtime execution providers.

    Returns:
        List[str]: A list of strings representing each execution provider.
    """
    if len(value) == 0:
        return []
    value = value.replace("[", "").replace("]", "").replace("'", "").replace(" ", "")
    return value.split(",")

inference.core.utils.postprocess ¶

Functions:¶

bitpacked_masks2poly ¶

bitpacked_masks2poly(bitpacked_masks, width)

Convert bit-packed masks with 8 pixels per byte into polygons.

Source code in inference/core/utils/postprocess.py

def bitpacked_masks2poly(bitpacked_masks: np.ndarray, width: int) -> List[np.ndarray]:
    """Convert bit-packed masks with 8 pixels per byte into polygons."""
    segments = []
    for packed_mask in bitpacked_masks:
        packed = (
            packed_mask
            if packed_mask.flags.c_contiguous
            else np.ascontiguousarray(packed_mask)
        )
        unpacked = np.unpackbits(packed, axis=-1, bitorder="little")[..., :width]
        if not np.any(unpacked):
            segments.append(np.zeros((0, 2), dtype=np.float32))
            continue
        segments.append(mask2poly(unpacked))
    return segments

cosine_similarity ¶

cosine_similarity(a, b)

Compute the cosine similarity between two vectors.

Parameters:

Name	Type	Description	Default
`a`	`ndarray`	Vector A.	required
`b`	`ndarray`	Vector B.	required

Returns:

Name	Type	Description
`float`	`Union[number, ndarray]`	Cosine similarity between vectors A and B.

Source code in inference/core/utils/postprocess.py

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> Union[np.number, np.ndarray]:
    """
    Compute the cosine similarity between two vectors.

    Args:
        a (np.ndarray): Vector A.
        b (np.ndarray): Vector B.

    Returns:
        float: Cosine similarity between vectors A and B.
    """
    return np.dot(a, b) / np.sqrt(np.vdot(a, a) * np.vdot(b, b))

crop_mask ¶

crop_mask(masks, boxes)

"Crop" predicted masks by zeroing out everything not in the predicted bbox. Vectorized by Chong (thanks Chong).

Source code in inference/core/utils/postprocess.py

def crop_mask(masks: np.ndarray, boxes: np.ndarray) -> np.ndarray:
    """
    "Crop" predicted masks by zeroing out everything not in the predicted bbox.
    Vectorized by Chong (thanks Chong).

    Args:
        - masks should be a size [h, w, n] tensor of masks
        - boxes should be a size [n, 4] tensor of bbox coords in relative point form
    """

    n, h, w = masks.shape
    x1, y1, x2, y2 = np.split(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
    r = np.arange(w, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
    c = np.arange(h, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)

    masks = masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
    return masks

get_static_crop_dimensions ¶

get_static_crop_dimensions(
    orig_shape, preproc, disable_preproc_static_crop=False
)

Generates a transformation based on preprocessing configuration.

Parameters:

Name	Type	Description	Default
`orig_shape`	`tuple`	The original shape of the object (e.g., image) - (height, width).	required
`preproc`	`dict`	Preprocessing configuration dictionary, containing information such as static cropping.	required
`disable_preproc_static_crop`	`bool`	If true, the static crop preprocessing step is disabled for this call. Default is False.	`False`

Returns:

Name	Type	Description
`tuple`	`Tuple[Tuple[int, int], Tuple[int, int]]`	A tuple containing the shift in the x and y directions, and the updated original shape after cropping.

Source code in inference/core/utils/postprocess.py

def get_static_crop_dimensions(
    orig_shape: Tuple[int, int],
    preproc: dict,
    disable_preproc_static_crop: bool = False,
) -> Tuple[Tuple[int, int], Tuple[int, int]]:
    """
    Generates a transformation based on preprocessing configuration.

    Args:
        orig_shape (tuple): The original shape of the object (e.g., image) - (height, width).
        preproc (dict): Preprocessing configuration dictionary, containing information such as static cropping.
        disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.

    Returns:
        tuple: A tuple containing the shift in the x and y directions, and the updated original shape after cropping.
    """
    try:
        if static_crop_should_be_applied(
            preprocessing_config=preproc,
            disable_preproc_static_crop=disable_preproc_static_crop,
        ):
            x_min, y_min, x_max, y_max = standardise_static_crop(
                static_crop_config=preproc[STATIC_CROP_KEY]
            )
        else:
            x_min, y_min, x_max, y_max = 0, 0, 1, 1
        crop_shift_x, crop_shift_y = (
            round(x_min * orig_shape[1]),
            round(y_min * orig_shape[0]),
        )
        cropped_percent_x = x_max - x_min
        cropped_percent_y = y_max - y_min
        orig_shape = (
            round(orig_shape[0] * cropped_percent_y),
            round(orig_shape[1] * cropped_percent_x),
        )
        return (crop_shift_x, crop_shift_y), orig_shape
    except KeyError as error:
        raise PostProcessingError(
            f"Could not find a proper configuration key {error} in post-processing."
        )

mask2multipoly ¶

mask2multipoly(mask)

Find all contours in the mask and return them as a float32 array.

Parameters:

Name	Type	Description	Default
`mask`	`ndarray`	A binary mask.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Contours represented as a float32 array.

Source code in inference/core/utils/postprocess.py

def mask2multipoly(mask: np.ndarray) -> np.ndarray:
    """
    Find all contours in the mask and return them as a float32 array.

    Args:
        mask (np.ndarray): A binary mask.

    Returns:
        np.ndarray: Contours represented as a float32 array.
    """
    contours = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
    if contours:
        contours = [c.reshape(-1, 2).astype("float32") for c in contours]
    else:
        contours = [np.zeros((0, 2)).astype("float32")]
    return contours

mask2poly ¶

mask2poly(mask)

Find contours in the mask and return them as a float32 array.

Parameters:

Name	Type	Description	Default
`mask`	`ndarray`	A binary mask.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Contours represented as a float32 array.

Source code in inference/core/utils/postprocess.py

def mask2poly(mask: np.ndarray) -> np.ndarray:
    """
    Find contours in the mask and return them as a float32 array.

    Args:
        mask (np.ndarray): A binary mask.

    Returns:
        np.ndarray: Contours represented as a float32 array.
    """
    contours = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
    if contours:
        contours = np.array(
            contours[np.array([len(x) for x in contours]).argmax()]
        ).reshape(-1, 2)
    else:
        contours = np.zeros((0, 2))
    return contours.astype("float32")

masks2multipoly ¶

masks2multipoly(masks)

Converts binary masks to polygonal segments.

Parameters:

Name	Type	Description	Default
`masks`	`ndarray`	A set of binary masks, where masks are multiplied by 255 and converted to uint8 type.	required

Returns:

Name	Type	Description
`list`	`List[ndarray]`	A list of segments, where each segment is obtained by converting the corresponding mask.

Source code in inference/core/utils/postprocess.py

def masks2multipoly(masks: np.ndarray) -> List[np.ndarray]:
    """Converts binary masks to polygonal segments.

    Args:
        masks (numpy.ndarray): A set of binary masks, where masks are multiplied by 255 and converted to uint8 type.

    Returns:
        list: A list of segments, where each segment is obtained by converting the corresponding mask.
    """
    segments = []
    # Process per-mask to avoid allocating an entire N x H x W uint8 copy
    for mask in masks:
        # Fast-path: bool -> zero-copy uint8 view
        if mask.dtype == np.bool_:
            m_uint8 = mask
            if not m_uint8.flags.c_contiguous:
                m_uint8 = np.ascontiguousarray(m_uint8)
            m_uint8 = m_uint8.view(np.uint8)
        elif mask.dtype == np.uint8:
            m_uint8 = mask if mask.flags.c_contiguous else np.ascontiguousarray(mask)
        else:
            # Fallback: threshold to bool then view as uint8 (may allocate once)
            m_bool = mask > 0
            if not m_bool.flags.c_contiguous:
                m_bool = np.ascontiguousarray(m_bool)
            m_uint8 = m_bool.view(np.uint8)

        # Quickly skip empty masks
        if not np.any(m_uint8):
            segments.append([np.zeros((0, 2), dtype=np.float32)])
            continue

        segments.append(mask2multipoly(m_uint8))
    return segments

masks2poly ¶

masks2poly(masks)

Converts binary masks to polygonal segments.

Parameters:

Name	Type	Description	Default
`masks`	`ndarray`	A set of binary masks, where masks are multiplied by 255 and converted to uint8 type.	required

Returns:

Name	Type	Description
`list`	`List[ndarray]`	A list of segments, where each segment is obtained by converting the corresponding mask.

Source code in inference/core/utils/postprocess.py

def masks2poly(masks: np.ndarray) -> List[np.ndarray]:
    """Converts binary masks to polygonal segments.

    Args:
        masks (numpy.ndarray): A set of binary masks, where masks are multiplied by 255 and converted to uint8 type.

    Returns:
        list: A list of segments, where each segment is obtained by converting the corresponding mask.
    """
    segments = []
    # Process per-mask to avoid allocating an entire N x H x W uint8 copy
    for mask in masks:
        # Fast-path: bool -> zero-copy uint8 view
        if mask.dtype == np.bool_:
            m_uint8 = mask
            if not m_uint8.flags.c_contiguous:
                m_uint8 = np.ascontiguousarray(m_uint8)
            m_uint8 = m_uint8.view(np.uint8)
        elif mask.dtype == np.uint8:
            m_uint8 = mask if mask.flags.c_contiguous else np.ascontiguousarray(mask)
        else:
            # Fallback: threshold to bool then view as uint8 (may allocate once)
            m_bool = mask > 0
            if not m_bool.flags.c_contiguous:
                m_bool = np.ascontiguousarray(m_bool)
            m_uint8 = m_bool.view(np.uint8)

        # Quickly skip empty masks
        if not np.any(m_uint8):
            segments.append(np.zeros((0, 2), dtype=np.float32))
            continue

        segments.append(mask2poly(m_uint8))
    return segments

post_process_bboxes ¶

post_process_bboxes(
    predictions,
    infer_shape,
    img_dims,
    preproc,
    disable_preproc_static_crop=False,
    resize_method="Stretch to",
)

Postprocesses each patch of detections by scaling them to the original image coordinates and by shifting them based on a static crop preproc (if applied).

Parameters:

Name	Type	Description	Default
`predictions`	`List[List[List[float]]]`	The predictions output from NMS, indices are: batch x prediction x [x1, y1, x2, y2, ...].	required
`infer_shape`	`Tuple[int, int]`	The shape of the inference image.	required
`img_dims`	`List[Tuple[int, int]]`	The dimensions of the original image for each batch, indices are: batch x [height, width].	required
`preproc`	`dict`	Preprocessing configuration dictionary.	required
`disable_preproc_static_crop`	`bool`	If true, the static crop preprocessing step is disabled for this call. Default is False.	`False`
`resize_method`	`str`	Resize method for image. Defaults to "Stretch to".	`'Stretch to'`

Returns:

Type	Description
`List[List[List[float]]]`	List[List[List[float]]]: The scaled and shifted predictions, indices are: batch x prediction x [x1, y1, x2, y2, ...].

Source code in inference/core/utils/postprocess.py

def post_process_bboxes(
    predictions: List[List[List[float]]],
    infer_shape: Tuple[int, int],
    img_dims: List[Tuple[int, int]],
    preproc: dict,
    disable_preproc_static_crop: bool = False,
    resize_method: str = "Stretch to",
) -> List[List[List[float]]]:
    """
    Postprocesses each patch of detections by scaling them to the original image coordinates and by shifting them based on a static crop preproc (if applied).

    Args:
        predictions (List[List[List[float]]]): The predictions output from NMS, indices are: batch x prediction x [x1, y1, x2, y2, ...].
        infer_shape (Tuple[int, int]): The shape of the inference image.
        img_dims (List[Tuple[int, int]]): The dimensions of the original image for each batch, indices are: batch x [height, width].
        preproc (dict): Preprocessing configuration dictionary.
        disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.
        resize_method (str, optional): Resize method for image. Defaults to "Stretch to".

    Returns:
        List[List[List[float]]]: The scaled and shifted predictions, indices are: batch x prediction x [x1, y1, x2, y2, ...].
    """

    # Get static crop params
    scaled_predictions = []
    # Loop through batches
    for i, batch_predictions in enumerate(predictions):
        if len(batch_predictions) == 0:
            scaled_predictions.append([])
            continue
        np_batch_predictions = np.array(batch_predictions)
        # Get bboxes from predictions (x1,y1,x2,y2)
        predicted_bboxes = np_batch_predictions[:, :4]
        (crop_shift_x, crop_shift_y), origin_shape = get_static_crop_dimensions(
            img_dims[i],
            preproc,
            disable_preproc_static_crop=disable_preproc_static_crop,
        )
        if resize_method == "Stretch to":
            predicted_bboxes = stretch_bboxes(
                predicted_bboxes=predicted_bboxes,
                infer_shape=infer_shape,
                origin_shape=origin_shape,
            )
        elif (
            resize_method == "Fit (black edges) in"
            or resize_method == "Fit (white edges) in"
            or resize_method == "Fit (grey edges) in"
        ):
            predicted_bboxes = undo_image_padding_for_predicted_boxes(
                predicted_bboxes=predicted_bboxes,
                infer_shape=infer_shape,
                origin_shape=origin_shape,
            )
        predicted_bboxes = clip_boxes_coordinates(
            predicted_bboxes=predicted_bboxes,
            origin_shape=origin_shape,
        )
        predicted_bboxes = shift_bboxes(
            bboxes=predicted_bboxes,
            shift_x=crop_shift_x,
            shift_y=crop_shift_y,
        )
        np_batch_predictions[:, :4] = predicted_bboxes
        scaled_predictions.append(np_batch_predictions.tolist())
    return scaled_predictions

post_process_keypoints ¶

post_process_keypoints(
    predictions,
    keypoints_start_index,
    infer_shape,
    img_dims,
    preproc,
    disable_preproc_static_crop=False,
    resize_method="Stretch to",
)

Scales and shifts keypoints based on the given image shapes and preprocessing method.

This function performs polygon scaling and shifting based on the specified resizing method and pre-processing steps. The polygons are transformed according to the ratio and padding between two images.

Parameters:

Name	Type	Description	Default
`predictions`	`List[List[List[float]]]`	predictions from model	required
`keypoints_start_index`	`int`	offset in the 3rd dimension pointing where in the prediction start keypoints [(x, y, cfg), ...] for each keypoint class	required
`img_dims list of`	`tuple of int`	Shape of the source image (height, width).	required
`infer_shape`	`tuple of int`	Shape of the target image (height, width).	required
`preproc`	`object`	Preprocessing details used for generating the transformation.	required
`resize_method`	`str`	Resizing method, either "Stretch to", "Fit (black edges) in", "Fit (white edges) in", or "Fit (grey edges) in". Defaults to "Stretch to".	`'Stretch to'`
`disable_preproc_static_crop`	`bool`	flag to disable static crop	`False`

Source code in inference/core/utils/postprocess.py

def post_process_keypoints(
    predictions: List[List[List[float]]],
    keypoints_start_index: int,
    infer_shape: Tuple[int, int],
    img_dims: List[Tuple[int, int]],
    preproc: dict,
    disable_preproc_static_crop: bool = False,
    resize_method: str = "Stretch to",
) -> List[List[List[float]]]:
    """Scales and shifts keypoints based on the given image shapes and preprocessing method.

    This function performs polygon scaling and shifting based on the specified resizing method and
    pre-processing steps. The polygons are transformed according to the ratio and padding between two images.

    Args:
        predictions: predictions from model
        keypoints_start_index: offset in the 3rd dimension pointing where in the prediction start keypoints [(x, y, cfg), ...] for each keypoint class
        img_dims list of (tuple of int): Shape of the source image (height, width).
        infer_shape (tuple of int): Shape of the target image (height, width).
        preproc (object): Preprocessing details used for generating the transformation.
        resize_method (str, optional): Resizing method, either "Stretch to", "Fit (black edges) in", "Fit (white edges) in", or "Fit (grey edges) in". Defaults to "Stretch to".
        disable_preproc_static_crop: flag to disable static crop
    Returns:
        list of list of list: predictions with post-processed keypoints
    """
    # Get static crop params
    scaled_predictions = []
    # Loop through batches
    for i, batch_predictions in enumerate(predictions):
        if len(batch_predictions) == 0:
            scaled_predictions.append([])
            continue
        np_batch_predictions = np.array(batch_predictions)
        keypoints = np_batch_predictions[:, keypoints_start_index:]
        (crop_shift_x, crop_shift_y), origin_shape = get_static_crop_dimensions(
            img_dims[i],
            preproc,
            disable_preproc_static_crop=disable_preproc_static_crop,
        )
        if resize_method == "Stretch to":
            keypoints = stretch_keypoints(
                keypoints=keypoints,
                infer_shape=infer_shape,
                origin_shape=origin_shape,
            )
        elif (
            resize_method == "Fit (black edges) in"
            or resize_method == "Fit (white edges) in"
            or resize_method == "Fit (grey edges) in"
        ):
            keypoints = undo_image_padding_for_predicted_keypoints(
                keypoints=keypoints,
                infer_shape=infer_shape,
                origin_shape=origin_shape,
            )
        keypoints = clip_keypoints_coordinates(
            keypoints=keypoints, origin_shape=origin_shape
        )
        keypoints = shift_keypoints(
            keypoints=keypoints, shift_x=crop_shift_x, shift_y=crop_shift_y
        )
        np_batch_predictions[:, keypoints_start_index:] = keypoints
        scaled_predictions.append(np_batch_predictions.tolist())
    return scaled_predictions

post_process_polygons ¶

post_process_polygons(
    origin_shape,
    polys,
    infer_shape,
    preproc,
    resize_method="Stretch to",
)

Scales and shifts polygons based on the given image shapes and preprocessing method.

This function performs polygon scaling and shifting based on the specified resizing method and pre-processing steps. The polygons are transformed according to the ratio and padding between two images.

Parameters:

Name	Type	Description	Default
`origin_shape`	`tuple of int`	Shape of the source image (height, width).	required
`infer_shape`	`tuple of int`	Shape of the target image (height, width).	required
`polys`	`list of list of tuple`	List of polygons, where each polygon is represented by a list of (x, y) coordinates.	required
`preproc`	`object`	Preprocessing details used for generating the transformation.	required
`resize_method`	`str`	Resizing method, either "Stretch to", "Fit (black edges) in", "Fit (white edges) in", or "Fit (grey edges) in". Defaults to "Stretch to".	`'Stretch to'`

Returns:

Type	Description
`List[List[Tuple[float, float]]]`	list of list of tuple: A list of shifted and scaled polygons.

Source code in inference/core/utils/postprocess.py

def post_process_polygons(
    origin_shape: Tuple[int, int],
    polys: List[List[Tuple[float, float]]],
    infer_shape: Tuple[int, int],
    preproc: dict,
    resize_method: str = "Stretch to",
) -> List[List[Tuple[float, float]]]:
    """Scales and shifts polygons based on the given image shapes and preprocessing method.

    This function performs polygon scaling and shifting based on the specified resizing method and
    pre-processing steps. The polygons are transformed according to the ratio and padding between two images.

    Args:
        origin_shape (tuple of int): Shape of the source image (height, width).
        infer_shape (tuple of int): Shape of the target image (height, width).
        polys (list of list of tuple): List of polygons, where each polygon is represented by a list of (x, y) coordinates.
        preproc (object): Preprocessing details used for generating the transformation.
        resize_method (str, optional): Resizing method, either "Stretch to", "Fit (black edges) in", "Fit (white edges) in", or "Fit (grey edges) in". Defaults to "Stretch to".

    Returns:
        list of list of tuple: A list of shifted and scaled polygons.
    """
    (crop_shift_x, crop_shift_y), origin_shape = get_static_crop_dimensions(
        origin_shape, preproc
    )
    new_polys = []
    if resize_method == "Stretch to":
        width_ratio = origin_shape[1] / infer_shape[1]
        height_ratio = origin_shape[0] / infer_shape[0]
        new_polys = scale_polygons(
            polygons=polys,
            x_scale=width_ratio,
            y_scale=height_ratio,
        )
    elif resize_method in {
        "Fit (black edges) in",
        "Fit (white edges) in",
        "Fit (grey edges) in",
    }:
        new_polys = undo_image_padding_for_predicted_polygons(
            polygons=polys,
            infer_shape=infer_shape,
            origin_shape=origin_shape,
        )
    shifted_polys = []
    for poly in new_polys:
        poly = [(p[0] + crop_shift_x, p[1] + crop_shift_y) for p in poly]
        shifted_polys.append(poly)
    return shifted_polys

process_mask_accurate ¶

process_mask_accurate(protos, masks_in, bboxes, shape)

Returns masks that are the size of the original image.

Parameters:

Name	Type	Description	Default
`protos`	`ndarray`	Prototype masks.	required
`masks_in`	`ndarray`	Input masks.	required
`bboxes`	`ndarray`	Bounding boxes.	required
`shape`	`tuple`	Target shape.	required

Returns:

Type	Description
`ndarray`	numpy.ndarray: Processed masks.

Source code in inference/core/utils/postprocess.py

def process_mask_accurate(
    protos: np.ndarray,
    masks_in: np.ndarray,
    bboxes: np.ndarray,
    shape: Tuple[int, int],
) -> np.ndarray:
    """Returns masks that are the size of the original image.

    Args:
        protos (numpy.ndarray): Prototype masks.
        masks_in (numpy.ndarray): Input masks.
        bboxes (numpy.ndarray): Bounding boxes.
        shape (tuple): Target shape.

    Returns:
        numpy.ndarray: Processed masks.
    """
    masks = preprocess_segmentation_masks(
        protos=protos,
        masks_in=masks_in,
        shape=shape,
    )
    # Order = 1 -> bilinear
    if len(masks.shape) == 2:
        masks = np.expand_dims(masks, axis=0)
    masks = masks.transpose((1, 2, 0))
    masks = cv2.resize(masks, (shape[1], shape[0]), cv2.INTER_LINEAR)
    if len(masks.shape) == 2:
        masks = np.expand_dims(masks, axis=2)
    masks = masks.transpose((2, 0, 1))
    masks = crop_mask(masks, bboxes)
    masks[masks < 0.5] = 0
    return masks

process_mask_fast ¶

process_mask_fast(protos, masks_in, bboxes, shape)

Returns masks in their original size.

Parameters:

Name	Type	Description	Default
`protos`	`ndarray`	Prototype masks.	required
`masks_in`	`ndarray`	Input masks.	required
`bboxes`	`ndarray`	Bounding boxes.	required
`shape`	`tuple`	Target shape.	required

Returns:

Type	Description
`ndarray`	numpy.ndarray: Processed masks.

Source code in inference/core/utils/postprocess.py

def process_mask_fast(
    protos: np.ndarray,
    masks_in: np.ndarray,
    bboxes: np.ndarray,
    shape: Tuple[int, int],
) -> np.ndarray:
    """Returns masks in their original size.

    Args:
        protos (numpy.ndarray): Prototype masks.
        masks_in (numpy.ndarray): Input masks.
        bboxes (numpy.ndarray): Bounding boxes.
        shape (tuple): Target shape.

    Returns:
        numpy.ndarray: Processed masks.
    """
    ih, iw = shape
    c, mh, mw = protos.shape  # CHW
    masks = preprocess_segmentation_masks(
        protos=protos,
        masks_in=masks_in,
        shape=shape,
    )
    down_sampled_boxes = scale_bboxes(
        bboxes=deepcopy(bboxes),
        scale_x=mw / iw,
        scale_y=mh / ih,
    )
    masks = crop_mask(masks, down_sampled_boxes)
    masks[masks < 0.5] = 0
    return masks

process_mask_tradeoff ¶

process_mask_tradeoff(
    protos, masks_in, bboxes, shape, tradeoff_factor
)

Returns masks that are the size of the original image with a tradeoff factor applied.

Parameters:

Name	Type	Description	Default
`protos`	`ndarray`	Prototype masks.	required
`masks_in`	`ndarray`	Input masks.	required
`bboxes`	`ndarray`	Bounding boxes.	required
`shape`	`tuple`	Target shape.	required
`tradeoff_factor`	`float`	Tradeoff factor for resizing masks.	required

Returns:

Type	Description
`ndarray`	numpy.ndarray: Processed masks.

Source code in inference/core/utils/postprocess.py

def process_mask_tradeoff(
    protos: np.ndarray,
    masks_in: np.ndarray,
    bboxes: np.ndarray,
    shape: Tuple[int, int],
    tradeoff_factor: float,
) -> np.ndarray:
    """Returns masks that are the size of the original image with a tradeoff factor applied.

    Args:
        protos (numpy.ndarray): Prototype masks.
        masks_in (numpy.ndarray): Input masks.
        bboxes (numpy.ndarray): Bounding boxes.
        shape (tuple): Target shape.
        tradeoff_factor (float): Tradeoff factor for resizing masks.

    Returns:
        numpy.ndarray: Processed masks.
    """
    c, mh, mw = protos.shape  # CHW
    masks = preprocess_segmentation_masks(
        protos=protos,
        masks_in=masks_in,
        shape=shape,
    )

    # Order = 1 -> bilinear
    if len(masks.shape) == 2:
        masks = np.expand_dims(masks, axis=0)
    masks = masks.transpose((1, 2, 0))
    ih, iw = shape
    h = int(mh * (1 - tradeoff_factor) + ih * tradeoff_factor)
    w = int(mw * (1 - tradeoff_factor) + iw * tradeoff_factor)
    size = (h, w)
    if tradeoff_factor != 0:
        masks = cv2.resize(masks, size, cv2.INTER_LINEAR)
    if len(masks.shape) == 2:
        masks = np.expand_dims(masks, axis=2)
    masks = masks.transpose((2, 0, 1))
    c, mh, mw = masks.shape
    down_sampled_boxes = scale_bboxes(
        bboxes=deepcopy(bboxes),
        scale_x=mw / iw,
        scale_y=mh / ih,
    )
    masks = crop_mask(masks, down_sampled_boxes)
    masks[masks < 0.5] = 0
    return masks

sigmoid ¶

sigmoid(x)

Computes the sigmoid function for the given input.

The sigmoid function is defined as: f(x) = 1 / (1 + exp(-x))

Parameters:

Name	Type	Description	Default
`x`	`float or ndarray`	Input value or array for which the sigmoid function is to be computed.	required

Returns:

Type	Description
`Union[float, number, ndarray]`	float or numpy.ndarray: The computed sigmoid value(s).

Source code in inference/core/utils/postprocess.py

def sigmoid(x: Union[float, np.ndarray]) -> Union[float, np.number, np.ndarray]:
    """Computes the sigmoid function for the given input.

    The sigmoid function is defined as:
    f(x) = 1 / (1 + exp(-x))

    Args:
        x (float or numpy.ndarray): Input value or array for which the sigmoid function is to be computed.

    Returns:
        float or numpy.ndarray: The computed sigmoid value(s).
    """
    return 1 / (1 + np.exp(-x))

inference.core.utils.preprocess ¶

Functions:¶

letterbox_image ¶

letterbox_image(image, desired_size, color=(0, 0, 0))

Resize and pad image to fit the desired size, preserving its aspect ratio.

Parameters: - image: numpy array representing the image. - desired_size: tuple (width, height) representing the target dimensions. - color: tuple (B, G, R) representing the color to pad with.

Returns: - letterboxed image.

Source code in inference/core/utils/preprocess.py

def letterbox_image(
    image: ImageMetaType,
    desired_size: Tuple[int, int],
    color: Tuple[int, int, int] = (0, 0, 0),
) -> ImageMetaType:
    """
    Resize and pad image to fit the desired size, preserving its aspect ratio.

    Parameters:
    - image: numpy array representing the image.
    - desired_size: tuple (width, height) representing the target dimensions.
    - color: tuple (B, G, R) representing the color to pad with.

    Returns:
    - letterboxed image.
    """
    resized_img = resize_image_keeping_aspect_ratio(
        image=image,
        desired_size=desired_size,
    )
    new_height, new_width = (
        resized_img.shape[:2]
        if isinstance(resized_img, np.ndarray)
        else resized_img.shape[-2:]
    )
    top_padding = (desired_size[1] - new_height) // 2
    bottom_padding = desired_size[1] - new_height - top_padding
    left_padding = (desired_size[0] - new_width) // 2
    right_padding = desired_size[0] - new_width - left_padding
    if isinstance(resized_img, np.ndarray):
        return cv2.copyMakeBorder(
            resized_img,
            top_padding,
            bottom_padding,
            left_padding,
            right_padding,
            cv2.BORDER_CONSTANT,
            value=color,
        )
    elif USE_PYTORCH_FOR_PREPROCESSING:
        return torch.nn.functional.pad(
            resized_img,
            (left_padding, right_padding, top_padding, bottom_padding),
            "constant",
            color[0],
        )
    else:
        raise ValueError(
            f"Received an image of unknown type, {type(resized_img)}; "
            "This is most likely a bug. Contact Roboflow team through github issues "
            "(https://github.com/roboflow/inference/issues) providing full context of the problem"
        )

prepare ¶

prepare(
    image,
    preproc,
    disable_preproc_contrast=False,
    disable_preproc_grayscale=False,
    disable_preproc_static_crop=False,
)

Prepares an image by applying a series of preprocessing steps defined in the preproc dictionary.

Parameters:

Name	Type	Description	Default
`image`	`Image`	The input PIL image object.	required
`preproc`	`dict`	Dictionary containing preprocessing steps. Example: { "resize": {"enabled": true, "width": 416, "height": 416, "format": "Stretch to"}, "static-crop": {"y_min": 25, "x_max": 75, "y_max": 75, "enabled": true, "x_min": 25}, "auto-orient": {"enabled": true}, "grayscale": {"enabled": true}, "contrast": {"enabled": true, "type": "Adaptive Equalization"} }	required
`disable_preproc_contrast`	`bool`	If true, the contrast preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_grayscale`	`bool`	If true, the grayscale preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_static_crop`	`bool`	If true, the static crop preprocessing step is disabled for this call. Default is False.	`False`

Returns:

Name	Type	Description
	`ndarray`	PIL.Image.Image: The preprocessed image object.
`tuple`	`Tuple[int, int]`	The dimensions of the image.

Note

The function uses global flags like DISABLE_PREPROC_AUTO_ORIENT, DISABLE_PREPROC_STATIC_CROP, etc. to conditionally enable or disable certain preprocessing steps.

Source code in inference/core/utils/preprocess.py

def prepare(
    image: np.ndarray,
    preproc,
    disable_preproc_contrast: bool = False,
    disable_preproc_grayscale: bool = False,
    disable_preproc_static_crop: bool = False,
) -> Tuple[np.ndarray, Tuple[int, int]]:
    """
    Prepares an image by applying a series of preprocessing steps defined in the `preproc` dictionary.

    Args:
        image (PIL.Image.Image): The input PIL image object.
        preproc (dict): Dictionary containing preprocessing steps. Example:
            {
                "resize": {"enabled": true, "width": 416, "height": 416, "format": "Stretch to"},
                "static-crop": {"y_min": 25, "x_max": 75, "y_max": 75, "enabled": true, "x_min": 25},
                "auto-orient": {"enabled": true},
                "grayscale": {"enabled": true},
                "contrast": {"enabled": true, "type": "Adaptive Equalization"}
            }
        disable_preproc_contrast (bool, optional): If true, the contrast preprocessing step is disabled for this call. Default is False.
        disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
        disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.

    Returns:
        PIL.Image.Image: The preprocessed image object.
        tuple: The dimensions of the image.

    Note:
        The function uses global flags like `DISABLE_PREPROC_AUTO_ORIENT`, `DISABLE_PREPROC_STATIC_CROP`, etc.
        to conditionally enable or disable certain preprocessing steps.
    """
    try:
        if isinstance(image, np.ndarray):
            h, w = image.shape[0:2]
        elif USE_PYTORCH_FOR_PREPROCESSING:
            h, w = image.shape[-2:]
        else:
            raise ValueError(
                f"Received an image of unknown type, {type(image)}; "
                "This is most likely a bug. Contact Roboflow team through github issues "
                "(https://github.com/roboflow/inference/issues) providing full context of the problem"
            )

        img_dims = (h, w)
        if static_crop_should_be_applied(
            preprocessing_config=preproc,
            disable_preproc_static_crop=disable_preproc_static_crop,
        ):
            image = take_static_crop(
                image=image, crop_parameters=preproc[STATIC_CROP_KEY]
            )
        if contrast_adjustments_should_be_applied(
            preprocessing_config=preproc,
            disable_preproc_contrast=disable_preproc_contrast,
        ):
            adjustment_type = ContrastAdjustmentType(preproc[CONTRAST_KEY][TYPE_KEY])
            image = apply_contrast_adjustment(
                image=image, adjustment_type=adjustment_type
            )
        if grayscale_conversion_should_be_applied(
            preprocessing_config=preproc,
            disable_preproc_grayscale=disable_preproc_grayscale,
        ):
            image = apply_grayscale_conversion(image=image)
        return image, img_dims
    except KeyError as error:
        raise PreProcessingError(
            f"Pre-processing of image failed due to misconfiguration. Missing key: {error}."
        ) from error

resize_image_keeping_aspect_ratio ¶

resize_image_keeping_aspect_ratio(image, desired_size)

Resize reserving its aspect ratio.

Parameters: - image: numpy array representing the image. - desired_size: tuple (width, height) representing the target dimensions.

Source code in inference/core/utils/preprocess.py

def resize_image_keeping_aspect_ratio(
    image: ImageMetaType,
    desired_size: Tuple[int, int],
) -> ImageMetaType:
    """
    Resize reserving its aspect ratio.

    Parameters:
    - image: numpy array representing the image.
    - desired_size: tuple (width, height) representing the target dimensions.
    """
    if isinstance(image, np.ndarray):
        img_ratio = image.shape[1] / image.shape[0]
    elif USE_PYTORCH_FOR_PREPROCESSING:
        img_ratio = image.shape[-1] / image.shape[-2]
    else:
        raise ValueError(
            f"Received an image of unknown type, {type(image)}; "
            "This is most likely a bug. Contact Roboflow team through github issues "
            "(https://github.com/roboflow/inference/issues) providing full context of the problem"
        )
    desired_ratio = desired_size[0] / desired_size[1]

    # Determine the new dimensions
    if img_ratio >= desired_ratio:
        # Resize by width
        new_width = desired_size[0]
        new_height = int(desired_size[0] / img_ratio)
    else:
        # Resize by height
        new_height = desired_size[1]
        new_width = int(desired_size[1] * img_ratio)

    # Resize the image to new dimensions
    if isinstance(image, np.ndarray):
        return cv2.resize(image, (new_width, new_height))
    elif USE_PYTORCH_FOR_PREPROCESSING:
        return torch.nn.functional.interpolate(
            image, size=(new_height, new_width), mode="bilinear"
        )
    else:
        raise ValueError(
            f"Received an image of unknown type, {type(image)}; "
            "This is most likely a bug. Contact Roboflow team through github issues "
            "(https://github.com/roboflow/inference/issues) providing full context of the problem"
        )

inference.core.utils.rle_to_polygon ¶

COCO RLE to OpenCV-style polygon conversion.

Functions:¶

rle_masks_to_polygons ¶

rle_masks_to_polygons(masks)

Convert COCO RLE masks into the legacy largest external polygon.

The old adapter path decoded every RLE into a full-frame dense mask and then called cv2.findContours(mask, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE). This path keeps the RLE sparse until the final contour step, where it materializes only the foreground bounding crop needed by OpenCV.

Source code in inference/core/utils/rle_to_polygon.py

def rle_masks_to_polygons(masks: object) -> List[np.ndarray]:
    """Convert COCO RLE masks into the legacy largest external polygon.

    The old adapter path decoded every RLE into a full-frame dense mask and then
    called ``cv2.findContours(mask, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE)``. This
    path keeps the RLE sparse until the final contour step, where it materializes
    only the foreground bounding crop needed by OpenCV.
    """

    height, width = masks.image_size
    sparse_counts = _get_lazy_uncompressed_counts(masks=masks)
    if sparse_counts is not None:
        counts, lengths = sparse_counts
        return [
            polygon_from_uncompressed_counts(
                counts=counts[i, : int(lengths[i])],
                height=height,
                width=width,
            )
            for i in range(lengths.shape[0])
        ]
    return [
        polygon_from_coco_counts(counts=counts, height=height, width=width)
        for counts in masks.masks
    ]

inference.core.utils.sqlite_wrapper ¶

inference.core.utils.torchscript_guard ¶

inference.core.utils.url_input ¶

SSRF-hardened primitives for fetching caller-supplied URL image input.

This module owns the network side of URL image loading: resolving hostnames, rejecting non-global destinations, pinning the connection to the validated IP so DNS rebinding cannot swap the target after validation, and (optionally) re-validating every redirect hop instead of letting requests follow redirects blindly.

The URL-string policy (scheme / FQDN / allow-list / block-list) stays in inference.core.utils.image_utils and is injected here as a validate_redirect callback so this module has no opinion about how a URL string is judged.

Background: GHSA-hjmm-hr52-vrp2.

Classes¶

SSRFProtectedHTTPAdapter ¶

Bases: HTTPAdapter

requests adapter that validates + pins the destination IP.

For hostname targets it resolves once, validates, and connects the pool to the resolved IP while preserving the original hostname for TLS SNI, cert verification, and the Host header — so a second resolution (rebinding) cannot redirect the socket. For IP-literal targets it validates the literal directly and lets requests connect normally.

The adapter is mounted on every hop, so even when requests follows redirects itself (legacy mode) each redirect connection is still validated.

Source code in inference/core/utils/url_input.py

class SSRFProtectedHTTPAdapter(HTTPAdapter):
    """``requests`` adapter that validates + pins the destination IP.

    For hostname targets it resolves once, validates, and connects the pool to
    the resolved IP while preserving the original hostname for TLS SNI, cert
    verification, and the ``Host`` header — so a second resolution (rebinding)
    cannot redirect the socket. For IP-literal targets it validates the literal
    directly and lets ``requests`` connect normally.

    The adapter is mounted on every hop, so even when ``requests`` follows
    redirects itself (legacy mode) each redirect connection is still validated.
    """

    def __init__(self, *, allow_non_global_addresses: bool, **kwargs):
        self._allow_non_global_addresses = allow_non_global_addresses
        super().__init__(**kwargs)

    def send(self, request, **kwargs):
        # Preserve the vhost/Host header when the pool is pinned to a raw IP.
        parsed = urllib3.util.parse_url(request.url)
        if parsed.host is not None and not _host_is_ip_literal(parsed.host):
            host_header = parsed.host
            if parsed.port is not None:
                host_header = f"{host_header}:{parsed.port}"
            request.headers["Host"] = host_header
        return super().send(request, **kwargs)

    def _resolve_pin_target(self, url: str) -> Optional[Tuple[str, str]]:
        """Return ``(hostname, pinned_ip)`` to pin, or ``None`` to connect
        normally. Raises :class:`URLAddressNotAllowedError` when the destination
        is a blocked non-global address.
        """
        parsed = urllib3.util.parse_url(url)
        host = parsed.host
        if host is None:
            return None
        scheme = parsed.scheme or "https"
        port = parsed.port or (443 if scheme == "https" else 80)
        if _host_is_ip_literal(host):
            literal = _strip_ipv6_brackets(host)
            if not self._allow_non_global_addresses and not address_is_global(literal):
                raise URLAddressNotAllowedError(
                    f"URL points to non-global address '{literal}'."
                )
            # The literal already is the validated address; connect directly.
            return None
        resolved_ips = resolve_and_validate_ips(
            host=host,
            port=port,
            allow_non_global_addresses=self._allow_non_global_addresses,
        )
        return host, resolved_ips[0]

    def _build_pinned_pool(
        self,
        host_params: Dict[str, Any],
        pool_kwargs: Dict[str, Any],
        hostname: str,
        pinned_ip: str,
    ) -> HTTPConnectionPool:
        # Reuse requests' own host params / TLS pool kwargs (so proxies, custom
        # CA bundles and mTLS client certs are preserved), but point the pool at
        # the validated IP and verify TLS against the original hostname.
        host_params = dict(host_params)
        host_params["host"] = pinned_ip
        pool_kwargs = dict(pool_kwargs)
        is_https = host_params.get("scheme") == "https"
        if is_https:
            # assert_hostname / server_hostname are HTTPS-only; forwarding them
            # to a plain HTTPConnection raises TypeError at connect time.
            pool_kwargs["assert_hostname"] = hostname
        pool = self.poolmanager.connection_from_host(
            **host_params, pool_kwargs=pool_kwargs
        )
        if is_https:
            # SNI must target the original hostname even though we dial the IP.
            pool.conn_kw["server_hostname"] = hostname
        return pool

    def get_connection_with_tls_context(self, request, verify, proxies=None, cert=None):
        # This is the method on the send() path for requests >= 2.32 (the pinned
        # floor). With a forward proxy the proxy resolves the target, so
        # client-side pinning is moot -> defer and keep proxy/CA/mTLS intact.
        if select_proxy(request.url, proxies):
            if not self._allow_non_global_addresses:
                _warn_proxy_bypasses_ssrf_protection()
            return super().get_connection_with_tls_context(
                request, verify, proxies=proxies, cert=cert
            )
        pin = self._resolve_pin_target(request.url)
        if pin is None:
            return super().get_connection_with_tls_context(
                request, verify, proxies=proxies, cert=cert
            )
        hostname, pinned_ip = pin
        host_params, pool_kwargs = self.build_connection_pool_key_attributes(
            request, verify, cert
        )
        return self._build_pinned_pool(host_params, pool_kwargs, hostname, pinned_ip)

    def get_connection(self, url, proxies=None):
        # Fallback for requests < 2.32 (below the pinned floor); kept so the
        # protection also holds if an older requests is somehow installed.
        if select_proxy(url, proxies):
            if not self._allow_non_global_addresses:
                _warn_proxy_bypasses_ssrf_protection()
            return super().get_connection(url, proxies)
        pin = self._resolve_pin_target(url)
        if pin is None:
            return super().get_connection(url, proxies)
        hostname, pinned_ip = pin
        parsed = urllib3.util.parse_url(url)
        scheme = parsed.scheme or "https"
        port = parsed.port or (443 if scheme == "https" else 80)
        host_params = {"scheme": scheme, "host": pinned_ip, "port": port}
        pool_kwargs = {"cert_reqs": "CERT_REQUIRED"} if scheme == "https" else {}
        return self._build_pinned_pool(host_params, pool_kwargs, hostname, pinned_ip)

URLAddressNotAllowedError ¶

Bases: Exception

Raised when a URL resolves to a destination that is not permitted.

Source code in inference/core/utils/url_input.py

class URLAddressNotAllowedError(Exception):
    """Raised when a URL resolves to a destination that is not permitted."""

Functions:¶

address_is_global ¶

address_is_global(address)

Return True only for public, routable unicast addresses.

ipaddress.is_global already excludes loopback, private (RFC1918), link-local (incl. 169.254.169.254 metadata), CGNAT (100.64/10), ULA (fc00::/7), unspecified and reserved ranges, so a single check covers the destinations the advisory asks us to block. IPv4-mapped IPv6 is unwrapped so ::ffff:127.0.0.1 cannot smuggle a loopback target past the check.

Source code in inference/core/utils/url_input.py

def address_is_global(address: str) -> bool:
    """Return True only for public, routable unicast addresses.

    ``ipaddress.is_global`` already excludes loopback, private (RFC1918),
    link-local (incl. 169.254.169.254 metadata), CGNAT (100.64/10), ULA
    (fc00::/7), unspecified and reserved ranges, so a single check covers the
    destinations the advisory asks us to block. IPv4-mapped IPv6 is unwrapped so
    ``::ffff:127.0.0.1`` cannot smuggle a loopback target past the check.
    """
    try:
        parsed = ipaddress.ip_address(address)
    except ValueError:
        return False
    if isinstance(parsed, ipaddress.IPv6Address) and parsed.ipv4_mapped is not None:
        parsed = parsed.ipv4_mapped
    return parsed.is_global

fetch_url_content_legacy ¶

fetch_url_content_legacy(
    url,
    allow_non_global_addresses,
    max_redirects,
    request_timeout=None,
)

Legacy fetch: requests follows redirects itself (capped at max_redirects). Behaviour matches the pre-hardening implementation; the only additions are the explicit redirect cap and — when non-global is disallowed — per-connection IP validation/pinning via the mounted adapter.

Source code in inference/core/utils/url_input.py

def fetch_url_content_legacy(
    url: str,
    allow_non_global_addresses: bool,
    max_redirects: int,
    request_timeout: Optional[float] = None,
) -> bytes:
    """Legacy fetch: ``requests`` follows redirects itself (capped at
    ``max_redirects``). Behaviour matches the pre-hardening implementation; the
    only additions are the explicit redirect cap and — when non-global is
    disallowed — per-connection IP validation/pinning via the mounted adapter.
    """
    _warn_legacy_redirect_handling()
    session = _build_ssrf_protected_session(allow_non_global_addresses)
    session.max_redirects = max_redirects
    try:
        response = session.get(
            url, stream=True, allow_redirects=True, timeout=request_timeout
        )
        api_key_safe_raise_for_status(response=response)
        return response.content
    finally:
        session.close()

fetch_url_content_validating_redirects ¶

fetch_url_content_validating_redirects(
    url,
    allow_non_global_addresses,
    max_redirects,
    validate_redirect,
    request_timeout=None,
)

Hardened fetch: follow redirects one hop at a time, re-running the full URL-string policy (via validate_redirect) on every hop before the next request is issued. IP validation/pinning is applied on every hop by the mounted adapter.

Source code in inference/core/utils/url_input.py

def fetch_url_content_validating_redirects(
    url: str,
    allow_non_global_addresses: bool,
    max_redirects: int,
    validate_redirect: Callable[[str], str],
    request_timeout: Optional[float] = None,
) -> bytes:
    """Hardened fetch: follow redirects one hop at a time, re-running the full
    URL-string policy (via ``validate_redirect``) on every hop before the next
    request is issued. IP validation/pinning is applied on every hop by the
    mounted adapter.
    """
    session = _build_ssrf_protected_session(allow_non_global_addresses)
    current_url = url
    try:
        for _ in range(max_redirects + 1):
            response = session.get(
                current_url,
                stream=True,
                allow_redirects=False,
                timeout=request_timeout,
            )
            if response.is_redirect:
                location = response.headers.get("Location")
                response.close()
                if not location:
                    raise requests.exceptions.RequestException(
                        "Redirect response did not contain a Location header."
                    )
                next_url = urllib.parse.urljoin(current_url, location)
                # Re-run scheme / FQDN / allow-list / block-list on the hop.
                current_url = validate_redirect(next_url)
                continue
            api_key_safe_raise_for_status(response=response)
            return response.content
        raise requests.exceptions.TooManyRedirects(
            f"Exceeded maximum of {max_redirects} redirects."
        )
    finally:
        session.close()

resolve_and_validate_ips ¶

resolve_and_validate_ips(
    host, port, allow_non_global_addresses
)

Resolve host and, unless non-global is allowed, require every resolved IP to be global. Returns the resolved IPs (validated ones first would be identical since all must pass).

Rejecting when any resolved address is non-global is deliberately conservative: it prevents a rebinding-style response that mixes a global and a non-global A-record from later steering the pinned connection to the non-global one.

Source code in inference/core/utils/url_input.py

def resolve_and_validate_ips(
    host: str,
    port: int,
    allow_non_global_addresses: bool,
) -> List[str]:
    """Resolve ``host`` and, unless non-global is allowed, require every
    resolved IP to be global. Returns the resolved IPs (validated ones first
    would be identical since all must pass).

    Rejecting when *any* resolved address is non-global is deliberately
    conservative: it prevents a rebinding-style response that mixes a global and
    a non-global A-record from later steering the pinned connection to the
    non-global one.
    """
    try:
        addr_infos = socket.getaddrinfo(host, port, proto=socket.IPPROTO_TCP)
    except socket.gaierror as error:
        # Unresolvable host is a normal connection failure, not an SSRF block.
        raise requests.exceptions.ConnectionError(
            f"Could not resolve host: {host}"
        ) from error
    resolved_ips = [info[4][0] for info in addr_infos]
    if not resolved_ips:
        raise requests.exceptions.ConnectionError(f"Could not resolve host: {host}")
    if not allow_non_global_addresses:
        for ip in resolved_ips:
            if not address_is_global(ip):
                raise URLAddressNotAllowedError(
                    f"Host '{host}' resolves to non-global address '{ip}'."
                )
    return resolved_ips

`core/workflows/core_steps/analytics/detection_event_log`¶

inference.core.workflows.core_steps.analytics.detection_event_log.v1 ¶

Classes¶

DetectionEvent `dataclass` ¶

Stores event data for a tracked detection.

Source code in inference/core/workflows/core_steps/analytics/detection_event_log/v1.py

@dataclass
class DetectionEvent:
    """Stores event data for a tracked detection."""

    tracker_id: int
    class_name: str
    first_seen_frame: int
    first_seen_timestamp: float  # Unix wall-clock time (frame_timestamp or time.time())
    last_seen_frame: int
    last_seen_timestamp: float  # Unix wall-clock time (frame_timestamp or time.time())
    first_seen_relative: float = 0.0  # seconds since video start
    last_seen_relative: float = 0.0  # seconds since video start
    frame_count: int = 1
    logged: bool = False

DetectionEventLogBlockV1 ¶

Bases: WorkflowBlock

Block that tracks detection events over time.

Maintains a dictionary of tracked objects with: - First seen timestamp and frame - Last seen timestamp and frame - Class name - Frame count (number of frames the object has been seen)

Only logs objects that have been seen for at least frame_threshold frames. Runs cleanup every flush_interval frames, removing events not seen for stale_frames.

Source code in inference/core/workflows/core_steps/analytics/detection_event_log/v1.py

class DetectionEventLogBlockV1(WorkflowBlock):
    """
    Block that tracks detection events over time.

    Maintains a dictionary of tracked objects with:
    - First seen timestamp and frame
    - Last seen timestamp and frame
    - Class name
    - Frame count (number of frames the object has been seen)

    Only logs objects that have been seen for at least frame_threshold frames.
    Runs cleanup every flush_interval frames, removing events not seen for stale_frames.
    """

    def __init__(self):
        # Dict[video_id, Dict[tracker_id, DetectionEvent]]
        self._event_logs: Dict[str, Dict[int, DetectionEvent]] = {}
        # Dict[video_id, last_flush_frame]
        self._last_flush_frame: Dict[str, int] = {}
        # Dict[video_id, frame_count] - internal frame counter (increments each run)
        self._frame_count: Dict[str, int] = {}
        # Dict[video_id, last_access_frame] - tracks when each video was last accessed (global frame count)
        self._last_access: Dict[str, int] = {}
        # Dict[video_id, first_frame_timestamp] - stores the first frame's wall-clock timestamp
        # Used as the anchor for frame_timestamp-based relative time calculation
        self._first_frame_timestamps: Dict[str, float] = {}
        # Global frame counter for tracking video access order
        self._global_frame: int = 0
        # Min-heap of (last_access_frame, video_id) for efficient oldest video lookup
        self._access_heap: List[Tuple[int, str]] = []

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return BlockManifest

    def _get_relative_time(
        self,
        video_id: str,
        metadata: VideoMetadata,
        fallback_fps: float,
    ) -> float:
        """Calculate relative time in seconds since video started.

        Uses frame_timestamp from metadata when available for accurate timing,
        even when inference doesn't run at the camera's reported FPS (e.g. due
        to dropped frames or processing lag). Falls back to metadata.frame_number
        / FPS when frame_timestamp is not available.
        """
        if metadata.frame_timestamp is not None:
            frame_ts = metadata.frame_timestamp.timestamp()
            if video_id not in self._first_frame_timestamps:
                self._first_frame_timestamps[video_id] = frame_ts
            return frame_ts - self._first_frame_timestamps[video_id]

        # Fallback: use actual video frame number (not internal counter) to
        # correctly account for dropped/skipped frames during inference.
        # frame_number=0 is a sentinel for static/non-video images, treat as first frame.
        fps = metadata.fps if metadata.fps and metadata.fps != 0 else fallback_fps
        return max(metadata.frame_number - 1, 0) / fps

    def _evict_oldest_video(self) -> None:
        """Remove the oldest video stream data when MAX_VIDEOS is exceeded."""
        if len(self._event_logs) <= MAX_VIDEOS:
            return

        # Rebuild heap if out of sync with current state
        if len(self._access_heap) < len(self._last_access):
            self._access_heap[:] = [
                (frame, vid) for vid, frame in self._last_access.items()
            ]
            heapq.heapify(self._access_heap)

        # Pop stale entries until we find a valid current entry
        while self._access_heap:
            frame, vid = heapq.heappop(self._access_heap)
            if self._last_access.get(vid) == frame and vid in self._event_logs:
                oldest_video_id = vid
                break
        else:
            # If heap is empty but we have event_logs, use fallback
            oldest_video_id = min(self._last_access, key=self._last_access.get)

        # Remove all data for this video
        self._event_logs.pop(oldest_video_id, None)
        self._last_flush_frame.pop(oldest_video_id, None)
        self._frame_count.pop(oldest_video_id, None)
        self._last_access.pop(oldest_video_id, None)
        self._first_frame_timestamps.pop(oldest_video_id, None)

    def _remove_stale_events(
        self,
        event_log: Dict[int, DetectionEvent],
        current_frame: int,
        stale_frames: int,
        frame_threshold: int,
    ) -> List[DetectionEvent]:
        """Remove events that haven't been seen for stale_frames.

        Returns list of removed LOGGED events (events that met frame_threshold).
        These are "complete" events - objects that were tracked long enough
        to be logged and have now left the scene.
        """
        stale_tracker_ids = []
        complete_events = []

        for tracker_id, event in event_log.items():
            frames_since_seen = current_frame - event.last_seen_frame
            if frames_since_seen > stale_frames:
                stale_tracker_ids.append(tracker_id)
                # Only return logged events as "complete" - pending events are just discarded
                if event.frame_count >= frame_threshold:
                    complete_events.append(event)

        for tracker_id in stale_tracker_ids:
            del event_log[tracker_id]

        return complete_events

    def run(
        self,
        image: WorkflowImageData,
        detections: sv.Detections,
        frame_threshold: int,
        flush_interval: int,
        stale_frames: int,
        fallback_fps: float = 1.0,
        reference_timestamp: Optional[float] = None,
    ) -> BlockResult:
        """Process detections and update the event log.

        Args:
            image: Workflow image data containing video metadata.
            detections: Tracked detections with tracker_id from ByteTracker.
            frame_threshold: Minimum frames an object must be seen before logging.
            flush_interval: How often to run stale event cleanup.
            stale_frames: Remove events not seen for this many frames.
            fallback_fps: FPS to use when video metadata doesn't provide FPS.
            reference_timestamp: Unused, kept for backward compatibility.

        Returns:
            Dictionary containing event_log, detections, total_logged, and total_pending.
        """
        metadata = image.video_metadata
        video_id = metadata.video_identifier

        # Track global frame count and video access for eviction
        self._global_frame += 1
        self._last_access[video_id] = self._global_frame

        # Increment internal frame counter
        current_frame = self._frame_count.get(video_id, 0) + 1
        self._frame_count[video_id] = current_frame

        current_time = self._get_relative_time(video_id, metadata, fallback_fps)

        # Use frame_timestamp for absolute time when available (reflects actual capture
        # time, not inference processing time). Falls back to time.time().
        current_absolute_time = (
            metadata.frame_timestamp.timestamp()
            if metadata.frame_timestamp is not None
            else time.time()
        )

        # Initialize event log for this video if needed
        event_log = self._event_logs.setdefault(video_id, {})

        # Evict oldest video if we've exceeded MAX_VIDEOS (after adding current video)
        self._evict_oldest_video()

        # Initialize last flush frame if not set
        if video_id not in self._last_flush_frame:
            self._last_flush_frame[video_id] = current_frame

        # Check if it's time to run cleanup
        complete_events_list = []
        last_flush = self._last_flush_frame.get(video_id, 0)
        if (current_frame - last_flush) >= flush_interval:
            complete_events_list = self._remove_stale_events(
                event_log, current_frame, stale_frames, frame_threshold
            )
            self._last_flush_frame[video_id] = current_frame

        # Format complete events
        complete_events = self._format_complete_events(complete_events_list)

        # Process detections
        if detections.tracker_id is None or len(detections.tracker_id) == 0:
            # No tracked detections, return current log
            event_log_dict, total_logged, total_pending = self._format_event_log(
                event_log, frame_threshold
            )
            return {
                OUTPUT_KEY: event_log_dict,
                DETECTIONS_OUTPUT_KEY: detections,
                "total_logged": total_logged,
                "total_pending": total_pending,
                "complete_events": complete_events,
            }

        # Get class names
        class_names = detections.data.get("class_name", [])
        if (
            len(class_names) == 0
            and hasattr(detections, "class_id")
            and detections.class_id is not None
        ):
            class_names = [f"class_{cid}" for cid in detections.class_id]

        # Update event log for each tracked detection
        for i, tracker_id in enumerate(detections.tracker_id):
            tracker_id = int(tracker_id)
            class_name = str(class_names[i]) if len(class_names) > 0 else "unknown"

            if tracker_id in event_log:
                # Update existing event
                event = event_log[tracker_id]
                event.last_seen_frame = current_frame
                event.last_seen_timestamp = current_absolute_time
                event.last_seen_relative = current_time
                event.frame_count += 1

                # Mark as logged once threshold is reached
                if event.frame_count >= frame_threshold and not event.logged:
                    event.logged = True
                    logger.debug(
                        f"Object {tracker_id} ({event.class_name}) logged after {event.frame_count} frames"
                    )
            else:
                # Create new event
                event_log[tracker_id] = DetectionEvent(
                    tracker_id=tracker_id,
                    class_name=class_name,
                    first_seen_frame=current_frame,
                    first_seen_timestamp=current_absolute_time,
                    last_seen_frame=current_frame,
                    last_seen_timestamp=current_absolute_time,
                    first_seen_relative=current_time,
                    last_seen_relative=current_time,
                    frame_count=1,
                    logged=False,
                )

        event_log_dict, total_logged, total_pending = self._format_event_log(
            event_log, frame_threshold
        )
        return {
            OUTPUT_KEY: event_log_dict,
            DETECTIONS_OUTPUT_KEY: detections,
            "total_logged": total_logged,
            "total_pending": total_pending,
            "complete_events": complete_events,
        }

    def _format_complete_events(
        self,
        complete_events: List[DetectionEvent],
    ) -> Dict[str, Any]:
        """Format complete events for output.

        Args:
            complete_events: List of DetectionEvent objects that have completed (gone stale).

        Returns:
            Dictionary with tracker_id as key and event data as value.
        """
        formatted = {}
        for event in complete_events:
            event_data = event.__dict__.copy()
            del event_data["logged"]
            formatted[str(event.tracker_id)] = event_data

        return formatted

    def _format_event_log(
        self,
        event_log: Dict[int, DetectionEvent],
        frame_threshold: int,
    ) -> tuple:
        """Format the event log for output.

        Returns:
            Tuple of (event_log_dict, total_logged, total_pending)
        """
        logged_events = {}
        pending_events = {}

        for tracker_id, event in event_log.items():
            event_data = event.__dict__.copy()
            del event_data["logged"]

            if event.frame_count >= frame_threshold:
                logged_events[str(tracker_id)] = event_data
            else:
                pending_events[str(tracker_id)] = event_data

        event_log_dict = {
            "logged": logged_events,
            "pending": pending_events,
        }

        return event_log_dict, len(logged_events), len(pending_events)

Methods:¶

run ¶

run(
    image,
    detections,
    frame_threshold,
    flush_interval,
    stale_frames,
    fallback_fps=1.0,
    reference_timestamp=None,
)

Process detections and update the event log.

Parameters:

Name	Type	Description	Default
`image`	`WorkflowImageData`	Workflow image data containing video metadata.	required
`detections`	`Detections`	Tracked detections with tracker_id from ByteTracker.	required
`frame_threshold`	`int`	Minimum frames an object must be seen before logging.	required
`flush_interval`	`int`	How often to run stale event cleanup.	required
`stale_frames`	`int`	Remove events not seen for this many frames.	required
`fallback_fps`	`float`	FPS to use when video metadata doesn't provide FPS.	`1.0`
`reference_timestamp`	`Optional[float]`	Unused, kept for backward compatibility.	`None`

Returns:

Type	Description
`BlockResult`	Dictionary containing event_log, detections, total_logged, and total_pending.

Source code in inference/core/workflows/core_steps/analytics/detection_event_log/v1.py

def run(
    self,
    image: WorkflowImageData,
    detections: sv.Detections,
    frame_threshold: int,
    flush_interval: int,
    stale_frames: int,
    fallback_fps: float = 1.0,
    reference_timestamp: Optional[float] = None,
) -> BlockResult:
    """Process detections and update the event log.

    Args:
        image: Workflow image data containing video metadata.
        detections: Tracked detections with tracker_id from ByteTracker.
        frame_threshold: Minimum frames an object must be seen before logging.
        flush_interval: How often to run stale event cleanup.
        stale_frames: Remove events not seen for this many frames.
        fallback_fps: FPS to use when video metadata doesn't provide FPS.
        reference_timestamp: Unused, kept for backward compatibility.

    Returns:
        Dictionary containing event_log, detections, total_logged, and total_pending.
    """
    metadata = image.video_metadata
    video_id = metadata.video_identifier

    # Track global frame count and video access for eviction
    self._global_frame += 1
    self._last_access[video_id] = self._global_frame

    # Increment internal frame counter
    current_frame = self._frame_count.get(video_id, 0) + 1
    self._frame_count[video_id] = current_frame

    current_time = self._get_relative_time(video_id, metadata, fallback_fps)

    # Use frame_timestamp for absolute time when available (reflects actual capture
    # time, not inference processing time). Falls back to time.time().
    current_absolute_time = (
        metadata.frame_timestamp.timestamp()
        if metadata.frame_timestamp is not None
        else time.time()
    )

    # Initialize event log for this video if needed
    event_log = self._event_logs.setdefault(video_id, {})

    # Evict oldest video if we've exceeded MAX_VIDEOS (after adding current video)
    self._evict_oldest_video()

    # Initialize last flush frame if not set
    if video_id not in self._last_flush_frame:
        self._last_flush_frame[video_id] = current_frame

    # Check if it's time to run cleanup
    complete_events_list = []
    last_flush = self._last_flush_frame.get(video_id, 0)
    if (current_frame - last_flush) >= flush_interval:
        complete_events_list = self._remove_stale_events(
            event_log, current_frame, stale_frames, frame_threshold
        )
        self._last_flush_frame[video_id] = current_frame

    # Format complete events
    complete_events = self._format_complete_events(complete_events_list)

    # Process detections
    if detections.tracker_id is None or len(detections.tracker_id) == 0:
        # No tracked detections, return current log
        event_log_dict, total_logged, total_pending = self._format_event_log(
            event_log, frame_threshold
        )
        return {
            OUTPUT_KEY: event_log_dict,
            DETECTIONS_OUTPUT_KEY: detections,
            "total_logged": total_logged,
            "total_pending": total_pending,
            "complete_events": complete_events,
        }

    # Get class names
    class_names = detections.data.get("class_name", [])
    if (
        len(class_names) == 0
        and hasattr(detections, "class_id")
        and detections.class_id is not None
    ):
        class_names = [f"class_{cid}" for cid in detections.class_id]

    # Update event log for each tracked detection
    for i, tracker_id in enumerate(detections.tracker_id):
        tracker_id = int(tracker_id)
        class_name = str(class_names[i]) if len(class_names) > 0 else "unknown"

        if tracker_id in event_log:
            # Update existing event
            event = event_log[tracker_id]
            event.last_seen_frame = current_frame
            event.last_seen_timestamp = current_absolute_time
            event.last_seen_relative = current_time
            event.frame_count += 1

            # Mark as logged once threshold is reached
            if event.frame_count >= frame_threshold and not event.logged:
                event.logged = True
                logger.debug(
                    f"Object {tracker_id} ({event.class_name}) logged after {event.frame_count} frames"
                )
        else:
            # Create new event
            event_log[tracker_id] = DetectionEvent(
                tracker_id=tracker_id,
                class_name=class_name,
                first_seen_frame=current_frame,
                first_seen_timestamp=current_absolute_time,
                last_seen_frame=current_frame,
                last_seen_timestamp=current_absolute_time,
                first_seen_relative=current_time,
                last_seen_relative=current_time,
                frame_count=1,
                logged=False,
            )

    event_log_dict, total_logged, total_pending = self._format_event_log(
        event_log, frame_threshold
    )
    return {
        OUTPUT_KEY: event_log_dict,
        DETECTIONS_OUTPUT_KEY: detections,
        "total_logged": total_logged,
        "total_pending": total_pending,
        "complete_events": complete_events,
    }

`core/workflows/core_steps/classical_cv/auto_rotate_on_edges`¶

inference.core.workflows.core_steps.classical_cv.auto_rotate_on_edges.v1 ¶

Classes¶

Functions:¶

build_auto_rotate_matrix ¶

build_auto_rotate_matrix(width, height, angle_degrees)

Build the canvas-expanding rotation matrix used to deskew an image.

This mirrors apply_rotate_image in inference/core/workflows/core_steps/classical_cv/image_preprocessing/v1.py EXACTLY (same center computed via integer // division, same cv2.getRotationMatrix2D call, same canvas-expansion math using int() truncation). This function is the single source of truth for that math: downstream consumers rebuild this matrix from (width, height, angle) and invert it to map detections made on the rotated image back into the original image's coordinate space. If you change this function, any such consumer MUST be re-verified against it.

Parameters¶

width : int Width (in pixels) of the ORIGINAL (pre-rotation) image. height : int Height (in pixels) of the ORIGINAL (pre-rotation) image. angle_degrees : float Rotation angle in degrees (OpenCV convention: positive is counter-clockwise).

Returns¶

np.ndarray The adjusted 2x3 affine rotation matrix, already translated to rotate about the original image's center and re-centered into the expanded output canvas.

Source code in inference/core/workflows/core_steps/classical_cv/auto_rotate_on_edges/v1.py

def build_auto_rotate_matrix(
    width: int, height: int, angle_degrees: float
) -> np.ndarray:
    """
    Build the canvas-expanding rotation matrix used to deskew an image.

    This mirrors `apply_rotate_image` in
    `inference/core/workflows/core_steps/classical_cv/image_preprocessing/v1.py`
    EXACTLY (same center computed via integer `//` division, same
    `cv2.getRotationMatrix2D` call, same canvas-expansion math using `int()`
    truncation). This function is the single source of truth for that math:
    downstream consumers rebuild this matrix from (width, height, angle) and
    invert it to map detections made on the rotated image back into the
    original image's coordinate space. If you change this function, any such
    consumer MUST be re-verified against it.

    Parameters
    ----------
    width : int
        Width (in pixels) of the ORIGINAL (pre-rotation) image.
    height : int
        Height (in pixels) of the ORIGINAL (pre-rotation) image.
    angle_degrees : float
        Rotation angle in degrees (OpenCV convention: positive is
        counter-clockwise).

    Returns
    -------
    np.ndarray
        The adjusted 2x3 affine rotation matrix, already translated to rotate
        about the original image's center and re-centered into the expanded
        output canvas.
    """
    center = (width // 2, height // 2)
    rotation_matrix = cv2.getRotationMatrix2D(center, angle_degrees, 1.0)

    cos = np.abs(rotation_matrix[0, 0])
    sin = np.abs(rotation_matrix[0, 1])
    new_width = int((height * sin) + (width * cos))
    new_height = int((height * cos) + (width * sin))

    rotation_matrix[0, 2] += (new_width / 2) - center[0]
    rotation_matrix[1, 2] += (new_height / 2) - center[1]

    return rotation_matrix

`core/workflows/core_steps/classical_cv/camera_focus`¶

inference.core.workflows.core_steps.classical_cv.camera_focus.v1 ¶

Classes¶

Functions:¶

calculate_brenner_measure ¶

calculate_brenner_measure(
    input_image,
    text_color=(255, 255, 255),
    text_thickness=2,
)

Brenner's focus measure.

Parameters¶

input_image : np.ndarray Image as HxW grayscale or HxWxC (1/2/3/4 channels); normalized to grayscale uint8 internally. text_color : Tuple[int, int, int], optional The color of the text displaying the Brenner value, in BGR format. Default is white (255, 255, 255). text_thickness : int, optional The thickness of the text displaying the Brenner value. Default is 2.

Returns¶

Tuple[np.ndarray, float] The Brenner image and the Brenner value.

Source code in inference/core/workflows/core_steps/classical_cv/camera_focus/v1.py

def calculate_brenner_measure(
    input_image: np.ndarray,
    text_color: Tuple[int, int, int] = (255, 255, 255),
    text_thickness: int = 2,
) -> Tuple[np.ndarray, float]:
    """
    Brenner's focus measure.

    Parameters
    ----------
    input_image : np.ndarray
        Image as HxW grayscale or HxWxC (1/2/3/4 channels); normalized to grayscale uint8 internally.
    text_color : Tuple[int, int, int], optional
        The color of the text displaying the Brenner value, in BGR format. Default is white (255, 255, 255).
    text_thickness : int, optional
        The thickness of the text displaying the Brenner value. Default is 2.

    Returns
    -------
    Tuple[np.ndarray, float]
        The Brenner image and the Brenner value.
    """
    gray_uint8 = _to_grayscale_uint8_for_brenner(input_image)

    # Convert image to 16-bit integer format
    converted_image = gray_uint8.astype(np.int16)

    # Get the dimensions of the image
    height, width = converted_image.shape

    # Initialize two matrices for horizontal and vertical focus measures
    horizontal_diff = np.zeros((height, width))
    vertical_diff = np.zeros((height, width))

    # Calculate horizontal and vertical focus measures
    horizontal_diff[:, : width - 2] = np.clip(
        converted_image[:, 2:] - converted_image[:, :-2], 0, None
    )
    vertical_diff[: height - 2, :] = np.clip(
        converted_image[2:, :] - converted_image[:-2, :], 0, None
    )

    # Calculate final focus measure
    focus_measure = np.max((horizontal_diff, vertical_diff), axis=0) ** 2

    # Convert focus measure matrix to 8-bit for visualization
    fm_max = float(focus_measure.max())
    if fm_max > 0:
        focus_measure_image = ((focus_measure / fm_max) * 255).astype(np.uint8)
    else:
        focus_measure_image = np.zeros(focus_measure.shape, dtype=np.uint8)

    # Display the Brenner value on the top left of the image
    cv2.putText(
        focus_measure_image,
        f"Focus value: {focus_measure.mean():.2f}",
        (10, 30),
        cv2.FONT_HERSHEY_SIMPLEX,
        1,
        text_color,
        text_thickness,
    )

    return focus_measure_image, focus_measure.mean()

inference.core.workflows.core_steps.classical_cv.camera_focus.v2 ¶

Classes¶

Functions:¶

visualize_tenengrad_measure ¶

visualize_tenengrad_measure(
    input_image,
    underexposed_threshold=16,
    overexposed_threshold=239,
    show_zebra_warnings=True,
    grid_overlay="3x3",
    show_hud=True,
    show_focus_peaking=True,
    show_center_marker=True,
    detections=None,
)

Tenengrad focus measure with visualization overlay.

Uses Sobel operators to compute gradient magnitudes as a focus metric. Higher values indicate sharper/more in-focus images.

Returns the input image unchanged if no visualizations are enabled.

Source code in inference/core/workflows/core_steps/classical_cv/camera_focus/v2.py

def visualize_tenengrad_measure(
    input_image: np.ndarray,
    underexposed_threshold: int = 16,
    overexposed_threshold: int = 239,
    show_zebra_warnings: bool = True,
    grid_overlay: str = "3x3",
    show_hud: bool = True,
    show_focus_peaking: bool = True,
    show_center_marker: bool = True,
    detections: Optional[sv.Detections] = None,
) -> Tuple[np.ndarray, float, List[Optional[float]]]:
    """
    Tenengrad focus measure with visualization overlay.

    Uses Sobel operators to compute gradient magnitudes as a focus metric.
    Higher values indicate sharper/more in-focus images.

    Returns the input image unchanged if no visualizations are enabled.
    """
    grid_divisions = GRID_DIVISIONS.get(grid_overlay, 0)
    any_visualization_enabled = (
        show_zebra_warnings
        or show_hud
        or show_focus_peaking
        or show_center_marker
        or grid_divisions > 0
    )

    gray, focus_measure, focus_value, bbox_focus_measures = _compute_tenengrad(
        input_image, detections
    )

    if not any_visualization_enabled:
        return input_image, focus_value, bbox_focus_measures

    if len(input_image.shape) == 3:
        output = input_image.copy()
    else:
        output = cv2.cvtColor(input_image, cv2.COLOR_GRAY2BGR)

    if show_zebra_warnings:
        output = _apply_zebra_warnings(
            output, gray, underexposed_threshold, overexposed_threshold
        )
    if show_focus_peaking:
        output = _apply_focus_peaking(output, focus_measure)
    if show_center_marker:
        output = _draw_center_marker(output)
    if grid_divisions > 0:
        output = _draw_grid(output, grid_divisions)
    if show_hud:
        output = _draw_hud_overlay(output, focus_value, gray, input_image)

    return output, focus_value, bbox_focus_measures

`core/workflows/core_steps/classical_cv/contours`¶

inference.core.workflows.core_steps.classical_cv.contours.v1 ¶

Classes¶

Functions:¶

find_and_draw_contours ¶

find_and_draw_contours(
    image, color=(255, 0, 255), thickness=3
)

Finds and draws contours on the image.

Parameters:

Name	Type	Description	Default
`image`	`ndarray`	Input thresholded image.	required
`color`	`tuple`	Color of the contour lines in BGR. Defaults to purple (255, 0, 255).	`(255, 0, 255)`
`thickness`	`int`	Thickness of the contour lines. Defaults to 3.	`3`

Returns:

Name	Type	Description
`tuple`	`Tuple[ndarray, int]`	Image with contours drawn and number of contours.

Source code in inference/core/workflows/core_steps/classical_cv/contours/v1.py

def find_and_draw_contours(
    image: np.ndarray, color: Tuple[int, int, int] = (255, 0, 255), thickness: int = 3
) -> Tuple[np.ndarray, int]:
    """
    Finds and draws contours on the image.

    Args:
        image (np.ndarray): Input thresholded image.
        color (tuple, optional): Color of the contour lines in BGR. Defaults to purple (255, 0, 255).
        thickness (int, optional): Thickness of the contour lines. Defaults to 3.

    Returns:
        tuple: Image with contours drawn and number of contours.
    """
    # If not in grayscale, convert to grayscale
    if len(image.shape) == 3 and image.shape[2] == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Find contours
    contours, hierarchy = cv2.findContours(
        image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )

    # Draw contours on a copy of the original image
    contour_image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    cv2.drawContours(contour_image, contours, -1, color, thickness)

    # Return the image with contours and the number of contours
    return contour_image, contours, hierarchy

`core/workflows/core_steps/classical_cv/contrast_enhancement`¶

inference.core.workflows.core_steps.classical_cv.contrast_enhancement.v1 ¶

Classes¶

Functions:¶

enhance_contrast ¶

enhance_contrast(
    image,
    clip_limit,
    contrast_multiplier=1.0,
    normalize_brightness=False,
)

Enhance image contrast using histogram normalization with optional brightness normalization.

Source code in inference/core/workflows/core_steps/classical_cv/contrast_enhancement/v1.py

def enhance_contrast(
    image: WorkflowImageData,
    clip_limit: int,
    contrast_multiplier: float = 1.0,
    normalize_brightness: bool = False,
) -> WorkflowImageData:
    """Enhance image contrast using histogram normalization with optional brightness normalization."""

    np_img = image.numpy_image.copy()
    clip_pct = float(clip_limit) / 100.0
    contrast_mult = float(contrast_multiplier)
    gamma_val = (
        1.0 / 1.3 if normalize_brightness else 1.0
    )  # Midtone brightening when enabled

    # Handle different image formats
    if len(np_img.shape) == 2:
        # Grayscale
        return _enhance_channel_contrast(
            image, np_img, clip_pct, contrast_mult, gamma_val
        )
    elif np_img.shape[2] == 1:
        # Single channel
        np_img = np_img[:, :, 0]
        return _enhance_channel_contrast(
            image, np_img, clip_pct, contrast_mult, gamma_val
        )
    elif np_img.shape[2] == 4:
        # BGRA: enhance BGR separately, keep alpha
        bgr = np_img[:, :, :3]
        alpha = np_img[:, :, 3:4]
        enhanced_bgr = _enhance_multichannel_contrast(
            bgr, clip_pct, contrast_mult, gamma_val
        )
        enhanced_img = np.concatenate([enhanced_bgr, alpha], axis=2)
        return WorkflowImageData.copy_and_replace(
            origin_image_data=image,
            numpy_image=enhanced_img.astype(np.uint8),
        )
    else:
        # BGR
        enhanced_img = _enhance_multichannel_contrast(
            np_img, clip_pct, contrast_mult, gamma_val
        )
        return WorkflowImageData.copy_and_replace(
            origin_image_data=image,
            numpy_image=enhanced_img.astype(np.uint8),
        )

`core/workflows/core_steps/classical_cv/distance_measurement`¶

inference.core.workflows.core_steps.classical_cv.distance_measurement.v1 ¶

Classes¶

Functions:¶

has_overlap ¶

has_overlap(bbox1, bbox2)

Check if two bounding boxes overlap.

Parameters:

Name	Type	Description	Default
`bbox1`	`Tuple[int, int, int, int]`	A tuple of (x_min, y_min, x_max, y_max) for the first bounding box.	required
`bbox2`	`Tuple[int, int, int, int]`	A tuple of (x_min, y_min, x_max, y_max) for the second bounding box.	required

Returns:

Type	Description
`bool`	True if the bounding boxes overlap, False otherwise.

Source code in inference/core/workflows/core_steps/classical_cv/distance_measurement/v1.py

def has_overlap(
    bbox1: Tuple[int, int, int, int], bbox2: Tuple[int, int, int, int]
) -> bool:
    """
    Check if two bounding boxes overlap.

    Args:
        bbox1: A tuple of (x_min, y_min, x_max, y_max) for the first bounding box.
        bbox2: A tuple of (x_min, y_min, x_max, y_max) for the second bounding box.

    Returns:
        True if the bounding boxes overlap, False otherwise.
    """
    x1_min, y1_min, x1_max, y1_max = bbox1
    x2_min, y2_min, x2_max, y2_max = bbox2

    if x1_max < x2_min or x2_max < x1_min:
        return False
    if y1_max < y2_min or y2_max < y1_min:
        return False

    return True

`core/workflows/core_steps/classical_cv/image_blur`¶

inference.core.workflows.core_steps.classical_cv.image_blur.v1 ¶

Classes¶

Functions:¶

apply_blur ¶

apply_blur(image, blur_type, ksize=5)

Applies the specified blur to the image.

Parameters:

Name	Type	Description	Default
`image`	`ndarray`	Input image.	required
`blur_type`	`str`	Type of blur ('average', 'gaussian', 'median', 'bilateral').	required
`ksize`	`int`	Kernel size for the blur. Defaults to 5.	`5`

Returns:

Type	Description
`ndarray`	np.ndarray: Blurred image.

Source code in inference/core/workflows/core_steps/classical_cv/image_blur/v1.py

def apply_blur(image: np.ndarray, blur_type: str, ksize: int = 5) -> np.ndarray:
    """
    Applies the specified blur to the image.

    Args:
        image: Input image.
        blur_type (str): Type of blur ('average', 'gaussian', 'median', 'bilateral').
        ksize (int, optional): Kernel size for the blur. Defaults to 5.

    Returns:
        np.ndarray: Blurred image.
    """

    if blur_type == "average":
        blurred_image = cv2.blur(image, (ksize, ksize))
    elif blur_type == "gaussian":
        ksize = _to_positive_odd(ksize)
        blurred_image = cv2.GaussianBlur(image, (ksize, ksize), 0)
    elif blur_type == "median":
        ksize = _to_positive_odd(ksize)
        blurred_image = cv2.medianBlur(image, ksize)
    elif blur_type == "bilateral":
        blurred_image = cv2.bilateralFilter(image, ksize, 75, 75)
    else:
        raise ValueError(f"Unknown blur type: {blur_type}")

    return blurred_image

`core/workflows/core_steps/classical_cv/mask_area_measurement`¶

inference.core.workflows.core_steps.classical_cv.mask_area_measurement.v1 ¶

Classes¶

Functions:¶

compute_detection_areas ¶

compute_detection_areas(detections)

Compute the area of all detections in square pixels.

For bounding-box-only detections, areas are computed in a single vectorized operation. For detections with segmentation masks, the area is the count of non-zero mask pixels (via cv2.countNonZero). This correctly handles masks with holes — hole pixels are zero and are not counted. Falls back to the bounding box area when the mask pixel count is zero.

Parameters:

Name	Type	Description	Default
`detections`	`Detections`	A supervision Detections object.	required

Returns:

Type	Description
`List[float]`	List of areas in square pixels, one per detection.

Source code in inference/core/workflows/core_steps/classical_cv/mask_area_measurement/v1.py

def compute_detection_areas(detections: sv.Detections) -> List[float]:
    """Compute the area of all detections in square pixels.

    For bounding-box-only detections, areas are computed in a single vectorized
    operation. For detections with segmentation masks, the area is the count of
    non-zero mask pixels (via ``cv2.countNonZero``). This correctly handles masks
    with holes — hole pixels are zero and are not counted. Falls back to the
    bounding box area when the mask pixel count is zero.

    Args:
        detections: A supervision Detections object.

    Returns:
        List of areas in square pixels, one per detection.
    """
    n = len(detections)
    if n == 0:
        return []

    areas = []
    for i in range(n):
        if detections.mask is not None:
            count = cv.countNonZero(detections.mask[i].astype(np.uint8))
            if count > 0:
                areas.append(float(count))
                continue
        x1, y1, x2, y2 = detections.xyxy[i]
        areas.append(float((x2 - x1) * (y2 - y1)))

    return areas

`core/workflows/core_steps/classical_cv/mask_edge_snap`¶

inference.core.workflows.core_steps.classical_cv.mask_edge_snap.v1 ¶

Classes¶

Functions:¶

refine_masks ¶

refine_masks(
    image,
    segmentation,
    pixel_tolerance,
    sigma,
    min_contour_area,
    dilation_iterations,
    boundary_band_width,
    adaptive_window_size,
)

Refine instance segmentation masks by snapping edges to detected boundaries.

Source code in inference/core/workflows/core_steps/classical_cv/mask_edge_snap/v1.py

def refine_masks(
    image: WorkflowImageData,
    segmentation: sv.Detections,
    pixel_tolerance: int,
    sigma: float,
    min_contour_area: float,
    dilation_iterations: int,
    boundary_band_width: int,
    adaptive_window_size: int,
) -> tuple:
    """Refine instance segmentation masks by snapping edges to detected boundaries."""

    np_img = image.numpy_image.copy()
    H, W = np_img.shape[:2]

    # Convert to grayscale
    if len(np_img.shape) == 2:
        gray = np_img.copy()
    elif np_img.shape[2] == 1:
        gray = np_img[:, :, 0]
    elif np_img.shape[2] == 4:
        gray = cv2.cvtColor(np_img, cv2.COLOR_BGRA2GRAY)
    elif np_img.shape[2] == 3:
        gray = cv2.cvtColor(np_img, cv2.COLOR_BGR2GRAY)
    else:
        # For any other case, try to convert assuming BGR
        try:
            gray = cv2.cvtColor(np_img, cv2.COLOR_BGR2GRAY)
        except cv2.error:
            # If conversion fails, take first channel
            gray = np_img[:, :, 0] if len(np_img.shape) >= 3 else np_img.copy()

    tol = int(pixel_tolerance)
    band_radius = max(1, int(boundary_band_width))
    band_kernel = cv2.getStructuringElement(
        cv2.MORPH_ELLIPSE, (band_radius * 2 + 1, band_radius * 2 + 1)
    )

    # Create boundary band from segmentation masks
    if segmentation.mask is not None and len(segmentation) > 0:
        boundary_band_pre = np.zeros((H, W), dtype=np.uint8)
        for m in segmentation.mask:
            m_uint8 = m.astype(np.uint8)
            if m_uint8.shape[:2] != (H, W):
                m_uint8 = cv2.resize(m_uint8, (W, H), interpolation=cv2.INTER_NEAREST)
            inner_i = cv2.erode(m_uint8, band_kernel)
            outer_i = cv2.dilate(m_uint8, band_kernel)
            boundary_band_pre = np.maximum(
                boundary_band_pre, ((outer_i > 0) & (inner_i == 0)).astype(np.uint8)
            )
    else:
        boundary_band_pre = np.ones((H, W), dtype=np.uint8)

    # Compute Sobel edges
    gx = cv2.Sobel(gray, cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(gray, cv2.CV_32F, 0, 1, ksize=3)
    magnitude = cv2.magnitude(gx, gy)

    # Adaptive threshold on Sobel magnitude
    win = max(3, int(adaptive_window_size))
    if win % 2 == 0:
        win += 1
    mag_mean = cv2.boxFilter(magnitude, cv2.CV_32F, (win, win))
    mag_sq_mean = cv2.boxFilter(magnitude * magnitude, cv2.CV_32F, (win, win))
    mag_var = np.maximum(mag_sq_mean - mag_mean * mag_mean, 0.0)
    mag_std = np.sqrt(mag_var)
    threshold_field = mag_mean + float(sigma) * mag_std
    edges_adaptive = (magnitude > threshold_field).astype(np.uint8) * 255

    # Morphological closing and thinning
    iterations = max(0, int(dilation_iterations))
    if iterations > 0:
        close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
        edges_adaptive = cv2.morphologyEx(
            edges_adaptive, cv2.MORPH_CLOSE, close_kernel, iterations=iterations
        )
        edges_adaptive = _zhang_suen_one_iteration(edges_adaptive)

    edges = edges_adaptive

    # Apply boundary band filter
    if segmentation.mask is not None and len(segmentation) > 0:
        edges_to_filter = (edges * boundary_band_pre).astype(np.uint8)
    else:
        edges_to_filter = edges.copy()

    # Filter by contour area
    min_area = max(0.0, float(min_contour_area))
    num_labels, labels, _, _ = cv2.connectedComponentsWithStats(
        edges_to_filter, connectivity=8
    )
    edge_filtered = np.zeros((H, W), dtype=np.uint8)
    for lbl in range(1, num_labels):
        comp_mask = (labels == lbl).astype(np.uint8)
        comp_contours, _ = cv2.findContours(
            comp_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
        )
        if not comp_contours:
            continue
        main_contour = max(comp_contours, key=cv2.contourArea)
        if cv2.contourArea(main_contour) >= min_area:
            edge_filtered[labels == lbl] = 255

    snap_region = edge_filtered > 0

    # Build edges detection output
    if snap_region.any():
        ys, xs = np.where(snap_region)
        import uuid

        edges_detections = sv.Detections(
            xyxy=np.array(
                [[xs.min(), ys.min(), xs.max() + 1, ys.max() + 1]], dtype=np.float32
            ),
            mask=snap_region[None, :, :],
            confidence=np.array([1.0], dtype=np.float32),
            class_id=np.array([0], dtype=int),
            data={
                "class_name": np.array(["edges"]),
                "detection_id": np.array([str(uuid.uuid4())]),
            },
        )
    else:
        edges_detections = sv.Detections.empty()

    # If no segmentation, return early
    if segmentation.mask is None or len(segmentation) == 0:
        return segmentation, edges_detections

    # Snap contours to edges
    refined_masks = []
    TANGENT_WINDOW = 5

    for mask in segmentation.mask:
        mask_uint8 = mask.astype(np.uint8)
        if mask_uint8.shape[:2] != (H, W):
            mask_uint8 = cv2.resize(mask_uint8, (W, H), interpolation=cv2.INTER_NEAREST)

        dist = cv2.distanceTransform(mask_uint8, cv2.DIST_L2, cv2.DIST_MASK_PRECISE)
        mask_dilated = cv2.dilate(mask_uint8, np.ones((3, 3), np.uint8))
        valid_snap = snap_region & ((dist <= tol) | (mask_dilated == 0))

        mask_contours, _ = cv2.findContours(
            mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
        )
        if not mask_contours:
            refined_masks.append(mask_uint8.astype(bool))
            continue

        new_mask = np.zeros((H, W), dtype=np.uint8)

        for mc in mask_contours:
            mc_pts = mc.reshape(-1, 2).astype(np.float32)
            n_pts = mc_pts.shape[0]
            refined_pts = mc_pts.copy()

            for i in range(n_pts):
                px, py = mc_pts[i]

                prev_pt = mc_pts[(i - TANGENT_WINDOW) % n_pts]
                next_pt = mc_pts[(i + TANGENT_WINDOW) % n_pts]
                tangent = next_pt - prev_pt
                t_len = np.linalg.norm(tangent)
                if t_len < 1e-6:
                    continue
                tangent /= t_len
                nx, ny = -tangent[1], tangent[0]

                best_score = 0.0
                best_pt = None

                for sign in (1.0, -1.0):
                    for d in range(1, tol + 1):
                        sx = int(round(px + sign * nx * d))
                        sy = int(round(py + sign * ny * d))
                        if 0 <= sx < W and 0 <= sy < H and valid_snap[sy, sx]:
                            mag = float(magnitude[sy, sx])
                            proximity = 1.0 - d / (tol + 1)
                            score = mag * proximity
                            if score > best_score:
                                best_score = score
                                best_pt = (sx, sy)

                if best_pt is not None:
                    refined_pts[i] = best_pt

            refined_contour = refined_pts.reshape(-1, 1, 2).astype(np.int32)
            cv2.fillPoly(new_mask, [refined_contour], color=1)

        refined_masks.append(new_mask.astype(bool))

    refined_masks_np = np.stack(refined_masks, axis=0)
    refined_detections = sv.Detections(
        xyxy=segmentation.xyxy.copy(),
        mask=refined_masks_np,
        confidence=segmentation.confidence,
        class_id=segmentation.class_id,
        tracker_id=segmentation.tracker_id,
        data=segmentation.data,
    )

    return refined_detections, edges_detections

`core/workflows/core_steps/classical_cv/morphological_transformation`¶

inference.core.workflows.core_steps.classical_cv.morphological_transformation.v2 ¶

Classes¶

Functions:¶

apply_morphological_operation ¶

apply_morphological_operation(img, kernel_size, operation)

Apply morphological operation to color image, preserving color format and alpha channel.

Source code in inference/core/workflows/core_steps/classical_cv/morphological_transformation/v2.py

def apply_morphological_operation(
    img: np.ndarray, kernel_size: int, operation: str
) -> np.ndarray:
    """Apply morphological operation to color image, preserving color format and alpha channel."""
    # Save alpha channel if present
    alpha = None
    if len(img.shape) == 2:
        # Grayscale to BGR
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    elif img.shape[2] == 1:
        # Single channel to BGR
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    elif img.shape[2] == 4:
        # BGRA: extract alpha, work on BGR only
        alpha = img[:, :, 3:4]
        img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)

    kernel = np.ones((kernel_size, kernel_size), np.uint8)

    if operation == "Dilation":
        result = cv2.dilate(img, kernel, iterations=1)
    elif operation == "Erosion":
        result = cv2.erode(img, kernel, iterations=1)
    elif operation == "Opening":
        result = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)
    elif operation == "Closing":
        result = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)
    elif operation == "Opening then Closing":
        # Apply morphological open (erosion followed by dilation)
        opened = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)
        # Apply morphological close (dilation followed by erosion)
        result = cv2.morphologyEx(opened, cv2.MORPH_CLOSE, kernel)
    elif operation == "Gradient":
        result = cv2.morphologyEx(img, cv2.MORPH_GRADIENT, kernel)
    elif operation == "Top Hat":
        result = cv2.morphologyEx(img, cv2.MORPH_TOPHAT, kernel)
    elif operation == "Black Hat":
        result = cv2.morphologyEx(img, cv2.MORPH_BLACKHAT, kernel)
    else:
        raise ValueError(
            f"Invalid operation: {operation}. Supported operations are 'Erosion', 'Dilation', 'Opening', 'Closing', 'Opening then Closing', 'Gradient', 'Top Hat', 'Black Hat'."
        )

    # Re-attach alpha if it was present
    if alpha is not None:
        result = np.concatenate([result, alpha], axis=2)

    return result

`core/workflows/core_steps/classical_cv/motion_detection`¶

inference.core.workflows.core_steps.classical_cv.motion_detection.v1 ¶

Classes¶

Functions:¶

clip_contours_to_contour ¶

clip_contours_to_contour(contours, clip_contour)

Clip OpenCV contours to another contour and return clipped OpenCV contours.

Parameters:

Name	Type	Description	Default
`contours`	`List[ndarray]`	List of OpenCV contours, each as numpy array of shape (N, 1, 2)	required
`clip_contour`	`ndarray`	Clip contour as numpy array of shape (M, 2) with xy points	required

Returns:

Type	Description
`List[ndarray]`	List of clipped OpenCV contours as numpy arrays of shape (N, 1, 2).
`List[ndarray]`	Only includes contours that overlap with the clip contour.

Source code in inference/core/workflows/core_steps/classical_cv/motion_detection/v1.py

def clip_contours_to_contour(
    contours: List[np.ndarray], clip_contour: np.ndarray
) -> List[np.ndarray]:
    """
    Clip OpenCV contours to another contour and return clipped OpenCV contours.

    Args:
        contours: List of OpenCV contours, each as numpy array of shape (N, 1, 2)
        clip_contour: Clip contour as numpy array of shape (M, 2) with xy points

    Returns:
        List of clipped OpenCV contours as numpy arrays of shape (N, 1, 2).
        Only includes contours that overlap with the clip contour.
    """

    clip_poly = Polygon(clip_contour)
    result = []

    for contour in contours:
        # Convert OpenCV contour (N, 1, 2) to xy points (N, 2)
        points = contour.reshape(-1, 2)

        if len(points) < 3:
            continue

        try:
            poly = Polygon(points)
            clipped = poly.intersection(clip_poly)

            if clipped.is_empty:
                continue

            # Extract coordinates based on geometry type
            if clipped.geom_type == "Polygon":
                coords = list(clipped.exterior.coords[:-1])
                if len(coords) >= 3:
                    result.append(list_to_contour(coords))

            elif clipped.geom_type == "MultiPolygon":
                for geom in clipped.geoms:
                    coords = list(geom.exterior.coords[:-1])
                    if len(coords) >= 3:
                        result.append(list_to_contour(coords))

        except Exception:
            # Silently skip contours that fail shapely operations
            # (e.g., self-intersecting polygons)
            continue

    return result

list_to_contour ¶

list_to_contour(list_of_tuples)

Convert a list of (x, y) tuples to an OpenCV contour format.

Parameters:

Name	Type	Description	Default
`list_of_tuples`	`List[Tuple]`	List of coordinate tuples [(x1, y1), (x2, y2), ...]	required

Returns:

Type	Description
`ndarray`	NumPy array of shape (N, 1, 2) suitable for OpenCV operations

Source code in inference/core/workflows/core_steps/classical_cv/motion_detection/v1.py

def list_to_contour(list_of_tuples: List[Tuple]) -> np.ndarray:
    """
    Convert a list of (x, y) tuples to an OpenCV contour format.

    Args:
        list_of_tuples: List of coordinate tuples [(x1, y1), (x2, y2), ...]

    Returns:
        NumPy array of shape (N, 1, 2) suitable for OpenCV operations
    """
    points = np.array(
        [[int(xy[0]), int(xy[1])] for xy in list_of_tuples], dtype=np.int32
    )
    return points.reshape(-1, 1, 2)

`core/workflows/core_steps/classical_cv/pixel_color_count`¶

inference.core.workflows.core_steps.classical_cv.pixel_color_count.v1 ¶

Classes¶

Functions:¶

count_specific_color_pixels ¶

count_specific_color_pixels(image, target_color, tolerance)

Counts the number of pixels that match the target color within the given tolerance.

Parameters:

Name	Type	Description	Default
`image`	`ndarray`	Input image.	required
`target_color`	`Union[str, tuple]`	Target color in hex format (e.g., '#431112') or BGR tuple (e.g., (18, 17, 67)).	required
`tolerance`	`int`	Tolerance for color matching. Defaults to 10.	required

Returns:

Name	Type	Description
`int`	`int`	Number of pixels that match the target color.

Source code in inference/core/workflows/core_steps/classical_cv/pixel_color_count/v1.py

def count_specific_color_pixels(
    image: np.ndarray,
    target_color: Union[str, Tuple[int, int, int]],
    tolerance: int,
) -> int:
    """
    Counts the number of pixels that match the target color within the given tolerance.

    Args:
        image: Input image.
        target_color (Union[str, tuple]): Target color in hex format (e.g., '#431112') or BGR tuple (e.g., (18, 17, 67)).
        tolerance (int, optional): Tolerance for color matching. Defaults to 10.

    Returns:
        int: Number of pixels that match the target color.
    """
    target_color_bgr = convert_color_to_bgr_tuple(color=target_color)
    lower_bound = np.array(target_color_bgr) - tolerance
    upper_bound = np.array(target_color_bgr) + tolerance

    # Use vectorized comparison to directly create a mask and count non-zero elements
    mask = cv2.inRange(image, lower_bound, upper_bound)

    return int(cv2.countNonZero(mask))

`core/workflows/core_steps/classical_cv/sift`¶

inference.core.workflows.core_steps.classical_cv.sift.v1 ¶

Classes¶

Functions:¶

apply_sift ¶

apply_sift(image)

Applies SIFT to the image. Args: image: Input image. Returns: np.ndarray: Image with keypoints drawn. list: Keypoints detected. np.ndarray: Descriptors of the keypoints.

Source code in inference/core/workflows/core_steps/classical_cv/sift/v1.py

def apply_sift(image: np.ndarray) -> (np.ndarray, list, np.ndarray):
    """
    Applies SIFT to the image.
    Args:
        image: Input image.
    Returns:
        np.ndarray: Image with keypoints drawn.
        list: Keypoints detected.
        np.ndarray: Descriptors of the keypoints.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    sift = cv2.SIFT_create()
    kp, des = sift.detectAndCompute(gray, None)
    img_with_kp = cv2.drawKeypoints(gray, kp, image)
    # Convert keypoints to the desired format
    keypoints = [
        {
            "pt": (point.pt[0], point.pt[1]),
            "size": point.size,
            "angle": point.angle,
            "response": point.response,
            "octave": point.octave,
            "class_id": point.class_id,
        }
        for point in kp
    ]
    return img_with_kp, keypoints, des

`core/workflows/core_steps/classical_cv/sift_comparison`¶

inference.core.workflows.core_steps.classical_cv.sift_comparison.v2 ¶

Classes¶

Functions:¶

apply_sift ¶

apply_sift(image, visualize=False)

Applies SIFT to the image. Args: image: Input image. visualize: Whether to visualize keypoints on the image. Returns: img_with_kp: Image with keypoints drawn (if visualize is True). kp: List of cv2.KeyPoint objects. keypoints_dicts: List of keypoints as dictionaries. des: Descriptors of the keypoints.

Source code in inference/core/workflows/core_steps/classical_cv/sift_comparison/v2.py

def apply_sift(
    image: np.ndarray, visualize=False
) -> (Optional[np.ndarray], list, list, np.ndarray):
    """
    Applies SIFT to the image.
    Args:
        image: Input image.
        visualize: Whether to visualize keypoints on the image.
    Returns:
        img_with_kp: Image with keypoints drawn (if visualize is True).
        kp: List of cv2.KeyPoint objects.
        keypoints_dicts: List of keypoints as dictionaries.
        des: Descriptors of the keypoints.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    sift = cv2.SIFT_create()
    kp, des = sift.detectAndCompute(gray, None)
    img_with_kp = None
    if visualize:
        img_with_kp = cv2.drawKeypoints(gray, kp, None)
    # Convert keypoints to the desired format
    keypoints_dicts = [
        {
            "pt": (point.pt[0], point.pt[1]),
            "size": point.size,
            "angle": point.angle,
            "response": point.response,
            "octave": point.octave,
            "class_id": point.class_id,
        }
        for point in kp
    ]
    return img_with_kp, kp, keypoints_dicts, des

`core/workflows/core_steps/classical_cv/size_measurement`¶

inference.core.workflows.core_steps.classical_cv.size_measurement.v1 ¶

Classes¶

Functions:¶

compute_aligned_dimensions ¶

compute_aligned_dimensions(contour)

Compute the width and height of an object based on its contour, ensuring proper orientation.

This function: 1. Finds the minimum area rectangle that encloses the contour 2. Determines which edges correspond to width and height by analyzing their angles 3. Returns dimensions where width is the more horizontal edge and height is the more vertical edge

Parameters:

Name	Type	Description	Default
`contour`	`ndarray`	Array of points representing the object's contour	required

Returns:

Type	Description
`Tuple[float, float]`	Tuple[float, float]: A tuple of (width_pixels, height_pixels) where: - width_pixels: Length of the more horizontal edge - height_pixels: Length of the more vertical edge

Note

The function uses angle analysis to ensure consistent width/height assignment regardless of the object's rotation. The edge closer to horizontal (0° or 180°) is always considered the width.

Source code in inference/core/workflows/core_steps/classical_cv/size_measurement/v1.py

def compute_aligned_dimensions(contour: np.ndarray) -> Tuple[float, float]:
    """
    Compute the width and height of an object based on its contour, ensuring proper orientation.

    This function:
    1. Finds the minimum area rectangle that encloses the contour
    2. Determines which edges correspond to width and height by analyzing their angles
    3. Returns dimensions where width is the more horizontal edge and height is the more vertical edge

    Args:
        contour (np.ndarray): Array of points representing the object's contour

    Returns:
        Tuple[float, float]: A tuple of (width_pixels, height_pixels) where:
            - width_pixels: Length of the more horizontal edge
            - height_pixels: Length of the more vertical edge

    Note:
        The function uses angle analysis to ensure consistent width/height assignment
        regardless of the object's rotation. The edge closer to horizontal (0° or 180°)
        is always considered the width.
    """
    rect = cv.minAreaRect(contour)
    box = cv.boxPoints(rect)
    box = np.array(box, dtype=np.float32)

    edge1 = box[1] - box[0]
    edge2 = box[2] - box[1]

    len_edge1 = np.linalg.norm(edge1)
    len_edge2 = np.linalg.norm(edge2)

    angle1 = np.degrees(np.arctan2(edge1[1], edge1[0]))
    angle2 = np.degrees(np.arctan2(edge2[1], edge2[0]))

    h_score1 = horizontal_score(angle1)
    h_score2 = horizontal_score(angle2)

    if h_score1 < h_score2:
        width_pixels = len_edge1
        height_pixels = len_edge2
    else:
        width_pixels = len_edge2
        height_pixels = len_edge1

    return float(width_pixels), float(height_pixels)

get_detection_dimensions ¶

get_detection_dimensions(detection, index)

Retrieve the width and height dimensions of a detected object in pixels.

Parameters:

Name	Type	Description	Default
`detection`	`Detections`	Detection object containing masks and/or bounding boxes	required
`index`	`int`	Index of the specific detection to analyze	required

Returns:

Type	Description
`Tuple[Optional[float], Optional[float]]`	Tuple[float, float]: A tuple of (width_pixels, height_pixels) where: - width_pixels: Width of the object in pixels - height_pixels: Height of the object in pixels

Notes

The function uses two methods to compute dimensions: 1. If a segmentation mask is available: - Extracts the largest contour from the mask - Uses compute_aligned_dimensions() to get orientation-aware measurements 2. If no mask is available: - Falls back to using the bounding box dimensions - Simply computes width and height as box edges

Source code in inference/core/workflows/core_steps/classical_cv/size_measurement/v1.py

def get_detection_dimensions(
    detection: sv.Detections, index: int
) -> Tuple[Optional[float], Optional[float]]:
    """
    Retrieve the width and height dimensions of a detected object in pixels.

    Args:
        detection (sv.Detections): Detection object containing masks and/or bounding boxes
        index (int): Index of the specific detection to analyze

    Returns:
        Tuple[float, float]: A tuple of (width_pixels, height_pixels) where:
            - width_pixels: Width of the object in pixels
            - height_pixels: Height of the object in pixels

    Notes:
        The function uses two methods to compute dimensions:
        1. If a segmentation mask is available:
           - Extracts the largest contour from the mask
           - Uses compute_aligned_dimensions() to get orientation-aware measurements
        2. If no mask is available:
           - Falls back to using the bounding box dimensions
           - Simply computes width and height as box edges
    """
    if detection.mask is not None:
        mask = detection.mask[index].astype(np.uint8)
        contours, _ = cv.findContours(mask, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
        if contours:
            largest_contour = max(contours, key=cv.contourArea)
            if cv.contourArea(largest_contour) > 0:
                return compute_aligned_dimensions(largest_contour)

    else:
        bbox = detection.xyxy[index]
        w = bbox[2] - bbox[0]
        h = bbox[3] - bbox[1]
        return float(w), float(h)

    return None, None

horizontal_score ¶

horizontal_score(angle)

Determine how close an angle is to horizontal (0 or 180 degrees). Lower score means more horizontal.

Source code in inference/core/workflows/core_steps/classical_cv/size_measurement/v1.py

def horizontal_score(angle: float) -> float:
    """
    Determine how close an angle is to horizontal (0 or 180 degrees).
    Lower score means more horizontal.
    """
    mod_angle = abs(angle % 180)
    return min(mod_angle, 180 - mod_angle)

parse_reference_dimensions ¶

parse_reference_dimensions(reference_dimensions)

Parse reference dimensions from various input formats.

Source code in inference/core/workflows/core_steps/classical_cv/size_measurement/v1.py

def parse_reference_dimensions(
    reference_dimensions: Union[str, Tuple[float, float], List[float]],
) -> Tuple[float, float]:
    """Parse reference dimensions from various input formats."""
    if isinstance(reference_dimensions, str):
        parts = reference_dimensions.split(",")
        if len(parts) != 2:
            raise ValueError(
                "reference_dimensions must be a string in the format 'width,height'"
            )
        try:
            reference_dimensions = [float(p.strip()) for p in parts]
        except ValueError:
            raise ValueError("Invalid format for reference_dimensions")

    if len(reference_dimensions) != 2:
        raise ValueError("reference_dimensions must have two values (width, height)")

    return tuple(reference_dimensions)

`core/workflows/core_steps/classical_cv/threshold`¶

inference.core.workflows.core_steps.classical_cv.threshold.v1 ¶

Classes¶

Functions:¶

apply_thresholding ¶

apply_thresholding(
    image, threshold_type, thresh_value, max_value
)

Applies the specified thresholding to the image.

Parameters:

Name	Type	Description	Default
`image`	`ndarray`	Input image in grayscale.	required
`threshold_type`	`str`	Type of thresholding ('binary', 'binary_inv', 'trunc', 'tozero', 'tozero_inv', 'adaptive_mean', 'adaptive_gaussian', 'otsu').	required
`thresh_value`	`int`	Threshold value.	required
`max_value`	`int`	Maximum value to use with the THRESH_BINARY and THRESH_BINARY_INV thresholding types.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Image with thresholding applied.

Source code in inference/core/workflows/core_steps/classical_cv/threshold/v1.py

def apply_thresholding(
    image: np.ndarray, threshold_type: str, thresh_value: int, max_value: int
) -> np.ndarray:
    """
    Applies the specified thresholding to the image.

    Args:
        image (np.ndarray): Input image in grayscale.
        threshold_type (str): Type of thresholding ('binary', 'binary_inv', 'trunc', 'tozero', 'tozero_inv', 'adaptive_mean', 'adaptive_gaussian', 'otsu').
        thresh_value (int, optional): Threshold value.
        max_value (int, optional): Maximum value to use with the THRESH_BINARY and THRESH_BINARY_INV thresholding types.

    Returns:
        np.ndarray: Image with thresholding applied.
    """
    if threshold_type == "binary":
        _, thresh_image = cv2.threshold(
            image, thresh_value, max_value, cv2.THRESH_BINARY
        )
    elif threshold_type == "binary_inv":
        _, thresh_image = cv2.threshold(
            image, thresh_value, max_value, cv2.THRESH_BINARY_INV
        )
    elif threshold_type == "trunc":
        _, thresh_image = cv2.threshold(
            image, thresh_value, max_value, cv2.THRESH_TRUNC
        )
    elif threshold_type == "tozero":
        _, thresh_image = cv2.threshold(
            image, thresh_value, max_value, cv2.THRESH_TOZERO
        )
    elif threshold_type == "tozero_inv":
        _, thresh_image = cv2.threshold(
            image, thresh_value, max_value, cv2.THRESH_TOZERO_INV
        )
    elif threshold_type == "adaptive_mean":
        thresh_image = cv2.adaptiveThreshold(
            image, max_value, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2
        )
    elif threshold_type == "adaptive_gaussian":
        thresh_image = cv2.adaptiveThreshold(
            image,
            max_value,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            11,
            2,
        )
    elif threshold_type == "otsu":
        _, thresh_image = cv2.threshold(
            image, 0, max_value, cv2.THRESH_BINARY + cv2.THRESH_OTSU
        )
    else:
        raise ValueError(f"Unknown threshold type: {threshold_type}")

    return thresh_image

`core/workflows/core_steps/common`¶

inference.core.workflows.core_steps.common.entities ¶

Backwards-compatibility re-exports.

StepExecutionMode used to live in this module. It has since moved to inference.core.workflows.prototypes.block so the framework layer (prototypes) owns the enum and higher-level packages (core_steps, executor, compiler) can depend on prototypes instead of the other way around. This module is kept purely as a re-export shim so existing imports from inference.core.workflows.core_steps.common.entities import StepExecutionMode keep working.

Classes¶

StepExecutionMode ¶

Bases: Enum

How a workflow step is dispatched at runtime.

LOCAL: the step executes in-process inside the current Python interpreter. REMOTE: the step delegates execution to a remote inference service / HTTP runtime.

Kept in prototypes/block.py so the framework layer owns this enum and higher-level packages (core_steps, executor, compiler) depend on prototypes rather than the other way around.

Source code in inference/core/workflows/prototypes/block.py

class StepExecutionMode(Enum):
    """How a workflow step is dispatched at runtime.

    LOCAL: the step executes in-process inside the current Python interpreter.
    REMOTE: the step delegates execution to a remote inference service / HTTP
    runtime.

    Kept in ``prototypes/block.py`` so the framework layer owns this enum and
    higher-level packages (``core_steps``, executor, compiler) depend on
    ``prototypes`` rather than the other way around.
    """

    LOCAL = "local"
    REMOTE = "remote"

inference.core.workflows.core_steps.common.openrouter ¶

Shared utilities for workflow blocks that route through OpenRouter.

This module owns the OpenRouter API Key Passthrough plumbing so individual blocks (Gemma, Llama Vision, Kimi, unified Qwen) only need to declare their manifest specifics (model dropdown, search keywords, icon) and call into the shared base class for execution.

Two key paths are supported per call

Roboflow-managed key (default for new blocks): the user's api_key starts with rf_key:account (or rf_key:user:<id>) and is sent to Roboflow's apiproxy/openrouter route, which resolves to the managed OpenRouter key, applies privacy filters, bills credits, and returns the upstream response.
Custom user key: any other api_key value (e.g. sk-or-...) is passed straight to openrouter.ai via the OpenAI SDK with no Roboflow proxy in the loop.

Both paths honor a user-selected privacy_level of allow, deny, or zdr (zero data retention). Full task-type prompt builders shared across the VLM blocks live here too so the per-block files stay small.

Classes¶

OpenRouterBlockManifestMixin ¶

Pydantic mixin contributing the OpenRouter-specific manifest fields.

Concrete block manifests inherit from this AND declare their own type, model_version, task_type, images, prompt, etc.

Source code in inference/core/workflows/core_steps/common/openrouter.py

class OpenRouterBlockManifestMixin(WorkflowBlockManifest):
    """Pydantic mixin contributing the OpenRouter-specific manifest fields.

    Concrete block manifests inherit from this AND declare their own ``type``,
    ``model_version``, ``task_type``, ``images``, ``prompt``, etc.
    """

    api_key: Union[
        Selector(kind=[STRING_KIND, SECRET_KIND, ROBOFLOW_MANAGED_KEY]), str
    ] = Field(
        default="rf_key:account",
        description=(
            "OpenRouter API key. Defaults to Roboflow's managed key, billed in "
            "credits via Roboflow. Provide your own `sk-or-...` key to call "
            "OpenRouter directly without Roboflow billing."
        ),
        examples=["rf_key:account", "sk-or-...", "$inputs.openrouter_api_key"],
        private=True,
    )
    privacy_level: PRIVACY_LEVEL_LITERAL = Field(
        default="deny",
        description=(
            "Provider privacy filter. Stricter levels reduce the pool of "
            "providers and may increase per-call cost on the managed key."
        ),
        json_schema_extra={"values_metadata": PRIVACY_LEVEL_METADATA},
    )
    max_tokens: int = Field(
        default=500,
        description="Maximum number of tokens the model can generate in its response.",
        gt=1,
    )
    temperature: Union[float, Selector(kind=[FLOAT_KIND])] = Field(
        default=0.1,
        description=(
            "Temperature to sample from the model - value in range 0.0-2.0, "
            'the higher - the more random / "creative" the generations are.'
        ),
    )
    max_concurrent_requests: Optional[int] = Field(
        default=None,
        description=(
            "Number of concurrent requests for batches of images. If not "
            "given - block defaults to value configured globally in Workflows "
            "Execution Engine. Restrict if you hit rate limits."
        ),
    )

OpenRouterWorkflowBlockBase ¶

Bases: WorkflowBlock

Shared base class for blocks that route through OpenRouter.

Subclasses provide manifest + prompt-building; this class owns the routing/execution machinery.

Source code in inference/core/workflows/core_steps/common/openrouter.py

class OpenRouterWorkflowBlockBase(WorkflowBlock):
    """Shared base class for blocks that route through OpenRouter.

    Subclasses provide manifest + prompt-building; this class owns the
    routing/execution machinery.
    """

    def __init__(
        self,
        model_manager: ModelManager,
        api_key: Optional[str],
    ):
        self._model_manager = model_manager
        self._roboflow_api_key = api_key

    @classmethod
    def get_init_parameters(cls) -> List[str]:
        return ["model_manager", "api_key"]

    def execute_openrouter_batch(
        self,
        openrouter_api_key: str,
        model: str,
        prompts: List[List[dict]],
        max_tokens: int,
        temperature: float,
        privacy_level: str,
        max_concurrent_requests: Optional[int],
    ) -> List[str]:
        """Run a batch of OpenRouter chat-completion calls in parallel.

        Routes through the Roboflow proxy when ``openrouter_api_key`` starts
        with ``rf_key:`` (managed/user-stored), otherwise calls the OpenRouter
        API directly using the OpenAI SDK with the provided key.
        """
        is_managed = openrouter_api_key.startswith(("rf_key:account", "rf_key:user:"))
        if is_managed:
            single = partial(
                _execute_proxied_openrouter_request,
                roboflow_api_key=self._roboflow_api_key,
                openrouter_api_key=openrouter_api_key,
                model=model,
                privacy_level=privacy_level,
            )
        else:
            single = partial(
                _execute_direct_openrouter_request,
                api_key=openrouter_api_key,
                model=model,
                privacy_level=privacy_level,
            )
        tasks = [
            partial(
                single,
                messages=p,
                max_tokens=max_tokens,
                temperature=temperature,
            )
            for p in prompts
        ]
        max_workers = (
            max_concurrent_requests
            or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
        )
        return run_in_parallel(tasks=tasks, max_workers=max_workers)

Methods:¶

execute_openrouter_batch ¶

execute_openrouter_batch(
    openrouter_api_key,
    model,
    prompts,
    max_tokens,
    temperature,
    privacy_level,
    max_concurrent_requests,
)

Run a batch of OpenRouter chat-completion calls in parallel.

Routes through the Roboflow proxy when openrouter_api_key starts with rf_key: (managed/user-stored), otherwise calls the OpenRouter API directly using the OpenAI SDK with the provided key.

Source code in inference/core/workflows/core_steps/common/openrouter.py

def execute_openrouter_batch(
    self,
    openrouter_api_key: str,
    model: str,
    prompts: List[List[dict]],
    max_tokens: int,
    temperature: float,
    privacy_level: str,
    max_concurrent_requests: Optional[int],
) -> List[str]:
    """Run a batch of OpenRouter chat-completion calls in parallel.

    Routes through the Roboflow proxy when ``openrouter_api_key`` starts
    with ``rf_key:`` (managed/user-stored), otherwise calls the OpenRouter
    API directly using the OpenAI SDK with the provided key.
    """
    is_managed = openrouter_api_key.startswith(("rf_key:account", "rf_key:user:"))
    if is_managed:
        single = partial(
            _execute_proxied_openrouter_request,
            roboflow_api_key=self._roboflow_api_key,
            openrouter_api_key=openrouter_api_key,
            model=model,
            privacy_level=privacy_level,
        )
    else:
        single = partial(
            _execute_direct_openrouter_request,
            api_key=openrouter_api_key,
            model=model,
            privacy_level=privacy_level,
        )
    tasks = [
        partial(
            single,
            messages=p,
            max_tokens=max_tokens,
            temperature=temperature,
        )
        for p in prompts
    ]
    max_workers = (
        max_concurrent_requests
        or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
    )
    return run_in_parallel(tasks=tasks, max_workers=max_workers)

Functions:¶

build_prompts_from_images ¶

build_prompts_from_images(
    images, task_type, prompt, output_structure, classes
)

Build a list of OpenRouter messages arrays, one per input image.

images items are inference-format image dicts as produced by WorkflowImageData.to_inference_format().

Source code in inference/core/workflows/core_steps/common/openrouter.py

def build_prompts_from_images(
    images: List[Dict[str, Any]],
    task_type: str,
    prompt: Optional[str],
    output_structure: Optional[Dict[str, str]],
    classes: Optional[List[str]],
) -> List[List[dict]]:
    """Build a list of OpenRouter ``messages`` arrays, one per input image.

    ``images`` items are inference-format image dicts as produced by
    ``WorkflowImageData.to_inference_format()``.
    """
    if task_type not in PROMPT_BUILDERS:
        raise ValueError(f"Task type: {task_type} not supported.")
    builder = PROMPT_BUILDERS[task_type]
    built: List[List[dict]] = []
    for image in images:
        loaded_image, _ = load_image(image)
        base64_image = base64.b64encode(
            encode_image_to_jpeg_bytes(loaded_image)
        ).decode("ascii")
        built.append(
            builder(
                base64_image=base64_image,
                prompt=prompt,
                output_structure=output_structure,
                classes=classes,
            )
        )
    return built

build_provider_routing ¶

build_provider_routing(privacy_level)

Translate a privacy level into OpenRouter's provider payload object.

Returns None for allow (no filter), an object with data_collection: deny for deny, and an object with both data_collection and zdr set for zdr.

Source code in inference/core/workflows/core_steps/common/openrouter.py

def build_provider_routing(privacy_level: str) -> Optional[dict]:
    """Translate a privacy level into OpenRouter's ``provider`` payload object.

    Returns ``None`` for ``allow`` (no filter), an object with
    ``data_collection: deny`` for ``deny``, and an object with both
    ``data_collection`` and ``zdr`` set for ``zdr``.
    """
    if privacy_level == "allow":
        return None
    if privacy_level == "deny":
        return {"data_collection": "deny"}
    if privacy_level == "zdr":
        return {"data_collection": "deny", "zdr": True}
    raise ValueError(f"unknown privacy_level: {privacy_level}")

validate_task_type_required_fields ¶

validate_task_type_required_fields(
    task_type, prompt, classes, output_structure
)

Raise ValueError if a required field for task_type is missing.

Used by block manifests' model_validator to surface a clear error before the workflow runs.

Source code in inference/core/workflows/core_steps/common/openrouter.py

def validate_task_type_required_fields(
    task_type: str,
    prompt: Optional[str],
    classes: Optional[List[str]],
    output_structure: Optional[Dict[str, str]],
) -> None:
    """Raise ``ValueError`` if a required field for ``task_type`` is missing.

    Used by block manifests' ``model_validator`` to surface a clear error
    before the workflow runs.
    """
    if task_type in TASKS_REQUIRING_PROMPT and prompt is None:
        raise ValueError(
            f"`prompt` parameter required to be set for task `{task_type}`"
        )
    if task_type in TASKS_REQUIRING_CLASSES and classes is None:
        raise ValueError(
            f"`classes` parameter required to be set for task `{task_type}`"
        )
    if task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE and output_structure is None:
        raise ValueError(
            f"`output_structure` parameter required to be set for task `{task_type}`"
        )

inference.core.workflows.core_steps.common.utils ¶

Classes¶

Functions:¶

remove_unexpected_keys_from_dictionary ¶

remove_unexpected_keys_from_dictionary(
    dictionary, expected_keys
)

This function mutates input dictionary

Source code in inference/core/workflows/core_steps/common/utils.py

def remove_unexpected_keys_from_dictionary(
    dictionary: dict,
    expected_keys: set,
) -> dict:
    """This function mutates input `dictionary`"""
    unexpected_keys = set(dictionary.keys()).difference(expected_keys)
    for unexpected_key in unexpected_keys:
        del dictionary[unexpected_key]
    return dictionary

`core/workflows/core_steps/common/query_language/introspection`¶

inference.core.workflows.core_steps.common.query_language.introspection.core ¶

`core/workflows/core_steps/flow_control/inner_workflow`¶

inference.core.workflows.core_steps.flow_control.inner_workflow.v1 ¶

Classes¶

InnerWorkflowBlockV1 ¶

Bases: WorkflowBlock

Placeholder block; inner workflows are expanded at compile time and never executed as a unit.

Source code in inference/core/workflows/core_steps/flow_control/inner_workflow/v1.py

class InnerWorkflowBlockV1(WorkflowBlock):
    """Placeholder block; inner workflows are expanded at compile time and never executed as a unit."""

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return BlockManifest

    def run(self, *args, **kwargs) -> BlockResult:
        raise InnerWorkflowRunNotSupportedError(
            "inner_workflow steps are compiled away into ordinary steps; block.run() must not be called."
        )

`core/workflows/core_steps/fusion/detections_list_rollup`¶

inference.core.workflows.core_steps.fusion.detections_list_rollup.v1 ¶

Classes¶

Functions:¶

merge_crop_predictions ¶

merge_crop_predictions(
    parent_prediction,
    child_predictions,
    confidence_strategy="max",
    overlap_threshold=0.0,
    keypoint_merge_threshold=10.0,
)

Merge predictions from multiple crops back to parent image coordinates.

Parameters:

Name	Type	Description	Default
`parent_prediction`		Supervision Detections object that defines the crop locations. Each detection in this prediction represents one crop region.	required
`child_predictions`	`List`	List of Supervision Detections objects from crops. Order matches the detection order in parent_prediction.	required
`confidence_strategy`	`str`	How to handle confidence when merging overlaps. Options: "max", "mean", "min"	`'max'`
`overlap_threshold`	`float`	Minimum IoU/overlap ratio to merge detections (0.0 to 1.0). - 0.0: Only merge if detections touch or overlap at all (default) - >0.0: Only merge if overlap ratio exceeds this threshold - 1.0: Only merge completely overlapping detections	`0.0`
`keypoint_merge_threshold`	`float`	Maximum distance in pixels to merge keypoints (default: 10). For keypoint detections, merges detections if their average keypoint distance is below this threshold.	`10.0`

Returns:

Type	Description
`Tuple`	Tuple of (detections, crop_zones):
`Tuple`	detections: Detections object with merged predictions in parent image coordinates. Works for both instance segmentation (with masks) and object detection (without masks).
`Tuple`	crop_zones: List of lists of (x, y) tuples. Each inner list defines the rectangular zone boundary of a crop in parent image coordinates as 4 corner points.

Source code in inference/core/workflows/core_steps/fusion/detections_list_rollup/v1.py

def merge_crop_predictions(
    parent_prediction,
    child_predictions: List,
    confidence_strategy: str = "max",
    overlap_threshold: float = 0.0,
    keypoint_merge_threshold: float = 10.0,
) -> Tuple:
    """
    Merge predictions from multiple crops back to parent image coordinates.

    Args:
        parent_prediction: Supervision Detections object that defines the crop locations.
                          Each detection in this prediction represents one crop region.
        child_predictions: List of Supervision Detections objects from crops.
                          Order matches the detection order in parent_prediction.
        confidence_strategy: How to handle confidence when merging overlaps.
                           Options: "max", "mean", "min"
        overlap_threshold: Minimum IoU/overlap ratio to merge detections (0.0 to 1.0).
                         - 0.0: Only merge if detections touch or overlap at all (default)
                         - >0.0: Only merge if overlap ratio exceeds this threshold
                         - 1.0: Only merge completely overlapping detections
        keypoint_merge_threshold: Maximum distance in pixels to merge keypoints (default: 10).
                                For keypoint detections, merges detections if their average
                                keypoint distance is below this threshold.

    Returns:
        Tuple of (detections, crop_zones):
        - detections: Detections object with merged predictions in parent image coordinates.
                     Works for both instance segmentation (with masks) and object detection (without masks).
        - crop_zones: List of lists of (x, y) tuples. Each inner list defines the rectangular
                     zone boundary of a crop in parent image coordinates as 4 corner points.
    """
    if len(parent_prediction) != len(child_predictions):
        raise ValueError(
            f"Number of detections in parent_prediction ({len(parent_prediction)}) "
            f"must match number of child predictions ({len(child_predictions)})"
        )

    # Extract parent image shape from parent prediction's data
    # root_parent_dimensions is a list of tuples, one per detection (all should be the same)
    root_parent_dims = parent_prediction.data.get("root_parent_dimensions")

    if root_parent_dims is None or len(root_parent_dims) == 0:
        raise ValueError(
            "parent_prediction must have 'root_parent_dimensions' in its data attribute"
        )

    # Get the first tuple (all should be identical for the same parent image)
    parent_image_shape = root_parent_dims[0]

    # Build crop zones list - one zone per crop/child prediction
    crop_zones = []
    for i in range(len(parent_prediction)):
        crop_bbox = parent_prediction.xyxy[i]  # [x_min, y_min, x_max, y_max]
        x_min, y_min, x_max, y_max = (
            crop_bbox[0],
            crop_bbox[1],
            crop_bbox[2],
            crop_bbox[3],
        )

        # Create zone as list of 4 corner points: top-left, top-right, bottom-right, bottom-left
        zone = [
            (float(x_min), float(y_min)),  # top-left
            (float(x_max), float(y_min)),  # top-right
            (float(x_max), float(y_max)),  # bottom-right
            (float(x_min), float(y_max)),  # bottom-left
        ]
        crop_zones.append(zone)

    # Check if we have instance segmentation (with masks) or object detection (without masks)
    has_masks = False
    is_keypoint_detection = False
    for child_pred in child_predictions:
        if child_pred.mask is not None and len(child_pred.mask) > 0:
            has_masks = True
            break

    for child_pred in child_predictions:
        # Check for keypoint detection
        if "prediction_type" in child_pred.data:
            pred_type = child_pred.data["prediction_type"]
            if isinstance(pred_type, np.ndarray):
                if len(pred_type) > 0 and pred_type[0] == "keypoint-detection":
                    is_keypoint_detection = True
                    break
            elif pred_type == "keypoint-detection":
                is_keypoint_detection = True
                break

    # Group predictions by class
    class_predictions = {}

    # Iterate through each crop region and its corresponding child predictions
    for i, child_pred in enumerate(child_predictions):
        # Get crop location from parent prediction
        crop_bbox = parent_prediction.xyxy[i]  # [x_min, y_min, x_max, y_max]
        x_min, y_min = int(crop_bbox[0]), int(crop_bbox[1])

        # Process each detection in the child prediction
        for j in range(len(child_pred)):
            class_id = child_pred.class_id[j]
            confidence = child_pred.confidence[j]

            # Prepare keypoint data if present
            keypoint_data = {}
            if is_keypoint_detection and "keypoints_xy" in child_pred.data:
                # Transform keypoint coordinates from crop to parent space
                keypoints_xy = child_pred.data["keypoints_xy"][
                    j
                ]  # Shape: (num_keypoints, 2)

                # Vectorised offset — avoids a per-keypoint Python loop.
                # copy=True prevents mutating the source array; reshape handles
                # edge cases where keypoints_xy arrives as a flat/empty array.
                kp_array = np.array(keypoints_xy, dtype=np.float64, copy=True)
                if kp_array.size == 0:
                    keypoint_data["keypoints_xy"] = []
                else:
                    kp_array = kp_array.reshape(-1, 2)
                    kp_array = kp_array + np.array([x_min, y_min], dtype=np.float64)
                    keypoint_data["keypoints_xy"] = kp_array.tolist()

                # Copy other keypoint data
                if "keypoints_class_name" in child_pred.data:
                    keypoint_data["keypoints_class_name"] = child_pred.data[
                        "keypoints_class_name"
                    ][j]
                if "keypoints_class_id" in child_pred.data:
                    keypoint_data["keypoints_class_id"] = child_pred.data[
                        "keypoints_class_id"
                    ][j]
                if "keypoints_confidence" in child_pred.data:
                    keypoint_data["keypoints_confidence"] = child_pred.data[
                        "keypoints_confidence"
                    ][j]

            # Collect per-detection data fields to preserve individual detection metadata
            # This is crucial for preserving class_name and other fields when multiple
            # detections have the same class_id but different values
            detection_data = {}
            for key in child_pred.data.keys():
                if key not in [
                    "detection_id",
                    "parent_id",
                    "inference_id",
                    "keypoints_xy",
                    "keypoints_class_name",
                    "keypoints_class_id",
                    "keypoints_confidence",
                ]:
                    if j < len(child_pred.data[key]):
                        detection_data[key] = child_pred.data[key][j]

            if has_masks and child_pred.mask is not None:
                # Instance segmentation - transform mask
                mask = child_pred.mask[j]
                transformed_mask = _transform_mask_to_parent(
                    mask, x_min, y_min, parent_image_shape
                )

                # Also store the transformed bbox for cheap pre-filtering
                raw_bbox = child_pred.xyxy[j]
                transformed_bbox = np.array(
                    [
                        raw_bbox[0] + x_min,
                        raw_bbox[1] + y_min,
                        raw_bbox[2] + x_min,
                        raw_bbox[3] + y_min,
                    ]
                )

                # Store prediction with transformed mask
                if class_id not in class_predictions:
                    class_predictions[class_id] = []

                class_predictions[class_id].append(
                    {
                        "mask": transformed_mask,
                        "confidence": confidence,
                        "class_id": class_id,
                        "bbox": transformed_bbox,
                        "keypoint_data": keypoint_data,
                        "detection_data": detection_data,  # Store per-detection metadata
                    }
                )
            else:
                # Object detection - transform bounding box
                bbox = child_pred.xyxy[j]  # [x_min, y_min, x_max, y_max]
                transformed_bbox = np.array(
                    [bbox[0] + x_min, bbox[1] + y_min, bbox[2] + x_min, bbox[3] + y_min]
                )

                # Store prediction with transformed bbox
                if class_id not in class_predictions:
                    class_predictions[class_id] = []

                class_predictions[class_id].append(
                    {
                        "bbox": transformed_bbox,
                        "confidence": confidence,
                        "class_id": class_id,
                        "mask": None,
                        "keypoint_data": keypoint_data,
                        "detection_data": detection_data,  # Store per-detection metadata
                    }
                )

    # Merge overlapping predictions for each class
    merged_masks = []
    merged_bboxes = []
    merged_confidences = []
    merged_class_ids = []

    # Collect all data field names from child predictions
    all_data_keys = set()
    for child_pred in child_predictions:
        all_data_keys.update(child_pred.data.keys())

    # Initialize lists for each data field
    merged_data = {
        key: []
        for key in all_data_keys
        if key
        not in [
            "keypoints_xy",
            "keypoints_class_name",
            "keypoints_class_id",
            "keypoints_confidence",
        ]
    }

    # Collect keypoint data separately
    all_keypoints_data = {
        "keypoints_xy": [],
        "keypoints_class_name": [],
        "keypoints_class_id": [],
        "keypoints_confidence": [],
    }

    # Build mapping from class_id to typical data values
    class_id_to_data = {}
    for child_pred in child_predictions:
        for i in range(len(child_pred)):
            class_id = child_pred.class_id[i]
            if class_id not in class_id_to_data:
                class_id_to_data[class_id] = {}
                # Store sample values for this class_id (except ID fields and keypoint fields)
                for key in child_pred.data.keys():
                    if key not in [
                        "detection_id",
                        "parent_id",
                        "inference_id",
                        "keypoints_xy",
                        "keypoints_class_name",
                        "keypoints_class_id",
                        "keypoints_confidence",
                    ]:
                        if key in child_pred.data and i < len(child_pred.data[key]):
                            class_id_to_data[class_id][key] = child_pred.data[key][i]

    # Get a sample inference_id and parent_id from the first child prediction if available
    sample_inference_id = None
    sample_parent_id = None
    if len(child_predictions) > 0 and len(child_predictions[0]) > 0:
        if "inference_id" in child_predictions[0].data:
            sample_inference_id = child_predictions[0].data["inference_id"][0]
        if "parent_id" in child_predictions[0].data:
            sample_parent_id = child_predictions[0].data["parent_id"][0]

    for class_id, preds in class_predictions.items():
        if is_keypoint_detection:
            # For keypoint detection, merge based on keypoint proximity
            merged_preds = _merge_keypoint_detections(
                preds, confidence_strategy, keypoint_merge_threshold
            )
        elif has_masks:
            merged_preds = _merge_overlapping_masks(
                preds, confidence_strategy, overlap_threshold
            )
        else:
            merged_preds = _merge_overlapping_bboxes(
                preds, confidence_strategy, overlap_threshold
            )

        for pred in merged_preds:
            if has_masks:
                merged_masks.append(pred["mask"])
            else:
                # For non-mask detections, collect bboxes
                if "bbox" in pred and pred["bbox"] is not None:
                    merged_bboxes.append(pred["bbox"])
            merged_confidences.append(pred["confidence"])
            merged_class_ids.append(pred["class_id"])

            # Collect keypoint data if present
            if "keypoint_data" in pred and pred["keypoint_data"]:
                kp_data = pred["keypoint_data"]
                all_keypoints_data["keypoints_xy"].append(kp_data.get("keypoints_xy"))
                all_keypoints_data["keypoints_class_name"].append(
                    kp_data.get("keypoints_class_name")
                )
                all_keypoints_data["keypoints_class_id"].append(
                    kp_data.get("keypoints_class_id")
                )
                all_keypoints_data["keypoints_confidence"].append(
                    kp_data.get("keypoints_confidence")
                )

            # Add data fields for this detection
            for key in all_data_keys:
                # Skip keypoint fields as they're handled separately
                if key in [
                    "keypoints_xy",
                    "keypoints_class_name",
                    "keypoints_class_id",
                    "keypoints_confidence",
                ]:
                    continue

                if key == "detection_id":
                    # Generate new UUID for merged detection
                    merged_data[key].append(str(uuid.uuid4()))
                elif key == "parent_id":
                    # Use sample parent_id or generate new one
                    merged_data[key].append(
                        sample_parent_id if sample_parent_id else str(uuid.uuid4())
                    )
                elif key == "inference_id":
                    # Use the same inference_id as inputs (they're from same inference batch)
                    merged_data[key].append(
                        sample_inference_id
                        if sample_inference_id
                        else str(uuid.uuid4())
                    )
                elif key == "root_parent_dimensions":
                    # Add the parent image shape as a list [height, width]
                    merged_data[key].append(list(parent_image_shape))
                elif key == "parent_dimensions":
                    # Parent dimensions should be same as root_parent_dimensions for merged results
                    merged_data[key].append(list(parent_image_shape))
                elif key == "image_dimensions":
                    # Image dimensions for this detection
                    merged_data[key].append(list(parent_image_shape))
                elif key == "root_parent_coordinates":
                    # Root parent coordinates [y, x] - should be [0, 0] for the root
                    if (
                        pred["class_id"] in class_id_to_data
                        and key in class_id_to_data[pred["class_id"]]
                    ):
                        merged_data[key].append(class_id_to_data[pred["class_id"]][key])
                    else:
                        merged_data[key].append([0, 0])
                elif key == "parent_coordinates":
                    # Parent coordinates [y, x]
                    if (
                        pred["class_id"] in class_id_to_data
                        and key in class_id_to_data[pred["class_id"]]
                    ):
                        merged_data[key].append(class_id_to_data[pred["class_id"]][key])
                    else:
                        merged_data[key].append([0, 0])
                elif key == "root_parent_id":
                    # Root parent ID
                    if (
                        pred["class_id"] in class_id_to_data
                        and key in class_id_to_data[pred["class_id"]]
                    ):
                        merged_data[key].append(class_id_to_data[pred["class_id"]][key])
                    else:
                        merged_data[key].append("image")
                elif key == "prediction_type":
                    # Prediction type should be 'instance-segmentation'
                    merged_data[key].append("instance-segmentation")
                else:
                    # For other fields like class_name, check pred dict first (per-detection data)
                    # then fall back to class_id_to_data (class-level defaults)
                    if key in pred.get("detection_data", {}):
                        merged_data[key].append(pred["detection_data"][key])
                    elif (
                        pred["class_id"] in class_id_to_data
                        and key in class_id_to_data[pred["class_id"]]
                    ):
                        merged_data[key].append(class_id_to_data[pred["class_id"]][key])
                    else:
                        merged_data[key].append(None)

    if not merged_confidences:
        # Return empty detections if no detections
        return Detections.empty(), crop_zones

    # Convert to numpy arrays
    merged_confidences_array = np.array(merged_confidences, dtype=np.float32)
    merged_class_ids_array = np.array(merged_class_ids, dtype=int)

    if has_masks:
        # Instance segmentation - stack masks and compute bounding boxes
        merged_masks_array = np.stack(merged_masks, axis=0)

        # Compute bounding boxes from masks
        xyxy = []
        for mask in merged_masks_array:
            rows, cols = np.where(mask)
            if len(rows) > 0:
                x_min, x_max = cols.min(), cols.max()
                y_min, y_max = rows.min(), rows.max()
                xyxy.append([x_min, y_min, x_max + 1, y_max + 1])
            else:
                xyxy.append([0, 0, 0, 0])

        xyxy_array = np.array(xyxy, dtype=np.float32)

        # Create Detections object with masks
        result = Detections(
            xyxy=xyxy_array,
            mask=merged_masks_array,
            confidence=merged_confidences_array,
            class_id=merged_class_ids_array,
        )
    else:
        # Object detection - use bounding boxes directly
        if merged_bboxes:
            xyxy_array = np.array(merged_bboxes, dtype=np.float32)
        else:
            # Shouldn't happen, but handle edge case
            xyxy_array = np.zeros((len(merged_confidences), 4), dtype=np.float32)

        # Create Detections object without masks
        result = Detections(
            xyxy=xyxy_array,
            confidence=merged_confidences_array,
            class_id=merged_class_ids_array,
        )

    # Convert data fields to numpy arrays with proper dtypes
    for key, values in merged_data.items():
        if key in [
            "class_name",
            "prediction_type",
            "detection_id",
            "parent_id",
            "inference_id",
            "root_parent_id",
        ]:
            # String fields - use 'U' dtype (Unicode strings), not np.str_
            result.data[key] = np.array(values, dtype=str)
        elif key in [
            "root_parent_dimensions",
            "parent_dimensions",
            "image_dimensions",
            "root_parent_coordinates",
            "parent_coordinates",
        ]:
            # Array/coordinate fields - convert to numpy arrays of integers
            result.data[key] = np.array(values, dtype=int)
        else:
            # Other fields - store as is
            result.data[key] = np.array(values)

    # Add keypoint data if it exists
    if is_keypoint_detection:
        if all_keypoints_data["keypoints_xy"]:
            result.data["keypoints_xy"] = np.array(
                all_keypoints_data["keypoints_xy"], dtype=object
            )
        if all_keypoints_data["keypoints_class_name"]:
            result.data["keypoints_class_name"] = np.array(
                all_keypoints_data["keypoints_class_name"], dtype=object
            )
        if all_keypoints_data["keypoints_class_id"]:
            result.data["keypoints_class_id"] = np.array(
                all_keypoints_data["keypoints_class_id"], dtype=object
            )
        if all_keypoints_data["keypoints_confidence"]:
            result.data["keypoints_confidence"] = np.array(
                all_keypoints_data["keypoints_confidence"], dtype=object
            )

    return result, crop_zones

`core/workflows/core_steps/fusion/detections_stitch`¶

inference.core.workflows.core_steps.fusion.detections_stitch.v1 ¶

Classes¶

Functions:¶

move_detections ¶

move_detections(detections, offset, resolution_wh)

Shift detections by offset, keeping every geometry field consistent: axis-aligned boxes, segmentation masks, and oriented-box corners.

Mirrors supervision.detection.tools.inference_slicer.move_detections; kept local since that helper is not part of supervision's public API.

Source code in inference/core/workflows/core_steps/fusion/detections_stitch/v1.py

def move_detections(
    detections: sv.Detections,
    offset: Optional[np.ndarray],
    resolution_wh: Optional[Tuple[int, int]],
) -> sv.Detections:
    """
    Shift detections by ``offset``, keeping every geometry field consistent:
    axis-aligned boxes, segmentation masks, and oriented-box corners.

    Mirrors ``supervision.detection.tools.inference_slicer.move_detections``;
    kept local since that helper is not part of supervision's public API.
    """
    if len(detections) == 0:
        return detections
    if offset is None:
        raise ValueError("To move non-empty detections offset is needed, but not given")
    detections.xyxy = move_boxes(xyxy=detections.xyxy, offset=offset)
    if ORIENTED_BOX_COORDINATES in detections.data:
        # OBB corners live in `data["xyxyxyxy"]` with shape (N, 4, 2); broadcast
        # `offset` (shape (2,)) over the trailing axis to translate each (x, y).
        # Without this, downstream OBB-aware NMS/NMM compares corners in
        # tile-local coords against `xyxy` already moved to image coords.
        detections.data[ORIENTED_BOX_COORDINATES] = (
            detections.data[ORIENTED_BOX_COORDINATES] + offset
        )
    if detections.mask is not None:
        if resolution_wh is None:
            raise ValueError(
                "To move non-empty detections with segmentation mask, resolution_wh is needed, but not given."
            )
        detections.mask = move_masks(
            masks=detections.mask, offset=offset, resolution_wh=resolution_wh
        )
    return detections

`core/workflows/core_steps/fusion/overlap_analysis`¶

inference.core.workflows.core_steps.fusion.overlap_analysis.v1 ¶

Classes¶

`core/workflows/core_steps`¶

inference.core.workflows.core_steps.loader ¶

Classes¶

`core/workflows/core_steps/models/foundation`¶

inference.core.workflows.core_steps.models.foundation._streaming_video_common ¶

Shared helpers for SAM2/SAM3 streaming video tracker workflow blocks.

The blocks multiplex a single inference_models-backed streaming model across many videos by keying state_dicts on video_identifier, and reset a session whenever the source stream restarts. The SAM2 block additionally re-prompts on the frames requested by prompt_mode; the SAM3 concept block prompts once per session (the model re-detects continuously on its own). Everything that is independent of the concrete model lives here so each block is just a thin wrapper around inference_models.AutoModel.

Classes¶

BoxPromptMetadata `dataclass` ¶

Class info carried from an upstream detector to the emitted mask.

Source code in inference/core/workflows/core_steps/models/foundation/_streaming_video_common.py

@dataclass
class BoxPromptMetadata:
    """Class info carried from an upstream detector to the emitted mask."""

    class_id: int
    class_name: str
    confidence: float
    parent_id: Optional[str]

VideoSessionBookkeeping `dataclass` ¶

Per-video bookkeeping that lives alongside the model's opaque state_dict.

We store the last state returned from the model so the next call can continue the same session; obj_id_metadata holds the detector-provided class name / id / parent detection id for each prompted track so the emitted masks inherit them.

Source code in inference/core/workflows/core_steps/models/foundation/_streaming_video_common.py

@dataclass
class VideoSessionBookkeeping:
    """Per-video bookkeeping that lives alongside the model's opaque
    ``state_dict``.

    We store the last state returned from the model so the next call
    can continue the same session; ``obj_id_metadata`` holds the
    detector-provided class name / id / parent detection id for each
    prompted track so the emitted masks inherit them.
    """

    state_dict: Optional[dict] = None
    last_frame_number: int = -1
    frames_since_prompt: int = 0
    obj_id_metadata: Dict[int, Dict[str, Any]] = field(default_factory=dict)

Functions:¶

build_obj_id_metadata_from_boxes ¶

build_obj_id_metadata_from_boxes(obj_ids, box_metas)

Align SAM-assigned object ids with the detector-provided metadata.

The model hands us object ids in the same order as the prompts we issued; we zip them together so later frames (which only have obj_ids) can still be labelled.

Source code in inference/core/workflows/core_steps/models/foundation/_streaming_video_common.py

def build_obj_id_metadata_from_boxes(
    obj_ids: np.ndarray,
    box_metas: List[BoxPromptMetadata],
) -> Dict[int, BoxPromptMetadata]:
    """Align SAM-assigned object ids with the detector-provided metadata.

    The model hands us object ids in the same order as the prompts we
    issued; we zip them together so later frames (which only have
    ``obj_ids``) can still be labelled.
    """
    return dict(zip([int(i) for i in obj_ids.tolist()], box_metas))

build_obj_id_metadata_from_text ¶

build_obj_id_metadata_from_text(obj_ids, class_names)

For text-prompt sessions where we don't have per-object class info, fall back to a single class name (if only one was supplied) or "foreground" (if multiple or none).

Source code in inference/core/workflows/core_steps/models/foundation/_streaming_video_common.py

def build_obj_id_metadata_from_text(
    obj_ids: np.ndarray,
    class_names: List[str],
) -> Dict[int, BoxPromptMetadata]:
    """For text-prompt sessions where we don't have per-object class
    info, fall back to a single class name (if only one was supplied)
    or "foreground" (if multiple or none).
    """
    label = class_names[0] if len(class_names) == 1 and class_names[0] else "foreground"
    return {
        int(oid): BoxPromptMetadata(
            class_id=0, class_name=label, confidence=1.0, parent_id=None
        )
        for oid in obj_ids.tolist()
    }

concept_frame_to_sv_detections ¶

concept_frame_to_sv_detections(
    masks,
    object_ids,
    scores,
    boxes,
    prompt_to_object_ids,
    class_names,
    image,
    threshold,
)

Assemble sv.Detections from one SAM3 concept-tracker frame.

Unlike the box-prompted path (where class metadata is frozen at prompt time), the concept tracker reports per frame which prompt each object belongs to and a per-object detection score — class labels and confidences are rebuilt from those every frame, so objects detected mid-stream are labelled correctly.

class_id is the prompt's position in class_names (the exact texts sent to the model), so ids are stable for a given prompt set.

Source code in inference/core/workflows/core_steps/models/foundation/_streaming_video_common.py

def concept_frame_to_sv_detections(
    masks: np.ndarray,
    object_ids: np.ndarray,
    scores: np.ndarray,
    boxes: np.ndarray,
    prompt_to_object_ids: Dict[str, List[int]],
    class_names: List[str],
    image: WorkflowImageData,
    threshold: float,
) -> sv.Detections:
    """Assemble ``sv.Detections`` from one SAM3 concept-tracker frame.

    Unlike the box-prompted path (where class metadata is frozen at
    prompt time), the concept tracker reports per frame which prompt
    each object belongs to and a per-object detection score — class
    labels and confidences are rebuilt from those every frame, so
    objects detected mid-stream are labelled correctly.

    ``class_id`` is the prompt's position in ``class_names`` (the exact
    texts sent to the model), so ids are stable for a given prompt set.
    """
    h, w = image.numpy_image.shape[:2]
    if masks.shape[0] == 0:
        return _empty_detections(h, w)

    object_id_to_prompt = {
        int(obj_id): prompt
        for prompt, obj_ids in prompt_to_object_ids.items()
        for obj_id in obj_ids
    }

    xyxy: List[List[float]] = []
    confidences: List[float] = []
    class_ids: List[int] = []
    kept_class_names: List[str] = []
    tracker_ids: List[int] = []
    detection_ids: List[str] = []
    kept_masks: List[np.ndarray] = []

    for mask, obj_id, score, box in zip(
        masks, object_ids.tolist(), scores.tolist(), boxes.tolist()
    ):
        if score < threshold:
            continue
        if not mask.any():
            continue
        prompt = object_id_to_prompt.get(int(obj_id), "foreground")
        try:
            class_id = class_names.index(prompt)
        except ValueError:
            class_id = 0
        xyxy.append([float(v) for v in box[:4]])
        confidences.append(float(score))
        class_ids.append(class_id)
        kept_class_names.append(prompt)
        tracker_ids.append(int(obj_id))
        detection_ids.append(str(uuid4()))
        kept_masks.append(mask.astype(bool))

    if not kept_masks:
        return _empty_detections(h, w)

    detections = sv.Detections(
        xyxy=np.asarray(xyxy, dtype=np.float32),
        mask=np.stack(kept_masks, axis=0),
        confidence=np.asarray(confidences, dtype=np.float32),
        class_id=np.asarray(class_ids, dtype=int),
        tracker_id=np.asarray(tracker_ids, dtype=int),
    )
    detections.data[DETECTIONS_CLASS_NAME_FIELD] = np.asarray(
        kept_class_names, dtype=object
    )
    detections[DETECTION_ID_KEY] = np.asarray(detection_ids, dtype=object)
    detections[PARENT_ID_KEY] = np.asarray(
        [image.parent_metadata.parent_id] * len(detection_ids), dtype=object
    )
    detections[IMAGE_DIMENSIONS_KEY] = np.asarray([[h, w]] * len(detections), dtype=int)
    return detections

decide_prompt_vs_track ¶

decide_prompt_vs_track(
    session,
    frame_number,
    prompt_mode,
    prompt_interval,
    has_prompts,
)

Return (should_reset, should_prompt) for a single frame.

A reset fires when the source stream's frame_number rolls back (or this is the first frame we've seen for this video).
should_prompt is gated on prompt availability: there's no point issuing a prompt call with nothing to prompt on.

Source code in inference/core/workflows/core_steps/models/foundation/_streaming_video_common.py

def decide_prompt_vs_track(
    session: VideoSessionBookkeeping,
    frame_number: int,
    prompt_mode: PromptMode,
    prompt_interval: int,
    has_prompts: bool,
) -> Tuple[bool, bool]:
    """Return ``(should_reset, should_prompt)`` for a single frame.

    - A reset fires when the source stream's ``frame_number`` rolls
      back (or this is the first frame we've seen for this video).
    - ``should_prompt`` is gated on prompt availability: there's no
      point issuing a prompt call with nothing to prompt on.
    """
    fresh_session = session.last_frame_number < 0 or session.state_dict is None
    reset = fresh_session or frame_number < session.last_frame_number

    if prompt_mode == "every_frame":
        return reset, has_prompts
    if prompt_mode == "every_n_frames":
        due = reset or session.frames_since_prompt >= max(1, prompt_interval)
        return reset, due and has_prompts
    # first_frame
    return reset, reset and has_prompts

extract_box_prompts ¶

extract_box_prompts(boxes_for_image)

Flatten an sv.Detections into xyxy tuples + per-box metadata.

Empty / missing input returns two empty lists; class_name defaults to "foreground" when the detection doesn't carry one.

Source code in inference/core/workflows/core_steps/models/foundation/_streaming_video_common.py

def extract_box_prompts(
    boxes_for_image: Optional[sv.Detections],
) -> Tuple[List[Tuple[float, float, float, float]], List[BoxPromptMetadata]]:
    """Flatten an ``sv.Detections`` into xyxy tuples + per-box metadata.

    Empty / missing input returns two empty lists; class_name defaults
    to "foreground" when the detection doesn't carry one.
    """
    if boxes_for_image is None or len(boxes_for_image) == 0:
        return [], []

    boxes_xyxy: List[Tuple[float, float, float, float]] = []
    metas: List[BoxPromptMetadata] = []
    for xyxy, _mask, confidence, class_id, _tracker_id, data in boxes_for_image:
        x1, y1, x2, y2 = xyxy
        boxes_xyxy.append((float(x1), float(y1), float(x2), float(y2)))
        class_name = (
            data.get(DETECTIONS_CLASS_NAME_FIELD, "foreground")
            if isinstance(data, dict)
            else "foreground"
        )
        parent_id = data.get("detection_id") if isinstance(data, dict) else None
        metas.append(
            BoxPromptMetadata(
                class_id=int(class_id) if class_id is not None else 0,
                class_name=str(class_name),
                confidence=float(confidence) if confidence is not None else 1.0,
                parent_id=str(parent_id) if parent_id is not None else None,
            )
        )
    return boxes_xyxy, metas

masks_to_sv_detections ¶

masks_to_sv_detections(
    masks, obj_ids, image, obj_id_metadata, threshold
)

Assemble one sv.Detections of instance-seg predictions.

Emits one detection per SAM-assigned object (preserving the one-to-one mapping with tracker_id). Masks without any positive pixels are dropped.

Source code in inference/core/workflows/core_steps/models/foundation/_streaming_video_common.py

def masks_to_sv_detections(
    masks: np.ndarray,
    obj_ids: np.ndarray,
    image: WorkflowImageData,
    obj_id_metadata: Dict[int, BoxPromptMetadata],
    threshold: float,
) -> sv.Detections:
    """Assemble one ``sv.Detections`` of instance-seg predictions.

    Emits one detection per SAM-assigned object (preserving the
    one-to-one mapping with ``tracker_id``).  Masks without any positive
    pixels are dropped.
    """
    h, w = image.numpy_image.shape[:2]
    if masks.shape[0] == 0:
        return _empty_detections(h, w)

    xyxy: List[List[float]] = []
    confidences: List[float] = []
    class_ids: List[int] = []
    class_names: List[str] = []
    tracker_ids: List[int] = []
    detection_ids: List[str] = []
    parent_ids: List[str] = []
    kept_masks: List[np.ndarray] = []

    for mask, obj_id in zip(masks, obj_ids.tolist()):
        meta = obj_id_metadata.get(int(obj_id))
        confidence = meta.confidence if meta is not None else 1.0
        if confidence < threshold:
            continue
        ys, xs = np.where(mask)
        if xs.size == 0:
            continue
        xyxy.append(
            [
                float(xs.min()),
                float(ys.min()),
                float(xs.max()),
                float(ys.max()),
            ]
        )
        confidences.append(float(confidence))
        class_ids.append(meta.class_id if meta is not None else 0)
        class_names.append(meta.class_name if meta is not None else "foreground")
        tracker_ids.append(int(obj_id))
        parent = meta.parent_id if meta is not None else None
        parent_ids.append(str(parent) if parent is not None else "")
        detection_ids.append(str(uuid4()))
        kept_masks.append(mask.astype(bool))

    if not kept_masks:
        return _empty_detections(h, w)

    detections = sv.Detections(
        xyxy=np.asarray(xyxy, dtype=np.float32),
        mask=np.stack(kept_masks, axis=0),
        confidence=np.asarray(confidences, dtype=np.float32),
        class_id=np.asarray(class_ids, dtype=int),
        tracker_id=np.asarray(tracker_ids, dtype=int),
    )
    detections.data[DETECTIONS_CLASS_NAME_FIELD] = np.asarray(class_names, dtype=object)
    detections[DETECTION_ID_KEY] = np.asarray(detection_ids, dtype=object)
    detections[PARENT_ID_KEY] = np.asarray(parent_ids, dtype=object)
    detections[IMAGE_DIMENSIONS_KEY] = np.asarray([[h, w]] * len(detections), dtype=int)
    return detections

normalise_class_names ¶

normalise_class_names(class_names)

Accept a list, comma-separated string, or None and return a list.

Source code in inference/core/workflows/core_steps/models/foundation/_streaming_video_common.py

def normalise_class_names(
    class_names: Optional[Any],
) -> List[str]:
    """Accept a list, comma-separated string, or None and return a list."""
    if class_names is None:
        return []
    if isinstance(class_names, str):
        return [c.strip() for c in class_names.split(",") if c.strip()]
    return [c for c in class_names if c]

`core/workflows/core_steps/models/foundation/anthropic_claude`¶

inference.core.workflows.core_steps.models.foundation.anthropic_claude.v3 ¶

Classes¶

Functions:¶

execute_claude_request ¶

execute_claude_request(
    roboflow_api_key,
    anthropic_api_key,
    system_prompt,
    messages,
    model_version,
    max_tokens,
    temperature,
    extended_thinking,
    thinking_budget_tokens,
)

Route to proxied or direct execution based on API key format.

Source code in inference/core/workflows/core_steps/models/foundation/anthropic_claude/v3.py

def execute_claude_request(
    roboflow_api_key: Optional[str],
    anthropic_api_key: str,
    system_prompt: Optional[str],
    messages: List[dict],
    model_version: str,
    max_tokens: Optional[int],
    temperature: Optional[float],
    extended_thinking: Optional[bool],
    thinking_budget_tokens: Optional[int],
) -> str:
    """Route to proxied or direct execution based on API key format."""
    if anthropic_api_key.startswith(("rf_key:account", "rf_key:user:")):
        return _execute_proxied_claude_request(
            roboflow_api_key=roboflow_api_key,
            anthropic_api_key=anthropic_api_key,
            system_prompt=system_prompt,
            messages=messages,
            model_version=model_version,
            max_tokens=max_tokens,
            temperature=temperature,
            extended_thinking=extended_thinking,
            thinking_budget_tokens=thinking_budget_tokens,
        )
    else:
        return _execute_direct_claude_request(
            anthropic_api_key=anthropic_api_key,
            system_prompt=system_prompt,
            messages=messages,
            model_version=model_version,
            max_tokens=max_tokens,
            temperature=temperature,
            extended_thinking=extended_thinking,
            thinking_budget_tokens=thinking_budget_tokens,
        )

`core/workflows/core_steps/models/foundation/glm_ocr`¶

inference.core.workflows.core_steps.models.foundation.glm_ocr.v1 ¶

Classes¶

`core/workflows/core_steps/models/foundation/google_gemini`¶

inference.core.workflows.core_steps.models.foundation.google_gemini.v3 ¶

Classes¶

Functions:¶

execute_gemini_request ¶

execute_gemini_request(
    roboflow_api_key, google_api_key, prompt, model_version
)

Route to proxied or direct execution based on API key format.

Source code in inference/core/workflows/core_steps/models/foundation/google_gemini/v3.py

def execute_gemini_request(
    roboflow_api_key: Optional[str],
    google_api_key: str,
    prompt: dict,
    model_version: str,
) -> str:
    """Route to proxied or direct execution based on API key format."""
    if google_api_key.startswith(("rf_key:account", "rf_key:user:")):
        return _execute_proxied_gemini_request(
            roboflow_api_key=roboflow_api_key,
            google_api_key=google_api_key,
            prompt=prompt,
            model_version=model_version,
        )
    else:
        return _execute_direct_gemini_request(
            google_api_key=google_api_key,
            prompt=prompt,
            model_version=model_version,
        )

`core/workflows/core_steps/models/foundation/openai`¶

inference.core.workflows.core_steps.models.foundation.openai.v3 ¶

Classes¶

Functions:¶

inference.core.workflows.core_steps.models.foundation.openai.v4 ¶

Classes¶

Functions:¶

`core/workflows/core_steps/models/foundation/openai_compatible`¶

inference.core.workflows.core_steps.models.foundation.openai_compatible.v1 ¶

Classes¶

Functions:¶

`core/workflows/core_steps/models/foundation/openrouter`¶

inference.core.workflows.core_steps.models.foundation.openrouter.v1 ¶

Generic OpenRouter workflow block.

Like the Qwen-VL / Kimi / Gemma OpenRouter blocks, but the model is a free-form string instead of a fixed dropdown. The user pastes any OpenRouter model slug (e.g. openai/gpt-4o-mini, anthropic/claude-3.5-sonnet, qwen/qwen3.6-27b) and the block routes through Roboflow's apiproxy/openrouter proxy by default, or directly to OpenRouter when the user provides their own sk-or-... key.

The task-type surface (unconstrained, OCR, classification, detection, etc.) is the one shared via common.openrouter with the other VLM blocks.

Classes¶

Functions:¶

`core/workflows/core_steps/models/foundation/qwen_vlm`¶

inference.core.workflows.core_steps.models.foundation.qwen_vlm.v1 ¶

Unified Qwen-VL workflow block.

Subsumes the older per-version Qwen blocks (qwen25vl@v1, qwen3vl@v1, qwen3_5vl@v1, qwen3_5_openrouter@v1, qwen3_6_openrouter@v1) into a single block where the user picks:

a backend — "Native (Roboflow)" or "OpenRouter"
a model_version (combined version+size selector); each variant is bound to one backend
the standard VLM task_type surface (unconstrained, OCR, classification, detection, etc.) shared with the Gemma/Llama/Kimi blocks

For the OpenRouter backend, all the API Key Passthrough plumbing (rf_key:account vs custom sk-or-..., the privacy_level filter, the proxy/billing flow) is inherited from common.openrouter.OpenRouterWorkflowBlockBase.

For the Native backend, the block dispatches via StepExecutionMode.LOCAL/REMOTE to either model_manager (local process) or InferenceHTTPClient (Roboflow-hosted inference), exactly like the v1 native qwen blocks did.

Classes¶

BlockManifest ¶

Bases: OpenRouterBlockManifestMixin

Source code in inference/core/workflows/core_steps/models/foundation/qwen_vlm/v1.py

class BlockManifest(OpenRouterBlockManifestMixin):
    model_config = ConfigDict(
        json_schema_extra={
            "name": "Qwen-VL",
            "version": "v1",
            "short_description": "Run any Qwen vision model — natively or via OpenRouter.",
            "long_description": LONG_DESCRIPTION,
            "license": "Apache-2.0",
            "block_type": "model",
            "search_keywords": [
                "Qwen",
                "qwen-vl",
                "qwen3.5",
                "qwen3.6",
                "VLM",
                "Alibaba",
                "OpenRouter",
            ],
            "is_vlm_block": True,
            "task_type_property": "task_type",
            "ui_manifest": {
                "section": "model",
                "icon": "fal fa-atom",
                "blockPriority": 5.5,
            },
        },
        protected_namespaces=(),
    )
    type: Literal["roboflow_core/qwen_vlm@v1"]

    images: Selector(kind=[IMAGE_KIND]) = ImageInputField

    backend: Backend = Field(
        default="native",
        description=(
            "Where to run inference. Native = Roboflow infrastructure. "
            "OpenRouter = large hosted Qwen models via OpenRouter."
        ),
        json_schema_extra={
            "values_metadata": {
                "native": {
                    "name": "Native (Roboflow)",
                    "description": (
                        "Runs locally on the inference server, or remotely "
                        "via Roboflow Hosted Inference. Smaller models, "
                        "lower latency."
                    ),
                },
                "openrouter": {
                    "name": "OpenRouter",
                    "description": (
                        "Routes to large hosted Qwen models via OpenRouter. "
                        "Defaults to a Roboflow-managed key (billed in "
                        "credits)."
                    ),
                },
            },
            "always_visible": True,
        },
    )

    # Native model picker: friendly-name dropdown listing the built-in
    # pre-trained variants AND a `Fine-tuned model` sentinel entry that,
    # when selected, reveals the `fine_tuned_model_id` field below.
    model_version: Union[Selector(kind=[STRING_KIND]), NativeModelVersion] = Field(
        default=DEFAULT_NATIVE_MODEL_VERSION,
        description=(
            "Native Qwen-VL variant. Pick a pre-trained model or "
            f"`{FINE_TUNED_NATIVE_LABEL}` to use a Qwen3 fine-tune from your "
            "workspace."
        ),
        examples=[DEFAULT_NATIVE_MODEL_VERSION, FINE_TUNED_NATIVE_LABEL],
        json_schema_extra={
            "relevant_for": {
                "backend": {"values": ["native"], "required": True},
            },
        },
    )

    # Fine-tuned native picker: Roboflow model-id selector so the UI
    # surfaces the user's workspace Qwen3 fine-tunes (qwen-pretrains/2
    # family). Gated solely on `model_version=FINE_TUNED_NATIVE_LABEL` —
    # the UI honors only one `relevant_for` key. The companion
    # `model_validator` below resets `model_version` back to a pre-trained
    # variant when the user switches to OpenRouter, which makes the gate
    # condition false on revalidation and hides this field as well.
    fine_tuned_model_id: Optional[
        Union[Selector(kind=[ROBOFLOW_MODEL_ID_KIND, STRING_KIND]), str]
    ] = Field(
        default=None,
        description=(
            "Fine-tuned Qwen3-VL model from your workspace, in "
            "`workspace/version` form."
        ),
        examples=["your-workspace/3", "$inputs.qwen_finetune"],
        json_schema_extra={
            "relevant_for": {
                "model_version": {
                    "values": [FINE_TUNED_NATIVE_LABEL],
                    "required": True,
                },
            },
        },
    )

    # OpenRouter model picker: friendly-name dropdown bound to OpenRouter slugs.
    openrouter_model_version: Union[
        Selector(kind=[STRING_KIND]), OpenRouterModelVersion
    ] = Field(
        default="Qwen 3.6 27B",
        description="OpenRouter-hosted Qwen variant.",
        examples=["Qwen 3.6 27B", "Qwen 3.5 27B"],
        json_schema_extra={
            "relevant_for": {
                "backend": {"values": ["openrouter"], "required": True},
            },
        },
    )

    task_type: TaskType = Field(
        default="unconstrained",
        description="Task type to be performed by model. Value determines required parameters and output response.",
        json_schema_extra={
            "values_metadata": RELEVANT_TASKS_METADATA,
            "recommended_parsers": RECOMMENDED_PARSERS,
            "always_visible": True,
        },
    )

    prompt: Optional[Union[Selector(kind=[STRING_KIND]), str]] = Field(
        default=None,
        description="Text prompt to the Qwen model",
        examples=["my prompt", "$inputs.prompt"],
        json_schema_extra={
            "relevant_for": {
                "task_type": {
                    "values": ["unconstrained", "visual-question-answering"],
                    "required": True,
                },
            },
            "multiline": True,
        },
    )
    enable_thinking: bool = Field(
        default=False,
        description=(
            "Enable Qwen3.5-VL's reasoning mode, where the model emits "
            "thinking tokens before its answer. The reasoning trace is "
            "returned in the `thinking` output. Only the Qwen 3.5 VL 2B "
            "checkpoint (and Qwen3-VL fine-tunes derived from it) supports "
            "this; ignored elsewhere."
        ),
        json_schema_extra={
            "relevant_for": {
                "model_version": {
                    "values": NATIVE_THINKING_MODEL_VERSIONS,
                    "required": False,
                },
            },
        },
    )
    output_structure: Optional[Dict[str, str]] = Field(
        default=None,
        description="Dictionary with structure of expected JSON response",
        examples=[{"my_key": "description"}, "$inputs.output_structure"],
        json_schema_extra={
            "relevant_for": {
                "task_type": {"values": ["structured-answering"], "required": True},
            },
        },
    )
    classes: Optional[Union[Selector(kind=[LIST_OF_VALUES_KIND]), List[str]]] = Field(
        default=None,
        description="List of classes to be used",
        examples=[["class-a", "class-b"], "$inputs.classes"],
        json_schema_extra={
            "relevant_for": {
                "task_type": {
                    "values": [
                        "classification",
                        "multi-label-classification",
                        "object-detection",
                    ],
                    "required": True,
                },
            },
        },
    )

    # --- Override inherited OpenRouter fields with relevant_for=openrouter ---
    api_key: Union[
        Selector(kind=[STRING_KIND, SECRET_KIND, ROBOFLOW_MANAGED_KEY]), str
    ] = Field(
        default="rf_key:account",
        description=(
            "OpenRouter API key (only used when backend=openrouter). Defaults "
            "to Roboflow's managed key. Provide your own `sk-or-...` key to "
            "call OpenRouter directly without Roboflow billing."
        ),
        examples=["rf_key:account", "sk-or-...", "$inputs.openrouter_api_key"],
        private=True,
        json_schema_extra={
            "relevant_for": {"backend": {"values": ["openrouter"], "required": False}},
        },
    )
    privacy_level: PRIVACY_LEVEL_LITERAL = Field(
        default="deny",
        description=(
            "Provider privacy filter (only used when backend=openrouter). "
            "Stricter levels reduce the pool of providers and may increase "
            "per-call cost on the managed key."
        ),
        json_schema_extra={
            "values_metadata": PRIVACY_LEVEL_METADATA,
            "relevant_for": {"backend": {"values": ["openrouter"], "required": False}},
        },
    )
    temperature: Union[float, Selector(kind=[FLOAT_KIND])] = Field(
        default=0.1,
        description=(
            "Sampling temperature (only used when backend=openrouter). "
            "The native Qwen-VL runtime doesn't accept a temperature knob. "
            'Range 0.0-2.0 — higher = more random / "creative" generations.'
        ),
        json_schema_extra={
            "relevant_for": {"backend": {"values": ["openrouter"], "required": False}},
        },
    )
    max_concurrent_requests: Optional[int] = Field(
        default=None,
        description=(
            "Maximum number of OpenRouter requests to run in parallel for a "
            "batch of images (only used when backend=openrouter). The native "
            "backend processes images sequentially. If unset, falls back to "
            "the global Workflows Execution Engine default. Restrict this if "
            "you hit OpenRouter rate limits."
        ),
        json_schema_extra={
            "relevant_for": {"backend": {"values": ["openrouter"], "required": False}},
        },
    )

    @model_validator(mode="after")
    def validate(self) -> "BlockManifest":
        # Re-coupling step for the OpenRouter backend: when the user
        # switches `backend` away from `native`, the native `model_version`
        # dropdown is hidden but its underlying value persists. If that
        # stale value is `FINE_TUNED_NATIVE_LABEL`, the `fine_tuned_model_id`
        # field — gated solely on `model_version=FINE_TUNED_NATIVE_LABEL`
        # — keeps showing in the UI. Resetting `model_version` here makes
        # the gate condition false on the next revalidation pass, so the
        # selector hides itself for the OpenRouter flow.
        if (
            self.backend == "openrouter"
            and self.model_version == FINE_TUNED_NATIVE_LABEL
        ):
            self.model_version = DEFAULT_NATIVE_MODEL_VERSION
        validate_task_type_required_fields(
            task_type=self.task_type,
            prompt=self.prompt,
            classes=self.classes,
            output_structure=self.output_structure,
        )
        if (
            self.backend == "native"
            and self.model_version == FINE_TUNED_NATIVE_LABEL
            and not self.fine_tuned_model_id
        ):
            raise ValueError(
                "`fine_tuned_model_id` is required when `model_version="
                f"'{FINE_TUNED_NATIVE_LABEL}'`. Pick a fine-tuned Qwen3 model "
                "from your workspace."
            )
        return self

    @field_validator("temperature")
    @classmethod
    def validate_temperature(cls, value: Union[str, float]) -> Union[str, float]:
        if isinstance(value, str):
            return value
        if value < 0.0 or value > 2.0:
            raise ValueError(
                "'temperature' parameter required to be in range [0.0, 2.0]"
            )
        return value

    @classmethod
    def get_air_gapped_availability(cls) -> AirGappedAvailability:
        return AirGappedAvailability(available=False, reason="requires_internet")

    @classmethod
    def get_parameters_accepting_batches(cls) -> List[str]:
        return ["images"]

    @classmethod
    def describe_outputs(cls) -> List[OutputDefinition]:
        return [
            OutputDefinition(
                name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND]
            ),
            OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]),
            OutputDefinition(
                name="thinking",
                kind=[STRING_KIND],
                description=(
                    "Reasoning trace from Qwen3.5-VL when `enable_thinking` "
                    "is on. Empty string otherwise."
                ),
            ),
        ]

    @classmethod
    def get_execution_engine_compatibility(cls) -> Optional[str]:
        return ">=1.3.0,<2.0.0"

    @classmethod
    def get_supported_model_variants(cls) -> Optional[List[str]]:
        """Tell the UI which native model_ids (and fine-tune families) this
        block can run, so the workspace model picker shows Qwen3 fine-tunes.
        """
        return NATIVE_SUPPORTED_VARIANTS

Methods:¶

get_supported_model_variants `classmethod` ¶

get_supported_model_variants()

Tell the UI which native model_ids (and fine-tune families) this block can run, so the workspace model picker shows Qwen3 fine-tunes.

Source code in inference/core/workflows/core_steps/models/foundation/qwen_vlm/v1.py

@classmethod
def get_supported_model_variants(cls) -> Optional[List[str]]:
    """Tell the UI which native model_ids (and fine-tune families) this
    block can run, so the workspace model picker shows Qwen3 fine-tunes.
    """
    return NATIVE_SUPPORTED_VARIANTS

QwenVlmBlockV1 ¶

Bases: OpenRouterWorkflowBlockBase

Unified Qwen-VL block. Inherits OpenRouter routing/execution from base and adds the native local/remote dispatch on top.

Source code in inference/core/workflows/core_steps/models/foundation/qwen_vlm/v1.py

class QwenVlmBlockV1(OpenRouterWorkflowBlockBase):
    """Unified Qwen-VL block. Inherits OpenRouter routing/execution from base
    and adds the native local/remote dispatch on top.
    """

    def __init__(
        self,
        model_manager: ModelManager,
        api_key: Optional[str],
        step_execution_mode: StepExecutionMode,
    ):
        super().__init__(model_manager=model_manager, api_key=api_key)
        self._step_execution_mode = step_execution_mode

    @classmethod
    def get_init_parameters(cls) -> List[str]:
        return ["model_manager", "api_key", "step_execution_mode"]

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return BlockManifest

    @classmethod
    def get_execution_engine_compatibility(cls) -> Optional[str]:
        return ">=1.3.0,<2.0.0"

    def run(
        self,
        images: Batch[WorkflowImageData],
        backend: str,
        model_version: str,
        fine_tuned_model_id: Optional[str],
        openrouter_model_version: str,
        task_type: str,
        prompt: Optional[str],
        enable_thinking: bool,
        output_structure: Optional[Dict[str, str]],
        classes: Optional[List[str]],
        api_key: str,
        privacy_level: str,
        max_tokens: int,
        temperature: float,
        max_concurrent_requests: Optional[int],
    ) -> BlockResult:
        if backend == "native":
            if model_version == FINE_TUNED_NATIVE_LABEL:
                if not fine_tuned_model_id:
                    raise ValueError(
                        "`fine_tuned_model_id` is required when "
                        f"`model_version='{FINE_TUNED_NATIVE_LABEL}'`."
                    )
                model_id = fine_tuned_model_id
            else:
                variant = MODEL_VARIANTS.get(model_version)
                if variant is None or variant["backend"] != "native":
                    raise ValueError(
                        f"Unknown pre-trained Qwen variant '{model_version}'. "
                        f"Pick one of: {NATIVE_VARIANT_LABELS} or "
                        f"'{FINE_TUNED_NATIVE_LABEL}'."
                    )
                model_id = variant["model_id"]
        elif backend == "openrouter":
            variant = MODEL_VARIANTS.get(openrouter_model_version)
            if variant is None or variant["backend"] != "openrouter":
                raise ValueError(
                    f"Unknown OpenRouter Qwen variant "
                    f"'{openrouter_model_version}'. Pick one of: "
                    f"{OPENROUTER_VARIANT_LABELS}"
                )
            model_id = variant["model_id"]
        else:
            raise ValueError(f"Unknown backend: {backend}")

        if backend == "openrouter":
            inference_images = [i.to_inference_format() for i in images]
            prompts = build_prompts_from_images(
                images=inference_images,
                task_type=task_type,
                prompt=prompt,
                output_structure=output_structure,
                classes=classes,
            )
            raw_outputs = self.execute_openrouter_batch(
                openrouter_api_key=api_key,
                model=model_id,
                prompts=prompts,
                max_tokens=max_tokens,
                temperature=temperature,
                privacy_level=privacy_level,
                max_concurrent_requests=max_concurrent_requests,
            )
            return [
                {"output": o, "classes": classes, "thinking": ""} for o in raw_outputs
            ]

        # `enable_thinking` is only meaningful on Qwen3.5-VL native variants
        # (and qwen3-vl fine-tunes derived from them). Silently ignore on
        # other native checkpoints so the field stays harmless if it's left
        # toggled on after a model switch.
        supports_thinking = model_version in NATIVE_THINKING_MODEL_VERSIONS
        native_outputs = self._run_native(
            images=images,
            model_id=model_id,
            task_type=task_type,
            prompt=prompt,
            output_structure=output_structure,
            classes=classes,
            enable_thinking=enable_thinking and supports_thinking,
            max_tokens=max_tokens,
        )
        return [
            {"output": o["output"], "classes": classes, "thinking": o["thinking"]}
            for o in native_outputs
        ]

    # ----------------------- Native dispatch -----------------------

    def _run_native(
        self,
        images: Batch[WorkflowImageData],
        model_id: str,
        task_type: str,
        prompt: Optional[str],
        output_structure: Optional[Dict[str, str]],
        classes: Optional[List[str]],
        enable_thinking: bool,
        max_tokens: Optional[int],
    ) -> List[Dict[str, str]]:
        combined_prompt = _build_native_prompt(
            task_type=task_type,
            prompt=prompt,
            output_structure=output_structure,
            classes=classes,
        )
        if self._step_execution_mode == StepExecutionMode.LOCAL:
            return self._run_native_locally(
                images=images,
                model_id=model_id,
                combined_prompt=combined_prompt,
                enable_thinking=enable_thinking,
                max_new_tokens=max_tokens,
            )
        if self._step_execution_mode == StepExecutionMode.REMOTE:
            return self._run_native_remotely(
                images=images,
                model_id=model_id,
                combined_prompt=combined_prompt,
                enable_thinking=enable_thinking,
                max_new_tokens=max_tokens,
            )
        raise ValueError(f"Unknown step_execution_mode: {self._step_execution_mode}")

    def _run_native_locally(
        self,
        images: Batch[WorkflowImageData],
        model_id: str,
        combined_prompt: str,
        enable_thinking: bool,
        max_new_tokens: Optional[int],
    ) -> List[Dict[str, str]]:
        inference_images = [
            i.to_inference_format(numpy_preferred=False) for i in images
        ]
        self._model_manager.add_model(model_id=model_id, api_key=self._roboflow_api_key)
        outputs: List[Dict[str, str]] = []
        for image in inference_images:
            request_kwargs: Dict[str, Any] = dict(
                api_key=self._roboflow_api_key,
                model_id=model_id,
                image=image,
                source="workflow-execution",
                prompt=combined_prompt,
                enable_thinking=enable_thinking,
            )
            if max_new_tokens is not None:
                request_kwargs["max_new_tokens"] = max_new_tokens
            request = LMMInferenceRequest(**request_kwargs)
            prediction = self._model_manager.infer_from_request_sync(
                model_id=model_id, request=request
            )
            output, thinking = _coerce_native_response(prediction.response)
            outputs.append({"output": output, "thinking": thinking})
        return outputs

    def _run_native_remotely(
        self,
        images: Batch[WorkflowImageData],
        model_id: str,
        combined_prompt: str,
        enable_thinking: bool,
        max_new_tokens: Optional[int],
    ) -> List[Dict[str, str]]:
        api_url = (
            LOCAL_INFERENCE_API_URL
            if WORKFLOWS_REMOTE_API_TARGET != "hosted"
            else HOSTED_CORE_MODEL_URL
        )
        client = InferenceHTTPClient(api_url=api_url, api_key=self._roboflow_api_key)
        if WORKFLOWS_REMOTE_API_TARGET == "hosted":
            client.select_api_v0()
        outputs: List[Dict[str, str]] = []
        for image in images:
            kwargs: Dict[str, Any] = dict(
                inference_input=image.base64_image,
                model_id=model_id,
                prompt=combined_prompt,
                model_id_in_path=True,
                enable_thinking=enable_thinking,
            )
            if max_new_tokens is not None:
                kwargs["max_new_tokens"] = max_new_tokens
            result = client.infer_lmm(**kwargs)
            response_text = result.get("response", result)
            output, thinking = _coerce_native_response(response_text)
            outputs.append({"output": output, "thinking": thinking})
        return outputs

Functions:¶

`core/workflows/core_steps/models/foundation/segment_anything2_video`¶

inference.core.workflows.core_steps.models.foundation.segment_anything2_video.v1 ¶

SAM2 Video Tracker workflow block.

Wraps inference_models's SAM2Video (the HuggingFace streaming tracker) so it can be driven from a workflow powered by InferencePipeline. The pipeline delivers one frame at a time with WorkflowImageData.video_metadata; this block keeps one state_dict per video_identifier and either re-prompts or propagates existing tracks on each frame.

Prompt modes (see prompt_mode):

first_frame Consume boxes once per session, then track silently. every_n_frames Re-seed every prompt_interval frames (counted from the last prompt). every_frame Re-seed on every frame — effectively turns the block into a per-frame detector→mask adapter with SAM-stable tracker ids.

boxes is ignored on frames where we only propagate.

Classes¶

SegmentAnything2VideoBlockV1 ¶

Bases: WorkflowBlock

Stateful SAM2 streaming video tracking block.

Source code in inference/core/workflows/core_steps/models/foundation/segment_anything2_video/v1.py

class SegmentAnything2VideoBlockV1(WorkflowBlock):
    """Stateful SAM2 streaming video tracking block."""

    _REMOTE_EXECUTION_NOT_SUPPORTED_MESSAGE = (
        "SAM2 Video Tracker only supports LOCAL workflow step "
        "execution.  Remote execution would ship each frame to a "
        "separate process and break the per-video SAM2 session "
        "that holds the temporal memory.  Set "
        "WORKFLOWS_STEP_EXECUTION_MODE=local (or run on a "
        "dedicated deployment) to use this block."
    )

    def __init__(
        self,
        model_manager: ModelManager,
        api_key: Optional[str],
        step_execution_mode: StepExecutionMode,
    ):
        self._model_manager = model_manager
        self._api_key = api_key
        self._step_execution_mode = step_execution_mode
        self._model = None  # lazily loaded
        self._current_model_id: Optional[str] = None
        self._sessions: Dict[str, VideoSessionBookkeeping] = {}

    @classmethod
    def get_init_parameters(cls) -> List[str]:
        return ["model_manager", "api_key", "step_execution_mode"]

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return BlockManifest

    def _get_model(self, model_id: str):
        if self._model is None or self._current_model_id != model_id:
            from inference_models import AutoModel

            extra_weights_provider_headers = get_extra_weights_provider_headers()
            self._model = AutoModel.from_pretrained(
                model_id_or_path=model_id,
                api_key=self._api_key,
                weights_provider_extra_headers=extra_weights_provider_headers,
            )
            self._current_model_id = model_id
            # Switching model invalidates every session we held.
            self._sessions.clear()
        return self._model

    def run(
        self,
        images: Batch[WorkflowImageData],
        boxes: Optional[Batch[sv.Detections]],
        model_id: str,
        prompt_mode: PromptMode,
        prompt_interval: int,
        threshold: float,
    ) -> BlockResult:
        if self._step_execution_mode is not StepExecutionMode.LOCAL:
            raise NotImplementedError(self._REMOTE_EXECUTION_NOT_SUPPORTED_MESSAGE)
        model = self._get_model(model_id=model_id)
        if boxes is None:
            boxes = [None] * len(images)

        batch_detections: List[sv.Detections] = []
        for single_image, boxes_for_image in zip(images, boxes):
            metadata = single_image.video_metadata
            video_id = metadata.video_identifier
            frame_number = metadata.frame_number or 0

            session = self._sessions.setdefault(video_id, VideoSessionBookkeeping())
            has_box_prompts = boxes_for_image is not None and len(boxes_for_image) > 0
            should_reset, should_prompt = decide_prompt_vs_track(
                session=session,
                frame_number=frame_number,
                prompt_mode=prompt_mode,
                prompt_interval=prompt_interval,
                has_prompts=has_box_prompts,
            )
            if should_reset:
                session.state_dict = None
                session.obj_id_metadata = {}
                session.frames_since_prompt = 0

            frame_np = single_image.numpy_image

            if should_prompt:
                boxes_xyxy, per_box_meta = extract_box_prompts(boxes_for_image)
                masks, obj_ids, new_state = model.prompt(
                    image=frame_np,
                    bboxes=boxes_xyxy,
                    state_dict=session.state_dict,
                    # clear old points is implied by our own reset gating
                    clear_old_prompts=True,
                    frame_idx=frame_number,
                )
                session.obj_id_metadata = build_obj_id_metadata_from_boxes(
                    obj_ids=obj_ids, box_metas=per_box_meta
                )
                session.state_dict = new_state
                session.frames_since_prompt = 0
            elif session.state_dict is not None:
                masks, obj_ids, new_state = model.track(
                    image=frame_np, state_dict=session.state_dict
                )
                session.state_dict = new_state
                session.frames_since_prompt += 1
            else:
                import numpy as np

                masks = np.zeros((0, frame_np.shape[0], frame_np.shape[1]), dtype=bool)
                obj_ids = np.zeros((0,), dtype=np.int64)

            session.last_frame_number = frame_number

            batch_detections.append(
                masks_to_sv_detections(
                    masks=masks,
                    obj_ids=obj_ids,
                    image=single_image,
                    obj_id_metadata=session.obj_id_metadata,
                    threshold=threshold,
                )
            )

        batch_detections = attach_prediction_type_info_to_sv_detections_batch(
            predictions=batch_detections,
            prediction_type="instance-segmentation",
        )
        batch_detections = attach_parents_coordinates_to_batch_of_sv_detections(
            images=images,
            predictions=batch_detections,
        )
        return [{"predictions": pred} for pred in batch_detections]

Functions:¶

`core/workflows/core_steps/models/foundation/segment_anything3_3d`¶

inference.core.workflows.core_steps.models.foundation.segment_anything3_3d.v1 ¶

Classes¶

Functions:¶

extract_masks_from_input ¶

extract_masks_from_input(mask_input)

Extract binary masks from sv.Detections, pass through other formats.

Source code in inference/core/workflows/core_steps/models/foundation/segment_anything3_3d/v1.py

def extract_masks_from_input(mask_input: Any) -> Any:
    """Extract binary masks from sv.Detections, pass through other formats."""
    if isinstance(mask_input, sv.Detections):
        if len(mask_input) == 0:
            raise ValueError("sv.Detections contains no detections.")
        if mask_input.mask is not None and len(mask_input.mask) > 0:
            return list(mask_input.mask)
        raise ValueError("sv.Detections has no mask data.")
    return mask_input

`core/workflows/core_steps/models/foundation/segment_anything3_interactive`¶

inference.core.workflows.core_steps.models.foundation.segment_anything3_interactive.v1 ¶

Classes¶

BlockManifest ¶

Source code in inference/core/workflows/core_steps/models/foundation/segment_anything3_interactive/v1.py

class BlockManifest(WorkflowBlockManifest):
    model_config = ConfigDict(
        json_schema_extra={
            "name": "SAM 3 Interactive",
            "version": "v1",
            "short_description": SHORT_DESCRIPTION,
            "long_description": LONG_DESCRIPTION,
            "license": "Apache-2.0",
            "block_type": "model",
            "search_keywords": [
                "Sam",
                "SAM3",
                "segment anything",
                "segment anything 3",
                "point prompt",
                "interactive segmentation",
                "PVS",
            ],
            "ui_manifest": {
                "section": "model",
                "icon": "fa-solid fa-eye",
                "blockPriority": 9.47,
                "needsGPU": True,
                "inference": True,
            },
        },
        protected_namespaces=(),
    )

    type: Literal["roboflow_core/sam3_interactive@v1"]
    images: Selector(kind=[IMAGE_KIND]) = ImageInputField
    points: Optional[Union[List[Any], Selector(kind=[LABELED_POINTS_KIND])]] = Field(
        default=None,
        title="Point Prompts",
        description="Labeled points defining a single object to segment. "
        "Each point is {'x': ..., 'y': ..., 'positive': ...} in absolute pixel coordinates - "
        "positive points mark the object, negative points mark regions to exclude. "
        "Plain (x, y) or (x, y, positive) sequences are also accepted.",
        examples=[
            [{"x": 320, "y": 240, "positive": True}],
            "$inputs.points",
        ],
        json_schema_extra={"always_visible": True},
    )
    boxes: Optional[
        Selector(
            kind=[
                OBJECT_DETECTION_PREDICTION_KIND,
                INSTANCE_SEGMENTATION_PREDICTION_KIND,
                KEYPOINT_DETECTION_PREDICTION_KIND,
            ]
        )
    ] = Field(  # type: ignore
        default=None,
        description="Bounding boxes (from another model) to use as prompts - "
        "the model segments the object inside each box",
        examples=["$steps.object_detection_model.predictions"],
        json_schema_extra={"always_visible": True},
    )
    threshold: Union[Selector(kind=[FLOAT_KIND]), float] = Field(
        default=0.0,
        description="Minimum confidence threshold for predicted masks",
        examples=[0.3],
    )
    multimask_output: Union[Optional[bool], Selector(kind=[BOOLEAN_KIND])] = Field(
        default=True,
        description="Flag to determine whether to use SAM3 internal multimask or single mask mode. "
        "For ambiguous prompts (like a single point) setting to True is recommended.",
        examples=[True, "$inputs.multimask_output"],
    )

    @model_validator(mode="after")
    def _validate_points(self) -> "BlockManifest":
        if isinstance(self.points, list):
            _as_sam2_points(self.points)
        return self

    @classmethod
    def get_parameters_accepting_batches(cls) -> List[str]:
        return ["images", "boxes"]

    @classmethod
    def describe_outputs(cls) -> List[OutputDefinition]:
        return [
            OutputDefinition(
                name="predictions",
                kind=[INSTANCE_SEGMENTATION_PREDICTION_KIND],
            ),
        ]

    @classmethod
    def get_execution_engine_compatibility(cls) -> Optional[str]:
        return ">=1.3.0,<2.0.0"

    @classmethod
    def get_restrictions(cls) -> List[RuntimeRestriction]:
        restrictions = [
            RuntimeRestriction(
                severity=Severity.HARD,
                note="Requires a GPU; run_locally() loads a model that needs CUDA.",
                applies_to_runtimes=[Runtime.SELF_HOSTED_CPU],
                applies_to_step_execution_modes=[StepExecutionMode.LOCAL],
            ),
        ]
        if not CORE_MODEL_SAM3_ENABLED:
            restrictions.append(
                RuntimeRestriction(
                    severity=Severity.HARD,
                    note=(
                        "CORE_MODEL_SAM3_ENABLED=False on Roboflow Hosted "
                        "Serverless: the SAM3 endpoint is not registered, so "
                        "run_remotely() returns 404."
                    ),
                    applies_to_runtimes=[Runtime.HOSTED_SERVERLESS],
                    applies_to_step_execution_modes=[StepExecutionMode.REMOTE],
                )
            )
        return restrictions

    @classmethod
    def get_supported_model_variants(cls) -> Optional[List[str]]:
        """Return list of model_id variants that can satisfy this block."""
        return [SAM3_INTERACTIVE_MODEL_ID]

Methods:¶

get_supported_model_variants `classmethod` ¶

get_supported_model_variants()

Return list of model_id variants that can satisfy this block.

Source code in inference/core/workflows/core_steps/models/foundation/segment_anything3_interactive/v1.py

@classmethod
def get_supported_model_variants(cls) -> Optional[List[str]]:
    """Return list of model_id variants that can satisfy this block."""
    return [SAM3_INTERACTIVE_MODEL_ID]

`core/workflows/core_steps/models/foundation/segment_anything3_video`¶

inference.core.workflows.core_steps.models.foundation.segment_anything3_video.v1 ¶

SAM3 Video Tracker workflow block.

Wraps inference_models's SAM3Video — the HuggingFace streaming port of SAM3's open-vocabulary concept tracker — so it can be driven from a workflow powered by InferencePipeline. The pipeline delivers one frame at a time with WorkflowImageData.video_metadata; this block keeps one state_dict per video_identifier.

Unlike the SAM2 video block (which needs an upstream detector to seed box prompts, and a prompt_mode policy to decide when to re-seed), SAM3 concept prompts are registered on the session once and the model runs fused detect-and-track on every frame: objects entering the scene that match a concept are picked up automatically with fresh tracker ids. There is therefore no re-prompt scheduling — the session is only re-seeded when the stream restarts or class_names changes.

Classes¶

SegmentAnything3VideoBlockV1 ¶

Bases: WorkflowBlock

Stateful SAM3 streaming concept tracking block.

Source code in inference/core/workflows/core_steps/models/foundation/segment_anything3_video/v1.py

class SegmentAnything3VideoBlockV1(WorkflowBlock):
    """Stateful SAM3 streaming concept tracking block."""

    _REMOTE_EXECUTION_NOT_SUPPORTED_MESSAGE = (
        "SAM3 Video Tracker only supports LOCAL workflow step "
        "execution.  Remote execution would ship each frame to a "
        "separate process and break the per-video SAM3 session "
        "that holds the temporal memory.  Set "
        "WORKFLOWS_STEP_EXECUTION_MODE=local (or run on a "
        "dedicated deployment) to use this block."
    )

    def __init__(
        self,
        model_manager: ModelManager,
        api_key: Optional[str],
        step_execution_mode: StepExecutionMode,
    ):
        self._model_manager = model_manager
        self._api_key = api_key
        self._step_execution_mode = step_execution_mode
        self._model = None  # lazily loaded
        self._current_model_id: Optional[str] = None
        self._sessions: Dict[str, _ConceptSessionBookkeeping] = {}

    @classmethod
    def get_init_parameters(cls) -> List[str]:
        return ["model_manager", "api_key", "step_execution_mode"]

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return BlockManifest

    def _get_model(self, model_id: str):
        if self._model is None or self._current_model_id != model_id:
            from inference_models import AutoModel

            extra_weights_provider_headers = get_extra_weights_provider_headers()
            self._model = AutoModel.from_pretrained(
                model_id_or_path=model_id,
                api_key=self._api_key,
                weights_provider_extra_headers=extra_weights_provider_headers,
            )
            self._current_model_id = model_id
            # Switching model invalidates every session we held.
            self._sessions.clear()
        return self._model

    def run(
        self,
        images: Batch[WorkflowImageData],
        class_names: Union[List[str], str],
        model_id: str,
        threshold: float,
    ) -> BlockResult:
        if self._step_execution_mode is not StepExecutionMode.LOCAL:
            raise NotImplementedError(self._REMOTE_EXECUTION_NOT_SUPPORTED_MESSAGE)
        model = self._get_model(model_id=model_id)
        class_list = normalise_class_names(class_names)
        prompt_signature = tuple(class_list)

        batch_detections: List[sv.Detections] = []
        for single_image in images:
            metadata = single_image.video_metadata
            video_id = metadata.video_identifier
            frame_number = metadata.frame_number or 0

            session = self._sessions.setdefault(video_id, _ConceptSessionBookkeeping())
            stream_restarted = (
                session.last_frame_number >= 0
                and frame_number < session.last_frame_number
            )
            if stream_restarted or session.prompt_signature != prompt_signature:
                session.state_dict = None

            frame_np = single_image.numpy_image

            if not class_list:
                detections = concept_frame_to_sv_detections(
                    masks=_EMPTY_MASKS,
                    object_ids=_EMPTY_IDS,
                    scores=_EMPTY_SCORES,
                    boxes=_EMPTY_BOXES,
                    prompt_to_object_ids={},
                    class_names=class_list,
                    image=single_image,
                    threshold=threshold,
                )
            else:
                if session.state_dict is None:
                    result = model.prompt(
                        image=frame_np,
                        text=class_list,
                        clear_old_prompts=True,
                    )
                    session.prompt_signature = prompt_signature
                else:
                    result = model.track(image=frame_np, state_dict=session.state_dict)
                session.state_dict = result.state_dict
                detections = concept_frame_to_sv_detections(
                    masks=result.masks,
                    object_ids=result.object_ids,
                    scores=result.scores,
                    boxes=result.boxes,
                    prompt_to_object_ids=result.prompt_to_object_ids,
                    class_names=class_list,
                    image=single_image,
                    threshold=threshold,
                )

            session.last_frame_number = frame_number
            batch_detections.append(detections)

        batch_detections = attach_prediction_type_info_to_sv_detections_batch(
            predictions=batch_detections,
            prediction_type="instance-segmentation",
        )
        batch_detections = attach_parents_coordinates_to_batch_of_sv_detections(
            images=images,
            predictions=batch_detections,
        )
        return [{"predictions": pred} for pred in batch_detections]

Functions:¶

`core/workflows/core_steps/models/foundation/stability_ai/inpainting`¶

inference.core.workflows.core_steps.models.foundation.stability_ai.inpainting.v1 ¶

Credits to: https://github.com/Fafruch for origin idea

Classes¶

`core/workflows/core_steps/models/foundation/stability_ai/outpainting`¶

inference.core.workflows.core_steps.models.foundation.stability_ai.outpainting.v1 ¶

Credits to: https://github.com/Fafruch for origin idea

Classes¶

`core/workflows/core_steps/sinks/email_notification`¶

inference.core.workflows.core_steps.sinks.email_notification.v2 ¶

Classes¶

Functions:¶

apply_operations_to_message_parameters ¶

apply_operations_to_message_parameters(
    message_parameters, message_parameters_operations
)

Apply per-parameter operation chains to message parameter values.

For each parameter in message_parameters, if operations are defined in message_parameters_operations for that parameter name, the operations are applied in order (e.g. ToString, StringToUpperCase, LookupTable). Parameters with no operations are returned unchanged.

Supports all value types, including WorkflowImageData: image operations such as ExtractImageProperty, ConvertImageToBase64, and ConvertImageToJPEG can be used to transform image parameters before they are serialized or interpolated into the message.

Returns:

Type	Description
`Dict[str, Any]`	A dict with the same keys as message_parameters and values that are
`Dict[str, Any]`	either the original value (no operations) or the result of the
`Dict[str, Any]`	operations chain.

Source code in inference/core/workflows/core_steps/sinks/email_notification/v2.py

def apply_operations_to_message_parameters(
    message_parameters: Dict[str, Any],
    message_parameters_operations: Dict[str, List[AllOperationsType]],
) -> Dict[str, Any]:
    """
    Apply per-parameter operation chains to message parameter values.

    For each parameter in message_parameters, if operations are defined in
    message_parameters_operations for that parameter name, the operations are
    applied in order (e.g. ToString, StringToUpperCase, LookupTable).
    Parameters with no operations are returned unchanged.

    Supports all value types, including WorkflowImageData: image operations
    such as ExtractImageProperty, ConvertImageToBase64, and ConvertImageToJPEG
    can be used to transform image parameters before they are serialized or
    interpolated into the message.

    Returns:
        A dict with the same keys as message_parameters and values that are
        either the original value (no operations) or the result of the
        operations chain.
    """
    parameters_values = {}
    for parameter_name in message_parameters:
        parameter_value = message_parameters[parameter_name]

        operations = message_parameters_operations.get(parameter_name)
        if not operations:
            parameters_values[parameter_name] = parameter_value
            continue

        operations_chain = build_operations_chain(operations=operations)
        parameters_values[parameter_name] = operations_chain(
            parameter_value, global_parameters={}
        )

    return parameters_values

format_email_message ¶

format_email_message(
    message,
    message_parameters,
    message_parameters_operations,
)

Format email message by replacing parameter placeholders with actual values.

Source code in inference/core/workflows/core_steps/sinks/email_notification/v2.py

def format_email_message(
    message: str,
    message_parameters: Dict[str, Any],
    message_parameters_operations: Dict[str, List[AllOperationsType]],
) -> str:
    """Format email message by replacing parameter placeholders with actual values."""
    matching_parameters = PARAMETER_REGEX.findall(message)
    parameters_to_get_values = {
        p[1] for p in matching_parameters if p[1] in message_parameters
    }

    parameters_values = apply_operations_to_message_parameters(
        message_parameters=message_parameters,
        message_parameters_operations=message_parameters_operations,
    )

    parameter_to_placeholders = defaultdict(list)
    for placeholder, parameter_name in matching_parameters:
        if parameter_name not in parameters_to_get_values:
            continue
        parameter_to_placeholders[parameter_name].append(placeholder)
    for parameter_name, placeholders in parameter_to_placeholders.items():
        for placeholder in placeholders:
            message = message.replace(
                placeholder, str(parameters_values[parameter_name])
            )
    return message

format_email_message_html_with_images ¶

format_email_message_html_with_images(
    message,
    message_parameters,
    message_parameters_operations,
)

Format email message as HTML with inline images.

Source code in inference/core/workflows/core_steps/sinks/email_notification/v2.py

def format_email_message_html_with_images(
    message: str,
    message_parameters: Dict[str, Any],
    message_parameters_operations: Dict[str, List[AllOperationsType]],
) -> Tuple[str, Dict[str, bytes]]:
    """Format email message as HTML with inline images."""
    matching_parameters = PARAMETER_REGEX.findall(message)
    parameters_to_get_values = {
        p[1] for p in matching_parameters if p[1] in message_parameters
    }

    parameters_values = {}
    image_attachments = {}

    for parameter_name in parameters_to_get_values:
        parameter_value = message_parameters[parameter_name]

        # Apply operations if any
        operations = message_parameters_operations.get(parameter_name)
        if operations:
            operations_chain = build_operations_chain(operations=operations)
            parameter_value = operations_chain(parameter_value, global_parameters={})

        if isinstance(parameter_value, WorkflowImageData):
            # Convert to JPEG and create CID
            jpeg_bytes = encode_image_to_jpeg_bytes(parameter_value.numpy_image)
            cid = f"image_{parameter_name}"
            image_attachments[cid] = jpeg_bytes
            parameters_values[parameter_name] = (
                f'<img src="cid:{cid}" alt="{parameter_name}" style="max-width: 600px; height: auto;">'
            )
        else:
            import html

            parameters_values[parameter_name] = html.escape(str(parameter_value))

    # Replace placeholders
    parameter_to_placeholders = defaultdict(list)
    for placeholder, parameter_name in matching_parameters:
        if parameter_name in parameters_to_get_values:
            parameter_to_placeholders[parameter_name].append(placeholder)

    html_message = message
    for parameter_name, placeholders in parameter_to_placeholders.items():
        for placeholder in placeholders:
            html_message = html_message.replace(
                placeholder, str(parameters_values[parameter_name])
            )

    # Convert newlines to <br> tags for HTML
    html_message = html_message.replace("\n", "<br>\n")

    return html_message, image_attachments

process_attachments ¶

process_attachments(attachments)

Process attachments dict to convert WorkflowImageData to JPEG bytes. Returns a dict with filename -> bytes mapping.

Source code in inference/core/workflows/core_steps/sinks/email_notification/v2.py

def process_attachments(attachments: Dict[str, Any]) -> Dict[str, bytes]:
    """
    Process attachments dict to convert WorkflowImageData to JPEG bytes.
    Returns a dict with filename -> bytes mapping.
    """
    processed = {}
    for filename, value in attachments.items():
        if isinstance(value, WorkflowImageData):
            # Convert image to JPEG bytes
            numpy_image = value.numpy_image
            jpeg_bytes = encode_image_to_jpeg_bytes(numpy_image)
            processed[filename] = jpeg_bytes
        elif isinstance(value, bytes):
            # Already bytes, use as-is
            processed[filename] = value
        elif isinstance(value, str):
            # String data (e.g., CSV content)
            processed[filename] = value.encode("utf-8")
        else:
            # Fallback: convert to string then bytes
            processed[filename] = str(value).encode("utf-8")
    return processed

send_email_using_smtp_server_v2 ¶

send_email_using_smtp_server_v2(
    sender_email,
    receiver_email,
    cc_receiver_email,
    bcc_receiver_email,
    subject,
    message,
    attachments,
    smtp_server,
    smtp_port,
    sender_email_password,
    inline_images,
    is_html,
)

V2-specific SMTP email sender with inline image support. This function is used only by v2 block and does not modify v1 behavior.

Source code in inference/core/workflows/core_steps/sinks/email_notification/v2.py

def send_email_using_smtp_server_v2(
    sender_email: str,
    receiver_email: List[str],
    cc_receiver_email: Optional[List[str]],
    bcc_receiver_email: Optional[List[str]],
    subject: str,
    message: str,
    attachments: Dict[str, bytes],
    smtp_server: str,
    smtp_port: int,
    sender_email_password: str,
    inline_images: Dict[str, bytes],
    is_html: bool,
) -> Tuple[bool, str]:
    """
    V2-specific SMTP email sender with inline image support.
    This function is used only by v2 block and does not modify v1 behavior.
    """
    try:
        _send_email_using_smtp_server_v2(
            sender_email=sender_email,
            receiver_email=receiver_email,
            cc_receiver_email=cc_receiver_email,
            bcc_receiver_email=bcc_receiver_email,
            subject=subject,
            message=message,
            attachments=attachments,
            smtp_server=smtp_server,
            smtp_port=smtp_port,
            sender_email_password=sender_email_password,
            inline_images=inline_images,
            is_html=is_html,
        )
        return False, "Notification sent successfully"
    except Exception as error:
        logging.warning(
            f"Could not send e-mail using custom SMTP server. Error: {str(error)}"
        )
        return True, f"Failed to send e-mail. Internal error details: {error}"

send_email_via_roboflow_proxy ¶

send_email_via_roboflow_proxy(
    roboflow_api_key,
    receiver_email,
    cc_receiver_email,
    bcc_receiver_email,
    subject,
    message,
    message_parameters,
    message_parameters_operations,
    attachments,
)

Send email through Roboflow's proxy service.

Source code in inference/core/workflows/core_steps/sinks/email_notification/v2.py

def send_email_via_roboflow_proxy(
    roboflow_api_key: str,
    receiver_email: List[str],
    cc_receiver_email: Optional[List[str]],
    bcc_receiver_email: Optional[List[str]],
    subject: str,
    message: str,
    message_parameters: Dict[str, Any],
    message_parameters_operations: Dict[str, List[AllOperationsType]],
    attachments: Dict[str, Any],
) -> Tuple[bool, str]:
    """Send email through Roboflow's proxy service."""
    from inference.core.exceptions import (
        RoboflowAPIForbiddenError,
        RoboflowAPIUnsuccessfulRequestError,
    )

    # Custom error handler that preserves the API's error message
    def handle_email_proxy_error(status_code: int, http_error: Exception) -> None:
        """Extract and preserve the actual error message from the API response."""
        try:
            response = http_error.response
            error_data = response.json()
            # API returns 'details' field with the actual message, 'error' is generic
            # Prioritize 'details' over 'error' for more specific messages
            api_error_message = (
                error_data.get("details") or error_data.get("error") or str(http_error)
            )
        except Exception:
            api_error_message = str(http_error)

        # Raise appropriate exception with the actual API error message
        if status_code == 403:
            raise RoboflowAPIForbiddenError(api_error_message) from http_error
        elif status_code == 413:
            raise RoboflowAPIUnsuccessfulRequestError(api_error_message) from http_error
        elif status_code == 429:
            raise RoboflowAPIUnsuccessfulRequestError(api_error_message) from http_error
        else:
            raise RoboflowAPIUnsuccessfulRequestError(api_error_message) from http_error

    # Map status codes to our custom handler
    custom_error_handlers = {
        403: lambda e: handle_email_proxy_error(403, e),
        413: lambda e: handle_email_proxy_error(413, e),
        429: lambda e: handle_email_proxy_error(429, e),
    }

    try:
        message_parameters_after_operations = apply_operations_to_message_parameters(
            message_parameters=message_parameters,
            message_parameters_operations=message_parameters_operations,
        )
        # Serialize any WorkflowImageData objects to base64 strings for JSON transmission
        serialized_parameters = serialize_image_data_parameters(
            message_parameters_after_operations
        )

        payload = {
            "receiver_email": receiver_email,
            "subject": subject,
            "message": message,
            "message_parameters": serialized_parameters,
        }

        if cc_receiver_email:
            payload["cc_receiver_email"] = cc_receiver_email
        if bcc_receiver_email:
            payload["bcc_receiver_email"] = bcc_receiver_email
        if attachments:
            # Process attachments: convert images to JPEG bytes, then base64 encode
            import base64

            processed_attachments = {}
            for filename, value in attachments.items():
                if isinstance(value, WorkflowImageData):
                    # Convert image to JPEG bytes
                    numpy_image = value.numpy_image
                    jpeg_bytes = encode_image_to_jpeg_bytes(numpy_image)
                    # Ensure filename has .jpg extension
                    if not filename.lower().endswith((".jpg", ".jpeg")):
                        filename = f"{filename}.jpg"
                    # Base64 encode for JSON transmission
                    processed_attachments[filename] = base64.b64encode(
                        jpeg_bytes
                    ).decode("utf-8")
                elif isinstance(value, bytes):
                    # Already bytes, base64 encode
                    processed_attachments[filename] = base64.b64encode(value).decode(
                        "utf-8"
                    )
                elif isinstance(value, str):
                    # String data (e.g., CSV content), base64 encode
                    processed_attachments[filename] = base64.b64encode(
                        value.encode("utf-8")
                    ).decode("utf-8")
                else:
                    # Fallback: convert to string then bytes then base64
                    processed_attachments[filename] = base64.b64encode(
                        str(value).encode("utf-8")
                    ).decode("utf-8")
            payload["attachments"] = processed_attachments

        endpoint = "apiproxy/email"

        response_data = post_to_roboflow_api(
            endpoint=endpoint,
            api_key=roboflow_api_key,
            payload=payload,
            http_errors_handlers=custom_error_handlers,
        )

        return False, "Notification sent successfully via Roboflow proxy"
    except RoboflowAPIForbiddenError as error:
        # Handle 403 errors (whitelist violations)
        error_message = str(error)
        logging.warning(
            f"Email rejected by proxy due to access restrictions: {error_message}"
        )

        # Check if it's a workspace member restriction
        # The API returns detailed error messages about non-workspace members
        if "non-workspace members" in error_message.lower():
            return True, (
                "To prevent spam, you can only send emails to members of your Roboflow Workspace via the Roboflow Managed API Key. "
                "Add this email to your Workspace or switch to sending via your own SMTP server."
            )
        else:
            return True, f"Failed to send email: access forbidden. {error_message}"
    except RoboflowAPIUnsuccessfulRequestError as error:
        # Handle rate limiting (429) and other API errors
        error_message = str(error)
        logging.warning(f"Email proxy API error: {error_message}")

        # Check for payload too large (413)
        if (
            "413" in error_message
            or "payload too large" in error_message.lower()
            or "too large" in error_message.lower()
        ):
            return True, (
                "Failed to send email: attachment size exceeds the 5MB limit. "
                "For image attachments, use the Image Preprocessing block to resize images before sending. "
                "For other attachments (like CSV files), reduce the file size or send smaller data."
            )
        # Check if it's a rate limit error
        elif "rate limit" in error_message.lower():
            return True, (
                "Failed to send email: rate limit exceeded. "
                "The workspace has exceeded its email sending limits. "
                "Please wait before sending more emails or contact support to increase your limits."
            )
        elif "credits exceeded" in error_message.lower():
            return True, (
                "Failed to send email: workspace credits exceeded. "
                "Please add more credits to your workspace to continue sending emails."
            )
        else:
            return True, f"Failed to send email via proxy. {error_message}"
    except Exception as error:
        logging.warning(
            f"Could not send e-mail via Roboflow proxy. Error: {str(error)}"
        )
        return True, f"Failed to send e-mail via proxy. Internal error details: {error}"

serialize_image_data ¶

serialize_image_data(value)

Serialize WorkflowImageData objects to base64 strings for JSON transmission. Returns the value unchanged if it's not a WorkflowImageData object.

Source code in inference/core/workflows/core_steps/sinks/email_notification/v2.py

def serialize_image_data(value: Any) -> Any:
    """
    Serialize WorkflowImageData objects to base64 strings for JSON transmission.
    Returns the value unchanged if it's not a WorkflowImageData object.
    """
    if isinstance(value, WorkflowImageData):
        # Get the base64 representation of the image
        base64_image = value.base64_image
        if base64_image:
            return base64_image
        # If no base64 available, try to convert numpy array
        numpy_image = value.numpy_image
        if numpy_image is not None:
            import cv2

            _, buffer = cv2.imencode(".jpg", numpy_image)
            import base64

            return base64.b64encode(buffer).decode("utf-8")
    elif isinstance(value, dict):
        return {k: serialize_image_data(v) for k, v in value.items()}
    elif isinstance(value, list):
        return [serialize_image_data(item) for item in value]
    return value

serialize_image_data_parameters ¶

serialize_image_data_parameters(message_parameters)

Convert any WorkflowImageData objects in message_parameters to base64 strings so they can be serialized to JSON for the API call.

Source code in inference/core/workflows/core_steps/sinks/email_notification/v2.py

def serialize_image_data_parameters(
    message_parameters: Dict[str, Any],
) -> Dict[str, Any]:
    """
    Convert any WorkflowImageData objects in message_parameters to base64 strings
    so they can be serialized to JSON for the API call.
    """
    return {k: serialize_image_data(v) for k, v in message_parameters.items()}

`core/workflows/core_steps/sinks/roboflow/asset_library_attributes`¶

inference.core.workflows.core_steps.sinks.roboflow.asset_library_attributes.v1 ¶

Classes¶

UpdateAssetLibraryAttributesOffloader ¶

Bases: Protocol

Callable contract for offloading the Roboflow image-attributes request.

Implementations decide how to actually deliver the updates — call the Roboflow API inline, enqueue to a background worker, write to a log, etc.

Source code in inference/core/workflows/core_steps/sinks/roboflow/asset_library_attributes/v1.py

class UpdateAssetLibraryAttributesOffloader(Protocol):
    """Callable contract for offloading the Roboflow image-attributes request.

    Implementations decide how to actually deliver the updates — call the
    Roboflow API inline, enqueue to a background worker, write to a log, etc.
    """

    def __call__(
        self,
        workspace_id: str,
        updates: List[Dict[str, Any]],
        api_key: str,
    ) -> Dict[str, Any]: ...

`core/workflows/core_steps/sinks/roboflow/dataset_upload`¶

inference.core.workflows.core_steps.sinks.roboflow.dataset_upload.v1 ¶

WARNING! *

This module contains the utility functions used by RoboflowDatasetUploadBlockV2.

We do not recommend making multiple blocks dependent on the same code, but the change between v1 and v2 was basically the default value of some parameter - hence we decided not to replicate the code.

If you need to modify this module beware that you may introduce change to RoboflowDatasetUploadBlockV2! If that happens, probably that's the time to disentangle those blocks and copy the code.

Classes¶

`core/workflows/core_steps/sinks/roboflow/vision_events`¶

inference.core.workflows.core_steps.sinks.roboflow.vision_events.v1 ¶

Classes¶

Functions:¶

`core/workflows/core_steps/sinks/twilio/sms`¶

inference.core.workflows.core_steps.sinks.twilio.sms.v2 ¶

Classes¶

Functions:¶

format_message ¶

format_message(
    message,
    message_parameters,
    message_parameters_operations,
)

Format SMS/MMS message by replacing parameter placeholders with actual values.

Returns:

Type	Description
`str`	Tuple of (formatted_message, needs_mms) where needs_mms is True if message
`bool`	exceeds SMS character limit and should be sent as MMS.

Source code in inference/core/workflows/core_steps/sinks/twilio/sms/v2.py

def format_message(
    message: str,
    message_parameters: Dict[str, Any],
    message_parameters_operations: Dict[str, List[AllOperationsType]],
) -> Tuple[str, bool]:
    """Format SMS/MMS message by replacing parameter placeholders with actual values.

    Returns:
        Tuple of (formatted_message, needs_mms) where needs_mms is True if message
        exceeds SMS character limit and should be sent as MMS.
    """
    matching_parameters = PARAMETER_REGEX.findall(message)
    parameters_to_get_values = {
        p[1] for p in matching_parameters if p[1] in message_parameters
    }
    parameters_values = {}
    for parameter_name in parameters_to_get_values:
        parameter_value = message_parameters[parameter_name]
        operations = message_parameters_operations.get(parameter_name)
        if not operations:
            parameters_values[parameter_name] = parameter_value
            continue
        operations_chain = build_operations_chain(operations=operations)
        parameters_values[parameter_name] = operations_chain(
            parameter_value, global_parameters={}
        )
    parameter_to_placeholders = defaultdict(list)
    for placeholder, parameter_name in matching_parameters:
        if parameter_name not in parameters_to_get_values:
            continue
        parameter_to_placeholders[parameter_name].append(placeholder)
    for parameter_name, placeholders in parameter_to_placeholders.items():
        for placeholder in placeholders:
            message = message.replace(
                placeholder, str(parameters_values[parameter_name])
            )

    # Determine if MMS is needed (message exceeds SMS limit)
    needs_mms = len(message) > SMS_CHAR_LIMIT

    # Truncate at MMS limit if necessary
    if len(message) > MMS_CHAR_LIMIT:
        truncated_message = message[: MMS_CHAR_LIMIT - 1 - len(TRUNCATION_MARKER)]
        message = f"{truncated_message} {TRUNCATION_MARKER}"

    return message, needs_mms

process_media_urls_for_twilio ¶

process_media_urls_for_twilio(media_url)

Process media URLs for Twilio MMS. Converts WorkflowImageData to temporary public URLs.

Source code in inference/core/workflows/core_steps/sinks/twilio/sms/v2.py

def process_media_urls_for_twilio(
    media_url: Union[str, List[Union[str, WorkflowImageData]], WorkflowImageData],
) -> Optional[List[str]]:
    """
    Process media URLs for Twilio MMS.
    Converts WorkflowImageData to temporary public URLs.
    """
    if isinstance(media_url, WorkflowImageData):
        url = _upload_image_to_ephemeral_host(media_url)
        if url:
            return [url]
        logging.warning("Failed to upload WorkflowImageData to temporary storage")
        return None
    elif isinstance(media_url, str):
        return [media_url]
    elif isinstance(media_url, list):
        result = []
        for item in media_url:
            if isinstance(item, WorkflowImageData):
                url = _upload_image_to_ephemeral_host(item)
                if url:
                    result.append(url)
                else:
                    logging.warning(
                        "Failed to upload WorkflowImageData to temporary storage"
                    )
            else:
                result.append(item)
        return result if result else None
    return None

send_sms_using_twilio_client ¶

send_sms_using_twilio_client(
    client,
    message,
    sender_number,
    receiver_number,
    media_urls,
)

Send SMS/MMS using Twilio client directly.

Source code in inference/core/workflows/core_steps/sinks/twilio/sms/v2.py

def send_sms_using_twilio_client(
    client: Client,
    message: str,
    sender_number: str,
    receiver_number: str,
    media_urls: Optional[List[str]],
) -> Tuple[bool, str]:
    """Send SMS/MMS using Twilio client directly."""
    try:
        message_params = {
            "body": message,
            "from_": sender_number,
            "to": receiver_number,
        }
        if media_urls:
            message_params["media_url"] = media_urls

        client.messages.create(**message_params)
        return False, "Notification sent successfully"
    except Exception as error:
        logging.warning(f"Could not send Twilio SMS notification. Error: {str(error)}")
        return (
            True,
            f"Failed to send Twilio SMS notification. Internal error details: {error}",
        )

send_sms_via_roboflow_proxy ¶

send_sms_via_roboflow_proxy(
    roboflow_api_key,
    receiver_number,
    message,
    message_parameters,
    message_parameters_operations,
    media_url,
)

Send SMS/MMS through Roboflow's proxy service.

Source code in inference/core/workflows/core_steps/sinks/twilio/sms/v2.py

def send_sms_via_roboflow_proxy(
    roboflow_api_key: str,
    receiver_number: str,
    message: str,
    message_parameters: Dict[str, Any],
    message_parameters_operations: Dict[str, List[AllOperationsType]],
    media_url: Optional[Union[str, List[str], WorkflowImageData]],
) -> Tuple[bool, str]:
    """Send SMS/MMS through Roboflow's proxy service."""

    # Custom error handler that preserves the API's error message
    def handle_sms_proxy_error(status_code: int, http_error: Exception) -> None:
        """Extract and preserve the actual error message from the API response."""
        try:
            response = http_error.response
            error_data = response.json()
            api_error_message = (
                error_data.get("details") or error_data.get("error") or str(http_error)
            )
        except Exception:
            api_error_message = str(http_error)

        if status_code == 403:
            raise RoboflowAPIForbiddenError(api_error_message) from http_error
        elif status_code == 429:
            raise RoboflowAPIUnsuccessfulRequestError(api_error_message) from http_error
        else:
            raise RoboflowAPIUnsuccessfulRequestError(api_error_message) from http_error

    custom_error_handlers = {
        403: lambda e: handle_sms_proxy_error(403, e),
        429: lambda e: handle_sms_proxy_error(429, e),
    }

    try:
        # Format message client-side before sending to proxy
        formatted_message, needs_mms = format_message(
            message=message,
            message_parameters=message_parameters,
            message_parameters_operations=message_parameters_operations,
        )

        payload = {
            "receiver_number": receiver_number,
            "message": formatted_message,
        }

        # Serialize media - separates URLs from base64 data
        has_media = False
        if media_url is not None:
            media_urls, media_base64 = serialize_media_for_api(media_url)
            if media_urls:
                payload["media_urls"] = media_urls
                has_media = True
            if media_base64:
                payload["media_base64"] = media_base64
                has_media = True

        # If message exceeds SMS limit but no media, tell server to force MMS
        if needs_mms and not has_media:
            payload["force_mms"] = True

        endpoint = "apiproxy/twilio"

        response_data = post_to_roboflow_api(
            endpoint=endpoint,
            api_key=roboflow_api_key,
            payload=payload,
            http_errors_handlers=custom_error_handlers,
        )

        return False, "Notification sent successfully via Roboflow proxy"
    except RoboflowAPIForbiddenError as error:
        error_message = str(error)
        logging.warning(
            f"SMS rejected by proxy due to access restrictions: {error_message}"
        )
        return True, f"Failed to send SMS: access forbidden. {error_message}"
    except RoboflowAPIUnsuccessfulRequestError as error:
        error_message = str(error)
        logging.warning(f"SMS proxy API error: {error_message}")

        if "rate limit" in error_message.lower():
            return True, (
                "Failed to send SMS: rate limit exceeded. "
                "The workspace has exceeded its SMS sending limits. "
                "Please wait before sending more messages."
            )
        elif "credits exceeded" in error_message.lower():
            return True, (
                "Failed to send SMS: workspace credits exceeded. "
                "Please add more credits to your workspace to continue sending messages."
            )
        else:
            return True, f"Failed to send SMS via proxy. {error_message}"
    except Exception as error:
        logging.warning(f"Could not send SMS via Roboflow proxy. Error: {str(error)}")
        return True, f"Failed to send SMS via proxy. Internal error details: {error}"

serialize_media_for_api ¶

serialize_media_for_api(media_url)

Serialize media for API transmission. Separates URL-based media from base64 image data.

Returns:

Type	Description
`Optional[List[str]]`	Tuple of (media_urls, media_base64) where:
`Optional[List[Dict[str, str]]]`	media_urls: List of string URLs
`Tuple[Optional[List[str]], Optional[List[Dict[str, str]]]]`	media_base64: List of {"base64": str, "mimeType": str} objects

Source code in inference/core/workflows/core_steps/sinks/twilio/sms/v2.py

def serialize_media_for_api(
    media_url: Union[str, List[str], WorkflowImageData, None],
) -> Tuple[Optional[List[str]], Optional[List[Dict[str, str]]]]:
    """
    Serialize media for API transmission.
    Separates URL-based media from base64 image data.

    Returns:
        Tuple of (media_urls, media_base64) where:
        - media_urls: List of string URLs
        - media_base64: List of {"base64": str, "mimeType": str} objects
    """
    if media_url is None:
        return None, None

    media_urls: List[str] = []
    media_base64: List[Dict[str, str]] = []

    items = [media_url] if not isinstance(media_url, list) else media_url

    for item in items:
        if isinstance(item, WorkflowImageData):
            # Convert to base64 JPEG
            jpeg_bytes = encode_image_to_jpeg_bytes(item.numpy_image)
            media_base64.append(
                {
                    "base64": base64.b64encode(jpeg_bytes).decode("utf-8"),
                    "mimeType": "image/jpeg",
                }
            )
        elif isinstance(item, str):
            media_urls.append(item)

    return (media_urls if media_urls else None, media_base64 if media_base64 else None)

`core/workflows/core_steps/trackers`¶

inference.core.workflows.core_steps.trackers._base ¶

Shared base classes for tracker workflow blocks.

Each concrete tracker block (ByteTrack, BoT-SORT, SORT, OC-SORT) inherits from TrackerBlockBase and implements _create_tracker and get_manifest. Sub-classes may override _tracker_update when the underlying tracker needs extra per-frame context (e.g. a video frame for camera motion compensation).

Classes¶

InstanceCache ¶

FIFO cache that tracks which object track IDs have been seen before.

Used to categorize tracked detections as new (first appearance) or already seen (reappearance) across video frames.

Source code in inference/core/workflows/core_steps/trackers/_base.py

class InstanceCache:
    """FIFO cache that tracks which object track IDs have been seen before.

    Used to categorize tracked detections as new (first appearance) or
    already seen (reappearance) across video frames.
    """

    def __init__(self, size: int):
        size = max(1, size)
        self._cache_inserts_track = deque(maxlen=size)
        self._cache = set()

    def record_instance(self, tracker_id: int) -> bool:
        """Record a tracker ID and return whether it was previously seen.

        Returns:
            True if the tracker_id was already in the cache (seen before),
            False if this is its first appearance.
        """
        in_cache = tracker_id in self._cache
        if not in_cache:
            self._cache_new_tracker_id(tracker_id=tracker_id)
        return in_cache

    def _cache_new_tracker_id(self, tracker_id: int) -> None:
        while len(self._cache) >= self._cache_inserts_track.maxlen:
            to_drop = self._cache_inserts_track.popleft()
            self._cache.remove(to_drop)
        self._cache_inserts_track.append(tracker_id)
        self._cache.add(tracker_id)

Methods:¶

record_instance ¶

record_instance(tracker_id)

Record a tracker ID and return whether it was previously seen.

Returns:

Type	Description
`bool`	True if the tracker_id was already in the cache (seen before),
`bool`	False if this is its first appearance.

Source code in inference/core/workflows/core_steps/trackers/_base.py

def record_instance(self, tracker_id: int) -> bool:
    """Record a tracker ID and return whether it was previously seen.

    Returns:
        True if the tracker_id was already in the cache (seen before),
        False if this is its first appearance.
    """
    in_cache = tracker_id in self._cache
    if not in_cache:
        self._cache_new_tracker_id(tracker_id=tracker_id)
    return in_cache

TrackerBlockBase ¶

Bases: WorkflowBlock

Common run-loop shared by every tracker block.

Sub-classes implement _create_tracker and get_manifest. Override _tracker_update only when the tracker API requires additional context beyond sv.Detections (e.g. BoT-SORT with camera motion compensation).

Source code in inference/core/workflows/core_steps/trackers/_base.py

class TrackerBlockBase(WorkflowBlock):
    """Common run-loop shared by every tracker block.

    Sub-classes implement ``_create_tracker`` and ``get_manifest``.  Override
    ``_tracker_update`` only when the tracker API requires additional context
    beyond ``sv.Detections`` (e.g. BoT-SORT with camera motion compensation).
    """

    def __init__(self) -> None:
        self._trackers: Dict[str, Any] = {}
        self._per_video_cache: Dict[str, InstanceCache] = {}

    @classmethod
    @abstractmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]: ...

    @abstractmethod
    def _create_tracker(self, fps: int, **kwargs: Any) -> Any:
        """Instantiate the concrete tracker with algorithm-specific params."""
        ...

    def _tracker_update(
        self,
        tracker: Any,
        detections: sv.Detections,
        image: WorkflowImageData,
    ) -> sv.Detections:
        """Invoke the tracker for one frame.

        Must call ``tracker.update`` only with arguments that library trackers
        define for the per-frame step (typically detections, optionally a frame
        tensor).  Do **not** pass workflow/block kwargs used in ``_create_tracker``.
        """
        return tracker.update(detections)

    def _run_tracker(
        self,
        image: WorkflowImageData,
        detections: sv.Detections,
        instances_cache_size: int,
        **tracker_kwargs: Any,
    ) -> BlockResult:
        """Run one frame through the tracker.

        Note: tracker parameters (``tracker_kwargs``) are only used when the
        tracker is **first created** for a given ``video_identifier``.
        Changing parameter values on subsequent frames has no effect because
        the tracker instance is cached for the lifetime of the video stream.
        """
        metadata = image.video_metadata
        fps = metadata.fps
        if not fps:
            fps = 30
            logger.warning(
                f"fps not available in VideoMetadata for {self.__class__.__name__}, "
                "defaulting to 30 fps for tracker initialisation"
            )
        video_id = metadata.video_identifier

        if video_id not in self._trackers:
            self._trackers[video_id] = self._create_tracker(fps=fps, **tracker_kwargs)

        tracker = self._trackers[video_id]
        tracked_detections = self._tracker_update(tracker, detections, image)

        # Filter out immature / unmatched tracks (tracker_id == -1)
        if tracked_detections.tracker_id is not None and len(tracked_detections) > 0:
            valid_mask = tracked_detections.tracker_id != -1
            tracked_detections = tracked_detections[valid_mask]

        if video_id not in self._per_video_cache:
            self._per_video_cache[video_id] = InstanceCache(size=instances_cache_size)
        cache = self._per_video_cache[video_id]

        not_seen_mask, seen_mask = [], []
        for tracker_id in tracked_detections.tracker_id.tolist():
            already_seen = cache.record_instance(tracker_id=tracker_id)
            not_seen_mask.append(not already_seen)
            seen_mask.append(already_seen)

        return {
            OUTPUT_KEY: tracked_detections,
            "new_instances": tracked_detections[not_seen_mask],
            "already_seen_instances": tracked_detections[seen_mask],
        }

Functions:¶

tracker_describe_outputs ¶

tracker_describe_outputs()

Output definitions shared by all tracker blocks.

Trackers preserve all detection fields (masks, keypoints, custom data) — they only use bounding boxes for association then index back into the original sv.Detections. The output kinds therefore mirror the input kinds accepted by every tracker manifest.

Source code in inference/core/workflows/core_steps/trackers/_base.py

def tracker_describe_outputs() -> List[OutputDefinition]:
    """Output definitions shared by all tracker blocks.

    Trackers preserve all detection fields (masks, keypoints, custom data) —
    they only use bounding boxes for association then index back into the
    original ``sv.Detections``.  The output kinds therefore mirror the input
    kinds accepted by every tracker manifest.
    """
    return [
        OutputDefinition(name=OUTPUT_KEY, kind=TRACKER_PREDICTION_KINDS),
        OutputDefinition(name="new_instances", kind=TRACKER_PREDICTION_KINDS),
        OutputDefinition(
            name="already_seen_instances",
            kind=TRACKER_PREDICTION_KINDS,
        ),
    ]

`core/workflows/core_steps/transformations/detections_merge`¶

inference.core.workflows.core_steps.transformations.detections_merge.v1 ¶

Classes¶

Functions:¶

calculate_union_bbox ¶

calculate_union_bbox(detections)

Calculate a single bounding box that contains all input detections.

Source code in inference/core/workflows/core_steps/transformations/detections_merge/v1.py

def calculate_union_bbox(detections: sv.Detections) -> np.ndarray:
    """Calculate a single bounding box that contains all input detections."""
    if len(detections) == 0:
        return np.array([], dtype=np.float32).reshape(0, 4)

    # Get all bounding boxes
    xyxy = detections.xyxy

    # Calculate the union by taking min/max coordinates
    x1 = np.min(xyxy[:, 0])
    y1 = np.min(xyxy[:, 1])
    x2 = np.max(xyxy[:, 2])
    y2 = np.max(xyxy[:, 3])

    return np.array([[x1, y1, x2, y2]])

get_lowest_confidence_index ¶

get_lowest_confidence_index(detections)

Get the index of the detection with the lowest confidence.

Source code in inference/core/workflows/core_steps/transformations/detections_merge/v1.py

def get_lowest_confidence_index(detections: sv.Detections) -> int:
    """Get the index of the detection with the lowest confidence."""
    if detections.confidence is None:
        return 0
    return int(np.argmin(detections.confidence))

`core/workflows/core_steps/transformations/geotag_detection`¶

inference.core.workflows.core_steps.transformations.geotag_detection.v1 ¶

Classes¶

Functions:¶

project_detections ¶

project_detections(
    predictions,
    image_w,
    image_h,
    latitude,
    longitude,
    altitude,
    horizontal_fov=73.7,
    heading=0.0,
)

Project sv.Detections to ground GPS coordinates.

Returns (geo_detections records, GeoJSON features).

Source code in inference/core/workflows/core_steps/transformations/geotag_detection/v1.py

def project_detections(
    predictions: sv.Detections,
    image_w: int,
    image_h: int,
    latitude: float,
    longitude: float,
    altitude: float,
    horizontal_fov: float = 73.7,
    heading: float = 0.0,
) -> Tuple[List[dict], List[dict]]:
    """Project sv.Detections to ground GPS coordinates.

    Returns (geo_detections records, GeoJSON features).
    """
    geo_detections, features = [], []
    if len(predictions) == 0:
        return geo_detections, features

    class_names = predictions.data.get("class_name", np.array([]))
    confidences = (
        predictions.confidence
        if predictions.confidence is not None
        else np.zeros(len(predictions))
    )

    for i in range(len(predictions)):
        x1, y1, x2, y2 = predictions.xyxy[i]
        cx, cy, w, h = (x1 + x2) / 2, (y1 + y2) / 2, x2 - x1, y2 - y1
        det_lat, det_lon = _pixel_to_gps(
            cx,
            cy,
            image_w,
            image_h,
            latitude,
            longitude,
            altitude,
            horizontal_fov,
            heading,
        )
        record = {
            "class": str(class_names[i]) if i < len(class_names) else "unknown",
            "confidence": round(float(confidences[i]), 4),
            "lat": round(det_lat, 7),
            "lon": round(det_lon, 7),
            "pixel_x": round(float(cx), 1),
            "pixel_y": round(float(cy), 1),
            "width": round(float(w), 1),
            "height": round(float(h), 1),
        }
        geo_detections.append(record)
        features.append(
            {
                "type": "Feature",
                "geometry": {
                    "type": "Point",
                    "coordinates": [record["lon"], record["lat"]],
                },
                "properties": {
                    "class": record["class"],
                    "confidence": record["confidence"],
                },
            }
        )
    return geo_detections, features

`core/workflows/core_steps/transformations/image_slicer`¶

inference.core.workflows.core_steps.transformations.image_slicer.v1 ¶

Classes¶

Functions:¶

generate_offsets ¶

generate_offsets(resolution_wh, slice_wh, overlap_ratio_wh)

Original code: https://github.com/roboflow/supervision/blob/5123085037ec594524fc8f9d9b71b1cd9f487e8d/supervision/detection/tools/inference_slicer.py#L204-L203 to avoid fragile contract with supervision, as this function is not element of public API.

Generate offset coordinates for slicing an image based on the given resolution, slice dimensions, and overlap ratios.

Parameters:

Name	Type	Description	Default
`resolution_wh`	`Tuple[int, int]`	A tuple representing the width and height of the image to be sliced.	required
`slice_wh`	`Tuple[int, int]`	Dimensions of each slice measured in pixels. The	required
`overlap_ratio_wh`	`Optional[Tuple[float, float]]`	A tuple representing the desired overlap ratio for width and height between consecutive slices. Each value should be in the range [0, 1), where 0 means no overlap and a value close to 1 means high overlap.	required

Note

The function ensures that slices do not exceed the boundaries of the original image. As a result, the final slices in the row and column dimensions might be smaller than the specified slice dimensions if the image's width or height is not a multiple of the slice's width or height minus the overlap.

Source code in inference/core/workflows/core_steps/transformations/image_slicer/v1.py

def generate_offsets(
    resolution_wh: Tuple[int, int],
    slice_wh: Tuple[int, int],
    overlap_ratio_wh: Optional[Tuple[float, float]],
) -> np.ndarray:
    """
    Original code: https://github.com/roboflow/supervision/blob/5123085037ec594524fc8f9d9b71b1cd9f487e8d/supervision/detection/tools/inference_slicer.py#L204-L203
    to avoid fragile contract with supervision, as this function is not element of public
    API.

    Generate offset coordinates for slicing an image based on the given resolution,
    slice dimensions, and overlap ratios.

    Args:
        resolution_wh (Tuple[int, int]): A tuple representing the width and height
            of the image to be sliced.
        slice_wh (Tuple[int, int]): Dimensions of each slice measured in pixels. The
        tuple should be in the format `(width, height)`.
        overlap_ratio_wh (Optional[Tuple[float, float]]): A tuple representing the
            desired overlap ratio for width and height between consecutive slices.
            Each value should be in the range [0, 1), where 0 means no overlap and
            a value close to 1 means high overlap.
    Returns:
        np.ndarray: An array of shape `(n, 4)` containing coordinates for each
            slice in the format `[xmin, ymin, xmax, ymax]`.

    Note:
        The function ensures that slices do not exceed the boundaries of the
            original image. As a result, the final slices in the row and column
            dimensions might be smaller than the specified slice dimensions if the
            image's width or height is not a multiple of the slice's width or
            height minus the overlap.
    """
    slice_width, slice_height = slice_wh
    image_width, image_height = resolution_wh
    overlap_width = int(overlap_ratio_wh[0] * slice_width)
    overlap_height = int(overlap_ratio_wh[1] * slice_height)
    width_stride = slice_width - overlap_width
    height_stride = slice_height - overlap_height
    ws = np.arange(0, image_width, width_stride)
    hs = np.arange(0, image_height, height_stride)
    xmin, ymin = np.meshgrid(ws, hs)
    xmax = np.clip(xmin + slice_width, 0, image_width)
    ymax = np.clip(ymin + slice_height, 0, image_height)
    return np.stack([xmin, ymin, xmax, ymax], axis=-1).reshape(-1, 4)

inference.core.workflows.core_steps.transformations.image_slicer.v2 ¶

Classes¶

Functions:¶

generate_offsets ¶

generate_offsets(resolution_wh, slice_wh, overlap_ratio_wh)

This is modification of the function from block v1, which makes sure that the "border" crops are pushed towards the center of the image, making sure: * all crops will be the same size * deduplication of crops coordinates is done

Source code in inference/core/workflows/core_steps/transformations/image_slicer/v2.py

def generate_offsets(
    resolution_wh: Tuple[int, int],
    slice_wh: Tuple[int, int],
    overlap_ratio_wh: Tuple[float, float],
) -> np.ndarray:
    """
    This is modification of the function from block v1, which
    makes sure that the "border" crops are pushed towards the center of
    the image, making sure:
        * all crops will be the same size
        * deduplication of crops coordinates is done
    """
    slice_width, slice_height = slice_wh
    image_width, image_height = resolution_wh
    slice_width = min(slice_width, image_width)
    slice_height = min(slice_height, image_height)
    overlap_width = int(overlap_ratio_wh[0] * slice_width)
    overlap_height = int(overlap_ratio_wh[1] * slice_height)
    width_stride = slice_width - overlap_width
    height_stride = slice_height - overlap_height
    ws = np.arange(0, image_width, width_stride)
    ws_left_over = np.clip(ws + slice_width - image_width, 0, slice_width)
    hs = np.arange(0, image_height, height_stride)
    hs_left_over = np.clip(hs + slice_height - image_height, 0, slice_height)
    anchors_ws = ws - ws_left_over
    anchors_hs = hs - hs_left_over
    xmin, ymin = np.meshgrid(anchors_ws, anchors_hs)
    xmax = np.clip(xmin + slice_width, 0, image_width)
    ymax = np.clip(ymin + slice_height, 0, image_height)
    results = np.stack([xmin, ymin, xmax, ymax], axis=-1).reshape(-1, 4)
    deduplicated_results = []
    already_seen = set()
    for xyxy in results:
        xyxy_tuple = tuple(xyxy)
        if xyxy_tuple in already_seen:
            continue
        deduplicated_results.append(xyxy)
        already_seen.add(xyxy_tuple)
    return np.array(deduplicated_results)

`core/workflows/core_steps/transformations/qr_code_generator`¶

inference.core.workflows.core_steps.transformations.qr_code_generator.v1 ¶

Classes¶

Functions:¶

generate_qr_code ¶

generate_qr_code(
    text,
    version=None,
    box_size=10,
    error_correct="M",
    border=4,
    fill_color="BLACK",
    back_color="WHITE",
)

Generate a QR code PNG image from text input.

Source code in inference/core/workflows/core_steps/transformations/qr_code_generator/v1.py

def generate_qr_code(
    text: str,
    version: Optional[int] = None,
    box_size: int = 10,
    error_correct: str = "M",
    border: int = 4,
    fill_color: str = "BLACK",
    back_color: str = "WHITE",
) -> WorkflowImageData:
    """Generate a QR code PNG image from text input."""
    global _ERROR_LEVELS, _QR_CACHE

    # Check cache first
    cached_result = _QR_CACHE.get(
        text, version, box_size, error_correct, border, fill_color, back_color
    )
    if cached_result is not None:
        return cached_result

    try:
        import qrcode
    except ImportError:
        raise ImportError(
            "qrcode library is required for QR code generation. "
            "Install it with: pip install qrcode"
        )
    if _ERROR_LEVELS is None:
        _ERROR_LEVELS = _get_error_levels()

    # Parse colors using the common utility that handles hex, rgb, bgr, and standard names
    try:
        # Convert to supervision Color object, then to RGB tuple for qrcode library
        fill_sv_color = str_to_color(fill_color)
        fill = fill_sv_color.as_rgb()  # Returns (R, G, B) tuple
    except (ValueError, AttributeError):
        # Fallback to original string if not a recognized format
        # This allows qrcode library to handle CSS3 color names directly
        fill = fill_color

    try:
        back_sv_color = str_to_color(back_color)
        back = back_sv_color.as_rgb()  # Returns (R, G, B) tuple
    except (ValueError, AttributeError):
        # Fallback to original string if not a recognized format
        back = back_color

    error_level = _ERROR_LEVELS.get(
        error_correct.upper(), qrcode.constants.ERROR_CORRECT_M
    )

    # Create QR code
    qr = qrcode.QRCode(
        version=version,
        error_correction=error_level,
        box_size=box_size,
        border=border,
    )

    qr.add_data(text)
    qr.make(fit=(version is None))

    # Generate image using default image factory
    img = qr.make_image(
        fill_color=fill,
        back_color=back,
    ).convert(
        "RGB"
    )  # Ensure always RGB

    # Direct conversion from PIL.Image to numpy array (much faster than encode/decode)
    numpy_image = np.array(img)

    # Convert from RGB (PIL format) to BGR (OpenCV/WorkflowImageData format)
    # PIL creates RGB images, but WorkflowImageData expects BGR format
    numpy_image = numpy_image[:, :, ::-1]  # RGB -> BGR

    # Defensive: numpy_image should never be None; original code checks for None on OpenCV decode failure
    if numpy_image is None or numpy_image.size == 0:
        raise ValueError("Failed to generate QR code image")

    # Create WorkflowImageData
    parent_metadata = ImageParentMetadata(parent_id=f"qr_code.{uuid4()}")
    result = WorkflowImageData(
        parent_metadata=parent_metadata,
        numpy_image=numpy_image,
    )

    # Store in cache
    _QR_CACHE.put(
        text, version, box_size, error_correct, border, fill_color, back_color, result
    )

    return result

`core/workflows/core_steps/transformations/stitch_ocr_detections`¶

inference.core.workflows.core_steps.transformations.stitch_ocr_detections.v1 ¶

Classes¶

Functions:¶

get_line_separator ¶

get_line_separator(reading_direction)

Get the appropriate separator based on reading direction.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v1.py

def get_line_separator(reading_direction: str) -> str:
    """Get the appropriate separator based on reading direction."""
    return "\n" if reading_direction in ["left_to_right", "right_to_left"] else " "

group_detections_by_line ¶

group_detections_by_line(
    xyxy, reading_direction, tolerance
)

Group detections into lines based on primary coordinate.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v1.py

def group_detections_by_line(
    xyxy: np.ndarray,
    reading_direction: str,
    tolerance: int,
) -> Dict[float, Dict[str, List]]:
    """Group detections into lines based on primary coordinate."""
    # After prepare_coordinates swap, we always group by y ([:, 1])
    primary_coord = xyxy[:, 1]  # This is y for horizontal, swapped x for vertical

    # Round primary coordinate to group into lines
    rounded_primary = np.round(primary_coord / tolerance) * tolerance

    boxes_by_line = {}
    # Group bounding boxes and associated indices by line
    for i, (bbox, line_pos) in enumerate(zip(xyxy, rounded_primary)):
        if line_pos not in boxes_by_line:
            boxes_by_line[line_pos] = {"xyxy": [bbox], "idx": [i]}
        else:
            boxes_by_line[line_pos]["xyxy"].append(bbox)
            boxes_by_line[line_pos]["idx"].append(i)

    return boxes_by_line

prepare_coordinates ¶

prepare_coordinates(xyxy, reading_direction)

Prepare coordinates based on reading direction.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v1.py

def prepare_coordinates(
    xyxy: np.ndarray,
    reading_direction: str,
) -> np.ndarray:
    """Prepare coordinates based on reading direction."""
    if reading_direction in ["vertical_top_to_bottom", "vertical_bottom_to_top"]:
        # Swap x and y coordinates: [x1,y1,x2,y2] -> [y1,x1,y2,x2]
        return xyxy[:, [1, 0, 3, 2]]
    return xyxy

sort_line_detections ¶

sort_line_detections(line_xyxy, reading_direction)

Sort detections within a line based on reading direction.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v1.py

def sort_line_detections(
    line_xyxy: np.ndarray,
    reading_direction: str,
) -> np.ndarray:
    """Sort detections within a line based on reading direction."""
    # After prepare_coordinates swap, we always sort by x ([:, 0])
    if reading_direction in ["left_to_right", "vertical_top_to_bottom"]:
        return line_xyxy[:, 0].argsort()  # Sort by x1 (original x or swapped y)
    else:  # right_to_left or vertical_bottom_to_top
        return (-line_xyxy[:, 0]).argsort()  # Sort by -x1 (original -x or swapped -y)

stitch_ocr_detections ¶

stitch_ocr_detections(
    detections,
    reading_direction="left_to_right",
    tolerance=10,
    delimiter="",
)

Stitch OCR detections into coherent text based on spatial arrangement.

Parameters:

Name	Type	Description	Default
`detections`	`Detections`	Supervision Detections object containing OCR results	required
`reading_direction`	`str`	Direction to read text ("left_to_right", "right_to_left", "vertical_top_to_bottom", "vertical_bottom_to_top")	`'left_to_right'`
`tolerance`	`int`	Vertical tolerance for grouping text into lines	`10`

Returns:

Type	Description
`Dict[str, str]`	Dict containing stitched OCR text under 'ocr_text' key

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v1.py

def stitch_ocr_detections(
    detections: sv.Detections,
    reading_direction: str = "left_to_right",
    tolerance: int = 10,
    delimiter: str = "",
) -> Dict[str, str]:
    """
    Stitch OCR detections into coherent text based on spatial arrangement.

    Args:
        detections: Supervision Detections object containing OCR results
        reading_direction: Direction to read text ("left_to_right", "right_to_left",
                         "vertical_top_to_bottom", "vertical_bottom_to_top")
        tolerance: Vertical tolerance for grouping text into lines

    Returns:
        Dict containing stitched OCR text under 'ocr_text' key
    """
    if len(detections) == 0:
        return {"ocr_text": ""}

    xyxy = detections.xyxy.round().astype(dtype=int)
    class_names = detections.data["class_name"]

    # Prepare coordinates based on reading direction
    xyxy = prepare_coordinates(xyxy, reading_direction)

    # Group detections into lines
    boxes_by_line = group_detections_by_line(xyxy, reading_direction, tolerance)
    # Sort lines based on reading direction
    lines = sorted(
        boxes_by_line.keys(), reverse=reading_direction in ["vertical_bottom_to_top"]
    )

    # Build final text
    ordered_class_names = []
    for i, key in enumerate(lines):
        line_data = boxes_by_line[key]
        line_xyxy = np.array(line_data["xyxy"])
        line_idx = np.array(line_data["idx"])

        # Sort detections within line
        sort_idx = sort_line_detections(line_xyxy, reading_direction)

        # Add sorted class names for this line
        ordered_class_names.extend(class_names[line_idx[sort_idx]])

        # Add line separator if not last line
        if i < len(lines) - 1:
            ordered_class_names.append(get_line_separator(reading_direction))

    return {"ocr_text": delimiter.join(ordered_class_names)}

inference.core.workflows.core_steps.transformations.stitch_ocr_detections.v2 ¶

Classes¶

CollimateDetection ¶

Helper class for collimate algorithm to store detection properties.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

class CollimateDetection:
    """Helper class for collimate algorithm to store detection properties."""

    def __init__(self, xyxy: np.ndarray, class_name: str, idx: int):
        self.x = (xyxy[0] + xyxy[2]) / 2
        self.y = (xyxy[1] + xyxy[3]) / 2
        self.width = xyxy[2] - xyxy[0]
        self.height = xyxy[3] - xyxy[1]
        self.class_name = class_name
        self.idx = idx  # Original index for tracking

    def __repr__(self) -> str:
        return f"{self.class_name}"

StitchingAlgorithm ¶

Bases: str, Enum

Algorithm for grouping detections into words/lines.

Uses fixed pixel tolerance for line grouping (original algorithm).

Good for consistent font sizes and line spacing.

Uses Otsu's method on normalized gaps to find natural breaks.

Resolution-invariant and works well with bimodal distributions (e.g., character-level vs word-level spacing).

Uses greedy parent-child traversal to group detections.

Good for skewed or curved text where bucket-based approaches fail.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

class StitchingAlgorithm(str, Enum):
    """Algorithm for grouping detections into words/lines.

    TOLERANCE: Uses fixed pixel tolerance for line grouping (original algorithm).
        Good for consistent font sizes and line spacing.

    OTSU: Uses Otsu's method on normalized gaps to find natural breaks.
        Resolution-invariant and works well with bimodal distributions
        (e.g., character-level vs word-level spacing).

    COLLIMATE: Uses greedy parent-child traversal to group detections.
        Good for skewed or curved text where bucket-based approaches fail.
    """

    TOLERANCE = "tolerance"
    OTSU = "otsu"
    COLLIMATE = "collimate"

Functions:¶

adaptive_word_grouping ¶

adaptive_word_grouping(
    detections,
    reading_direction,
    delimiter="",
    threshold_multiplier=1.0,
)

Stitch OCR detections using adaptive gap analysis with Otsu thresholding.

This approach is resolution-invariant because it normalizes gaps by local character dimensions. It works well with bimodal gap distributions (e.g., character-level vs word-level spacing).

The algorithm computes a global threshold across all lines to leverage the full dataset of gaps, which provides more robust Otsu thresholding than per-line computation.

Parameters:

Name	Type	Description	Default
`detections`	`Detections`	Supervision Detections object containing OCR results	required
`reading_direction`	`str`	Direction to read text	required
`delimiter`	`str`	String to insert between text elements	`''`
`threshold_multiplier`	`float`	Multiplier applied to Otsu threshold (>1.0 = fewer word breaks, <1.0 = more word breaks)	`1.0`

Returns:

Type	Description
`Dict[str, str]`	Dict containing stitched OCR text under 'ocr_text' key

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

def adaptive_word_grouping(
    detections: sv.Detections,
    reading_direction: str,
    delimiter: str = "",
    threshold_multiplier: float = 1.0,
) -> Dict[str, str]:
    """Stitch OCR detections using adaptive gap analysis with Otsu thresholding.

    This approach is resolution-invariant because it normalizes gaps by local
    character dimensions. It works well with bimodal gap distributions
    (e.g., character-level vs word-level spacing).

    The algorithm computes a global threshold across all lines to leverage
    the full dataset of gaps, which provides more robust Otsu thresholding
    than per-line computation.

    Args:
        detections: Supervision Detections object containing OCR results
        reading_direction: Direction to read text
        delimiter: String to insert between text elements
        threshold_multiplier: Multiplier applied to Otsu threshold (>1.0 = fewer word breaks, <1.0 = more word breaks)

    Returns:
        Dict containing stitched OCR text under 'ocr_text' key
    """
    if len(detections) == 0:
        return {"ocr_text": ""}

    xyxy = detections.xyxy
    class_names = detections.data["class_name"]

    # Determine if we're working with vertical text
    is_vertical = reading_direction in [
        "vertical_top_to_bottom",
        "vertical_bottom_to_top",
    ]

    # For vertical text, swap x/y for processing
    if is_vertical:
        # Swap coordinates: treat y as x for sorting
        x_centers = (xyxy[:, 1] + xyxy[:, 3]) / 2  # y becomes primary axis
        y_centers = (xyxy[:, 0] + xyxy[:, 2]) / 2  # x becomes secondary axis
        widths = xyxy[:, 3] - xyxy[:, 1]  # height becomes "width"
        heights = xyxy[:, 2] - xyxy[:, 0]  # width becomes "height"
    else:
        x_centers = (xyxy[:, 0] + xyxy[:, 2]) / 2
        y_centers = (xyxy[:, 1] + xyxy[:, 3]) / 2
        widths = xyxy[:, 2] - xyxy[:, 0]
        heights = xyxy[:, 3] - xyxy[:, 1]

    # First, group detections into lines based on y-coordinate clustering
    # Use adaptive threshold based on median height
    median_height = np.median(heights)
    line_tolerance = median_height * 0.5

    # Sort by y to group into lines
    y_sorted_indices = np.argsort(y_centers)

    lines = []
    current_line = [y_sorted_indices[0]]
    current_line_y = y_centers[y_sorted_indices[0]]

    for idx in y_sorted_indices[1:]:
        if abs(y_centers[idx] - current_line_y) <= line_tolerance:
            current_line.append(idx)
            # Update line y as running average
            current_line_y = np.mean([y_centers[i] for i in current_line])
        else:
            lines.append(current_line)
            current_line = [idx]
            current_line_y = y_centers[idx]
    lines.append(current_line)

    # Sort lines by y position
    line_y_positions = [np.mean([y_centers[i] for i in line]) for line in lines]
    if reading_direction in ["vertical_bottom_to_top"]:
        sorted_line_indices = np.argsort(line_y_positions)[::-1]
    else:
        sorted_line_indices = np.argsort(line_y_positions)

    # First pass: compute normalized gaps for ALL lines to get global threshold
    all_normalized_gaps = []
    line_data = []  # Store sorted line info for second pass

    for line_idx in sorted_line_indices:
        line = lines[line_idx]

        if len(line) == 1:
            line_data.append((line, None, None, None))
            continue

        # Sort detections in line by x position
        line_x_centers = x_centers[line]
        line_widths = widths[line]

        if reading_direction in ["right_to_left", "vertical_bottom_to_top"]:
            x_sorted_order = np.argsort(line_x_centers)[::-1]
        else:
            x_sorted_order = np.argsort(line_x_centers)

        sorted_line = [line[i] for i in x_sorted_order]
        sorted_x_centers = line_x_centers[x_sorted_order]
        sorted_widths = line_widths[x_sorted_order]

        # Compute normalized gaps for this line
        normalized_gaps = []
        for i in range(1, len(sorted_line)):
            prev_idx, curr_idx = i - 1, i
            # Raw gap between detection edges
            if reading_direction in ["right_to_left", "vertical_bottom_to_top"]:
                raw_gap = (
                    sorted_x_centers[prev_idx]
                    - sorted_x_centers[curr_idx]
                    - (sorted_widths[prev_idx] + sorted_widths[curr_idx]) / 2
                )
            else:
                raw_gap = (
                    sorted_x_centers[curr_idx]
                    - sorted_x_centers[prev_idx]
                    - (sorted_widths[prev_idx] + sorted_widths[curr_idx]) / 2
                )

            # Normalize by local character scale
            local_scale = (sorted_widths[prev_idx] + sorted_widths[curr_idx]) / 2
            if local_scale > 0:
                normalized_gaps.append(raw_gap / local_scale)
            else:
                normalized_gaps.append(0.0)

        normalized_gaps = np.array(normalized_gaps)
        all_normalized_gaps.extend(normalized_gaps.tolist())
        line_data.append(
            (sorted_line, sorted_x_centers, sorted_widths, normalized_gaps)
        )

    # Compute global threshold using all gaps, then apply multiplier
    all_normalized_gaps = np.array(all_normalized_gaps)
    global_threshold, is_bimodal = find_otsu_threshold(all_normalized_gaps)
    global_threshold *= threshold_multiplier

    # Second pass: use global threshold to group words
    all_text_parts = []

    for sorted_line, sorted_x_centers, sorted_widths, normalized_gaps in line_data:
        if normalized_gaps is None:
            # Single detection in line
            all_text_parts.append(class_names[sorted_line[0]])
            continue

        # If distribution is not bimodal (likely single word or uniform spacing),
        # treat all detections as a single word to avoid incorrect splitting
        if not is_bimodal:
            word_text = delimiter.join([class_names[idx] for idx in sorted_line])
            all_text_parts.append(word_text)
            continue

        # Group into words based on global threshold
        words = [[sorted_line[0]]]
        for i, det_idx in enumerate(sorted_line[1:]):
            if normalized_gaps[i] > global_threshold:
                words.append([det_idx])
            else:
                words[-1].append(det_idx)

        # Build text for this line
        line_text_parts = []
        for word in words:
            word_text = delimiter.join([class_names[idx] for idx in word])
            line_text_parts.append(word_text)

        # Join words with space (or delimiter if specified and non-empty)
        word_separator = " " if delimiter == "" else delimiter
        all_text_parts.append(word_separator.join(line_text_parts))

    # Join lines with appropriate separator
    line_separator = get_line_separator(reading_direction)
    return {"ocr_text": line_separator.join(all_text_parts)}

collimate_word_grouping ¶

collimate_word_grouping(
    detections,
    reading_direction,
    delimiter="",
    tolerance=10,
)

Stitch OCR detections using greedy parent-child traversal (collimate algorithm).

This algorithm is good for skewed or curved text where traditional bucket-based line grouping may fail. It works by: 1. Sorting detections by primary reading coordinate 2. Starting with the first detection as a "parent" 3. Finding all detections that "follow" the parent (within tolerance) 4. Building lines/columns through greedy traversal

Parameters:

Name	Type	Description	Default
`detections`	`Detections`	Supervision Detections object containing OCR results	required
`reading_direction`	`str`	Direction to read text	required
`delimiter`	`str`	String to insert between characters within words	`''`
`tolerance`	`int`	Pixel tolerance for alignment	`10`

Returns:

Type	Description
`Dict[str, str]`	Dict containing stitched OCR text under 'ocr_text' key

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

def collimate_word_grouping(
    detections: sv.Detections,
    reading_direction: str,
    delimiter: str = "",
    tolerance: int = 10,
) -> Dict[str, str]:
    """Stitch OCR detections using greedy parent-child traversal (collimate algorithm).

    This algorithm is good for skewed or curved text where traditional bucket-based
    line grouping may fail. It works by:
    1. Sorting detections by primary reading coordinate
    2. Starting with the first detection as a "parent"
    3. Finding all detections that "follow" the parent (within tolerance)
    4. Building lines/columns through greedy traversal

    Args:
        detections: Supervision Detections object containing OCR results
        reading_direction: Direction to read text
        delimiter: String to insert between characters within words
        tolerance: Pixel tolerance for alignment

    Returns:
        Dict containing stitched OCR text under 'ocr_text' key
    """
    if len(detections) == 0:
        return {"ocr_text": ""}

    xyxy = detections.xyxy
    class_names = detections.data["class_name"]

    # Convert to CollimateDetection objects
    coll_detections = [
        CollimateDetection(xyxy[i], class_names[i], i) for i in range(len(detections))
    ]

    # Sort by primary reading coordinate
    coll_detections = _sort_detections_for_collimate(coll_detections, reading_direction)

    if len(coll_detections) == 0:
        return {"ocr_text": ""}

    # Build lines through greedy parent-child traversal
    remaining = list(coll_detections)
    lines: List[List[CollimateDetection]] = [[remaining.pop(0)]]

    while len(remaining) > 0:
        found_child = False

        # Try to extend existing lines
        for line in lines:
            parent = line[-1]

            # Find children that follow parent
            for det in remaining.copy():
                if _detection_follows(parent, det, reading_direction, tolerance):
                    found_child = True
                    line.append(det)
                    parent = det  # New parent for next iteration
                    remaining.remove(det)

        # If no children found for any line, start a new line
        if not found_child and len(remaining) > 0:
            lines.append([remaining.pop(0)])

    # Sort lines by their average secondary coordinate
    is_vertical = reading_direction in [
        "vertical_top_to_bottom",
        "vertical_bottom_to_top",
    ]
    if is_vertical:
        # For vertical text, sort columns left-to-right (or right-to-left)
        reverse = reading_direction == "vertical_bottom_to_top"
    else:
        # For horizontal text, sort rows top-to-bottom
        reverse = False

    lines = sorted(
        lines,
        key=lambda line: _get_line_avg_coord(line, reading_direction),
        reverse=reverse,
    )

    # Build output text
    line_texts = []
    for line in lines:
        # Characters within a line are concatenated with delimiter
        line_text = delimiter.join(d.class_name for d in line)
        line_texts.append(line_text)

    # Join lines with appropriate separator
    line_separator = get_line_separator(reading_direction)
    return {"ocr_text": line_separator.join(line_texts)}

find_otsu_threshold ¶

find_otsu_threshold(gaps)

Find natural break between intra-word and inter-word gaps using Otsu's method.

This is a resolution-invariant approach that finds the optimal threshold to separate two classes of gaps (e.g., gaps within words vs gaps between words).

Also detects whether the distribution is bimodal (two distinct groups) or unimodal (single group, suggesting single word or uniform spacing).

Parameters:

Name	Type	Description	Default
`gaps`	`ndarray`	Array of normalized gap values	required

Returns:

Type	Description
`float`	Tuple of (threshold, is_bimodal):
`bool`	threshold: Optimal threshold value that maximizes between-class variance
`tuple[float, bool]`	is_bimodal: True if distribution appears bimodal, False if unimodal

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

def find_otsu_threshold(gaps: np.ndarray) -> tuple[float, bool]:
    """Find natural break between intra-word and inter-word gaps using Otsu's method.

    This is a resolution-invariant approach that finds the optimal threshold
    to separate two classes of gaps (e.g., gaps within words vs gaps between words).

    Also detects whether the distribution is bimodal (two distinct groups) or
    unimodal (single group, suggesting single word or uniform spacing).

    Args:
        gaps: Array of normalized gap values

    Returns:
        Tuple of (threshold, is_bimodal):
        - threshold: Optimal threshold value that maximizes between-class variance
        - is_bimodal: True if distribution appears bimodal, False if unimodal
    """
    if len(gaps) < 2:
        return 0.0, False

    # Create histogram of gaps
    hist, bin_edges = np.histogram(gaps, bins=min(50, len(gaps)))
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    best_thresh = 0.0
    best_variance = 0.0
    best_below_mean = 0.0
    best_above_mean = 0.0

    for t in bin_centers:
        below = gaps[gaps <= t]
        above = gaps[gaps > t]

        if len(below) == 0 or len(above) == 0:
            continue

        # Between-class variance (Otsu's criterion)
        variance = len(below) * len(above) * (below.mean() - above.mean()) ** 2

        if variance > best_variance:
            best_variance = variance
            best_thresh = t
            best_below_mean = below.mean()
            best_above_mean = above.mean()

    # Check if distribution is bimodal using several heuristics:
    # 1. The gap between class means should be significant relative to overall spread
    # 2. There should be meaningful absolute separation between classes

    overall_std = gaps.std()
    overall_mean = gaps.mean()

    # Separation ratio: how far apart are the two class means relative to overall std
    mean_separation = abs(best_above_mean - best_below_mean)
    separation_ratio = mean_separation / overall_std if overall_std > 0 else 0

    # Bimodality criteria - MUST have meaningful word gaps (not just outliers):
    # The key insight is that real word gaps are typically 0.5+ in normalized units.
    # A distribution with all gaps < 0.3 is unimodal (single word), even if there
    # are outliers (like overlapping characters with negative gaps) that inflate
    # the mean separation.
    #
    # Primary criterion: above-class mean must indicate actual word gaps exist
    has_positive_word_gaps = (
        best_above_mean > 0.3
    )  # Word gaps should be clearly positive

    # Secondary criterion: if we have good separation AND positive gaps
    has_good_relative_separation = separation_ratio > 1.5 and mean_separation > 0.3

    # Must have positive word gaps to be considered bimodal
    is_bimodal = has_positive_word_gaps and (
        mean_separation > 0.3 or has_good_relative_separation
    )

    return best_thresh, is_bimodal

get_line_separator ¶

get_line_separator(reading_direction)

Get the appropriate separator based on reading direction.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

def get_line_separator(reading_direction: str) -> str:
    """Get the appropriate separator based on reading direction."""
    return "\n" if reading_direction in ["left_to_right", "right_to_left"] else " "

group_detections_by_line ¶

group_detections_by_line(
    xyxy, reading_direction, tolerance
)

Group detections into lines based on primary coordinate.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

def group_detections_by_line(
    xyxy: np.ndarray,
    reading_direction: str,
    tolerance: int,
) -> Dict[float, Dict[str, List]]:
    """Group detections into lines based on primary coordinate."""
    # After prepare_coordinates swap, we always group by y ([:, 1])
    primary_coord = xyxy[:, 1]  # This is y for horizontal, swapped x for vertical

    # Round primary coordinate to group into lines
    rounded_primary = np.round(primary_coord / tolerance) * tolerance

    boxes_by_line = {}
    # Group bounding boxes and associated indices by line
    for i, (bbox, line_pos) in enumerate(zip(xyxy, rounded_primary)):
        if line_pos not in boxes_by_line:
            boxes_by_line[line_pos] = {"xyxy": [bbox], "idx": [i]}
        else:
            boxes_by_line[line_pos]["xyxy"].append(bbox)
            boxes_by_line[line_pos]["idx"].append(i)

    return boxes_by_line

prepare_coordinates ¶

prepare_coordinates(xyxy, reading_direction)

Prepare coordinates based on reading direction.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

def prepare_coordinates(
    xyxy: np.ndarray,
    reading_direction: str,
) -> np.ndarray:
    """Prepare coordinates based on reading direction."""
    if reading_direction in ["vertical_top_to_bottom", "vertical_bottom_to_top"]:
        # Swap x and y coordinates: [x1,y1,x2,y2] -> [y1,x1,y2,x2]
        return xyxy[:, [1, 0, 3, 2]]
    return xyxy

sort_line_detections ¶

sort_line_detections(line_xyxy, reading_direction)

Sort detections within a line based on reading direction.

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

def sort_line_detections(
    line_xyxy: np.ndarray,
    reading_direction: str,
) -> np.ndarray:
    """Sort detections within a line based on reading direction."""
    # After prepare_coordinates swap, we always sort by x ([:, 0])
    if reading_direction in ["left_to_right", "vertical_top_to_bottom"]:
        return line_xyxy[:, 0].argsort()  # Sort by x1 (original x or swapped y)
    else:  # right_to_left or vertical_bottom_to_top
        return (-line_xyxy[:, 0]).argsort()  # Sort by -x1 (original -x or swapped -y)

stitch_ocr_detections ¶

stitch_ocr_detections(
    detections,
    reading_direction="left_to_right",
    tolerance=10,
    delimiter="",
)

Stitch OCR detections into coherent text based on spatial arrangement.

Parameters:

Name	Type	Description	Default
`detections`	`Detections`	Supervision Detections object containing OCR results	required
`reading_direction`	`str`	Direction to read text ("left_to_right", "right_to_left", "vertical_top_to_bottom", "vertical_bottom_to_top")	`'left_to_right'`
`tolerance`	`int`	Vertical tolerance for grouping text into lines	`10`

Returns:

Type	Description
`Dict[str, str]`	Dict containing stitched OCR text under 'ocr_text' key

Source code in inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

def stitch_ocr_detections(
    detections: sv.Detections,
    reading_direction: str = "left_to_right",
    tolerance: int = 10,
    delimiter: str = "",
) -> Dict[str, str]:
    """
    Stitch OCR detections into coherent text based on spatial arrangement.

    Args:
        detections: Supervision Detections object containing OCR results
        reading_direction: Direction to read text ("left_to_right", "right_to_left",
                         "vertical_top_to_bottom", "vertical_bottom_to_top")
        tolerance: Vertical tolerance for grouping text into lines

    Returns:
        Dict containing stitched OCR text under 'ocr_text' key
    """
    if len(detections) == 0:
        return {"ocr_text": ""}

    xyxy = detections.xyxy.round().astype(dtype=int)
    class_names = detections.data["class_name"]

    # Prepare coordinates based on reading direction
    xyxy = prepare_coordinates(xyxy, reading_direction)

    # Group detections into lines
    boxes_by_line = group_detections_by_line(xyxy, reading_direction, tolerance)
    # Sort lines based on reading direction
    lines = sorted(
        boxes_by_line.keys(), reverse=reading_direction in ["vertical_bottom_to_top"]
    )

    # Build final text
    ordered_class_names = []
    for i, key in enumerate(lines):
        line_data = boxes_by_line[key]
        line_xyxy = np.array(line_data["xyxy"])
        line_idx = np.array(line_data["idx"])

        # Sort detections within line
        sort_idx = sort_line_detections(line_xyxy, reading_direction)

        # Add sorted class names for this line
        ordered_class_names.extend(class_names[line_idx[sort_idx]])

        # Add line separator if not last line
        if i < len(lines) - 1:
            ordered_class_names.append(get_line_separator(reading_direction))

    return {"ocr_text": delimiter.join(ordered_class_names)}

`core/workflows/core_steps/visualizations/classification_label`¶

inference.core.workflows.core_steps.visualizations.classification_label.v1 ¶

Classes¶

Functions:¶

create_label_visualization ¶

create_label_visualization(
    sorted_predictions,
    text_position,
    text,
    w,
    h,
    initial_offset,
    total_spacing,
    text_scale,
    text_padding,
)

Create visualization layout for classification labels.

Source code in inference/core/workflows/core_steps/visualizations/classification_label/v1.py

def create_label_visualization(
    sorted_predictions: List[dict],
    text_position: str,
    text: str,
    w: int,
    h: int,
    initial_offset: float,
    total_spacing: float,
    text_scale: float,
    text_padding: int,
) -> Tuple[np.ndarray, List[str], List[dict]]:
    """Create visualization layout for classification labels."""
    if text_position in ["BOTTOM_LEFT", "BOTTOM_CENTER", "BOTTOM_RIGHT"]:
        return handle_bottom_position(
            sorted_predictions, text, w, h, initial_offset, total_spacing
        )
    elif text_position in ["CENTER", "CENTER_LEFT", "CENTER_RIGHT"]:
        return handle_center_position(
            sorted_predictions,
            text,
            text_position,
            w,
            h,
            total_spacing,
            text_scale,
            text_padding,
        )
    else:  # Top positions
        return handle_top_position(
            sorted_predictions, text, w, h, initial_offset, total_spacing
        )

detect_prediction_type ¶

detect_prediction_type(predictions)

Detect whether predictions are single-label or multi-label based on structure.

Parameters:

Name	Type	Description	Default
`predictions`	`dict`	The predictions dictionary	required

Returns:

Name	Type	Description
`str`	`str`	'single-label' or 'multi-label'

Source code in inference/core/workflows/core_steps/visualizations/classification_label/v1.py

def detect_prediction_type(predictions: dict) -> str:
    """
    Detect whether predictions are single-label or multi-label based on structure.

    Args:
        predictions (dict): The predictions dictionary

    Returns:
        str: 'single-label' or 'multi-label'
    """
    if isinstance(predictions.get("predictions"), list):
        return "single-label"
    return "multi-label"

format_labels ¶

format_labels(predictions, text='Class and Confidence')

Format labels based on specified text option.

Parameters:

Name	Type	Description	Default
`predictions`	`list`	List of prediction dictionaries containing 'class' and 'confidence'	required
`text`	`str`	One of "class", "confidence", or "class and confidence"	`'Class and Confidence'`

Returns:

Name	Type	Description
`list`		Formatted label strings

Source code in inference/core/workflows/core_steps/visualizations/classification_label/v1.py

def format_labels(predictions, text="Class and Confidence"):
    """
    Format labels based on specified text option.

    Args:
        predictions (list): List of prediction dictionaries containing 'class' and 'confidence'
        text (str): One of "class", "confidence", or "class and confidence"

    Returns:
        list: Formatted label strings
    """
    if text == "Class":
        labels = [f"{p['class']}" for p in predictions]
    elif text == "Confidence":
        labels = [f"{p['confidence']:.2f}" for p in predictions]
    elif text == "Class and Confidence":
        labels = [f"{p['class']} {p['confidence']:.2f}" for p in predictions]
    else:
        raise ValueError(
            "text must be one of: 'class', 'confidence', or 'class and confidence'"
        )

    return labels

format_multi_label_predictions ¶

format_multi_label_predictions(predictions)

Transform multi-label predictions from predicted_classes into standard format.

Parameters:

Name	Type	Description	Default
`predictions`	`dict`	The predictions dictionary	required

Returns:

Type	Description
`List[dict]`	List[dict]: Formatted predictions list

Source code in inference/core/workflows/core_steps/visualizations/classification_label/v1.py

def format_multi_label_predictions(predictions: dict) -> List[dict]:
    """
    Transform multi-label predictions from predicted_classes into standard format.

    Args:
        predictions (dict): The predictions dictionary

    Returns:
        List[dict]: Formatted predictions list
    """
    formatted_predictions = []
    for class_name in predictions["predicted_classes"]:
        pred_info = predictions["predictions"][class_name]
        formatted_predictions.append(
            {
                "class": class_name,
                "class_id": pred_info["class_id"],
                "confidence": pred_info["confidence"],
            }
        )
    return formatted_predictions

handle_bottom_position ¶

handle_bottom_position(
    sorted_predictions,
    text,
    w,
    h,
    initial_offset,
    total_spacing,
)

Handle visualization layout for bottom positions.

Source code in inference/core/workflows/core_steps/visualizations/classification_label/v1.py

def handle_bottom_position(
    sorted_predictions: List[dict],
    text: str,
    w: int,
    h: int,
    initial_offset: float,
    total_spacing: float,
) -> Tuple[np.ndarray, List[str], List[dict]]:
    """Handle visualization layout for bottom positions."""
    reversed_predictions = sorted_predictions[::-1]
    xyxy = np.array(
        [
            [0, 0, w, h - (initial_offset + i * total_spacing)]
            for i in range(len(reversed_predictions))
        ]
    )
    labels = format_labels(reversed_predictions, text)
    return xyxy, labels, reversed_predictions

handle_center_position ¶

handle_center_position(
    sorted_predictions,
    text,
    text_position,
    w,
    h,
    total_spacing,
    text_scale,
    text_padding,
)

Handle visualization layout for center positions.

Source code in inference/core/workflows/core_steps/visualizations/classification_label/v1.py

def handle_center_position(
    sorted_predictions: List[dict],
    text: str,
    text_position: str,
    w: int,
    h: int,
    total_spacing: float,
    text_scale: float,
    text_padding: int,
) -> Tuple[np.ndarray, List[str], List[dict]]:
    """Handle visualization layout for center positions."""
    labels = format_labels(sorted_predictions, text)
    n_predictions = len(sorted_predictions)
    total_height = total_spacing * n_predictions
    start_y = max(0, min((h - total_height) / 2, h - total_height))

    max_label_length = max(len(label) for label in labels)
    char_width = 15
    label_width = (max_label_length * char_width * text_scale) + (text_padding * 2)
    extra_padding = 20 + max(0, 10 - text_padding) * 3

    if text_position == "CENTER_LEFT":
        x_start = label_width + extra_padding
        xyxy = np.array(
            [
                [
                    x_start,
                    start_y + i * total_spacing,
                    w,
                    start_y + (i + 1) * total_spacing,
                ]
                for i in range(n_predictions)
            ]
        )
    elif text_position == "CENTER_RIGHT":
        x_end = w - (label_width + extra_padding)
        xyxy = np.array(
            [
                [
                    0,
                    start_y + i * total_spacing,
                    x_end,
                    start_y + (i + 1) * total_spacing,
                ]
                for i in range(n_predictions)
            ]
        )
    else:  # CENTER
        xyxy = np.array(
            [
                [0, start_y + i * total_spacing, w, start_y + (i + 1) * total_spacing]
                for i in range(n_predictions)
            ]
        )

    return xyxy, labels, sorted_predictions

handle_top_position ¶

handle_top_position(
    sorted_predictions,
    text,
    w,
    h,
    initial_offset,
    total_spacing,
)

Handle visualization layout for top positions.

Source code in inference/core/workflows/core_steps/visualizations/classification_label/v1.py

def handle_top_position(
    sorted_predictions: List[dict],
    text: str,
    w: int,
    h: int,
    initial_offset: float,
    total_spacing: float,
) -> Tuple[np.ndarray, List[str], List[dict]]:
    """Handle visualization layout for top positions."""
    xyxy = np.array(
        [
            [0, initial_offset + i * total_spacing, w, h]
            for i in range(len(sorted_predictions))
        ]
    )
    labels = format_labels(sorted_predictions, text)
    return xyxy, labels, sorted_predictions

validate_prediction_format ¶

validate_prediction_format(predictions, task_type)

Validate that the predictions format matches the specified task type.

Parameters:

Name	Type	Description	Default
`predictions`	`dict`	The predictions dictionary	required
`task_type`	`str`	The specified task type ('single-label' or 'multi-label')	required

Raises:

Type	Description
`ValueError`	If prediction format doesn't match task type

Source code in inference/core/workflows/core_steps/visualizations/classification_label/v1.py

def validate_prediction_format(predictions: dict, task_type: str) -> None:
    """
    Validate that the predictions format matches the specified task type.

    Args:
        predictions (dict): The predictions dictionary
        task_type (str): The specified task type ('single-label' or 'multi-label')

    Raises:
        ValueError: If prediction format doesn't match task type
    """
    actual_type = detect_prediction_type(predictions)

    if actual_type != task_type:
        if actual_type == "single-label":
            raise ValueError(
                "Received single-label predictions but task_type is set to 'multi-label'. Please correct the task_type setting."
            )
        else:
            raise ValueError(
                "Received multi-label predictions but task_type is set to 'single-label'. Please correct the task_type setting."
            )

`core/workflows/core_steps/visualizations/common/annotators`¶

inference.core.workflows.core_steps.visualizations.common.annotators.background_color ¶

Classes¶

BackgroundColorAnnotator ¶

Bases: BaseAnnotator

A class for drawing background colors outside of detected box or mask regions.

Warning

This annotator uses sv.Detections.mask.

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/background_color.py

class BackgroundColorAnnotator(BaseAnnotator):
    """
    A class for drawing background colors outside of detected box or mask regions.
    !!! warning
        This annotator uses `sv.Detections.mask`.
    """

    def __init__(
        self,
        color: Color = Color.BLACK,
        opacity: float = 0.5,
        force_box: bool = False,
    ):
        """
        Args:
            color (Color): The color to use for annotating detections.
            opacity (float): Opacity of the overlay mask. Must be between `0` and `1`.
        """
        self.color: Color = color
        self.opacity = opacity
        self.force_box = force_box

    def annotate(self, scene: np.ndarray, detections: Detections) -> np.ndarray:
        """
        Annotates the given scene with masks based on the provided detections.
        Args:
            scene (ImageType): The image where masks will be drawn.
                `ImageType` is a flexible type, accepting either `numpy.ndarray`
                or `PIL.Image.Image`.
            detections (Detections): Object detections to annotate.
        Returns:
            The annotated image, matching the type of `scene` (`numpy.ndarray`
                or `PIL.Image.Image`)
        Example:
            ```python
            import supervision as sv
            image = ...
            detections = sv.Detections(...)
            background_color_annotator = sv.BackgroundColorAnnotator()
            annotated_frame = background_color_annotator.annotate(
                scene=image.copy(),
                detections=detections
            )
            ```
        ![background-color-annotator-example](https://media.roboflow.com/
        supervision-annotator-examples/background-color-annotator-example-purple.png)
        """

        colored_mask = np.full_like(scene, self.color.as_bgr(), dtype=np.uint8)

        cv2.addWeighted(
            scene, 1 - self.opacity, colored_mask, self.opacity, 0, dst=colored_mask
        )

        if detections.mask is None or self.force_box:
            for detection_idx in range(len(detections)):
                x1, y1, x2, y2 = detections.xyxy[detection_idx].astype(int)
                colored_mask[y1:y2, x1:x2] = scene[y1:y2, x1:x2]
        else:
            for mask in detections.mask:
                colored_mask[mask] = scene[mask]

        return colored_mask

Methods:¶

init ¶

__init__(color=Color.BLACK, opacity=0.5, force_box=False)

Parameters:

Name	Type	Description	Default
`color`	`Color`	The color to use for annotating detections.	`BLACK`
`opacity`	`float`	Opacity of the overlay mask. Must be between `0` and `1`.	`0.5`

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/background_color.py

def __init__(
    self,
    color: Color = Color.BLACK,
    opacity: float = 0.5,
    force_box: bool = False,
):
    """
    Args:
        color (Color): The color to use for annotating detections.
        opacity (float): Opacity of the overlay mask. Must be between `0` and `1`.
    """
    self.color: Color = color
    self.opacity = opacity
    self.force_box = force_box

annotate ¶

annotate(scene, detections)

Annotates the given scene with masks based on the provided detections. Args: scene (ImageType): The image where masks will be drawn. ImageType is a flexible type, accepting either numpy.ndarray or PIL.Image.Image. detections (Detections): Object detections to annotate. Returns: The annotated image, matching the type of scene (numpy.ndarray or PIL.Image.Image) Example:

import supervision as sv
image = ...
detections = sv.Detections(...)
background_color_annotator = sv.BackgroundColorAnnotator()
annotated_frame = background_color_annotator.annotate(
    scene=image.copy(),
    detections=detections
)

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/background_color.py

def annotate(self, scene: np.ndarray, detections: Detections) -> np.ndarray:
    """
    Annotates the given scene with masks based on the provided detections.
    Args:
        scene (ImageType): The image where masks will be drawn.
            `ImageType` is a flexible type, accepting either `numpy.ndarray`
            or `PIL.Image.Image`.
        detections (Detections): Object detections to annotate.
    Returns:
        The annotated image, matching the type of `scene` (`numpy.ndarray`
            or `PIL.Image.Image`)
    Example:
        ```python
        import supervision as sv
        image = ...
        detections = sv.Detections(...)
        background_color_annotator = sv.BackgroundColorAnnotator()
        annotated_frame = background_color_annotator.annotate(
            scene=image.copy(),
            detections=detections
        )
        ```
    ![background-color-annotator-example](https://media.roboflow.com/
    supervision-annotator-examples/background-color-annotator-example-purple.png)
    """

    colored_mask = np.full_like(scene, self.color.as_bgr(), dtype=np.uint8)

    cv2.addWeighted(
        scene, 1 - self.opacity, colored_mask, self.opacity, 0, dst=colored_mask
    )

    if detections.mask is None or self.force_box:
        for detection_idx in range(len(detections)):
            x1, y1, x2, y2 = detections.xyxy[detection_idx].astype(int)
            colored_mask[y1:y2, x1:x2] = scene[y1:y2, x1:x2]
    else:
        for mask in detections.mask:
            colored_mask[mask] = scene[mask]

    return colored_mask

inference.core.workflows.core_steps.visualizations.common.annotators.halo ¶

Classes¶

HaloAnnotator ¶

Bases: BaseAnnotator

A class for drawing Halos on an image using provided detections.

Warning

This annotator uses sv.Detections.mask.

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/halo.py

class HaloAnnotator(BaseAnnotator):
    """
    A class for drawing Halos on an image using provided detections.

    !!! warning

        This annotator uses `sv.Detections.mask`.
    """

    def __init__(
        self,
        color: Union[Color, ColorPalette] = ColorPalette.DEFAULT,
        opacity: float = 0.8,
        kernel_size: int = 40,
        color_lookup: ColorLookup = ColorLookup.CLASS,
    ):
        """
        Args:
            color (Union[Color, ColorPalette]): The color or color palette to use for
                annotating detections.
            opacity (float): Opacity of the overlay mask. Must be between `0` and `1`.
            kernel_size (int): The size of the average pooling kernel used for creating
                the halo.
            color_lookup (ColorLookup): Strategy for mapping colors to annotations.
                Options are `INDEX`, `CLASS`, `TRACK`.
        """
        self.color: Union[Color, ColorPalette] = color
        self.opacity = opacity
        self.color_lookup: ColorLookup = color_lookup
        self.kernel_size: int = kernel_size

    @ensure_cv2_image_for_annotation
    def annotate(
        self,
        scene: ImageType,
        detections: Detections,
        custom_color_lookup: Optional[np.ndarray] = None,
    ) -> ImageType:
        """
        Annotates the given scene with halos based on the provided detections.

        Args:
            scene (ImageType): The image where masks will be drawn.
                `ImageType` is a flexible type, accepting either `numpy.ndarray`
                or `PIL.Image.Image`.
            detections (Detections): Object detections to annotate.
            custom_color_lookup (Optional[np.ndarray]): Custom color lookup array.
                Allows to override the default color mapping strategy.

        Returns:
            The annotated image, matching the type of `scene` (`numpy.ndarray`
                or `PIL.Image.Image`)

        Example:
            ```python
            import supervision as sv

            image = ...
            detections = sv.Detections(...)

            halo_annotator = sv.HaloAnnotator()
            annotated_frame = halo_annotator.annotate(
                scene=image.copy(),
                detections=detections
            )
            ```

        ![halo-annotator-example](https://media.roboflow.com/
        supervision-annotator-examples/halo-annotator-example-purple.png)
        """
        assert isinstance(scene, np.ndarray)
        colored_mask = np.zeros_like(scene, dtype=np.uint8)
        fmask = np.array([False] * scene.shape[0] * scene.shape[1]).reshape(
            scene.shape[0], scene.shape[1]
        )

        for detection_idx in np.flip(np.argsort(detections.area)):
            color = resolve_color(
                color=self.color,
                detections=detections,
                detection_idx=detection_idx,
                color_lookup=(
                    self.color_lookup
                    if custom_color_lookup is None
                    else custom_color_lookup
                ),
            )
            if detections.mask is None:
                x1, y1, x2, y2 = detections.xyxy[detection_idx].astype(int)
                mask = np.zeros(scene.shape[:2], dtype=bool)
                mask[y1:y2, x1:x2] = True
            else:
                mask = detections.mask[detection_idx]
            fmask = np.logical_or(fmask, mask)
            color_bgr = color.as_bgr()
            colored_mask[mask] = color_bgr

        colored_mask = cv2.blur(colored_mask, (self.kernel_size, self.kernel_size))
        colored_mask[fmask] = [0, 0, 0]
        gray = cv2.cvtColor(colored_mask, cv2.COLOR_BGR2GRAY)
        alpha = self.opacity * gray / gray.max()
        alpha_mask = alpha[:, :, np.newaxis]
        blended_scene = np.uint8(scene * (1 - alpha_mask) + colored_mask * self.opacity)
        np.copyto(scene, blended_scene)
        return scene

Methods:¶

init ¶

__init__(
    color=ColorPalette.DEFAULT,
    opacity=0.8,
    kernel_size=40,
    color_lookup=ColorLookup.CLASS,
)

Parameters:

Name	Type	Description	Default
`color`	`Union[Color, ColorPalette]`	The color or color palette to use for annotating detections.	`DEFAULT`
`opacity`	`float`	Opacity of the overlay mask. Must be between `0` and `1`.	`0.8`
`kernel_size`	`int`	The size of the average pooling kernel used for creating the halo.	`40`
`color_lookup`	`ColorLookup`	Strategy for mapping colors to annotations. Options are `INDEX`, `CLASS`, `TRACK`.	`CLASS`

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/halo.py

def __init__(
    self,
    color: Union[Color, ColorPalette] = ColorPalette.DEFAULT,
    opacity: float = 0.8,
    kernel_size: int = 40,
    color_lookup: ColorLookup = ColorLookup.CLASS,
):
    """
    Args:
        color (Union[Color, ColorPalette]): The color or color palette to use for
            annotating detections.
        opacity (float): Opacity of the overlay mask. Must be between `0` and `1`.
        kernel_size (int): The size of the average pooling kernel used for creating
            the halo.
        color_lookup (ColorLookup): Strategy for mapping colors to annotations.
            Options are `INDEX`, `CLASS`, `TRACK`.
    """
    self.color: Union[Color, ColorPalette] = color
    self.opacity = opacity
    self.color_lookup: ColorLookup = color_lookup
    self.kernel_size: int = kernel_size

annotate ¶

annotate(scene, detections, custom_color_lookup=None)

Annotates the given scene with halos based on the provided detections.

Parameters:

Name	Type	Description	Default
`scene`	`ImageType`	The image where masks will be drawn. `ImageType` is a flexible type, accepting either `numpy.ndarray` or `PIL.Image.Image`.	required
`detections`	`Detections`	Object detections to annotate.	required
`custom_color_lookup`	`Optional[ndarray]`	Custom color lookup array. Allows to override the default color mapping strategy.	`None`

Returns:

Type	Description
`ImageType`	The annotated image, matching the type of `scene` (`numpy.ndarray` or `PIL.Image.Image`)

Example

import supervision as sv

image = ...
detections = sv.Detections(...)

halo_annotator = sv.HaloAnnotator()
annotated_frame = halo_annotator.annotate(
    scene=image.copy(),
    detections=detections
)

halo-annotator-example

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/halo.py

@ensure_cv2_image_for_annotation
def annotate(
    self,
    scene: ImageType,
    detections: Detections,
    custom_color_lookup: Optional[np.ndarray] = None,
) -> ImageType:
    """
    Annotates the given scene with halos based on the provided detections.

    Args:
        scene (ImageType): The image where masks will be drawn.
            `ImageType` is a flexible type, accepting either `numpy.ndarray`
            or `PIL.Image.Image`.
        detections (Detections): Object detections to annotate.
        custom_color_lookup (Optional[np.ndarray]): Custom color lookup array.
            Allows to override the default color mapping strategy.

    Returns:
        The annotated image, matching the type of `scene` (`numpy.ndarray`
            or `PIL.Image.Image`)

    Example:
        ```python
        import supervision as sv

        image = ...
        detections = sv.Detections(...)

        halo_annotator = sv.HaloAnnotator()
        annotated_frame = halo_annotator.annotate(
            scene=image.copy(),
            detections=detections
        )
        ```

    ![halo-annotator-example](https://media.roboflow.com/
    supervision-annotator-examples/halo-annotator-example-purple.png)
    """
    assert isinstance(scene, np.ndarray)
    colored_mask = np.zeros_like(scene, dtype=np.uint8)
    fmask = np.array([False] * scene.shape[0] * scene.shape[1]).reshape(
        scene.shape[0], scene.shape[1]
    )

    for detection_idx in np.flip(np.argsort(detections.area)):
        color = resolve_color(
            color=self.color,
            detections=detections,
            detection_idx=detection_idx,
            color_lookup=(
                self.color_lookup
                if custom_color_lookup is None
                else custom_color_lookup
            ),
        )
        if detections.mask is None:
            x1, y1, x2, y2 = detections.xyxy[detection_idx].astype(int)
            mask = np.zeros(scene.shape[:2], dtype=bool)
            mask[y1:y2, x1:x2] = True
        else:
            mask = detections.mask[detection_idx]
        fmask = np.logical_or(fmask, mask)
        color_bgr = color.as_bgr()
        colored_mask[mask] = color_bgr

    colored_mask = cv2.blur(colored_mask, (self.kernel_size, self.kernel_size))
    colored_mask[fmask] = [0, 0, 0]
    gray = cv2.cvtColor(colored_mask, cv2.COLOR_BGR2GRAY)
    alpha = self.opacity * gray / gray.max()
    alpha_mask = alpha[:, :, np.newaxis]
    blended_scene = np.uint8(scene * (1 - alpha_mask) + colored_mask * self.opacity)
    np.copyto(scene, blended_scene)
    return scene

inference.core.workflows.core_steps.visualizations.common.annotators.model_comparison ¶

Classes¶

ModelComparisonAnnotator ¶

Bases: BaseAnnotator

A class for annotating images by highlighting regions predicted by two different models. This annotator visually distinguishes areas uniquely predicted by each model as well as the background where neither model made a prediction.

Attributes:

Name	Type	Description
`color_a`	`Color`	Color used to highlight predictions made only by Model A.
`color_b`	`Color`	Color used to highlight predictions made only by Model B.
`background_color`	`Color`	Color used for parts of the image where neither model made a prediction.
`opacity`	`float`	Opacity level of the overlays, ranging between 0 and 1.
`force_box`	`bool`	If True, forces the use of bounding boxes for predictions even if masks are available.

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/model_comparison.py

class ModelComparisonAnnotator(BaseAnnotator):
    """
    A class for annotating images by highlighting regions predicted by two different models.
    This annotator visually distinguishes areas uniquely predicted by each model as well as
    the background where neither model made a prediction.

    Attributes:
        color_a (Color): Color used to highlight predictions made only by Model A.
        color_b (Color): Color used to highlight predictions made only by Model B.
        background_color (Color): Color used for parts of the image where neither model made a prediction.
        opacity (float): Opacity level of the overlays, ranging between 0 and 1.
        force_box (bool): If True, forces the use of bounding boxes for predictions even if masks are available.
    """

    def __init__(
        self,
        color_a: Color = Color.GREEN,
        color_b: Color = Color.RED,
        background_color: Color = Color.BLACK,
        opacity: float = 0.7,
        force_box: bool = False,
    ):
        """
        Initializes the ModelComparisonAnnotator with the specified colors, opacity, and behavior.

        Args:
            color_a (Color): Color used to highlight predictions made only by Model A.
            color_b (Color): Color used to highlight predictions made only by Model B.
            background_color (Color): Color for parts of the image not covered by any prediction.
            opacity (float): Opacity of the overlay mask, must be between 0 and 1.
            force_box (bool): Whether to use bounding boxes instead of masks if masks are available.
        """
        self.color_a: Color = color_a
        self.color_b: Color = color_b
        self.background_color: Color = background_color
        self.opacity = opacity
        self.force_box = force_box

    def annotate(
        self, scene: np.ndarray, detections_a: Detections, detections_b: Detections
    ) -> np.ndarray:
        """
        Annotates the given scene with highlights representing predictions from two models.

        Args:
            scene (np.ndarray): Original image as a NumPy array (H x W x C).
            detections_a (Detections): Predictions from Model A.
            detections_b (Detections): Predictions from Model B.

        Returns:
            np.ndarray: Annotated image as a NumPy array.
        """

        # Initialize single-channel masks
        neither_predicted = np.ones(
            scene.shape[:2], dtype=np.uint8
        )  # 1 where neither model predicts
        a_predicted = np.zeros(scene.shape[:2], dtype=np.uint8)
        b_predicted = np.zeros(scene.shape[:2], dtype=np.uint8)

        # Populate masks based on detections from Model A
        if detections_a.mask is None or self.force_box:
            for detection_idx in range(len(detections_a)):
                x1, y1, x2, y2 = detections_a.xyxy[detection_idx].astype(int)
                a_predicted[y1:y2, x1:x2] = 1
                neither_predicted[y1:y2, x1:x2] = 0
        else:
            for mask in detections_a.mask:
                a_predicted[mask.astype(bool)] = 1
                neither_predicted[mask.astype(bool)] = 0

        # Populate masks based on detections from Model B
        if detections_b.mask is None or self.force_box:
            for detection_idx in range(len(detections_b)):
                x1, y1, x2, y2 = detections_b.xyxy[detection_idx].astype(int)
                b_predicted[y1:y2, x1:x2] = 1
                neither_predicted[y1:y2, x1:x2] = 0
        else:
            for mask in detections_b.mask:
                b_predicted[mask.astype(bool)] = 1
                neither_predicted[mask.astype(bool)] = 0

        # Define combined masks
        only_a_predicted = a_predicted & (a_predicted ^ b_predicted)
        only_b_predicted = b_predicted & (b_predicted ^ a_predicted)

        # Prepare overlay colors
        background_color_bgr = self.background_color.as_bgr()  # Tuple like (B, G, R)
        color_a_bgr = self.color_a.as_bgr()
        color_b_bgr = self.color_b.as_bgr()

        # Create full-color overlay images
        overlay_background = np.full_like(scene, background_color_bgr, dtype=np.uint8)
        overlay_a = np.full_like(scene, color_a_bgr, dtype=np.uint8)
        overlay_b = np.full_like(scene, color_b_bgr, dtype=np.uint8)

        # Function to blend and apply overlay based on mask
        def apply_overlay(base_img, overlay_img, mask, opacity):
            """
            Blends the overlay with the base image where the mask is set.

            Args:
                base_img (np.ndarray): Original image.
                overlay_img (np.ndarray): Overlay color image.
                mask (np.ndarray): Single-channel mask where to apply the overlay.
                opacity (float): Opacity of the overlay (0 to 1).

            Returns:
                np.ndarray: Image with overlay applied.
            """
            # Blend the entire images
            blended = cv2.addWeighted(base_img, 1 - opacity, overlay_img, opacity, 0)
            # Expand mask to three channels
            mask_3ch = np.stack([mask] * 3, axis=-1)  # Shape: H x W x 3
            # Ensure mask is boolean
            mask_bool = mask_3ch.astype(bool)
            # Apply blended regions where mask is True
            base_img[mask_bool] = blended[mask_bool]
            return base_img

        # Apply background overlay where neither model predicted
        scene = apply_overlay(
            scene, overlay_background, neither_predicted, self.opacity
        )

        # Apply overlay for only Model A predictions
        scene = apply_overlay(scene, overlay_a, only_a_predicted, self.opacity)

        # Apply overlay for only Model B predictions
        scene = apply_overlay(scene, overlay_b, only_b_predicted, self.opacity)

        # Areas where both models predicted remain unchanged (no overlay)

        return scene

Methods:¶

init ¶

__init__(
    color_a=Color.GREEN,
    color_b=Color.RED,
    background_color=Color.BLACK,
    opacity=0.7,
    force_box=False,
)

Initializes the ModelComparisonAnnotator with the specified colors, opacity, and behavior.

Parameters:

Name	Type	Description	Default
`color_a`	`Color`	Color used to highlight predictions made only by Model A.	`GREEN`
`color_b`	`Color`	Color used to highlight predictions made only by Model B.	`RED`
`background_color`	`Color`	Color for parts of the image not covered by any prediction.	`BLACK`
`opacity`	`float`	Opacity of the overlay mask, must be between 0 and 1.	`0.7`
`force_box`	`bool`	Whether to use bounding boxes instead of masks if masks are available.	`False`

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/model_comparison.py

def __init__(
    self,
    color_a: Color = Color.GREEN,
    color_b: Color = Color.RED,
    background_color: Color = Color.BLACK,
    opacity: float = 0.7,
    force_box: bool = False,
):
    """
    Initializes the ModelComparisonAnnotator with the specified colors, opacity, and behavior.

    Args:
        color_a (Color): Color used to highlight predictions made only by Model A.
        color_b (Color): Color used to highlight predictions made only by Model B.
        background_color (Color): Color for parts of the image not covered by any prediction.
        opacity (float): Opacity of the overlay mask, must be between 0 and 1.
        force_box (bool): Whether to use bounding boxes instead of masks if masks are available.
    """
    self.color_a: Color = color_a
    self.color_b: Color = color_b
    self.background_color: Color = background_color
    self.opacity = opacity
    self.force_box = force_box

annotate ¶

annotate(scene, detections_a, detections_b)

Annotates the given scene with highlights representing predictions from two models.

Parameters:

Name	Type	Description	Default
`scene`	`ndarray`	Original image as a NumPy array (H x W x C).	required
`detections_a`	`Detections`	Predictions from Model A.	required
`detections_b`	`Detections`	Predictions from Model B.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Annotated image as a NumPy array.

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/model_comparison.py

def annotate(
    self, scene: np.ndarray, detections_a: Detections, detections_b: Detections
) -> np.ndarray:
    """
    Annotates the given scene with highlights representing predictions from two models.

    Args:
        scene (np.ndarray): Original image as a NumPy array (H x W x C).
        detections_a (Detections): Predictions from Model A.
        detections_b (Detections): Predictions from Model B.

    Returns:
        np.ndarray: Annotated image as a NumPy array.
    """

    # Initialize single-channel masks
    neither_predicted = np.ones(
        scene.shape[:2], dtype=np.uint8
    )  # 1 where neither model predicts
    a_predicted = np.zeros(scene.shape[:2], dtype=np.uint8)
    b_predicted = np.zeros(scene.shape[:2], dtype=np.uint8)

    # Populate masks based on detections from Model A
    if detections_a.mask is None or self.force_box:
        for detection_idx in range(len(detections_a)):
            x1, y1, x2, y2 = detections_a.xyxy[detection_idx].astype(int)
            a_predicted[y1:y2, x1:x2] = 1
            neither_predicted[y1:y2, x1:x2] = 0
    else:
        for mask in detections_a.mask:
            a_predicted[mask.astype(bool)] = 1
            neither_predicted[mask.astype(bool)] = 0

    # Populate masks based on detections from Model B
    if detections_b.mask is None or self.force_box:
        for detection_idx in range(len(detections_b)):
            x1, y1, x2, y2 = detections_b.xyxy[detection_idx].astype(int)
            b_predicted[y1:y2, x1:x2] = 1
            neither_predicted[y1:y2, x1:x2] = 0
    else:
        for mask in detections_b.mask:
            b_predicted[mask.astype(bool)] = 1
            neither_predicted[mask.astype(bool)] = 0

    # Define combined masks
    only_a_predicted = a_predicted & (a_predicted ^ b_predicted)
    only_b_predicted = b_predicted & (b_predicted ^ a_predicted)

    # Prepare overlay colors
    background_color_bgr = self.background_color.as_bgr()  # Tuple like (B, G, R)
    color_a_bgr = self.color_a.as_bgr()
    color_b_bgr = self.color_b.as_bgr()

    # Create full-color overlay images
    overlay_background = np.full_like(scene, background_color_bgr, dtype=np.uint8)
    overlay_a = np.full_like(scene, color_a_bgr, dtype=np.uint8)
    overlay_b = np.full_like(scene, color_b_bgr, dtype=np.uint8)

    # Function to blend and apply overlay based on mask
    def apply_overlay(base_img, overlay_img, mask, opacity):
        """
        Blends the overlay with the base image where the mask is set.

        Args:
            base_img (np.ndarray): Original image.
            overlay_img (np.ndarray): Overlay color image.
            mask (np.ndarray): Single-channel mask where to apply the overlay.
            opacity (float): Opacity of the overlay (0 to 1).

        Returns:
            np.ndarray: Image with overlay applied.
        """
        # Blend the entire images
        blended = cv2.addWeighted(base_img, 1 - opacity, overlay_img, opacity, 0)
        # Expand mask to three channels
        mask_3ch = np.stack([mask] * 3, axis=-1)  # Shape: H x W x 3
        # Ensure mask is boolean
        mask_bool = mask_3ch.astype(bool)
        # Apply blended regions where mask is True
        base_img[mask_bool] = blended[mask_bool]
        return base_img

    # Apply background overlay where neither model predicted
    scene = apply_overlay(
        scene, overlay_background, neither_predicted, self.opacity
    )

    # Apply overlay for only Model A predictions
    scene = apply_overlay(scene, overlay_a, only_a_predicted, self.opacity)

    # Apply overlay for only Model B predictions
    scene = apply_overlay(scene, overlay_b, only_b_predicted, self.opacity)

    # Areas where both models predicted remain unchanged (no overlay)

    return scene

inference.core.workflows.core_steps.visualizations.common.annotators.polygon ¶

Classes¶

PolygonAnnotator ¶

Bases: BaseAnnotator

A class for drawing polygons on an image using provided detections.

Warning

This annotator uses sv.Detections.mask.

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/polygon.py

class PolygonAnnotator(BaseAnnotator):
    """
    A class for drawing polygons on an image using provided detections.

    !!! warning

        This annotator uses `sv.Detections.mask`.
    """

    def __init__(
        self,
        color: Union[Color, ColorPalette] = ColorPalette.DEFAULT,
        thickness: int = 2,
        color_lookup: ColorLookup = ColorLookup.CLASS,
    ):
        """
        Args:
            color (Union[Color, ColorPalette]): The color or color palette to use for
                annotating detections.
            thickness (int): Thickness of the polygon lines.
            color_lookup (ColorLookup): Strategy for mapping colors to annotations.
                Options are `INDEX`, `CLASS`, `TRACK`.
        """
        self.color: Union[Color, ColorPalette] = color
        self.thickness: int = thickness
        self.color_lookup: ColorLookup = color_lookup

    @ensure_cv2_image_for_annotation
    def annotate(
        self,
        scene: ImageType,
        detections: Detections,
        custom_color_lookup: Optional[np.ndarray] = None,
    ) -> ImageType:
        """
        Annotates the given scene with polygons based on the provided detections.

        Args:
            scene (ImageType): The image where polygons will be drawn.
                `ImageType` is a flexible type, accepting either `numpy.ndarray`
                or `PIL.Image.Image`.
            detections (Detections): Object detections to annotate.
            custom_color_lookup (Optional[np.ndarray]): Custom color lookup array.
                Allows to override the default color mapping strategy.

        Returns:
            The annotated image, matching the type of `scene` (`numpy.ndarray`
                or `PIL.Image.Image`)

        Example:
            ```python
            import supervision as sv

            image = ...
            detections = sv.Detections(...)

            polygon_annotator = sv.PolygonAnnotator()
            annotated_frame = polygon_annotator.annotate(
                scene=image.copy(),
                detections=detections
            )
            ```

        ![polygon-annotator-example](https://media.roboflow.com/
        supervision-annotator-examples/polygon-annotator-example-purple.png)
        """
        assert isinstance(scene, np.ndarray)

        for detection_idx in range(len(detections)):
            color = resolve_color(
                color=self.color,
                detections=detections,
                detection_idx=detection_idx,
                color_lookup=(
                    self.color_lookup
                    if custom_color_lookup is None
                    else custom_color_lookup
                ),
            )

            if detections.mask is None:
                x1, y1, x2, y2 = detections.xyxy[detection_idx].astype(int)
                cv2.rectangle(
                    img=scene,
                    pt1=(x1, y1),
                    pt2=(x2, y2),
                    color=color.as_bgr(),
                    thickness=self.thickness,
                )
            else:
                mask = detections.mask[detection_idx]

                # Crop mask to bounding box — findContours only scans the detection
                # area instead of the full frame
                x1, y1, x2, y2 = detections.xyxy[detection_idx].astype(int)
                mask_crop = mask[y1:y2, x1:x2]

                polygons = list(mask_to_polygons(mask=mask_crop))

                for polygon in polygons:
                    # Offset polygon points back to full-frame coordinates
                    full_polygon = (polygon + np.array([[x1, y1]])).astype(np.int32)
                    scene = draw_polygon(
                        scene=scene,
                        polygon=full_polygon,
                        color=color,
                        thickness=self.thickness,
                    )

        return scene

Methods:¶

init ¶

__init__(
    color=ColorPalette.DEFAULT,
    thickness=2,
    color_lookup=ColorLookup.CLASS,
)

Parameters:

Name	Type	Description	Default
`color`	`Union[Color, ColorPalette]`	The color or color palette to use for annotating detections.	`DEFAULT`
`thickness`	`int`	Thickness of the polygon lines.	`2`
`color_lookup`	`ColorLookup`	Strategy for mapping colors to annotations. Options are `INDEX`, `CLASS`, `TRACK`.	`CLASS`

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/polygon.py

def __init__(
    self,
    color: Union[Color, ColorPalette] = ColorPalette.DEFAULT,
    thickness: int = 2,
    color_lookup: ColorLookup = ColorLookup.CLASS,
):
    """
    Args:
        color (Union[Color, ColorPalette]): The color or color palette to use for
            annotating detections.
        thickness (int): Thickness of the polygon lines.
        color_lookup (ColorLookup): Strategy for mapping colors to annotations.
            Options are `INDEX`, `CLASS`, `TRACK`.
    """
    self.color: Union[Color, ColorPalette] = color
    self.thickness: int = thickness
    self.color_lookup: ColorLookup = color_lookup

annotate ¶

annotate(scene, detections, custom_color_lookup=None)

Annotates the given scene with polygons based on the provided detections.

Parameters:

Name	Type	Description	Default
`scene`	`ImageType`	The image where polygons will be drawn. `ImageType` is a flexible type, accepting either `numpy.ndarray` or `PIL.Image.Image`.	required
`detections`	`Detections`	Object detections to annotate.	required
`custom_color_lookup`	`Optional[ndarray]`	Custom color lookup array. Allows to override the default color mapping strategy.	`None`

Returns:

Type	Description
`ImageType`	The annotated image, matching the type of `scene` (`numpy.ndarray` or `PIL.Image.Image`)

Example

import supervision as sv

image = ...
detections = sv.Detections(...)

polygon_annotator = sv.PolygonAnnotator()
annotated_frame = polygon_annotator.annotate(
    scene=image.copy(),
    detections=detections
)

polygon-annotator-example

Source code in inference/core/workflows/core_steps/visualizations/common/annotators/polygon.py

@ensure_cv2_image_for_annotation
def annotate(
    self,
    scene: ImageType,
    detections: Detections,
    custom_color_lookup: Optional[np.ndarray] = None,
) -> ImageType:
    """
    Annotates the given scene with polygons based on the provided detections.

    Args:
        scene (ImageType): The image where polygons will be drawn.
            `ImageType` is a flexible type, accepting either `numpy.ndarray`
            or `PIL.Image.Image`.
        detections (Detections): Object detections to annotate.
        custom_color_lookup (Optional[np.ndarray]): Custom color lookup array.
            Allows to override the default color mapping strategy.

    Returns:
        The annotated image, matching the type of `scene` (`numpy.ndarray`
            or `PIL.Image.Image`)

    Example:
        ```python
        import supervision as sv

        image = ...
        detections = sv.Detections(...)

        polygon_annotator = sv.PolygonAnnotator()
        annotated_frame = polygon_annotator.annotate(
            scene=image.copy(),
            detections=detections
        )
        ```

    ![polygon-annotator-example](https://media.roboflow.com/
    supervision-annotator-examples/polygon-annotator-example-purple.png)
    """
    assert isinstance(scene, np.ndarray)

    for detection_idx in range(len(detections)):
        color = resolve_color(
            color=self.color,
            detections=detections,
            detection_idx=detection_idx,
            color_lookup=(
                self.color_lookup
                if custom_color_lookup is None
                else custom_color_lookup
            ),
        )

        if detections.mask is None:
            x1, y1, x2, y2 = detections.xyxy[detection_idx].astype(int)
            cv2.rectangle(
                img=scene,
                pt1=(x1, y1),
                pt2=(x2, y2),
                color=color.as_bgr(),
                thickness=self.thickness,
            )
        else:
            mask = detections.mask[detection_idx]

            # Crop mask to bounding box — findContours only scans the detection
            # area instead of the full frame
            x1, y1, x2, y2 = detections.xyxy[detection_idx].astype(int)
            mask_crop = mask[y1:y2, x1:x2]

            polygons = list(mask_to_polygons(mask=mask_crop))

            for polygon in polygons:
                # Offset polygon points back to full-frame coordinates
                full_polygon = (polygon + np.array([[x1, y1]])).astype(np.int32)
                scene = draw_polygon(
                    scene=scene,
                    polygon=full_polygon,
                    color=color,
                    thickness=self.thickness,
                )

    return scene

`core/workflows/core_steps/visualizations/text_display`¶

inference.core.workflows.core_steps.visualizations.text_display.utils ¶

Functions:¶

align_offset ¶

align_offset(text_align, max_width, line_width)

Calculate horizontal offset for text alignment.

Source code in inference/core/workflows/core_steps/visualizations/text_display/utils.py

def align_offset(text_align: str, max_width: int, line_width: int) -> int:
    """Calculate horizontal offset for text alignment."""
    if text_align == "center":
        return (max_width - line_width) // 2
    elif text_align == "right":
        return max_width - line_width
    else:  # left
        return 0

calculate_relative_position ¶

calculate_relative_position(
    anchor,
    offset_x,
    offset_y,
    box_width,
    box_height,
    img_width,
    img_height,
)

Calculate the top-left corner position for a box positioned relative to an image anchor.

Parameters:

Name	Type	Description	Default
`anchor`	`str`	Anchor point name (e.g., "top_left", "center", "bottom_right")	required
`offset_x`	`int`	Horizontal offset from anchor point (positive = right)	required
`offset_y`	`int`	Vertical offset from anchor point (positive = down)	required
`box_width`	`int`	Width of the box to position	required
`box_height`	`int`	Height of the box to position	required
`img_width`	`int`	Width of the image	required
`img_height`	`int`	Height of the image	required

Returns:

Type	Description
`Tuple[int, int]`	Tuple of (x, y) coordinates for the top-left corner of the box

Raises:

Type	Description
`ValueError`	If anchor is not recognized

Source code in inference/core/workflows/core_steps/visualizations/text_display/utils.py

def calculate_relative_position(
    anchor: str,
    offset_x: int,
    offset_y: int,
    box_width: int,
    box_height: int,
    img_width: int,
    img_height: int,
) -> Tuple[int, int]:
    """Calculate the top-left corner position for a box positioned relative to an image anchor.

    Args:
        anchor: Anchor point name (e.g., "top_left", "center", "bottom_right")
        offset_x: Horizontal offset from anchor point (positive = right)
        offset_y: Vertical offset from anchor point (positive = down)
        box_width: Width of the box to position
        box_height: Height of the box to position
        img_width: Width of the image
        img_height: Height of the image

    Returns:
        Tuple of (x, y) coordinates for the top-left corner of the box

    Raises:
        ValueError: If anchor is not recognized
    """
    key = anchor.lower()
    try:
        ax, ay = ANCHORS[key]
    except KeyError as e:
        raise ValueError(
            f"Unknown anchor: {anchor!r}. Must be one of {sorted(ANCHORS.keys())}"
        ) from e

    anchor_x = int(round(ax * img_width))
    anchor_y = int(round(ay * img_height))

    box_x = anchor_x - int(round(ax * box_width)) + offset_x
    box_y = anchor_y - int(round(ay * box_height)) + offset_y

    return box_x, box_y

clamp_box ¶

clamp_box(box_x, box_y, box_w, box_h, img_w, img_h)

Clamp box position to image bounds.

Source code in inference/core/workflows/core_steps/visualizations/text_display/utils.py

def clamp_box(
    box_x: int, box_y: int, box_w: int, box_h: int, img_w: int, img_h: int
) -> Tuple[int, int]:
    """Clamp box position to image bounds."""
    box_x = 0 if box_w > img_w else max(0, min(box_x, img_w - box_w))
    box_y = 0 if box_h > img_h else max(0, min(box_y, img_h - box_h))
    return box_x, box_y

compute_layout ¶

compute_layout(
    *,
    formatted_text,
    font,
    font_scale,
    font_thickness,
    padding,
    position_mode,
    position_x,
    position_y,
    anchor,
    offset_x,
    offset_y,
    img_w,
    img_h
)

Compute text layout including dimensions and position.

Source code in inference/core/workflows/core_steps/visualizations/text_display/utils.py

def compute_layout(
    *,
    formatted_text: str,
    font,
    font_scale: float,
    font_thickness: int,
    padding: int,
    position_mode: str,
    position_x: int,
    position_y: int,
    anchor: str,
    offset_x: int,
    offset_y: int,
    img_w: int,
    img_h: int,
) -> TextLayout:
    """Compute text layout including dimensions and position."""
    lines = formatted_text.split("\n") if formatted_text else [""]
    (_, ref_h), ref_base = cv2.getTextSize("Ag", font, font_scale, font_thickness)
    line_advance = ref_h + ref_base
    line_spacing = max(1, int(round(0.25 * line_advance)))

    line_widths = [
        (
            cv2.getTextSize(line, font, font_scale, font_thickness)[0][0]
            if line.strip()
            else 0
        )
        for line in lines
    ]
    max_width = max(line_widths, default=0)

    num_lines = len(lines)
    total_h = num_lines * line_advance + max(0, num_lines - 1) * line_spacing

    box_w = max_width + 2 * padding
    box_h = total_h + 2 * padding

    if position_mode == "absolute":
        box_x, box_y = position_x, position_y
    else:
        box_x, box_y = calculate_relative_position(
            anchor=anchor,
            offset_x=offset_x,
            offset_y=offset_y,
            box_width=box_w,
            box_height=box_h,
            img_width=img_w,
            img_height=img_h,
        )

    box_x, box_y = clamp_box(box_x, box_y, box_w, box_h, img_w, img_h)

    return TextLayout(
        lines=lines,
        line_widths=line_widths,
        max_width=max_width,
        ref_height=ref_h,
        line_advance=line_advance,
        line_spacing=line_spacing,
        box_x=box_x,
        box_y=box_y,
        box_w=box_w,
        box_h=box_h,
    )

draw_background ¶

draw_background(
    img,
    x1,
    y1,
    x2,
    y2,
    bg_color_bgr,
    background_opacity,
    border_radius,
)

Draw background rectangle with optional transparency and rounded corners.

Source code in inference/core/workflows/core_steps/visualizations/text_display/utils.py

def draw_background(
    img: np.ndarray,
    x1: int,
    y1: int,
    x2: int,
    y2: int,
    bg_color_bgr: Optional[Tuple[int, int, int]],
    background_opacity: float,
    border_radius: int,
) -> None:
    """Draw background rectangle with optional transparency and rounded corners."""
    if bg_color_bgr is None or x2 <= x1 or y2 <= y1:
        return

    if background_opacity > 0.0:
        if background_opacity < 1.0:
            # Alpha blending required
            draw_background_with_alpha(
                img=img,
                pt1=(x1, y1),
                pt2=(x2, y2),
                color=bg_color_bgr,
                alpha=background_opacity,
                border_radius=border_radius,
            )
        else:
            # Fully opaque - use direct drawing
            # OpenCV uses inclusive coordinates, so subtract 1 from exclusive end coords
            if border_radius > 0:
                draw_rounded_rectangle(
                    img=img,
                    pt1=(x1, y1),
                    pt2=(x2 - 1, y2 - 1),
                    color=bg_color_bgr,
                    radius=border_radius,
                )
            else:
                cv2.rectangle(
                    img,
                    (x1, y1),
                    (x2 - 1, y2 - 1),
                    bg_color_bgr,
                    -1,
                )

draw_background_with_alpha ¶

draw_background_with_alpha(
    img, pt1, pt2, color, alpha, border_radius
)

Draw a filled rectangle with alpha blending using overlay compositing.

Uses proper overlay-based alpha blending for smooth antialiased edges, especially important for rounded rectangles.

Process: 1. Extract the affected region 2. Create overlay and draw shape on it 3. Alpha-blend overlay with original region 4. Write blended result back

Source code in inference/core/workflows/core_steps/visualizations/text_display/utils.py

def draw_background_with_alpha(
    img: np.ndarray,
    pt1: Tuple[int, int],
    pt2: Tuple[int, int],
    color: Tuple[int, int, int],
    alpha: float,
    border_radius: int,
) -> None:
    """Draw a filled rectangle with alpha blending using overlay compositing.

    Uses proper overlay-based alpha blending for smooth antialiased edges,
    especially important for rounded rectangles.

    Process:
    1. Extract the affected region
    2. Create overlay and draw shape on it
    3. Alpha-blend overlay with original region
    4. Write blended result back
    """
    x1, y1 = pt1
    x2, y2 = pt2

    # Clamp to image bounds
    img_h, img_w = img.shape[:2]
    x1_clamped = max(0, x1)
    y1_clamped = max(0, y1)
    x2_clamped = min(img_w, x2)
    y2_clamped = min(img_h, y2)

    if x2_clamped <= x1_clamped or y2_clamped <= y1_clamped:
        return

    # Extract the region of interest
    roi = img[y1_clamped:y2_clamped, x1_clamped:x2_clamped]

    # Create overlay for just this region
    overlay = roi.copy()

    roi_w = x2_clamped - x1_clamped
    roi_h = y2_clamped - y1_clamped

    # Draw the shape onto the overlay (coordinates relative
    # to ROI and OpenCV uses inclusive coordinates,
    # so max index is size - 1
    if border_radius > 0:
        draw_rounded_rectangle(
            img=overlay,
            pt1=(0, 0),
            pt2=(roi_w - 1, roi_h - 1),
            color=color,
            radius=border_radius,
        )
    else:
        cv2.rectangle(
            overlay,
            (0, 0),
            (roi_w - 1, roi_h - 1),
            color,
            -1,
        )

    # Alpha blend: result = overlay * alpha + original * (1 - alpha)
    blended = cv2.addWeighted(overlay, alpha, roi, 1 - alpha, 0)

    # Write blended result back to image
    img[y1_clamped:y2_clamped, x1_clamped:x2_clamped] = blended

draw_rounded_rectangle ¶

draw_rounded_rectangle(img, pt1, pt2, color, radius)

Draw a filled rounded rectangle on an image.

Source code in inference/core/workflows/core_steps/visualizations/text_display/utils.py

def draw_rounded_rectangle(
    img: np.ndarray,
    pt1: Tuple[int, int],
    pt2: Tuple[int, int],
    color: Tuple[int, int, int],
    radius: int,
) -> None:
    """Draw a filled rounded rectangle on an image."""
    x1, y1 = pt1
    x2, y2 = pt2

    # Early return for invalid coordinates
    if x2 <= x1 or y2 <= y1:
        return

    max_radius = min((x2 - x1) // 2, (y2 - y1) // 2)
    radius = min(radius, max_radius)

    if radius <= 0:
        cv2.rectangle(img, pt1, pt2, color, -1)
        return

    cv2.rectangle(img, (x1 + radius, y1), (x2 - radius, y2), color, -1)
    cv2.rectangle(img, (x1, y1 + radius), (x2, y2 - radius), color, -1)

    cv2.ellipse(
        img, (x1 + radius, y1 + radius), (radius, radius), 180, 0, 90, color, -1
    )
    cv2.ellipse(
        img, (x2 - radius, y1 + radius), (radius, radius), 270, 0, 90, color, -1
    )
    cv2.ellipse(img, (x1 + radius, y2 - radius), (radius, radius), 90, 0, 90, color, -1)
    cv2.ellipse(img, (x2 - radius, y2 - radius), (radius, radius), 0, 0, 90, color, -1)

draw_text_lines ¶

draw_text_lines(
    img,
    *,
    layout,
    padding,
    text_align,
    font,
    font_scale,
    font_thickness,
    color_bgr
)

Draw text lines on the image.

Source code in inference/core/workflows/core_steps/visualizations/text_display/utils.py

def draw_text_lines(
    img: np.ndarray,
    *,
    layout: TextLayout,
    padding: int,
    text_align: str,
    font,
    font_scale: float,
    font_thickness: int,
    color_bgr: Tuple[int, int, int],
) -> None:
    """Draw text lines on the image."""
    img_h, img_w = img.shape[:2]
    current_y = layout.box_y + padding
    base_x = layout.box_x + padding

    for i, line in enumerate(layout.lines):
        if line.strip():
            w = layout.line_widths[i]
            text_x = base_x + align_offset(text_align, layout.max_width, w)
            text_y = current_y + layout.ref_height

            if text_y > 0 and current_y < img_h and text_x < img_w:
                cv2.putText(
                    img,
                    line,
                    (text_x, text_y),
                    font,
                    font_scale,
                    color_bgr,
                    font_thickness,
                    cv2.LINE_AA,
                )

        current_y += layout.line_advance
        if i < len(layout.lines) - 1:
            current_y += layout.line_spacing

inference.core.workflows.core_steps.visualizations.text_display.v1 ¶

Classes¶

Functions:¶

format_text_with_parameters ¶

format_text_with_parameters(
    text, text_parameters, text_parameters_operations
)

Format text by replacing parameter placeholders with actual values.

Uses a single-pass regex substitution for efficiency and correctness.

Source code in inference/core/workflows/core_steps/visualizations/text_display/v1.py

def format_text_with_parameters(
    text: str,
    text_parameters: Dict[str, Any],
    text_parameters_operations: Dict[str, List[AllOperationsType]],
) -> str:
    """Format text by replacing parameter placeholders with actual values.

    Uses a single-pass regex substitution for efficiency and correctness.
    """
    # Cache for computed parameter values (with operations applied)
    computed_values: Dict[str, str] = {}

    def replace_placeholder(match: re.Match) -> str:
        parameter_name = match.group(2)
        if parameter_name not in text_parameters:
            return match.group(0)
        if parameter_name in computed_values:
            return computed_values[parameter_name]

        parameter_value = text_parameters[parameter_name]
        operations = text_parameters_operations.get(parameter_name)
        if operations:
            operations_chain = build_operations_chain(operations=operations)
            parameter_value = operations_chain(parameter_value, global_parameters={})

        # Cache and return
        computed_values[parameter_name] = str(parameter_value)
        return computed_values[parameter_name]

    return PARAMETER_REGEX.sub(replace_placeholder, text)

`core/workflows`¶

Workflow execution engine entry points and helpers.

inference.core.workflows.errors ¶

Classes¶

DynamicBlockCodeError ¶

Bases: WorkflowExecutionEngineError

Exception for dynamic block code execution errors (errors provoked by user's code).

Source code in inference/core/workflows/errors.py

class DynamicBlockCodeError(WorkflowExecutionEngineError):
    """Exception for dynamic block code execution errors (errors provoked by user's code)."""

    def __init__(
        self,
        public_message: str,
        context: str = "dynamic_block_code_execution",
        inner_error: Optional[Exception] = None,
        block_type_name: Optional[str] = None,
        error_line: Optional[int] = None,
        code_snippet: Optional[str] = None,
        traceback_str: Optional[str] = None,
        stdout: Optional[str] = None,
        stderr: Optional[str] = None,
    ):
        super().__init__(
            public_message=public_message, context=context, inner_error=inner_error
        )
        self.block_type_name = block_type_name
        self.error_line = error_line
        self.code_snippet = code_snippet
        self.traceback_str = traceback_str
        self.stdout = stdout
        self.stderr = stderr

    @property
    def block_traceback(self) -> Optional[BlockTraceback]:
        """Construct BlockTraceback from error fields if any are present."""
        if not any([self.error_line, self.traceback_str, self.stdout, self.stderr]):
            return None
        return BlockTraceback(
            error_line=self.error_line,
            code_snippet=self.code_snippet,
            traceback=self.traceback_str,
            stdout=self.stdout,
            stderr=self.stderr,
        )

Attributes¶

block_traceback `property` ¶

block_traceback

Construct BlockTraceback from error fields if any are present.

`core/workflows/execution_engine/introspection`¶

inference.core.workflows.execution_engine.introspection.blocks_loader ¶

Functions:¶

clear_caches ¶

clear_caches()

Clear all LRU caches in this module. Useful for testing or when environment configuration changes.

Source code in inference/core/workflows/execution_engine/introspection/blocks_loader.py

def clear_caches() -> None:
    """
    Clear all LRU caches in this module.
    Useful for testing or when environment configuration changes.
    """
    _cached_describe_available_blocks.cache_clear()
    load_core_workflow_blocks.cache_clear()
    _cached_load_all_defined_kinds.cache_clear()
    _cached_model_json_schema.cache_clear()
    _cached_describe_outputs.cache_clear()

inference.core.workflows.execution_engine.introspection.schema_parser ¶

Classes¶

Functions:¶

clear_cache ¶

clear_cache()

Clear the parse_block_manifest cache.

Source code in inference/core/workflows/execution_engine/introspection/schema_parser.py

def clear_cache() -> None:
    """Clear the parse_block_manifest cache."""
    parse_block_manifest.cache_clear()

`core/workflows/execution_engine/v1/compiler`¶

inference.core.workflows.execution_engine.v1.compiler.cache ¶

Classes¶

BasicWorkflowsCache ¶

Bases: Generic[V]

Base cache which is capable of hashing compound payloads based on list of injected hash functions. Hash functions are to produce stable hashing strings. Each function is invoked on get_hash_key(...) kwarg (use named args only!), output string is concatenated and md5 value is calculated.

Cache is size bounded, each entry lives until cache_size new entries appear.

Raises WorkflowEnvironmentConfigurationError when get_hash_key(...) is not provided with params corresponding to all hash functions.

Thread safe thanks to thread lock on get(...) and cache(...).

Source code in inference/core/workflows/execution_engine/v1/compiler/cache.py

class BasicWorkflowsCache(Generic[V]):
    """
    Base cache which is capable of hashing compound payloads based on
    list of injected hash functions. Hash functions are to produce stable hashing strings.
    Each function is invoked on `get_hash_key(...)` kwarg (use named args only!),
    output string is concatenated and md5 value is calculated.

    Cache is size bounded, each entry lives until `cache_size` new entries appear.

    Raises `WorkflowEnvironmentConfigurationError` when `get_hash_key(...)` is not
    provided with params corresponding to all hash functions.

    Thread safe thanks to thread lock on `get(...)` and `cache(...)`.
    """

    def __init__(
        self,
        cache_size: int,
        hash_functions: List[Tuple[str, Callable[[Any], str]]],
    ):
        self._keys_buffer = deque(maxlen=max(cache_size, 1))
        self._cache: Dict[str, V] = {}
        self._hash_functions = hash_functions
        self._cache_lock = Lock()

    def get_hash_key(self, **kwargs) -> str:
        hash_chunks = []
        for key_name, hashing_function in self._hash_functions:
            if key_name not in kwargs:
                raise WorkflowEnvironmentConfigurationError(
                    public_message=f"Cache is miss configured.",
                    context="workflows_cache | hash_key_generation",
                )
            hash_value = hashing_function(kwargs[key_name])
            hash_chunks.append(hash_value)
        return hashlib.md5("<|>".join(hash_chunks).encode("utf-8")).hexdigest()

    def get(self, key: str) -> Optional[V]:
        with self._cache_lock:
            return self._cache.get(key)

    def cache(self, key: str, value: V) -> None:
        with self._cache_lock:
            if len(self._keys_buffer) == self._keys_buffer.maxlen:
                to_pop = self._keys_buffer.popleft()
                del self._cache[to_pop]
            self._keys_buffer.append(key)
            self._cache[key] = value

inference.core.workflows.execution_engine.v1.compiler.graph_constructor ¶

Classes¶

Functions:¶

establish_step_execution_dimensionality ¶

establish_step_execution_dimensionality(
    inputs_dimensionalities,
    control_flow_lineage_support,
    output_dimensionality_offset,
)

Determine how many batch dimensions (execution slices) a step runs with.

Used during workflow compilation in denote_data_flow_for_step. The result is stored on StepNode.step_execution_dimensionality and consumed at execution time to: - Drive how many times the step is executed (which batch indices/slices). - Align and expand inputs (e.g. auto-batch casting) to match this size. - Validate that parameter dimensionalities are compatible (runtime checks in step_input_assembler and manager).

Logic: - If no input has non-zero dimensionality but the step is gated by control flow (control_flow_lineage_support non-empty), the dimensionality is the number of control-flow branches. - Otherwise, the minimum non-zero input dimensionality is used; if output_dimensionality_offset < 0 (step reduces batch dimension), one is subtracted.

Parameters:

Name	Type	Description	Default
`inputs_dimensionalities`	`Dict[str, Set[int]]`	Per-input sets of dimensionalities (from get_inputs_dimensionalities).	required
`control_flow_lineage_support`	`List[str]`	Lineage identifiers for control-flow branches that gate this step (from establish_batch_oriented_step_lineage).	required
`output_dimensionality_offset`	`int`	Block's output dimensionality offset (positive = expand, negative = reduce batch dimension).	required

Returns:

Type	Description
`int`	The number of batch dimensions (execution slices) for this step.

Source code in inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py

def establish_step_execution_dimensionality(
    inputs_dimensionalities: Dict[str, Set[int]],
    control_flow_lineage_support: List[str],
    output_dimensionality_offset: int,
) -> int:
    """
    Determine how many batch dimensions (execution slices) a step runs with.

    Used during workflow compilation in denote_data_flow_for_step. The result
    is stored on StepNode.step_execution_dimensionality and consumed at
    execution time to:
    - Drive how many times the step is executed (which batch indices/slices).
    - Align and expand inputs (e.g. auto-batch casting) to match this size.
    - Validate that parameter dimensionalities are compatible (runtime checks
      in step_input_assembler and manager).

    Logic:
    - If no input has non-zero dimensionality but the step is gated by
      control flow (control_flow_lineage_support non-empty), the
      dimensionality is the number of control-flow branches.
    - Otherwise, the minimum non-zero input dimensionality is used; if
      output_dimensionality_offset < 0 (step reduces batch dimension),
      one is subtracted.

    Args:
        inputs_dimensionalities: Per-input sets of dimensionalities (from
            get_inputs_dimensionalities).
        control_flow_lineage_support: Lineage identifiers for control-flow
            branches that gate this step (from establish_batch_oriented_step_lineage).
        output_dimensionality_offset: Block's output dimensionality offset
            (positive = expand, negative = reduce batch dimension).

    Returns:
        The number of batch dimensions (execution slices) for this step.
    """
    step_execution_dimensionality = 0
    non_zero_dimensionalities = {
        dimensionality
        for dimensionalities in inputs_dimensionalities.values()
        for dimensionality in dimensionalities
        if dimensionality > 0
    }
    if len(non_zero_dimensionalities) == 0 and len(control_flow_lineage_support) > 0:
        return len(control_flow_lineage_support)
    if len(non_zero_dimensionalities) > 0:
        step_execution_dimensionality = min(non_zero_dimensionalities)
        if output_dimensionality_offset < 0:
            step_execution_dimensionality -= 1
    return step_execution_dimensionality

get_lineage_derived_from_control_flow ¶

get_lineage_derived_from_control_flow(
    control_flow_steps_selectors, execution_graph
)

Return unique non-empty data lineages from the given control flow steps.

Each lineage is taken from the step's data_lineage in the execution graph. Lineages are deduplicated by lineage id (see identify_lineage); empty lineages are omitted. Used when establishing batch-oriented step lineage.

Parameters:

Name	Type	Description	Default
`control_flow_steps_selectors`	`List[str]`	Step selectors (node ids) of control flow steps whose data_lineage is to be collected.	required
`execution_graph`	`DiGraph`	The workflow execution graph containing step nodes and their data_lineage.	required

Returns:

Type	Description
`List[List[str]]`	List of distinct non-empty data lineages, one per unique lineage id.

Source code in inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py

def get_lineage_derived_from_control_flow(
    control_flow_steps_selectors: List[str],
    execution_graph: nx.DiGraph,
) -> List[List[str]]:
    """
    Return unique non-empty data lineages from the given control flow steps.

    Each lineage is taken from the step's data_lineage in the execution graph.
    Lineages are deduplicated by lineage id (see identify_lineage); empty
    lineages are omitted. Used when establishing batch-oriented step lineage.

    Args:
        control_flow_steps_selectors: Step selectors (node ids) of control
            flow steps whose data_lineage is to be collected.
        execution_graph: The workflow execution graph containing step nodes
            and their data_lineage.

    Returns:
        List of distinct non-empty data lineages, one per unique lineage id.
    """
    unique_lineages, _ = _collect_unique_control_flow_lineages_with_step_mapping(
        control_flow_steps_selectors=control_flow_steps_selectors,
        execution_graph=execution_graph,
    )
    return unique_lineages

verify_compatibility_of_input_data_lineage_with_control_flow_lineage ¶

verify_compatibility_of_input_data_lineage_with_control_flow_lineage(
    step_name,
    inputs_lineage,
    control_flow_steps_selectors,
    execution_graph,
)

Ensure control flow steps' data lineage is compatible with the step's inputs.

Control flow steps that affect this step must operate on data that is compatible with the data fed to the step; otherwise the step could never execute. Compares unique control flow lineages against input lineage prefixes and raises ControlFlowDefinitionError if any control flow lineage is not covered by the inputs.

If inputs_lineage is empty, there is no sense to verify compatibility. The lineage of the step should be established based on the control flow lineages.

Parameters:

Name	Type	Description	Default
`step_name`	`str`	Name of the step being verified (used in error messages).	required
`inputs_lineage`	`List[List[str]]`	Data lineages derived from the step's input data.	required
`control_flow_steps_selectors`	`List[str]`	Step selectors of control flow steps that affect this step's execution.	required
`execution_graph`	`DiGraph`	The workflow execution graph.	required

Raises:

Type	Description
`ControlFlowDefinitionError`	When a control flow step's lineage is not compatible with the step's input lineage (step would never execute).

Source code in inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py

def verify_compatibility_of_input_data_lineage_with_control_flow_lineage(
    step_name: str,
    inputs_lineage: List[List[str]],
    control_flow_steps_selectors: List[str],
    execution_graph: DiGraph,
) -> None:
    """
    Ensure control flow steps' data lineage is compatible with the step's inputs.

    Control flow steps that affect this step must operate on data that is
    compatible with the data fed to the step; otherwise the step could never
    execute. Compares unique control flow lineages against input lineage
    prefixes and raises ControlFlowDefinitionError if any control flow lineage
    is not covered by the inputs.

    If inputs_lineage is empty, there is no sense to verify compatibility. The lineage of the
    step should be established based on the control flow lineages.

    Args:
        step_name: Name of the step being verified (used in error messages).
        inputs_lineage: Data lineages derived from the step's input data.
        control_flow_steps_selectors: Step selectors of control flow steps
            that affect this step's execution.
        execution_graph: The workflow execution graph.

    Raises:
        ControlFlowDefinitionError: When a control flow step's lineage is not
            compatible with the step's input lineage (step would never execute).
    """
    (
        batch_oriented_control_flow_lineages,
        lineage_id2control_flow_steps,
    ) = _collect_unique_control_flow_lineages_with_step_mapping(
        control_flow_steps_selectors=control_flow_steps_selectors,
        execution_graph=execution_graph,
    )
    if not inputs_lineage:
        return

    all_input_lineage_prefixes = get_all_batch_lineage_prefixes(lineages=inputs_lineage)
    all_input_lineage_prefixes_hashes = {
        identify_lineage(lineage=lineage) for lineage in all_input_lineage_prefixes
    }
    for control_flow_lineage in batch_oriented_control_flow_lineages:
        control_flow_lineage_id = identify_lineage(lineage=control_flow_lineage)
        if control_flow_lineage_id not in all_input_lineage_prefixes_hashes:
            problematic_flow_control_steps = lineage_id2control_flow_steps[
                control_flow_lineage_id
            ]
            raise ControlFlowDefinitionError(
                public_message=f"Step {step_name} execution is impacted by control flow outcome of the following "
                f"steps {problematic_flow_control_steps} which make decision based on data that is "
                f"not compatible with data fed to the step {step_name} - which would cause the step "
                f"to never execute. This behaviour is invalid and prevented upfront by Workflows compiler.",
                context="workflow_compilation | execution_graph_construction | verification_of_control_flow_lineage",
            )

inference.core.workflows.execution_engine.v1.compiler.graph_traversal ¶

Functions:¶

traverse_graph_ensuring_parents_are_reached_first ¶

traverse_graph_ensuring_parents_are_reached_first(
    graph, start_node
)

This function works under assumption of common super-input node in the graph - otherwise, there is no common entry point to put as start_node.

Source code in inference/core/workflows/execution_engine/v1/compiler/graph_traversal.py

def traverse_graph_ensuring_parents_are_reached_first(
    graph: DiGraph,
    start_node: str,
) -> List[str]:
    """
    This function works under assumption of common super-input node in the graph - otherwise,
    there is no common entry point to put as `start_node`.
    """
    graph_copy = graph.copy()
    distance_key = "distance"
    graph_copy = assign_max_distances_from_start(
        graph=graph_copy,
        start_node=start_node,
        distance_key=distance_key,
    )
    nodes_groups = group_nodes_by_sorted_key_value(graph=graph_copy, key=distance_key)
    return [node for node_group in nodes_groups for node in node_group]

inference.core.workflows.execution_engine.v1.compiler.syntactic_parser ¶

Classes¶

Functions:¶

clear_cache ¶

clear_cache()

Clear the workflow schema cache.

Source code in inference/core/workflows/execution_engine/v1/compiler/syntactic_parser.py

def clear_cache() -> None:
    """Clear the workflow schema cache."""
    _cached_workflow_schema.cache_clear()

`core/workflows/execution_engine/v1/dynamic_blocks`¶

inference.core.workflows.execution_engine.v1.dynamic_blocks.block_assembler ¶

Classes¶

Functions:¶

ensure_dynamic_blocks_allowed ¶

ensure_dynamic_blocks_allowed(dynamic_blocks_definitions)

Ensure that dynamic blocks are allowed based on configuration.

Dynamic blocks are allowed if: 1. Local custom Python execution is enabled (ALLOW_CUSTOM_PYTHON_EXECUTION_IN_WORKFLOWS=True) 2. OR Modal execution mode is set (WORKFLOWS_CUSTOM_PYTHON_EXECUTION_MODE=modal)

This allows secure execution via Modal sandboxes even when local execution is disabled.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/block_assembler.py

def ensure_dynamic_blocks_allowed(dynamic_blocks_definitions: List[dict]) -> None:
    """Ensure that dynamic blocks are allowed based on configuration.

    Dynamic blocks are allowed if:
    1. Local custom Python execution is enabled (ALLOW_CUSTOM_PYTHON_EXECUTION_IN_WORKFLOWS=True)
    2. OR Modal execution mode is set (WORKFLOWS_CUSTOM_PYTHON_EXECUTION_MODE=modal)

    This allows secure execution via Modal sandboxes even when local execution is disabled.
    """
    if not dynamic_blocks_definitions:
        return

    # Check if we're using Modal for secure remote execution
    is_modal_mode = WORKFLOWS_CUSTOM_PYTHON_EXECUTION_MODE == "modal"

    # Allow if either local execution is enabled OR Modal mode is set
    if not ALLOW_CUSTOM_PYTHON_EXECUTION_IN_WORKFLOWS and not is_modal_mode:
        raise WorkflowEnvironmentConfigurationError(
            public_message="Cannot use dynamic blocks with custom Python code in this installation of `workflows`. "
            "This can be changed by either setting environmental variable "
            "`ALLOW_CUSTOM_PYTHON_EXECUTION_IN_WORKFLOWS=True` for local execution "
            "or `WORKFLOWS_CUSTOM_PYTHON_EXECUTION_MODE=modal` for secure remote execution.",
            context="workflow_compilation | dynamic_blocks_compilation",
        )

inference.core.workflows.execution_engine.v1.dynamic_blocks.block_scaffolding ¶

Classes¶

Functions:¶

inference.core.workflows.execution_engine.v1.dynamic_blocks.debug_logs ¶

Per-run debug capture for dynamic Python blocks (stdout/stderr and structured traces).

The HTTP layer opts in via debug=True on the workflow run request. That activates a :class:DebugSession for the run through :func:register_debug_session, which publishes both:

a :class:DebugLogsCollector for stdout/stderr (returned as python_blocks_output_streams), and
a :class:WorkflowDebugTrace for debug_traces.append(...) calls (returned as python_blocks_debug_traces).

Block runners look up the active session components through ContextVars after each invocation.

Propagation model: - Active session state is stored in ContextVars (current_debug_collector, current_debug_trace) using the same pattern as execution_id, remote_processing_times, and apply_duration_minimum. The engine captures values in the request thread and re-binds them inside each worker thread spawned by ThreadPoolExecutor (see safe_execute_step). - Only LOCAL execution is wired up today. Modal and OCI sandbox executors run the user code out of process and would need their own payload extension to bubble captured output back into the active session.

Classes¶

DebugLogsCollector ¶

Thread-safe collector for stdout/stderr produced by Python blocks.

Each stream of an entry is truncated to max_chars_per_stream. Once the total collected size of the run exceeds max_total_chars, a single marker entry is appended and all subsequent records are dropped.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/debug_logs.py

class DebugLogsCollector:
    """Thread-safe collector for stdout/stderr produced by Python blocks.

    Each stream of an entry is truncated to ``max_chars_per_stream``. Once the
    total collected size of the run exceeds ``max_total_chars``, a single marker
    entry is appended and all subsequent records are dropped.
    """

    def __init__(
        self,
        max_chars_per_stream: int = MAX_CHARS_PER_STREAM,
        max_total_chars: int = MAX_TOTAL_CHARS,
    ) -> None:
        self._lock = threading.Lock()
        self._entries: Dict[str, List[Dict[str, Optional[str]]]] = {}
        self._max_chars_per_stream = max_chars_per_stream
        self._max_total_chars = max_total_chars
        self._total_chars = 0
        self._capacity_exceeded = False

    def record(
        self,
        step_name: str,
        stdout: Optional[str],
        stderr: Optional[str],
    ) -> None:
        if stdout is None and stderr is None:
            return
        stdout = _truncate_stream(stdout, self._max_chars_per_stream)
        stderr = _truncate_stream(stderr, self._max_chars_per_stream)
        entry_chars = len(stdout or "") + len(stderr or "")
        with self._lock:
            if self._capacity_exceeded:
                return
            if self._total_chars + entry_chars > self._max_total_chars:
                self._capacity_exceeded = True
                self._entries.setdefault(step_name, []).append(
                    {"stdout": CAPACITY_EXCEEDED_MARKER, "stderr": None}
                )
                return
            self._total_chars += entry_chars
            self._entries.setdefault(step_name, []).append(
                {"stdout": stdout, "stderr": stderr}
            )

    def snapshot(self) -> Dict[str, List[Dict[str, Optional[str]]]]:
        with self._lock:
            return {step: list(entries) for step, entries in self._entries.items()}

DebugSession `dataclass` ¶

Per-run debug state activated when debug=True on a workflow request.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/debug_logs.py

@dataclass
class DebugSession:
    """Per-run debug state activated when ``debug=True`` on a workflow request."""

    output_streams: DebugLogsCollector
    debug_traces: WorkflowDebugTrace

Functions:¶

register_debug_session ¶

register_debug_session()

Activate stdout/stderr capture and structured debug traces for a run.

Both collectors are published via ContextVars for the duration of the with block and removed on exit. Worker threads re-bind the ContextVars inside their own thread (the execution engine does this in safe_execute_step).

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/debug_logs.py

@contextmanager
def register_debug_session() -> Generator[DebugSession, None, None]:
    """Activate stdout/stderr capture and structured debug traces for a run.

    Both collectors are published via ContextVars for the duration of the
    ``with`` block and removed on exit. Worker threads re-bind the ContextVars
    inside their own thread (the execution engine does this in
    ``safe_execute_step``).
    """
    session = DebugSession(
        output_streams=DebugLogsCollector(),
        debug_traces=WorkflowDebugTrace(),
    )
    token_collector = current_debug_collector.set(session.output_streams)
    token_trace = current_debug_trace.set(session.debug_traces)
    try:
        yield session
    finally:
        current_debug_collector.reset(token_collector)
        current_debug_trace.reset(token_trace)

inference.core.workflows.execution_engine.v1.dynamic_blocks.error_utils ¶

Utility functions for error formatting in dynamic blocks.

Classes¶

Functions:¶

build_traceback_string ¶

build_traceback_string(
    code, line_number, function_name, error_type, error_msg
)

Build a traceback string from structured error data.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/error_utils.py

def build_traceback_string(
    code: Optional[str],
    line_number: int,
    function_name: str,
    error_type: str,
    error_msg: str,
) -> str:
    """Build a traceback string from structured error data."""
    code_lines = (code or "").splitlines()
    code_line = (
        code_lines[line_number - 1].strip()
        if 0 < line_number <= len(code_lines)
        else ""
    )

    lines = [
        "Traceback (most recent call last):",
        f'  File "Python Block", line {line_number}, in {function_name}',
    ]
    if code_line:
        lines.append(f"    {code_line}")
    lines.append(f"{error_type}: {error_msg}")
    return "\n".join(lines)

capture_output ¶

capture_output()

Context manager to capture stdout and stderr for the current thread.

Uses per-thread buffers via threading.local so concurrent calls in different threads capture independently without any global lock.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/error_utils.py

@contextmanager
def capture_output() -> Generator[Tuple[StringIO, StringIO], None, None]:
    """Context manager to capture stdout and stderr for the current thread.

    Uses per-thread buffers via ``threading.local`` so concurrent calls in
    different threads capture independently without any global lock.
    """
    _install_dispatchers()
    stdout_buf, stderr_buf = StringIO(), StringIO()
    _thread_local._capture_stdout = stdout_buf
    _thread_local._capture_stderr = stderr_buf
    try:
        yield stdout_buf, stderr_buf
    finally:
        _thread_local._capture_stdout = None
        _thread_local._capture_stderr = None

create_dynamic_block_code_error ¶

create_dynamic_block_code_error(
    error,
    user_code,
    import_lines_count,
    stdout=None,
    stderr=None,
    block_type_name=None,
)

Create a DynamicBlockCodeError with structured code context.

Parameters:

Name	Type	Description	Default
`error`	`Exception`	The exception that was raised.	required
`user_code`	`str`	The user's Python code (run_function_code).	required
`import_lines_count`	`int`	Number of import lines prepended to the code.	required
`stdout`	`Optional[str]`	Captured stdout, if any.	`None`
`stderr`	`Optional[str]`	Captured stderr, if any.	`None`
`block_type_name`	`Optional[str]`	The dynamic block's type identifier.	`None`

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/error_utils.py

def create_dynamic_block_code_error(
    error: Exception,
    user_code: str,
    import_lines_count: int,
    stdout: Optional[str] = None,
    stderr: Optional[str] = None,
    block_type_name: Optional[str] = None,
) -> DynamicBlockCodeError:
    """Create a DynamicBlockCodeError with structured code context.

    Args:
        error: The exception that was raised.
        user_code: The user's Python code (run_function_code).
        import_lines_count: Number of import lines prepended to the code.
        stdout: Captured stdout, if any.
        stderr: Captured stderr, if any.
        block_type_name: The dynamic block's type identifier.
    """
    tb = traceback.extract_tb(error.__traceback__)
    if not tb:
        return DynamicBlockCodeError(
            public_message=f"{error.__class__.__name__}: {error}",
            inner_error=error,
            block_type_name=block_type_name,
            stdout=stdout,
            stderr=stderr,
        )

    frame = tb[-1]
    line_number = frame.lineno - import_lines_count

    code_snippet = extract_code_snippet(user_code, line_number)
    message = f"Error in line {line_number}, in {frame.name}: {error.__class__.__name__}: {error}"
    clean_traceback = _create_clean_traceback(error, user_code, import_lines_count)

    return DynamicBlockCodeError(
        public_message=message,
        inner_error=error,
        block_type_name=block_type_name,
        error_line=line_number,
        code_snippet=code_snippet.lstrip("\n") if code_snippet else None,
        traceback_str=clean_traceback,
        stdout=stdout,
        stderr=stderr,
    )

extract_code_snippet ¶

extract_code_snippet(code, error_line, context=10)

Extract a code snippet around the error line with markers.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/error_utils.py

def extract_code_snippet(
    code: Optional[str], error_line: int, context: int = 10
) -> str:
    """Extract a code snippet around the error line with markers."""
    if not code:
        return ""

    lines = code.splitlines()
    error_idx = error_line - 1
    start = max(0, error_idx - context)
    end = min(len(lines), error_idx + context + 1)
    snippet_lines = [
        f"{'>>>' if i == error_idx else '   '} {i + 1}: {lines[i]}"
        for i in range(start, end)
    ]
    return "\n" + "\n".join(snippet_lines)

inference.core.workflows.execution_engine.v1.dynamic_blocks.modal_executor ¶

Modal executor for Custom Python Blocks in Workflows using Web Endpoints.

This module handles the execution of untrusted user code in Modal sandboxes using web endpoints for better security and no size limitations.

Two transport modes are available, controlled by WEBEXEC_TRANSPORT:

http — JSON POST with gzip compression and persistent requests.Session.
websocket (default) — persistent WebSocket connections with msgpack binary frames. Eliminates per-request HTTP overhead and base64 image encoding.

Classes¶

ModalExecutor ¶

Manages execution of Custom Python Blocks in Modal sandboxes via web endpoints.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/modal_executor.py

class ModalExecutor:
    """Manages execution of Custom Python Blocks in Modal sandboxes via web endpoints."""

    def __init__(self, workspace_id: Optional[str] = None):
        self.workspace_id = workspace_id or MODAL_ANONYMOUS_WORKSPACE_NAME
        self._base_url: Optional[str] = None
        self._session: Optional[requests.Session] = None
        self._known_code_hashes: set = set()

    def _get_session(self) -> requests.Session:
        if self._session is None:
            self._session = requests.Session()
            self._session.headers.update(
                {
                    "Modal-Key": MODAL_TOKEN_ID,
                    "Modal-Secret": MODAL_TOKEN_SECRET,
                }
            )
        return self._session

    def _get_endpoint_url(self, workspace_id: str) -> str:
        if self._base_url is None:
            env_url = os.environ.get("MODAL_WEB_ENDPOINT_URL")
            if env_url:
                self._base_url = env_url
            else:
                self._base_url = _build_webexec_endpoint_base(
                    method_label=_WEBEXEC_HTTP_METHOD_LABEL
                )

                # If we couldn't get it dynamically, construct it based on expected pattern
                if not self._base_url:
                    # URL pattern: https://{workspace}--{app}-{class}-{method_truncated}.modal.run
                    # Note: Modal truncates long labels to 63 chars with a hash suffix
                    workspace = MODAL_WORKSPACE_NAME
                    app_name = WEBEXEC_MODAL_APP_NAME
                    class_name = "executor"
                    method_name = "execute-block"

                    # The label would be: inference-custom-blocks-web-customblockexecutor-execute-block
                    # This is 62 chars, which might get truncated
                    label = f"{app_name}-{class_name}-{method_name}"
                    if (
                        len(label) > 56
                    ):  # Modal truncates at 56 chars and adds 7-char hash
                        import hashlib

                        hash_str = hashlib.sha256(label.encode()).hexdigest()[:6]
                        label = f"{label[:56]}-{hash_str}"

                    self._base_url = f"https://{workspace}--{label}.modal.run"

        # Add workspace_id as query parameter
        return f"{self._base_url}?workspace_id={workspace_id}"

    def execute_remote(
        self,
        block_type_name: str,
        python_code: PythonCode,
        inputs: Dict[str, Any],
        workspace_id: Optional[str] = None,
        workflow_context: Optional[Dict[str, Any]] = None,
    ) -> BlockResult:
        if not MODAL_AVAILABLE:
            raise DynamicBlockError(
                public_message="Modal credentials not configured. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET environment variables.",
                context="modal_executor | credentials_check",
            )

        workspace = workspace_id if workspace_id else self.workspace_id

        try:
            endpoint_url = self._get_endpoint_url(workspace)

            inputs_json = serialize_for_modal_remote_execution(inputs)

            code_hash = _compute_code_hash(
                python_code.run_function_code or "",
                python_code.imports,
            )

            if (
                not workspace
                or workspace == "anonymous"
                or workspace == "unauthorized"
                or workspace == MODAL_ANONYMOUS_WORKSPACE_NAME
            ):
                from inference.core.env import MODAL_ALLOW_ANONYMOUS_EXECUTION

                if not MODAL_ALLOW_ANONYMOUS_EXECUTION:
                    raise DynamicBlockError(
                        public_message="Modal validation requires an API key when anonymous execution is disabled. "
                        "Please provide an API key or enable anonymous execution by setting "
                        "MODAL_ALLOW_ANONYMOUS_EXECUTION=True",
                        context="modal_executor | validation_authentication",
                    )

            # Hash-only path: skip shipping ``code_str`` and ``imports`` when
            # we believe the server already has this hash cached. On a miss
            # the server returns ``UnknownCodeHash`` and we resend full code.
            send_full_code = code_hash not in self._known_code_hashes
            result = self._post_execute(
                endpoint_url=endpoint_url,
                python_code=python_code,
                inputs_json=inputs_json,
                code_hash=code_hash,
                send_full_code=send_full_code,
                workflow_context=workflow_context or {},
            )

            if (
                not send_full_code
                and not result.get("success", False)
                and result.get("error_type") == "UnknownCodeHash"
            ):
                # Server replica doesn't have this hash cached; retry once.
                self._known_code_hashes.discard(code_hash)
                result = self._post_execute(
                    endpoint_url=endpoint_url,
                    python_code=python_code,
                    inputs_json=inputs_json,
                    code_hash=code_hash,
                    send_full_code=True,
                    workflow_context=workflow_context or {},
                )

            if result.get("success", False):
                self._known_code_hashes.add(code_hash)

            if not result.get("success", False):
                error_msg = result.get("error", "Unknown error")
                error_type = result.get("error_type", "RuntimeError")
                line_number = result.get("line_number")
                function_name = result.get("function_name") or "run"
                code = python_code.run_function_code

                message = (
                    f"Error in line {line_number}, in {function_name}: {error_type}: {error_msg}"
                    if line_number
                    else f"{error_type}: {error_msg}"
                )

                code_snippet = None
                traceback_str = None
                if line_number and code:
                    snippet = extract_code_snippet(code, line_number)
                    code_snippet = snippet.lstrip("\n") if snippet else None
                    traceback_str = build_traceback_string(
                        code, line_number, function_name, error_type, error_msg
                    )

                raise DynamicBlockCodeError(
                    public_message=message,
                    block_type_name=block_type_name,
                    error_line=line_number,
                    code_snippet=code_snippet,
                    traceback_str=traceback_str,
                    stdout=result.get("stdout"),
                    stderr=result.get("stderr"),
                )

            stdout = result.get("stdout")
            stderr = result.get("stderr")
            if stdout:
                sys.stdout.write(stdout)
                sys.stdout.flush()
            if stderr:
                sys.stderr.write(stderr)
                sys.stderr.flush()

            # Get the result and deserialize from JSON
            json_result = result.get("result", "{}")
            return deserialize_for_modal_remote_execution(json_result)

        except requests.exceptions.RequestException as e:
            raise DynamicBlockError(
                public_message=f"Failed to connect to Modal endpoint: {str(e)}",
                context="modal_executor | http_connection",
            )

    def _post_execute(
        self,
        endpoint_url: str,
        python_code: PythonCode,
        inputs_json: str,
        code_hash: str,
        send_full_code: bool,
        workflow_context: Dict[str, Any],
    ) -> Dict[str, Any]:
        """Build the gzip-JSON request and POST it. Returns the parsed JSON.

        When ``send_full_code`` is False we omit ``code_str`` and ``imports``;
        the server uses ``code_hash`` to locate its cached compiled namespace.
        """
        request_payload: Dict[str, Any] = {
            "code_hash": code_hash,
            "run_function_name": python_code.run_function_name,
            "inputs_json": inputs_json,
            "workflow_context": workflow_context,
        }
        if send_full_code:
            request_payload["code_str"] = python_code.run_function_code
            request_payload["imports"] = python_code.imports or []

        body_bytes = json.dumps(request_payload).encode("utf-8")
        compressed = gzip.compress(body_bytes, compresslevel=1)

        session = self._get_session()
        response = session.post(
            endpoint_url,
            data=compressed,
            timeout=30,
            headers={
                "Content-Type": "application/json",
                "Content-Encoding": "gzip",
            },
        )

        if response.status_code != 200:
            raise DynamicBlockError(
                public_message=f"Modal endpoint returned status {response.status_code}: {response.text}",
                context="modal_executor | http_request",
            )

        return response.json()

PooledWebSocketModalExecutor ¶

Per-workspace pool of websocket executors.

A single websocket supports only one in-flight request because responses are ordered on the connection. The workspace-level executor cache can therefore keep this pool hot without funneling every same-workspace execution through one socket.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/modal_executor.py

class PooledWebSocketModalExecutor:
    """Per-workspace pool of websocket executors.

    A single websocket supports only one in-flight request because responses are
    ordered on the connection. The workspace-level executor cache can therefore
    keep this pool hot without funneling every same-workspace execution through
    one socket.
    """

    def __init__(self, workspace_id: Optional[str] = None):
        self.workspace_id = workspace_id or MODAL_ANONYMOUS_WORKSPACE_NAME
        pool_size = max(1, WEBEXEC_WS_CONNECTION_POOL_SIZE)
        self._executors = [
            WebSocketModalExecutor(workspace_id=self.workspace_id)
            for _ in range(pool_size)
        ]
        self._active_counts = [0] * pool_size
        self._pool_lock = threading.Lock()

    def _acquire_executor(self) -> tuple[int, WebSocketModalExecutor]:
        # Prefer the lowest-index least-busy executor so serial workloads
        # (e.g. video streams) reuse a single connection; additional sockets
        # only open when concurrency actually demands them.
        with self._pool_lock:
            best_index = 0
            best_count = self._active_counts[0]
            if best_count > 0:
                for index in range(1, len(self._executors)):
                    active_count = self._active_counts[index]
                    if active_count < best_count:
                        best_index = index
                        best_count = active_count
                        if best_count == 0:
                            break
            self._active_counts[best_index] += 1
            return best_index, self._executors[best_index]

    def _release_executor(self, index: int) -> None:
        with self._pool_lock:
            self._active_counts[index] -= 1

    def close(self) -> None:
        for executor in self._executors:
            executor.close()

    def execute_remote(
        self,
        block_type_name: str,
        python_code: PythonCode,
        inputs: Dict[str, Any],
        workspace_id: Optional[str] = None,
        workflow_context: Optional[Dict[str, Any]] = None,
    ) -> BlockResult:
        index, executor = self._acquire_executor()
        try:
            return executor.execute_remote(
                block_type_name=block_type_name,
                python_code=python_code,
                inputs=inputs,
                workspace_id=workspace_id or self.workspace_id,
                workflow_context=workflow_context,
            )
        finally:
            self._release_executor(index)

WebSocketModalExecutor ¶

Executes Custom Python Blocks via a persistent WebSocket + msgpack.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/modal_executor.py

class WebSocketModalExecutor:
    """Executes Custom Python Blocks via a persistent WebSocket + msgpack."""

    _KEEPALIVE_IDLE_SECONDS = 25.0

    def __init__(self, workspace_id: Optional[str] = None):
        self.workspace_id = workspace_id or MODAL_ANONYMOUS_WORKSPACE_NAME
        self._ws: Any = None
        self._ws_url: Optional[str] = None
        self._hashes_sent_on_ws: set = set()
        self._io_lock = threading.Lock()
        self._last_activity: float = 0.0
        self._keepalive_stop: Optional[threading.Event] = None
        self._keepalive_thread: Optional[threading.Thread] = None

    def _get_ws_url(self, workspace_id: str) -> str:
        if self._ws_url is not None:
            return self._ws_url

        explicit_ws_url = os.environ.get("MODAL_WS_ENDPOINT_URL", "")
        if explicit_ws_url:
            base = explicit_ws_url.rstrip("/")
        else:
            legacy_http_url = os.environ.get("MODAL_WEB_ENDPOINT_URL", "")
            if legacy_http_url:
                base = _coerce_http_endpoint_to_ws_endpoint(legacy_http_url)
            else:
                base = _build_webexec_endpoint_base(
                    method_label=_WEBEXEC_WS_METHOD_LABEL
                )

        self._ws_url = _as_ws_endpoint_url(base, workspace_id)
        return self._ws_url

    def _connect(self, workspace_id: str) -> None:
        import websocket as ws_lib

        url = self._get_ws_url(workspace_id)
        headers = {
            "Modal-Key": MODAL_TOKEN_ID,
            "Modal-Secret": MODAL_TOKEN_SECRET,
        }
        logger.info("[webexec-ws] Connecting to %s", url)
        self._ws = ws_lib.create_connection(
            url,
            header=[f"{k}: {v}" for k, v in headers.items()],
            timeout=WEBEXEC_WS_CONNECT_TIMEOUT_SECONDS,
        )
        self._ws.settimeout(WEBEXEC_WS_READ_TIMEOUT_SECONDS)
        # New container -> no compiled namespaces cached yet.
        self._hashes_sent_on_ws = set()
        self._last_activity = _time.monotonic()
        self._ensure_keepalive_thread()
        logger.info("[webexec-ws] Connected")

    def _ensure_connection(self, workspace_id: str) -> None:
        # Hot path: trust the cached socket. A dead connection will surface
        # as an exception on the very next ``send``/``recv`` and we drop+
        # reconnect in the caller's except block (see ``_execute_ws``).
        if self._ws is None:
            with self._io_lock:
                # Double-check inside the lock to prevent race where two
                # threads both see _ws as None and both call _connect(),
                # leaking a socket and keepalive thread.
                if self._ws is None:
                    self._connect(workspace_id)

    def _ensure_keepalive_thread(self) -> None:
        if self._keepalive_thread is not None and self._keepalive_thread.is_alive():
            return
        self._keepalive_stop = threading.Event()
        self._keepalive_thread = threading.Thread(
            target=self._keepalive_loop,
            args=(self._keepalive_stop,),
            name=f"webexec-ws-keepalive-{self.workspace_id}",
            daemon=True,
        )
        self._keepalive_thread.start()

    def _keepalive_loop(self, stop_event: threading.Event) -> None:
        """Ping the WS when the connection has been idle long enough.

        Skipped entirely while frames are flowing (``_last_activity`` is
        updated on every successful RTT). Uses ``acquire(blocking=False)`` so
        the keepalive never delays a real frame already in flight.
        """
        interval = self._KEEPALIVE_IDLE_SECONDS
        while not stop_event.wait(interval):
            ws = self._ws
            if ws is None:
                return
            idle = _time.monotonic() - self._last_activity
            if idle < interval:
                continue
            if not self._io_lock.acquire(blocking=False):
                # Frame in flight -> that's keepalive enough.
                continue
            try:
                ws = self._ws
                if ws is None:
                    return
                try:
                    ws.ping()
                    self._last_activity = _time.monotonic()
                    logger.debug("[webexec-ws] keepalive ping ok")
                except Exception as e:
                    logger.debug(
                        "[webexec-ws] keepalive ping failed (%s); dropping conn",
                        e,
                    )
                    try:
                        ws.close()
                    except Exception:
                        pass
                    self._ws = None
                    self._hashes_sent_on_ws = set()
                    return
            finally:
                self._io_lock.release()

    def close(self) -> None:
        """Best-effort teardown, mainly for tests."""
        if self._keepalive_stop is not None:
            self._keepalive_stop.set()
        ws = self._ws
        self._ws = None
        if ws is not None:
            try:
                ws.close()
            except Exception:
                pass

    def _drop_ws_connection(self) -> None:
        try:
            if self._ws is not None:
                self._ws.close()
        except Exception:
            pass
        self._ws = None
        self._hashes_sent_on_ws = set()

    def execute_remote(
        self,
        block_type_name: str,
        python_code: PythonCode,
        inputs: Dict[str, Any],
        workspace_id: Optional[str] = None,
        workflow_context: Optional[Dict[str, Any]] = None,
    ) -> BlockResult:
        if not MODAL_AVAILABLE:
            raise DynamicBlockError(
                public_message="Modal credentials not configured. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET environment variables.",
                context="modal_executor | credentials_check",
            )

        workspace = workspace_id or self.workspace_id
        if not workspace or workspace in (
            "anonymous",
            "unauthorized",
            MODAL_ANONYMOUS_WORKSPACE_NAME,
        ):
            from inference.core.env import MODAL_ALLOW_ANONYMOUS_EXECUTION

            if not MODAL_ALLOW_ANONYMOUS_EXECUTION:
                raise DynamicBlockError(
                    public_message="Modal validation requires an API key when anonymous execution is disabled.",
                    context="modal_executor | validation_authentication",
                )

        try:
            import msgpack
        except ImportError:
            raise DynamicBlockError(
                public_message="WEBEXEC_TRANSPORT is set to 'websocket' but msgpack is not installed. "
                "Install it with: pip install msgpack",
                context="modal_executor | missing_dependency",
            )

        try:
            import websocket as _ws_lib  # noqa: F401
        except ImportError:
            raise DynamicBlockError(
                public_message="WEBEXEC_TRANSPORT is set to 'websocket' but websocket-client is not installed. "
                "Install it with: pip install websocket-client",
                context="modal_executor | missing_dependency",
            )

        return self._execute_ws(
            block_type_name,
            python_code,
            inputs,
            workspace,
            msgpack,
            workflow_context or {},
        )

    def _send_recv_with_retry(
        self,
        frame_bytes: bytes,
        workspace: str,
    ) -> bytes:
        """Send frame and receive response, reconnecting once before execution.

        We retry connection/send failures because the frame has not been
        accepted by the websocket client. Once ``send_binary`` succeeds, the
        remote may already be executing user code; a later ``recv`` failure has
        an ambiguous outcome, so we do not resend the frame and risk duplicate
        side effects.
        """
        import msgpack

        frames = _split_ws_frames(frame_bytes, msgpack)
        last_exc: Optional[Exception] = None
        for attempt in range(2):
            sent_ok = False
            try:
                self._ensure_connection(workspace)
                # Hold the lock across send+recv so concurrent callers sharing
                # this executor's socket can't interleave a request/response.
                with self._io_lock:
                    for frame in frames:
                        self._ws.send_binary(frame)
                    sent_ok = True
                    resp_bytes = self._recv_reassembled(msgpack)
                self._last_activity = _time.monotonic()
                return resp_bytes
            except Exception as e:
                self._drop_ws_connection()
                if sent_ok:
                    # recv failed after the frame was sent; the remote may have
                    # already executed user code, so we don't resend and risk
                    # duplicate side effects.
                    logger.warning(
                        "[webexec-ws] response receive failed after frame was "
                        "sent; not retrying to avoid duplicate execution: %s",
                        e,
                    )
                    raise DynamicBlockError(
                        public_message=(
                            "WebSocket connection to Modal endpoint lost after "
                            "the request was sent. The custom Python block may "
                            "have already executed, so the frame was not retried."
                        ),
                        context="modal_executor | websocket_response",
                    )
                last_exc = e
                logger.warning(
                    "[webexec-ws] connect/send failed (attempt %d/2): %s",
                    attempt + 1,
                    e,
                )
                continue

        raise DynamicBlockError(
            public_message=f"WebSocket connection to Modal endpoint failed after retry: {last_exc}",
            context="modal_executor | websocket_connection",
        )

    def _recv_reassembled(self, msgpack: Any) -> bytes:
        """Receive one logical frame, joining chunked frames if signalled."""
        resp_bytes = self._ws.recv()
        if isinstance(resp_bytes, bytes) and len(resp_bytes) < 64:
            head = msgpack.unpackb(resp_bytes, raw=False)
            if isinstance(head, dict) and "_chunked" in head:
                return b"".join(self._ws.recv() for _ in range(head["_chunked"]))
        return resp_bytes

    def _execute_ws(
        self,
        block_type_name: str,
        python_code: PythonCode,
        inputs: Dict[str, Any],
        workspace: str,
        msgpack: Any,
        workflow_context: Dict[str, Any],
    ) -> BlockResult:
        t0 = _time.monotonic()

        packed_inputs = serialize_inputs_for_msgpack(inputs)
        t_ser = _time.monotonic()

        code_hash = _compute_code_hash(
            python_code.run_function_code or "",
            python_code.imports,
        )

        # Hash-only path: if we've already sent this code over the current WS
        # connection (pinned to one container), drop ``code_str`` + ``imports``
        # from every subsequent frame. The server looks up the cached
        # compiled namespace by hash.
        send_full_code = code_hash not in self._hashes_sent_on_ws

        frame_bytes = self._build_ws_frame(
            python_code=python_code,
            packed_inputs=packed_inputs,
            code_hash=code_hash,
            send_full_code=send_full_code,
            msgpack=msgpack,
            workflow_context=workflow_context,
        )
        t_pack = _time.monotonic()

        resp_bytes = self._send_recv_with_retry(frame_bytes, workspace)

        t_rtt = _time.monotonic()

        result = msgpack.unpackb(resp_bytes, raw=False)

        # Fresh replica doesn't have this hash cached (can happen after a
        # reconnect or container restart). Retry once with full code.
        if (
            not send_full_code
            and not result.get("success", False)
            and result.get("error_type") == "UnknownCodeHash"
        ):
            self._hashes_sent_on_ws.discard(code_hash)
            logger.info(
                "[webexec-ws] server missed cached hash %s, resending full code",
                code_hash,
            )
            retry_frame = self._build_ws_frame(
                python_code=python_code,
                packed_inputs=packed_inputs,
                code_hash=code_hash,
                send_full_code=True,
                msgpack=msgpack,
                workflow_context=workflow_context,
            )
            resp_bytes = self._send_recv_with_retry(retry_frame, workspace)
            result = msgpack.unpackb(resp_bytes, raw=False)

        if result.get("success", False):
            self._hashes_sent_on_ws.add(code_hash)

        t_done = _time.monotonic()

        logger.debug(
            "[webexec-ws-timing] serialize=%.0fms pack=%.0fms rtt=%.0fms unpack=%.0fms total=%.0fms bytes=%d hash_only=%s",
            (t_ser - t0) * 1000,
            (t_pack - t_ser) * 1000,
            (t_rtt - t_pack) * 1000,
            (t_done - t_rtt) * 1000,
            (t_done - t0) * 1000,
            len(frame_bytes),
            not send_full_code,
        )

        if not result.get("success", False):
            self._raise_code_error(result, block_type_name, python_code)

        stdout = result.get("stdout")
        stderr = result.get("stderr")
        if stdout:
            sys.stdout.write(stdout)
            sys.stdout.flush()
        if stderr:
            sys.stderr.write(stderr)
            sys.stderr.flush()

        return _deserialize_msgpack_result(result.get("result", {}))

    @staticmethod
    def _build_ws_frame(
        python_code: PythonCode,
        packed_inputs: Dict[str, Any],
        code_hash: str,
        send_full_code: bool,
        msgpack: Any,
        workflow_context: Dict[str, Any],
    ) -> bytes:
        """Pack a msgpack frame, optionally omitting ``code_str``/``imports``.

        When ``send_full_code`` is False the server resolves the compiled
        namespace through its per-container cache keyed by ``code_hash``.
        """
        payload: Dict[str, Any] = {
            "code_hash": code_hash,
            "run_function_name": python_code.run_function_name,
            "inputs": packed_inputs,
            "workflow_context": workflow_context,
        }
        if send_full_code:
            payload["code_str"] = python_code.run_function_code
            payload["imports"] = python_code.imports or []
        return msgpack.packb(payload, use_bin_type=True)

    @staticmethod
    def _raise_code_error(
        result: dict,
        block_type_name: str,
        python_code: PythonCode,
    ) -> None:
        error_msg = result.get("error", "Unknown error")
        error_type = result.get("error_type", "RuntimeError")
        line_number = result.get("line_number")
        function_name = result.get("function_name") or "run"
        code = python_code.run_function_code

        message = (
            f"Error in line {line_number}, in {function_name}: {error_type}: {error_msg}"
            if line_number
            else f"{error_type}: {error_msg}"
        )

        code_snippet = None
        traceback_str = None
        if line_number and code:
            snippet = extract_code_snippet(code, line_number)
            code_snippet = snippet.lstrip("\n") if snippet else None
            traceback_str = build_traceback_string(
                code,
                line_number,
                function_name,
                error_type,
                error_msg,
            )

        raise DynamicBlockCodeError(
            public_message=message,
            block_type_name=block_type_name,
            error_line=line_number,
            code_snippet=code_snippet,
            traceback_str=traceback_str,
            stdout=result.get("stdout"),
            stderr=result.get("stderr"),
        )

Methods:¶

close ¶

close()

Best-effort teardown, mainly for tests.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/modal_executor.py

def close(self) -> None:
    """Best-effort teardown, mainly for tests."""
    if self._keepalive_stop is not None:
        self._keepalive_stop.set()
    ws = self._ws
    self._ws = None
    if ws is not None:
        try:
            ws.close()
        except Exception:
            pass

Functions:¶

get_modal_executor ¶

get_modal_executor(workspace_id=None)

Returns the right executor based on WEBEXEC_TRANSPORT.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/modal_executor.py

def get_modal_executor(workspace_id: Optional[str] = None) -> Any:
    """Returns the right executor based on ``WEBEXEC_TRANSPORT``."""
    from inference.core.env import WEBEXEC_TRANSPORT

    if WEBEXEC_TRANSPORT == "websocket":
        return PooledWebSocketModalExecutor(workspace_id)
    return ModalExecutor(workspace_id)

serialize_inputs_for_msgpack ¶

serialize_inputs_for_msgpack(inputs)

Convert workflow inputs to a msgpack-friendly dict.

Images become {"_type": "workflow_image", "_jpeg_bytes": <bytes>, ...}. Detections and other tagged types keep their _type markers but remain plain dicts/lists so msgpack can handle them.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/modal_executor.py

def serialize_inputs_for_msgpack(inputs: Dict[str, Any]) -> Dict[str, Any]:
    """Convert workflow inputs to a msgpack-friendly dict.

    Images become ``{"_type": "workflow_image", "_jpeg_bytes": <bytes>, ...}``.
    Detections and other tagged types keep their ``_type`` markers but remain
    plain dicts/lists so msgpack can handle them.
    """
    import supervision as sv

    from inference.core.workflows.core_steps.common.serializers import (
        serialise_sv_detections,
        serialize_video_metadata_kind,
    )
    from inference.core.workflows.execution_engine.entities.base import (
        VideoMetadata,
        WorkflowImageData,
    )

    def _pack(value: Any) -> Any:
        if isinstance(value, sv.Detections):
            d = serialise_sv_detections(detections=value)
            d["_type"] = "sv_detections"
            return {k: _pack(v) for k, v in d.items()}
        if isinstance(value, WorkflowImageData):
            d = _serialize_image_for_msgpack(value)
            return {k: _pack(v) for k, v in d.items()}
        if isinstance(value, VideoMetadata):
            d = serialize_video_metadata_kind(value)
            d["_type"] = "video_metadata"
            return {k: _pack(v) for k, v in d.items()}
        if isinstance(value, datetime):
            return {"_type": "datetime", "value": value.isoformat()}
        if isinstance(value, np.ndarray):
            return {
                "_type": "ndarray",
                "value": value.tolist(),
                "dtype": str(value.dtype),
                "shape": list(value.shape),
            }
        if isinstance(value, bytes):
            return value
        if isinstance(value, dict):
            return {k: _pack(v) for k, v in value.items()}
        if isinstance(value, (list, tuple)):
            return [_pack(v) for v in value]
        return value

    return {k: _pack(v) for k, v in inputs.items()}

validate_code_in_modal ¶

validate_code_in_modal(python_code, workspace_id=None)

Validate Python code syntax in a Modal sandbox via web endpoint.

Validation intentionally uses the HTTP execute-block endpoint even when WEBEXEC_TRANSPORT=websocket for execution. Deployments that use websocket execution must keep both Modal methods deployed: execute-block for validation and wsapp for execution.

Parameters:

Name	Type	Description	Default
`python_code`	`PythonCode`	The Python code to validate	required
`workspace_id`	`Optional[str]`	The workspace ID for Modal App	`None`

Returns:

Type	Description
`bool`	True if code is valid, raises otherwise

Raises:

Type	Description
`DynamicBlockError`	If code validation fails

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/modal_executor.py

def validate_code_in_modal(
    python_code: PythonCode, workspace_id: Optional[str] = None
) -> bool:
    """Validate Python code syntax in a Modal sandbox via web endpoint.

    Validation intentionally uses the HTTP ``execute-block`` endpoint even when
    ``WEBEXEC_TRANSPORT=websocket`` for execution. Deployments that use
    websocket execution must keep both Modal methods deployed: ``execute-block``
    for validation and ``wsapp`` for execution.

    Args:
        python_code: The Python code to validate
        workspace_id: The workspace ID for Modal App

    Returns:
        True if code is valid, raises otherwise

    Raises:
        DynamicBlockError: If code validation fails
    """
    # Check if Modal is available
    if not MODAL_AVAILABLE:
        raise DynamicBlockError(
            public_message="Modal credentials not configured. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET environment variables.",
            context="modal_executor | credentials_check",
        )

    workspace = workspace_id or MODAL_ANONYMOUS_WORKSPACE_NAME

    # Construct the full code to validate (same as in create_dynamic_module)
    full_code = python_code.run_function_code
    if python_code.init_function_code:
        full_code += "\n\n" + python_code.init_function_code

    # Escape the code for safe embedding in the validation function
    # Use repr() to properly escape quotes and special characters
    escaped_code = repr(full_code)

    # Simple validation code that checks syntax
    validation_code = PythonCode(
        type="PythonCode",
        imports=[],
        run_function_code=f"""
import ast

def validate_syntax():
    try:
        # Try to compile the user code
        code = {escaped_code}
        compile(code, "<string>", "exec")
        # Try to parse as AST to check structure
        ast.parse(code)
        return {{"valid": True}}
    except SyntaxError as e:
        return {{"valid": False, "error": str(e), "line": e.lineno}}
    except Exception as e:
        return {{"valid": False, "error": str(e)}}
""",
        run_function_name="validate_syntax",
        init_function_code=None,
        init_function_name="init",
    )

    # Keep validation on HTTP. It is a control-plane check, while websocket is
    # only the execution fast path.
    executor = ModalExecutor(workspace_id=workspace)

    try:
        # For validation, we don't need complex inputs, just pass empty JSON
        result = executor.execute_remote(
            block_type_name="validation",
            python_code=validation_code,
            inputs={},
            workspace_id=workspace,
        )

        if result.get("valid") is False:
            error_msg = result.get("error", "Unknown syntax error")
            line_no = result.get("line", None)
            if line_no:
                error_msg = f"Line {line_no}: {error_msg}"
            raise DynamicBlockError(
                public_message=f"Code validation failed: {error_msg}",
                context="modal_executor | code_validation",
            )

        return True

    except Exception as e:
        if isinstance(e, DynamicBlockError):
            raise
        raise DynamicBlockError(
            public_message=f"Code validation failed: {str(e)}",
            context="modal_executor | code_validation",
        )

inference.core.workflows.execution_engine.v1.dynamic_blocks.workflow_debug ¶

Workflow-scoped debug trace for custom Python blocks.

When the HTTP layer opts in to debug execution (debug=True), a WorkflowDebugTrace is activated for the run via register_debug_session(). Each custom Python block receives a module-level debug_traces proxy that appends structured values into the active trace. The collected entries are returned alongside python_blocks_output_streams in the workflow response.

Propagation model mirrors debug_logs: the active trace lives in a ContextVar and the execution engine re-binds it (and the current step name) inside every worker thread spawned by ThreadPoolExecutor.

Classes¶

WorkflowDebugTrace ¶

Thread-safe, append-only trace of intermediate state across blocks.

Source code in inference/core/workflows/execution_engine/v1/dynamic_blocks/workflow_debug.py

class WorkflowDebugTrace:
    """Thread-safe, append-only trace of intermediate state across blocks."""

    def __init__(
        self,
        max_entries: int = MAX_DEBUG_ENTRIES,
        max_entry_serialized_chars: int = MAX_ENTRY_SERIALIZED_CHARS,
        max_total_serialized_chars: int = MAX_TOTAL_SERIALIZED_CHARS,
    ) -> None:
        self._lock = threading.Lock()
        self._entries: List[Dict[str, Any]] = []
        self._max_entries = max_entries
        self._max_entry_serialized_chars = max_entry_serialized_chars
        self._max_total_serialized_chars = max_total_serialized_chars
        self._total_serialized_chars = 0
        self._capacity_exceeded = False

    def _append_capacity_marker(self, step_name: Optional[str]) -> None:
        entry = _capacity_marker_entry(step_name, self._max_entry_serialized_chars)
        self._total_serialized_chars += _entry_serialized_size(entry)
        self._entries.append(entry)

    def append(
        self,
        step_name: Optional[str],
        value: Any,
        *,
        add_timestamp: bool = False,
        timezone: Optional[Union[str, DatetimeTzInfo]] = None,
    ) -> None:
        serialized_value = _serialize_debug_value(value)
        entry: Dict[str, Any] = {"step": step_name, "value": serialized_value}
        if add_timestamp:
            timestamp, timezone_label = _format_timestamp(timezone)
            entry["timestamp"] = timestamp
            entry["timestamp_timezone"] = timezone_label
        if _entry_serialized_size(entry) > self._max_entry_serialized_chars:
            marker = "... [entry truncated]"
            text = (
                serialized_value
                if isinstance(serialized_value, str)
                else repr(serialized_value)
            )
            # Binary-search the longest value prefix that keeps the whole entry
            # (value + marker + metadata) within the cap. A char-count slice is
            # not a reliable size bound because json.dumps escaping can expand
            # characters several-fold; trimming one char at a time would
            # re-serialize the entry O(n) times for a large value. Serialized
            # size is monotonic in prefix length, so binary search needs only
            # O(log n) serializations.
            lo, hi, best = 0, len(text), 0
            while lo <= hi:
                mid = (lo + hi) // 2
                entry["value"] = text[:mid] + marker
                if _entry_serialized_size(entry) <= self._max_entry_serialized_chars:
                    best = mid
                    lo = mid + 1
                else:
                    hi = mid - 1
            entry["value"] = text[:best] + marker
        entry_size = _entry_serialized_size(entry)
        with self._lock:
            if self._capacity_exceeded:
                return
            if entry_size > self._max_entry_serialized_chars:
                # The value has been truncated as far as possible but the entry
                # still overflows the per-entry cap, so the metadata alone (e.g.
                # a very long, client-controlled step name) does not fit. Record
                # the capacity marker and stop instead of looping forever.
                self._capacity_exceeded = True
                self._append_capacity_marker(step_name)
                return
            if len(self._entries) >= self._max_entries:
                self._capacity_exceeded = True
                self._append_capacity_marker(step_name)
                return
            if (
                self._total_serialized_chars + entry_size
                > self._max_total_serialized_chars
            ):
                self._capacity_exceeded = True
                self._append_capacity_marker(step_name)
                return
            self._total_serialized_chars += entry_size
            self._entries.append(entry)

    def snapshot(self) -> List[Dict[str, Any]]:
        with self._lock:
            return list(self._entries)

`core/workflows/execution_engine/v1/executor/execution_data_manager`¶

inference.core.workflows.execution_engine.v1.executor.execution_data_manager.step_input_assembler ¶

Functions:¶

filter_to_valid_prefix_chains ¶

filter_to_valid_prefix_chains(per_dim_sets, dimensions)

Keep only indices that form a complete parent-child chain across dimensions.

Given per-dimension sets (e.g. from intersect_masks_per_dimension), retains only indices that have a full lineage from the smallest to the largest dimension. Used for inter-level intersection.

Source code in inference/core/workflows/execution_engine/v1/executor/execution_data_manager/step_input_assembler.py

def filter_to_valid_prefix_chains(
    per_dim_sets: Dict[int, Set[DynamicBatchIndex]],
    dimensions: Set[int],
) -> Dict[int, Set[DynamicBatchIndex]]:
    """Keep only indices that form a complete parent-child chain across dimensions.

    Given per-dimension sets (e.g. from intersect_masks_per_dimension), retains
    only indices that have a full lineage from the smallest to the largest
    dimension. Used for inter-level intersection.
    """
    sorted_dims = sorted(dimensions)
    by_dim: Dict[int, Set[DynamicBatchIndex]] = {
        dim: per_dim_sets.get(dim, set()) for dim in sorted_dims
    }

    if len(sorted_dims) <= 1:
        return dict(by_dim)

    prev_dim = {sorted_dims[i]: sorted_dims[i - 1] for i in range(1, len(sorted_dims))}

    # Bottom-up: mark indices that have at least one descendant
    has_child: Set[DynamicBatchIndex] = set()
    for dim in reversed(sorted_dims):
        for idx in by_dim[dim]:
            if dim == sorted_dims[-1] or idx in has_child:
                parent = idx[:-1]
                if parent:
                    has_child.add(parent)

    # Top-down: keep indices only if full prefix chain exists
    valid: Dict[int, Set[DynamicBatchIndex]] = {dim: set() for dim in sorted_dims}
    for dim in sorted_dims:
        for idx in by_dim[dim]:
            parent = idx[:-1]
            if dim == sorted_dims[0]:
                if idx in has_child:
                    valid[dim].add(idx)
            elif parent in valid[prev_dim[dim]]:
                if dim == sorted_dims[-1] or idx in has_child:
                    valid[dim].add(idx)

    return valid

get_masks_intersection_for_dimensions ¶

get_masks_intersection_for_dimensions(
    batch_masks, dimensions
)

Intersect masks at each dimension and filter to valid prefix chains.

Source code in inference/core/workflows/execution_engine/v1/executor/execution_data_manager/step_input_assembler.py

def get_masks_intersection_for_dimensions(
    batch_masks: List[Set[DynamicBatchIndex]],
    dimensions: Set[int],
) -> Dict[int, Optional[Set[DynamicBatchIndex]]]:
    """Intersect masks at each dimension and filter to valid prefix chains."""
    if not batch_masks:
        return {dim: None for dim in dimensions}

    sorted_dims = sorted(dimensions)

    if len(sorted_dims) <= 1:
        result = intersect_masks_per_dimension(batch_masks, dimensions)
        return {dim: result[dim] for dim in sorted_dims}

    per_dim = intersect_masks_per_dimension(batch_masks, dimensions)
    return filter_to_valid_prefix_chains(per_dim, dimensions)

intersect_masks_per_dimension ¶

intersect_masks_per_dimension(batch_masks, dimensions)

Intersect masks at each dimensionality level.

For each dimension d, returns the set of indices (with length d) that appear in every mask that has at least one index at that dimension. Masks with no indices at d are ignored for that dimension. Used for intra-dimensional intersection.

Source code in inference/core/workflows/execution_engine/v1/executor/execution_data_manager/step_input_assembler.py

def intersect_masks_per_dimension(
    batch_masks: List[Set[DynamicBatchIndex]],
    dimensions: Set[int],
) -> Dict[int, Set[DynamicBatchIndex]]:
    """Intersect masks at each dimensionality level.

    For each dimension d, returns the set of indices (with length d) that appear
    in every mask that has at least one index at that dimension. Masks with no
    indices at d are ignored for that dimension. Used for intra-dimensional
    intersection.
    """
    sorted_dims = sorted(dimensions)
    result: Dict[int, Set[DynamicBatchIndex]] = {}
    for dim in sorted_dims:
        sets_at_dim = [{idx for idx in mask if len(idx) == dim} for mask in batch_masks]
        non_empty = [s for s in sets_at_dim if s]
        result[dim] = set.intersection(*non_empty) if non_empty else set()
    return result

`core/workflows/execution_engine/v1/inner_workflow`¶

inference.core.workflows.execution_engine.v1.inner_workflow.compiler_bridge ¶

Compile-time helpers for nested workflows (composition validation and parameter bindings).

inner_workflow steps are expanded into ordinary steps before parsing; see inline.py.

Classes¶

Functions:¶

inference.core.workflows.execution_engine.v1.inner_workflow.composition ¶

Compile-time validation of workflow composition (which workflow references which).

This is separate from the per-workflow execution DAG: the step graph must remain acyclic, while this module validates the meta-graph of nested workflow references (e.g. inner_workflow).

See docs/workflows/inner_workflow_design.md.

Classes¶

Functions:¶

assert_composition_acyclic ¶

assert_composition_acyclic(graph)

Raise InnerWorkflowCompositionCycleError if the composition graph is not a DAG.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/composition.py

def assert_composition_acyclic(graph: nx.DiGraph) -> None:
    """Raise InnerWorkflowCompositionCycleError if the composition graph is not a DAG."""
    if graph.number_of_nodes() == 0:
        return

    if nx.is_directed_acyclic_graph(graph):
        return

    cycle_edges = nx.find_cycle(graph)
    last_edge_end_node = cycle_edges[-1][1]
    cycle_nodes = [edge_start_node for edge_start_node, _ in cycle_edges] + [
        last_edge_end_node
    ]

    raise InnerWorkflowCompositionCycleError(
        "Inner workflow composition graph contains a cycle. "
        f"Involved nodes (partial): {cycle_nodes!r}."
    )

build_composition_digraph ¶

build_composition_digraph(containment_edges)

Build a directed graph where an edge (parent, child) means "parent's definition directly embeds or references child as an inner workflow".

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/composition.py

def build_composition_digraph(
    containment_edges: Iterable[Tuple[Hashable, Hashable]],
) -> nx.DiGraph:
    """
    Build a directed graph where an edge (parent, child) means
    "parent's definition directly embeds or references child as an inner workflow".
    """
    graph = nx.DiGraph()
    for parent, child in containment_edges:
        graph.add_edge(parent, child)
    return graph

find_composition_cycles ¶

find_composition_cycles(containment_edges)

Return a list of simple cycles in the composition graph (for diagnostics / tests).

Each cycle is a list of node ids in order. Empty if the graph is acyclic.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/composition.py

def find_composition_cycles(
    containment_edges: Collection[Tuple[Hashable, Hashable]],
) -> List[List[Hashable]]:
    """
    Return a list of simple cycles in the composition graph (for diagnostics / tests).

    Each cycle is a list of node ids in order. Empty if the graph is acyclic.
    """
    graph = build_composition_digraph(containment_edges)
    return [list(c) for c in nx.simple_cycles(graph)]

max_nesting_depth_from_root ¶

max_nesting_depth_from_root(graph, root)

Maximum number of containment edges on any path starting at root.

If root has no outgoing containment edges, depth is 0.
If root references one child and that child references none, depth is 1.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/composition.py

def max_nesting_depth_from_root(graph: nx.DiGraph, root: Hashable) -> int:
    """
    Maximum number of containment edges on any path starting at ``root``.

    - If ``root`` has no outgoing containment edges, depth is ``0``.
    - If ``root`` references one child and that child references none, depth is ``1``.
    """
    if root not in graph or graph.out_degree(root) == 0:
        return 0

    memo: dict[Hashable, int] = {}

    def depth_from(node: Hashable) -> int:
        if node in memo:
            return memo[node]

        successors = list(graph.successors(node))
        if not successors:
            memo[node] = 0
            return 0

        depth = max(1 + depth_from(s) for s in successors)
        memo[node] = depth

        return depth

    return depth_from(root)

validate_inner_workflow_composition ¶

validate_inner_workflow_composition(
    *,
    containment_edges,
    root_workflow_id,
    max_nesting_depth,
    max_inner_workflow_count
)

Validate that the composition graph is acyclic and within max depth from root.

Parameters:

Name	Type	Description	Default
`containment_edges`	`Collection[Tuple[Hashable, Hashable]]`	(parent_workflow_id, child_workflow_id) for each direct inner-workflow reference (one entry per `inner_workflow` step in the tree).	required
`root_workflow_id`	`Hashable`	Identity of the workflow being compiled (opaque string or tuple).	required
`max_nesting_depth`	`int`	Maximum allowed value from :func:`max_nesting_depth_from_root`.	required
`max_inner_workflow_count`	`int`	Maximum allowed number of `inner_workflow` steps in the whole nested definition (`len(containment_edges)`).	required

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/composition.py

def validate_inner_workflow_composition(
    *,
    containment_edges: Collection[Tuple[Hashable, Hashable]],
    root_workflow_id: Hashable,
    max_nesting_depth: int,
    max_inner_workflow_count: int,
) -> None:
    """
    Validate that the composition graph is acyclic and within max depth from ``root``.

    Args:
        containment_edges: (parent_workflow_id, child_workflow_id) for each direct
            inner-workflow reference (one entry per ``inner_workflow`` step in the tree).
        root_workflow_id: Identity of the workflow being compiled (opaque string or tuple).
        max_nesting_depth: Maximum allowed value from :func:`max_nesting_depth_from_root`.
        max_inner_workflow_count: Maximum allowed number of ``inner_workflow`` steps in the
            whole nested definition (``len(containment_edges)``).
    """
    graph = build_composition_digraph(containment_edges)
    assert_composition_acyclic(graph)

    depth = max_nesting_depth_from_root(graph, root_workflow_id)
    if depth > max_nesting_depth:
        raise InnerWorkflowNestingDepthError(
            f"Inner workflow nesting depth from root {root_workflow_id!r} is {depth}, "
            f"which exceeds the limit of {max_nesting_depth}."
        )

    total = len(containment_edges)
    if total > max_inner_workflow_count:
        raise InnerWorkflowTotalCountError(
            f"Inner workflow step count is {total}, which exceeds the limit of "
            f"{max_inner_workflow_count}."
        )

inference.core.workflows.execution_engine.v1.inner_workflow.dynamic_blocks_collection ¶

Collect dynamic_blocks_definitions from a workflow and nested inner workflows.

Inner workflows may declare custom Python blocks on their own definition object. The compiler must discover all of them before compile_dynamic_blocks and inlining.

Functions:¶

apply_collected_dynamic_blocks_definitions_to_workflow_root ¶

apply_collected_dynamic_blocks_definitions_to_workflow_root(
    workflow_definition,
)

Hoist collected dynamic block definitions onto the root workflow dict.

Calls :func:collect_dynamic_blocks_definitions_from_workflow_definition and, when the result is non-empty, sets workflow_definition["dynamic_blocks_definitions"] to that merged list (mutates workflow_definition in place).

Parameters:

Name	Type	Description	Default
`workflow_definition`	`Dict[str, Any]`	Raw workflow JSON to update and scan for definitions.	required

Returns:

Type	Description
`List[Any]`	The merged dynamic block definition list (possibly empty).

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/dynamic_blocks_collection.py

def apply_collected_dynamic_blocks_definitions_to_workflow_root(
    workflow_definition: Dict[str, Any],
) -> List[Any]:
    """Hoist collected dynamic block definitions onto the root workflow dict.

    Calls :func:`collect_dynamic_blocks_definitions_from_workflow_definition` and, when
    the result is non-empty, sets ``workflow_definition["dynamic_blocks_definitions"]``
    to that merged list (mutates ``workflow_definition`` in place).

    Args:
        workflow_definition: Raw workflow JSON to update and scan for definitions.

    Returns:
        The merged dynamic block definition list (possibly empty).
    """
    merged = collect_dynamic_blocks_definitions_from_workflow_definition(
        workflow_definition=workflow_definition,
    )

    if merged:
        workflow_definition["dynamic_blocks_definitions"] = merged

    return merged

collect_dynamic_blocks_definitions_from_workflow_definition ¶

collect_dynamic_blocks_definitions_from_workflow_definition(
    workflow_definition,
)

Collect dynamic block definitions from a workflow and nested inner workflows.

Walks workflow_definition depth-first. For each level, appends entries from dynamic_blocks_definitions; then recurses into inner_workflow steps via workflow_definition.

When the same manifest.block_type appears more than once, the first occurrence is kept (parent definitions win over nested children) and a warning is logged for each skipped duplicate. Definitions without a block_type are still included and are not deduplicated.

Malformed entries (non-list dynamic_blocks_definitions, non-dict list items) are passed through as-is so :func:compile_dynamic_blocks can validate them.

Parameters:

Name	Type	Description	Default
`workflow_definition`	`Dict[str, Any]`	Raw workflow JSON (`steps`, optional nested definitions).	required

Returns:

Type	Description
`List[Any]`	Merged list of dynamic block definition dicts in discovery order.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/dynamic_blocks_collection.py

def collect_dynamic_blocks_definitions_from_workflow_definition(
    workflow_definition: Dict[str, Any],
) -> List[Any]:
    """Collect dynamic block definitions from a workflow and nested inner workflows.

    Walks ``workflow_definition`` depth-first. For each level, appends entries from
    ``dynamic_blocks_definitions``; then recurses into ``inner_workflow`` steps via
    ``workflow_definition``.

    When the same ``manifest.block_type`` appears more than once, the first occurrence
    is kept (parent definitions win over nested children) and a warning is logged for
    each skipped duplicate. Definitions without a ``block_type`` are still included and
    are not deduplicated.

    Malformed entries (non-list ``dynamic_blocks_definitions``, non-dict list items)
    are passed through as-is so :func:`compile_dynamic_blocks` can validate them.

    Args:
        workflow_definition: Raw workflow JSON (``steps``, optional nested definitions).

    Returns:
        Merged list of dynamic block definition dicts in discovery order.
    """
    collected: List[Any] = []
    seen_block_types: Set[str] = set()

    def append_definition(definition: Any) -> None:
        block_type = None
        if isinstance(definition, dict):
            block_type = _dynamic_block_type(definition)

        if block_type is not None:
            if block_type in seen_block_types:
                logger.warning(
                    "Skipping duplicate dynamic block definition for block_type=%r; "
                    "using the first definition collected while compiling the workflow.",
                    block_type,
                )
                return

            seen_block_types.add(block_type)

        collected.append(definition)

    def append_level(definitions: Any) -> None:
        if not definitions:
            return

        if not isinstance(definitions, list):
            append_definition(definitions)
            return

        for definition in definitions:
            append_definition(definition)

    def visit(workflow: Dict[str, Any]) -> None:
        append_level(workflow.get("dynamic_blocks_definitions"))

        for step in workflow.get("steps") or []:
            if not isinstance(step, dict):
                continue

            if step.get("type") != USE_INNER_WORKFLOW_BLOCK_TYPE:
                continue

            child = step.get("workflow_definition")
            if isinstance(child, dict):
                visit(child)

    visit(workflow_definition)

    return collected

inference.core.workflows.execution_engine.v1.inner_workflow.errors ¶

Errors raised by inner-workflow compilation and composition validation.

Classes¶

InnerWorkflowCompositionCycleError ¶

Raised when the workflow composition graph contains a cycle.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowCompositionCycleError(InnerWorkflowCompositionError):
    """Raised when the workflow composition graph contains a cycle."""

InnerWorkflowCompositionError ¶

Bases: WorkflowCompilerError

Base class for inner workflow composition failures at compile time.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowCompositionError(WorkflowCompilerError):
    """Base class for inner workflow composition failures at compile time."""

    def __init__(
        self,
        public_message: str,
        context: Optional[str] = None,
        inner_error: Optional[Exception] = None,
    ) -> None:
        super().__init__(
            public_message,
            context if context is not None else _INNER_WORKFLOW_COMPOSITION_CONTEXT,
            inner_error,
        )

InnerWorkflowInliningStructureError ¶

Raised when inner_workflow inlining cannot make progress (unexpected nested structure).

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowInliningStructureError(InnerWorkflowCompositionError):
    """Raised when ``inner_workflow`` inlining cannot make progress (unexpected nested structure)."""

InnerWorkflowInvalidStepEntryError ¶

Raised when steps contains an entry that is not a JSON object (mapping).

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowInvalidStepEntryError(InnerWorkflowCompositionError):
    """Raised when ``steps`` contains an entry that is not a JSON object (mapping)."""

InnerWorkflowNestingDepthError ¶

Raised when nesting from a root workflow exceeds the configured maximum depth.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowNestingDepthError(InnerWorkflowCompositionError):
    """Raised when nesting from a root workflow exceeds the configured maximum depth."""

InnerWorkflowParameterBindingsError ¶

Bases: WorkflowCompilerError

Raised when an inner_workflow step's parameter_bindings fail compile-time checks.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowParameterBindingsError(WorkflowCompilerError):
    """Raised when an ``inner_workflow`` step's ``parameter_bindings`` fail compile-time checks."""

    def __init__(
        self,
        public_message: str,
        context: Optional[str] = None,
        inner_error: Optional[Exception] = None,
    ) -> None:
        super().__init__(
            public_message,
            (
                context
                if context is not None
                else _INNER_WORKFLOW_PARAMETER_BINDINGS_CONTEXT
            ),
            inner_error,
        )

InnerWorkflowParameterBindingsMissingRequiredError ¶

Bases: InnerWorkflowParameterBindingsError

Raised when required child workflow inputs have no parameter_bindings entry.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowParameterBindingsMissingRequiredError(
    InnerWorkflowParameterBindingsError
):
    """Raised when required child workflow inputs have no ``parameter_bindings`` entry."""

InnerWorkflowParameterBindingsUnknownInputError ¶

Bases: InnerWorkflowParameterBindingsError

Raised when parameter_bindings keys are not declared child workflow inputs.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowParameterBindingsUnknownInputError(
    InnerWorkflowParameterBindingsError
):
    """Raised when ``parameter_bindings`` keys are not declared child workflow inputs."""

InnerWorkflowRunNotSupportedError ¶

Bases: WorkflowExecutionEngineError

Raised if the inner_workflow block's run() is invoked; the block is inlined at compile time.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowRunNotSupportedError(WorkflowExecutionEngineError):
    """Raised if the inner_workflow block's ``run()`` is invoked; the block is inlined at compile time."""

    def __init__(
        self,
        public_message: str,
        context: Optional[str] = None,
        inner_error: Optional[Exception] = None,
    ) -> None:
        super().__init__(
            public_message,
            (
                context
                if context is not None
                else _INNER_WORKFLOW_RUN_NOT_SUPPORTED_CONTEXT
            ),
            inner_error,
        )

InnerWorkflowTotalCountError ¶

Raised when the number of inner_workflow steps in the composition tree exceeds the limit.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/errors.py

class InnerWorkflowTotalCountError(InnerWorkflowCompositionError):
    """Raised when the number of ``inner_workflow`` steps in the composition tree exceeds the limit."""

inference.core.workflows.execution_engine.v1.inner_workflow.inline ¶

Compile-time expansion of roboflow_core/inner_workflow@v1 into ordinary steps.

Runs after reference normalization and composition validation on the pre-inline definition.

Classes¶

Functions:¶

inline_inner_workflow_steps ¶

inline_inner_workflow_steps(
    workflow_definition, *, available_blocks, profiler=None
)

Return a deep copy of workflow_definition with every inner_workflow step inlined.

Repeatedly expands innermost nested workflows until no roboflow_core/inner_workflow@v1 steps remain at any depth. The original dict is not modified.

Parameters:

Name	Type	Description	Default
`workflow_definition`	`Dict[str, Any]`	Parsed workflow JSON (root `steps` list).	required
`available_blocks`	`Any`	Block registry passed to the child workflow parser.	required
`profiler`	`Optional[WorkflowsProfiler]`	Optional compiler profiler.	`None`

Raises:

Type	Description
`InnerWorkflowInliningStructureError`	If inlining cannot make progress (unexpected graph).

Note

Composition and reference normalization must already be applied to the input.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/inline.py

def inline_inner_workflow_steps(
    workflow_definition: Dict[str, Any],
    *,
    available_blocks: Any,
    profiler: Optional[WorkflowsProfiler] = None,
) -> Dict[str, Any]:
    """Return a deep copy of ``workflow_definition`` with every ``inner_workflow`` step inlined.

    Repeatedly expands innermost nested workflows until no ``roboflow_core/inner_workflow@v1``
    steps remain at any depth. The original dict is not modified.

    Args:
        workflow_definition: Parsed workflow JSON (root ``steps`` list).
        available_blocks: Block registry passed to the child workflow parser.
        profiler: Optional compiler profiler.

    Raises:
        InnerWorkflowInliningStructureError: If inlining cannot make progress (unexpected graph).

    Note:
        Composition and reference normalization must already be applied to the input.
    """
    root = copy.deepcopy(workflow_definition)
    while _contains_inner_workflow_step(root.get("steps")):
        if not _inline_one_inner_workflow_leaf(
            root, available_blocks=available_blocks, profiler=profiler
        ):
            raise InnerWorkflowInliningStructureError(
                public_message=(
                    "Could not inline inner_workflow steps (unexpected nested structure). "
                    "Ensure inner workflow composition is valid."
                ),
            )
    return root

inference.core.workflows.execution_engine.v1.inner_workflow.reference_resolution ¶

Resolve roboflow_core/inner_workflow@v1 steps that reference a saved workflow by id into inline workflow_definition payloads before parsing / composition validation.

Functions:¶

normalize_inner_workflow_references_in_definition ¶

normalize_inner_workflow_references_in_definition(
    workflow_definition, init_parameters
)

Return a workflow definition suitable for parsing: all inner_workflow reference fields are resolved to inline workflow_definition (recursively). The input dict is never mutated.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/reference_resolution.py

def normalize_inner_workflow_references_in_definition(
    workflow_definition: Dict[str, Any],
    init_parameters: Dict[str, Any],
) -> Dict[str, Any]:
    """
    Return a workflow definition suitable for parsing: all ``inner_workflow`` reference fields
    are resolved to inline ``workflow_definition`` (recursively). The input dict is never mutated.
    """
    if not workflow_definition_contains_unresolved_inner_workflow_reference(
        workflow_definition
    ):
        return workflow_definition

    result = copy.deepcopy(workflow_definition)
    resolver = get_inner_workflow_spec_resolver(init_parameters)
    fetch_memo: Dict[Tuple[str, str, Optional[str]], Dict[str, Any]] = {}
    _normalize_inner_workflow_refs_in_workflow_dict(
        result, init_parameters, resolver, fetch_memo
    )
    return result

workflow_definition_contains_unresolved_inner_workflow_reference ¶

workflow_definition_contains_unresolved_inner_workflow_reference(
    workflow_definition,
)

True if any inner_workflow step (at any depth) still needs reference resolution.

Source code in inference/core/workflows/execution_engine/v1/inner_workflow/reference_resolution.py

def workflow_definition_contains_unresolved_inner_workflow_reference(
    workflow_definition: Dict[str, Any],
) -> bool:
    """True if any ``inner_workflow`` step (at any depth) still needs reference resolution."""

    def visit(wf: Dict[str, Any]) -> bool:
        for step in wf.get("steps", []) or []:
            if not isinstance(step, dict):
                continue
            if step.get("type") != USE_INNER_WORKFLOW_BLOCK_TYPE:
                continue
            if _inner_workflow_step_has_reference(step):
                return True
            child = step.get("workflow_definition")
            if isinstance(child, dict) and visit(child):
                return True
        return False

    return visit(workflow_definition)

`core/workflows/prototypes`¶

inference.core.workflows.prototypes.block ¶

Classes¶

AirGappedAvailability `dataclass` ¶

Declares whether a block can operate without internet access.

Blocks that require cloud APIs (e.g. OpenAI, Anthropic) return AirGappedAvailability(available=False, reason="requires_internet"). Blocks that work fully offline return the default (available=True).

Source code in inference/core/workflows/prototypes/block.py

@dataclass(frozen=True)
class AirGappedAvailability:
    """Declares whether a block can operate without internet access.

    Blocks that require cloud APIs (e.g. OpenAI, Anthropic) return
    ``AirGappedAvailability(available=False, reason="requires_internet")``.
    Blocks that work fully offline return the default (available=True).
    """

    available: bool = True
    reason: Optional[str] = None

BlockAirGappedInfo `dataclass` ¶

Full air-gapped status for a block, as returned by the describe endpoint.

Source code in inference/core/workflows/prototypes/block.py

@dataclass(frozen=True)
class BlockAirGappedInfo:
    """Full air-gapped status for a block, as returned by the describe endpoint."""

    available: bool = True
    reason: Optional[str] = None
    model_id: Optional[str] = None
    compatible_task_types: Optional[List[str]] = None

    def to_dict(self) -> Dict[str, Any]:
        result: Dict[str, Any] = {"available": self.available}
        if self.reason is not None:
            result["reason"] = self.reason
        if self.model_id is not None:
            result["model_id"] = self.model_id
        if self.compatible_task_types is not None:
            result["compatible_task_types"] = self.compatible_task_types
        return result

Runtime ¶

Bases: str, Enum

Canonical runtimes a workflow block can be executed in.

Runtimes not covered by get_restrictions() are considered OK.

Source code in inference/core/workflows/prototypes/block.py

class Runtime(str, Enum):
    """Canonical runtimes a workflow block can be executed in.

    Runtimes not covered by ``get_restrictions()`` are considered OK.
    """

    HOSTED_SERVERLESS = "hosted_serverless"
    DEDICATED_DEPLOYMENT = "dedicated_deployment"
    SELF_HOSTED_CPU = "self_hosted_cpu"
    SELF_HOSTED_GPU = "self_hosted_gpu"
    INFERENCE_PIPELINE = "inference_pipeline"

RuntimeInputMode ¶

Bases: str, Enum

Workflow input modes for a restriction.

Source code in inference/core/workflows/prototypes/block.py

class RuntimeInputMode(str, Enum):
    """Workflow input modes for a restriction."""

    IMAGE = "image"
    VIDEO = "video"

RuntimeRestriction `dataclass` ¶

A single caveat for a workflow block.

note is a one-line, human-readable explanation of the failure mode or degraded behavior. It should describe what happens (e.g. "track_ids reset between requests", "raises RuntimeError", "writes to ephemeral /tmp"), not abstract preconditions.

applies_to_runtimes narrows the restriction to specific workflow runtimes. When unset, the restriction applies to all runtimes.

applies_to_step_execution_modes narrows the restriction to specific workflow step execution modes. When unset, the restriction applies to all step execution modes.

applies_to_input_modes narrows the restriction to specific workflow input modes, such as video workflows that depend on cross-frame state. When unset, the restriction applies to all input modes.

Source code in inference/core/workflows/prototypes/block.py

@dataclass(frozen=True)
class RuntimeRestriction:
    """A single caveat for a workflow block.

    ``note`` is a one-line, human-readable explanation of the failure mode or
    degraded behavior. It should describe what happens (e.g. "track_ids reset
    between requests", "raises RuntimeError", "writes to ephemeral /tmp"),
    not abstract preconditions.

    ``applies_to_runtimes`` narrows the restriction to specific workflow
    runtimes. When unset, the restriction applies to all runtimes.

    ``applies_to_step_execution_modes`` narrows the restriction to specific
    workflow step execution modes. When unset, the restriction applies to all
    step execution modes.

    ``applies_to_input_modes`` narrows the restriction to specific workflow
    input modes, such as video workflows that depend on cross-frame state.
    When unset, the restriction applies to all input modes.
    """

    severity: Severity
    note: str
    applies_to_runtimes: Optional[List[Runtime]] = None
    applies_to_step_execution_modes: Optional[List[StepExecutionMode]] = None
    applies_to_input_modes: Optional[List[RuntimeInputMode]] = None

    def to_dict(self) -> Dict[str, Any]:
        result: Dict[str, Any] = {"severity": self.severity.value, "note": self.note}
        if self.applies_to_runtimes is not None:
            result["applies_to_runtimes"] = [
                runtime.value for runtime in self.applies_to_runtimes
            ]
        if self.applies_to_step_execution_modes is not None:
            result["applies_to_step_execution_modes"] = [
                mode.value for mode in self.applies_to_step_execution_modes
            ]
        if self.applies_to_input_modes is not None:
            result["applies_to_input_modes"] = [
                mode.value for mode in self.applies_to_input_modes
            ]
        return result

Severity ¶

Bases: str, Enum

Severity of a runtime restriction for a workflow block in a given runtime.

SOFT: the block runs to completion and returns the right output shape, but the values are degraded or meaningless (e.g. tracker IDs reset across requests, cooldown does not throttle, file is written to ephemeral disk).

HARD: the block does not run / raises / cannot produce a usable output in this runtime. The engine should refuse to compile or fail-fast.

Source code in inference/core/workflows/prototypes/block.py

class Severity(str, Enum):
    """Severity of a runtime restriction for a workflow block in a given runtime.

    SOFT: the block runs to completion and returns the right output shape,
    but the values are degraded or meaningless (e.g. tracker IDs reset across
    requests, cooldown does not throttle, file is written to ephemeral disk).

    HARD: the block does not run / raises / cannot produce a usable output
    in this runtime. The engine should refuse to compile or fail-fast.
    """

    SOFT = "soft"
    HARD = "hard"

StepExecutionMode ¶

Bases: Enum

How a workflow step is dispatched at runtime.

LOCAL: the step executes in-process inside the current Python interpreter. REMOTE: the step delegates execution to a remote inference service / HTTP runtime.

Kept in prototypes/block.py so the framework layer owns this enum and higher-level packages (core_steps, executor, compiler) depend on prototypes rather than the other way around.

Source code in inference/core/workflows/prototypes/block.py

class StepExecutionMode(Enum):
    """How a workflow step is dispatched at runtime.

    LOCAL: the step executes in-process inside the current Python interpreter.
    REMOTE: the step delegates execution to a remote inference service / HTTP
    runtime.

    Kept in ``prototypes/block.py`` so the framework layer owns this enum and
    higher-level packages (``core_steps``, executor, compiler) depend on
    ``prototypes`` rather than the other way around.
    """

    LOCAL = "local"
    REMOTE = "remote"

WorkflowBlockManifest ¶

Bases: BaseModel, ABC

Source code in inference/core/workflows/prototypes/block.py

class WorkflowBlockManifest(BaseModel, ABC):
    model_config = ConfigDict(
        validate_assignment=True,
    )

    type: str
    name: str = Field(
        title="Step Name", description="Enter a unique identifier for this step."
    )

    @classmethod
    @abstractmethod
    def describe_outputs(cls) -> List[OutputDefinition]:
        raise BlockInterfaceError(
            public_message=f"Class method `describe_outputs()` must be implemented "
            f"for {get_full_type_name(selected_type=cls)} to be valid "
            f"`WorkflowBlockManifest`.",
            context="getting_block_outputs",
        )

    def get_actual_outputs(self) -> List[OutputDefinition]:
        return self.describe_outputs()

    @classmethod
    def get_air_gapped_availability(cls) -> AirGappedAvailability:
        """Declare whether this block can operate without internet access.

        Override in subclasses that require cloud APIs to return
        ``AirGappedAvailability(available=False, reason="requires_internet")``.

        The default indicates the block works offline.
        """
        return AirGappedAvailability(available=True)

    @classmethod
    def get_restrictions(cls) -> List[RuntimeRestriction]:
        """Caveats for this block.

        Return restrictions describing where the block degrades
        (``Severity.SOFT``) or fails outright (``Severity.HARD``). Each
        restriction can scope itself to runtimes, step execution modes, and/or
        input modes.
        """
        return []

    @classmethod
    def get_supported_model_variants(cls) -> Optional[List[str]]:
        """Return model IDs whose cached weights enable this block to run offline.

        For foundation-model blocks, return the list of model variant IDs
        (e.g. ``["sam2/hiera_large", "sam2/hiera_small"]``).  The block is
        considered available if **any** variant has cached artifacts.

        Return ``None`` (the default) for blocks that do not depend on
        locally-cached model weights (pure logic blocks, cloud API blocks, etc.).
        """
        return None

    @classmethod
    def get_compatible_task_types(cls) -> Optional[List[str]]:
        """Return task types this block can process (e.g. ``["object-detection"]``).

        Used by the air-gapped builder to match user-trained models to
        compatible workflow blocks.  Return ``None`` (the default) for blocks
        that are not parameterised by a Roboflow model.
        """
        return None

    @classmethod
    def get_input_dimensionality_offsets(cls) -> Dict[str, int]:
        return {}

    @classmethod
    def get_dimensionality_reference_property(cls) -> Optional[str]:
        return None

    @classmethod
    def get_output_dimensionality_offset(
        cls,
    ) -> int:
        return 0

    @classmethod
    def accepts_batch_input(cls) -> bool:
        return (
            len(cls.get_parameters_accepting_batches()) > 0
            or len(cls.get_parameters_accepting_batches_and_scalars()) > 0
        )

    @classmethod
    def get_parameters_accepting_batches(cls) -> List[str]:
        return []

    @classmethod
    def get_parameters_accepting_batches_and_scalars(cls) -> List[str]:
        return []

    @classmethod
    def get_parameters_enforcing_auto_batch_casting(cls) -> List[str]:
        return []

    @classmethod
    def accepts_empty_values(cls) -> bool:
        return False

    @classmethod
    def get_execution_engine_compatibility(cls) -> Optional[str]:
        return None

Methods:¶

get_air_gapped_availability `classmethod` ¶

get_air_gapped_availability()

Declare whether this block can operate without internet access.

Override in subclasses that require cloud APIs to return AirGappedAvailability(available=False, reason="requires_internet").

The default indicates the block works offline.

Source code in inference/core/workflows/prototypes/block.py

@classmethod
def get_air_gapped_availability(cls) -> AirGappedAvailability:
    """Declare whether this block can operate without internet access.

    Override in subclasses that require cloud APIs to return
    ``AirGappedAvailability(available=False, reason="requires_internet")``.

    The default indicates the block works offline.
    """
    return AirGappedAvailability(available=True)

get_compatible_task_types `classmethod` ¶

get_compatible_task_types()

Return task types this block can process (e.g. ["object-detection"]).

Used by the air-gapped builder to match user-trained models to compatible workflow blocks. Return None (the default) for blocks that are not parameterised by a Roboflow model.

Source code in inference/core/workflows/prototypes/block.py

@classmethod
def get_compatible_task_types(cls) -> Optional[List[str]]:
    """Return task types this block can process (e.g. ``["object-detection"]``).

    Used by the air-gapped builder to match user-trained models to
    compatible workflow blocks.  Return ``None`` (the default) for blocks
    that are not parameterised by a Roboflow model.
    """
    return None

get_restrictions `classmethod` ¶

get_restrictions()

Caveats for this block.

Return restrictions describing where the block degrades (Severity.SOFT) or fails outright (Severity.HARD). Each restriction can scope itself to runtimes, step execution modes, and/or input modes.

Source code in inference/core/workflows/prototypes/block.py

@classmethod
def get_restrictions(cls) -> List[RuntimeRestriction]:
    """Caveats for this block.

    Return restrictions describing where the block degrades
    (``Severity.SOFT``) or fails outright (``Severity.HARD``). Each
    restriction can scope itself to runtimes, step execution modes, and/or
    input modes.
    """
    return []

get_supported_model_variants `classmethod` ¶

get_supported_model_variants()

Return model IDs whose cached weights enable this block to run offline.

For foundation-model blocks, return the list of model variant IDs (e.g. ["sam2/hiera_large", "sam2/hiera_small"]). The block is considered available if any variant has cached artifacts.

Return None (the default) for blocks that do not depend on locally-cached model weights (pure logic blocks, cloud API blocks, etc.).

Source code in inference/core/workflows/prototypes/block.py

@classmethod
def get_supported_model_variants(cls) -> Optional[List[str]]:
    """Return model IDs whose cached weights enable this block to run offline.

    For foundation-model blocks, return the list of model variant IDs
    (e.g. ``["sam2/hiera_large", "sam2/hiera_small"]``).  The block is
    considered available if **any** variant has cached artifacts.

    Return ``None`` (the default) for blocks that do not depend on
    locally-cached model weights (pure logic blocks, cloud API blocks, etc.).
    """
    return None

`enterprise/parallel`¶

Parallel HTTP inference via Celery workers for high-throughput deployments.

inference.enterprise.parallel.dispatch_manager ¶

Classes¶

ResultsChecker ¶

Class responsible for queuing asyncronous inference runs, keeping track of running requests, and awaiting their results.

Source code in inference/enterprise/parallel/dispatch_manager.py

class ResultsChecker:
    """
    Class responsible for queuing asyncronous inference runs,
    keeping track of running requests, and awaiting their results.
    """

    def __init__(self, redis: Redis):
        self.tasks: Dict[str, Event] = {}
        self.dones = dict()
        self.errors = dict()
        self.running = True
        self.redis = redis
        self.semaphore: BoundedSemaphore = BoundedSemaphore(NUM_PARALLEL_TASKS)

    def add_task(self, task_id: str, request: InferenceRequest):
        """
        Wait until there's available cylce to queue a task.
        When there are cycles, add the task's id to a list to keep track of its results,
        launch the preprocess celeryt task, set the task's status to in progress in redis.
        """
        self.semaphore.acquire()
        self.tasks[task_id] = Event()
        preprocess.s(request.dict()).delay()

    def get_result(self, task_id: str) -> Any:
        """
        Check the done tasks and errored tasks for this task id.
        """
        if task_id in self.dones:
            return self.dones.pop(task_id)
        elif task_id in self.errors:
            message = self.errors.pop(task_id)
            raise Exception(message)
        else:
            raise RuntimeError(
                "Task result not found in either success or error dict. Unreachable"
            )

    def loop(self):
        """
        Main loop. Check all in progress tasks for their status, and if their status is final,
        (either failure or success) then add their results to the appropriate results dictionary.
        """
        with self.redis.pubsub() as pubsub:
            pubsub.subscribe("results")
            for message in pubsub.listen():
                if message["type"] != "message":
                    continue
                message = orjson.loads(message["data"])
                task_id = message.pop("task_id")
                if task_id not in self.tasks:
                    continue
                self.semaphore.release()
                status = message.pop("status")
                if status == FAILURE_STATE:
                    self.errors[task_id] = message["payload"]
                elif status == SUCCESS_STATE:
                    self.dones[task_id] = message["payload"]
                else:
                    raise RuntimeError(
                        "Task result not found in possible states. Unreachable"
                    )
                self.tasks[task_id].set()

    def wait_for_response(self, key: str):
        event = self.tasks[key]
        event.wait()
        del self.tasks[key]
        return self.get_result(key)

Methods:¶

add_task ¶

add_task(task_id, request)

Wait until there's available cylce to queue a task. When there are cycles, add the task's id to a list to keep track of its results, launch the preprocess celeryt task, set the task's status to in progress in redis.

Source code in inference/enterprise/parallel/dispatch_manager.py

def add_task(self, task_id: str, request: InferenceRequest):
    """
    Wait until there's available cylce to queue a task.
    When there are cycles, add the task's id to a list to keep track of its results,
    launch the preprocess celeryt task, set the task's status to in progress in redis.
    """
    self.semaphore.acquire()
    self.tasks[task_id] = Event()
    preprocess.s(request.dict()).delay()

get_result ¶

get_result(task_id)

Check the done tasks and errored tasks for this task id.

Source code in inference/enterprise/parallel/dispatch_manager.py

def get_result(self, task_id: str) -> Any:
    """
    Check the done tasks and errored tasks for this task id.
    """
    if task_id in self.dones:
        return self.dones.pop(task_id)
    elif task_id in self.errors:
        message = self.errors.pop(task_id)
        raise Exception(message)
    else:
        raise RuntimeError(
            "Task result not found in either success or error dict. Unreachable"
        )

loop ¶

loop()

Main loop. Check all in progress tasks for their status, and if their status is final, (either failure or success) then add their results to the appropriate results dictionary.

Source code in inference/enterprise/parallel/dispatch_manager.py

def loop(self):
    """
    Main loop. Check all in progress tasks for their status, and if their status is final,
    (either failure or success) then add their results to the appropriate results dictionary.
    """
    with self.redis.pubsub() as pubsub:
        pubsub.subscribe("results")
        for message in pubsub.listen():
            if message["type"] != "message":
                continue
            message = orjson.loads(message["data"])
            task_id = message.pop("task_id")
            if task_id not in self.tasks:
                continue
            self.semaphore.release()
            status = message.pop("status")
            if status == FAILURE_STATE:
                self.errors[task_id] = message["payload"]
            elif status == SUCCESS_STATE:
                self.dones[task_id] = message["payload"]
            else:
                raise RuntimeError(
                    "Task result not found in possible states. Unreachable"
                )
            self.tasks[task_id].set()

Functions:¶

inference.enterprise.parallel.infer ¶

Classes¶

Functions:¶

get_batch ¶

get_batch(redis, model_names)

Run a heuristic to select the best batch to infer on redis[Redis]: redis client model_names[List[str]]: list of models with nonzero number of requests returns: Tuple[List[Dict], str] List[Dict] represents a batch of request dicts str is the model id

Source code in inference/enterprise/parallel/infer.py

def get_batch(redis: Redis, model_names: List[str]) -> Tuple[List[Dict], str]:
    """
    Run a heuristic to select the best batch to infer on
    redis[Redis]: redis client
    model_names[List[str]]: list of models with nonzero number of requests
    returns:
        Tuple[List[Dict], str]
        List[Dict] represents a batch of request dicts
        str is the model id
    """
    batch_sizes = [
        RoboflowInferenceModel.model_metadata_from_memcache_endpoint(m)["batch_size"]
        for m in model_names
    ]
    batch_sizes = [b if not isinstance(b, str) else BATCH_SIZE for b in batch_sizes]
    batches = [
        redis.zrange(f"infer:{m}", 0, b - 1, withscores=True)
        for m, b in zip(model_names, batch_sizes)
    ]
    model_index = select_best_inference_batch(batches, batch_sizes)
    batch = batches[model_index]
    selected_model = model_names[model_index]
    redis.zrem(f"infer:{selected_model}", *[b[0] for b in batch])
    redis.hincrby(f"requests", selected_model, -len(batch))
    batch = [orjson.loads(b[0]) for b in batch]
    return batch, selected_model

write_infer_arrays_and_launch_postprocess ¶

write_infer_arrays_and_launch_postprocess(
    arrs, request, preproc_return_metadata
)

Write inference results to shared memory and launch the postprocessing task

Source code in inference/enterprise/parallel/infer.py

def write_infer_arrays_and_launch_postprocess(
    arrs: Tuple[np.ndarray, ...],
    request: InferenceRequest,
    preproc_return_metadata: Dict,
):
    """Write inference results to shared memory and launch the postprocessing task"""
    shms = [shared_memory.SharedMemory(create=True, size=arr.nbytes) for arr in arrs]
    with shm_manager(*shms):
        shm_metadatas = []
        for arr, shm in zip(arrs, shms):
            shared = np.ndarray(arr.shape, dtype=arr.dtype, buffer=shm.buf)
            shared[:] = arr[:]
            shm_metadata = SharedMemoryMetadata(
                shm_name=shm.name, array_shape=arr.shape, array_dtype=arr.dtype.name
            )
            shm_metadatas.append(asdict(shm_metadata))

        postprocess.s(
            tuple(shm_metadatas), request.dict(), preproc_return_metadata
        ).delay()

inference.enterprise.parallel.utils ¶

Classes¶

SharedMemoryMetadata `dataclass` ¶

Info needed to load array from shared memory

Source code in inference/enterprise/parallel/utils.py

@dataclass
class SharedMemoryMetadata:
    """Info needed to load array from shared memory"""

    shm_name: str
    array_shape: List[int]
    array_dtype: str

Functions:¶

failure_handler ¶

failure_handler(redis, *request_ids)

Context manager that updates the status/results key in redis with exception info on failure.

Source code in inference/enterprise/parallel/utils.py

@contextmanager
def failure_handler(redis: Redis, *request_ids: str):
    """
    Context manager that updates the status/results key in redis with exception
    info on failure.
    """
    try:
        yield
    except Exception as error:
        message = type(error).__name__ + ": " + str(error)
        for request_id in request_ids:
            redis.publish(
                "results",
                json.dumps(
                    {"task_id": request_id, "status": FAILURE_STATE, "payload": message}
                ),
            )
        raise

shm_manager ¶

shm_manager(*shms, unlink_on_success=False)

Context manager that closes and frees shared memory objects.

Source code in inference/enterprise/parallel/utils.py

@contextmanager
def shm_manager(
    *shms: Union[str, shared_memory.SharedMemory], unlink_on_success: bool = False
):
    """Context manager that closes and frees shared memory objects."""
    try:
        loaded_shms = []
        for shm in shms:
            errors = []
            try:
                if isinstance(shm, str):
                    shm = shared_memory.SharedMemory(name=shm)
                loaded_shms.append(shm)
            except BaseException as error:
                errors.append(error)
            if errors:
                raise Exception(errors)

        yield loaded_shms
    except:
        for shm in loaded_shms:
            shm.close()
            shm.unlink()
        raise
    else:
        for shm in loaded_shms:
            shm.close()
            if unlink_on_success:
                shm.unlink()

`enterprise/workflows/enterprise_blocks/sinks/PLC_modbus`¶

inference.enterprise.workflows.enterprise_blocks.sinks.PLC_modbus.v1 ¶

Classes¶

ModbusTCPBlockV1 ¶

Bases: WorkflowBlock

A Modbus TCP communication block using pymodbus.

Supports: - 'read': Reads specified registers. - 'write': Writes values to specified registers. - 'read_and_write': Reads and writes in one execution.

On failures, errors are printed and marked as "ReadFailure" or "WriteFailure".

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/PLC_modbus/v1.py

class ModbusTCPBlockV1(WorkflowBlock):
    """A Modbus TCP communication block using pymodbus.

    Supports:
    - 'read': Reads specified registers.
    - 'write': Writes values to specified registers.
    - 'read_and_write': Reads and writes in one execution.

    On failures, errors are printed and marked as "ReadFailure" or "WriteFailure".
    """

    def __init__(self):
        self.client: Optional[ModbusClient] = None

    def __del__(self):
        if self.client:
            try:
                self.client.close()
            except Exception as exc:
                logger.debug("Failed to release modbus client: %s", exc)

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return ModbusTCPBlockManifest

    def run(
        self,
        plc_ip: str,
        plc_port: int,
        mode: str,
        registers_to_read: List[int],
        registers_to_write: Dict[int, int],
        depends_on: any,
        image: Optional[WorkflowImageData] = None,
        metadata: Optional[VideoMetadata] = None,
    ) -> dict:
        read_results = {}
        write_results = {}

        if not self.client:
            self.client: ModbusClient = ModbusClient(plc_ip, port=plc_port)
            if not self.client.connect():
                print("Failed to connect to PLC")
                return {"modbus_results": [{"error": "ConnectionFailure"}]}

        # If mode involves reading
        if mode in ["read", "read_and_write"]:
            for address in registers_to_read:
                try:
                    response = self.client.read_holding_registers(address)
                    if not response.isError():
                        read_results[address] = (
                            response.registers[0] if response.registers else None
                        )
                    else:
                        print(f"Error reading register {address}: {response}")
                        read_results[address] = "ReadFailure"
                except Exception as e:
                    print(f"Exception reading register {address}: {e}")
                    read_results[address] = "ReadFailure"

        # If mode involves writing
        if mode in ["write", "read_and_write"]:
            for address, value in registers_to_write.items():
                try:
                    response = self.client.write_register(address, value)
                    if not response.isError():
                        write_results[address] = "WriteSuccess"
                    else:
                        print(
                            f"Error writing register {address} with value {value}: {response}"
                        )
                        write_results[address] = "WriteFailure"
                except Exception as e:
                    print(
                        f"Exception writing register {address} with value {value}: {e}"
                    )
                    write_results[address] = "WriteFailure"

        modbus_output = {}
        if read_results:
            modbus_output["read"] = read_results
        if write_results:
            modbus_output["write"] = write_results

        return {"modbus_results": [modbus_output]}

`enterprise/workflows/enterprise_blocks/sinks/PLCethernetIP`¶

inference.enterprise.workflows.enterprise_blocks.sinks.PLCethernetIP.v1 ¶

Classes¶

PLCBlockManifest ¶

Manifest for a PLC communication block using Ethernet/IP.

The block can be used in one of three modes: - 'read': Only reads specified tags. - 'write': Only writes specified tags. - 'read_and_write': Performs both reading and writing in one execution.

tags_to_read and tags_to_write are applicable depending on the mode chosen.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/PLCethernetIP/v1.py

class PLCBlockManifest(WorkflowBlockManifest):
    """Manifest for a PLC communication block using Ethernet/IP.

    The block can be used in one of three modes:
    - 'read': Only reads specified tags.
    - 'write': Only writes specified tags.
    - 'read_and_write': Performs both reading and writing in one execution.

    `tags_to_read` and `tags_to_write` are applicable depending on the mode chosen.
    """

    model_config = ConfigDict(
        json_schema_extra={
            "name": "PLC EthernetIP",
            "version": "v1",
            "short_description": "Generic PLC read/write block using pylogix over Ethernet/IP.",
            "long_description": LONG_DESCRIPTION,
            "license": "Roboflow Enterprise License",
            "block_type": "sinks",
            "deprecated": True,
            "deprecation_message": "This block is deprecated. Use the PLC Reader / PLC Writer "
            "blocks (set Connection mode to 'Direct - EtherNet/IP') instead. Note the outputs "
            "differ: instead of a single `plc_results` list, the PLC Reader returns `tag_values` "
            "(a tag->value dict) and the PLC Writer returns `write_result`, each alongside an "
            "`error_status` flag.",
            "ui_manifest": {
                "section": "industrial",
                "icon": "fal fa-microchip",
                "blockPriority": 13,
                "enterprise_only": True,
                "local_only": True,
            },
        }
    )

    type: Literal["roboflow_core/sinks@v1"]

    plc_ip: Union[str, WorkflowParameterSelector(kind=[STRING_KIND])] = Field(
        description="IP address of the target PLC.", examples=["192.168.1.10"]
    )

    mode: Literal["read", "write", "read_and_write"] = Field(
        description="Mode of operation: 'read', 'write', or 'read_and_write'.",
        examples=["read", "write", "read_and_write"],
    )

    tags_to_read: Union[
        List[str],
        Selector(kind=[LIST_OF_VALUES_KIND]),
        WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]),
    ] = Field(
        default=[],
        description="List of PLC tag names to read. Applicable if mode='read' or mode='read_and_write'.",
        examples=[["camera_msg", "sku_number"]],
    )

    tags_to_write: Union[
        Dict[str, Union[int, float, str]],
        Selector(kind=[DICTIONARY_KIND]),
        WorkflowParameterSelector(kind=[DICTIONARY_KIND]),
    ] = Field(
        default={},
        description="Dictionary of tags and the values to write. Applicable if mode='write' or mode='read_and_write'.",
        examples=[{"camera_fault": True, "defect_count": 5}],
    )

    depends_on: Selector() = Field(
        description="Reference to the step output this block depends on.",
        examples=["$steps.some_previous_step"],
    )

    @classmethod
    def describe_outputs(cls) -> List[OutputDefinition]:
        return [
            OutputDefinition(
                name="plc_results",
                kind=[LIST_OF_VALUES_KIND],
            ),
        ]

    @classmethod
    def get_execution_engine_compatibility(cls) -> Optional[str]:
        return ">=1.0.0,<2.0.0"

PLCBlockV1 ¶

Bases: WorkflowBlock

A PLC communication workflow block using Ethernet/IP and pylogix.

Depending on the selected mode: - 'read': Reads specified tags. - 'write': Writes provided values to specified tags. - 'read_and_write': Reads and writes in one go.

In case of failures, errors are printed to terminal and the corresponding tag entry in the output is set to "ReadFailure" or "WriteFailure".

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/PLCethernetIP/v1.py

class PLCBlockV1(WorkflowBlock):
    """A PLC communication workflow block using Ethernet/IP and pylogix.

    Depending on the selected mode:
    - 'read': Reads specified tags.
    - 'write': Writes provided values to specified tags.
    - 'read_and_write': Reads and writes in one go.

    In case of failures, errors are printed to terminal and the corresponding tag entry in the output is set to "ReadFailure" or "WriteFailure".
    """

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return PLCBlockManifest

    def _read_single_tag(self, comm, tag):
        try:
            response = comm.Read(tag)
            if response.Status == "Success":
                return response.Value
            logger.error(f"Error reading tag '%s': %s", tag, response.Status)
            return "ReadFailure"
        except Exception as e:
            logger.error(f"Unhandled error reading tag '%s': %s", tag, e)
            return "ReadFailure"

    def _write_single_tag(self, comm, tag, value):
        try:
            response = comm.Write(tag, value)
            if response.Status == "Success":
                return "WriteSuccess"
            logger.error(
                "Error writing tag '%s' with value '%s': %s",
                tag,
                value,
                response.Status,
            )
            return "WriteFailure"
        except Exception as e:
            logger.error(f"Unhandled error writing tag '%s': %s", tag, e)
            return "WriteFailure"

    def run(
        self,
        plc_ip: str,
        mode: str,
        tags_to_read: List[str],
        tags_to_write: Dict[str, Union[int, float, str]],
        depends_on: any,
        image: Optional[WorkflowImageData] = None,
        metadata: Optional[VideoMetadata] = None,
    ) -> dict:
        """Run PLC read/write operations using pylogix over Ethernet/IP.

        Args:
            plc_ip (str): PLC IP address.
            mode (str): 'read', 'write', or 'read_and_write'.
            tags_to_read (List[str]): Tags to read if applicable.
            tags_to_write (Dict[str, Union[int, float, str]]): Tags to write if applicable.
            depends_on (any): The step output this block depends on.
            image (Optional[WorkflowImageData]): Not required for this block.
            metadata (Optional[VideoMetadata]): Not required for this block.

        Returns:
            dict: A dictionary with `plc_results` as a list containing one dictionary. That dictionary has 'read' and/or 'write' keys.
        """
        read_results = {}
        write_results = {}

        with pylogix.PLC() as comm:
            comm.IPAddress = plc_ip

            if mode in ["read", "read_and_write"]:
                read_results = {
                    tag: self._read_single_tag(comm, tag) for tag in tags_to_read
                }

            if mode in ["write", "read_and_write"]:
                write_results = {
                    tag: self._write_single_tag(comm, tag, value)
                    for tag, value in tags_to_write.items()
                }

        plc_output = {}
        if read_results:
            plc_output["read"] = read_results
        if write_results:
            plc_output["write"] = write_results

        return {"plc_results": [plc_output]}

Methods:¶

run ¶

run(
    plc_ip,
    mode,
    tags_to_read,
    tags_to_write,
    depends_on,
    image=None,
    metadata=None,
)

Run PLC read/write operations using pylogix over Ethernet/IP.

Parameters:

Name	Type	Description	Default
`plc_ip`	`str`	PLC IP address.	required
`mode`	`str`	'read', 'write', or 'read_and_write'.	required
`tags_to_read`	`List[str]`	Tags to read if applicable.	required
`tags_to_write`	`Dict[str, Union[int, float, str]]`	Tags to write if applicable.	required
`depends_on`	`any`	The step output this block depends on.	required
`image`	`Optional[WorkflowImageData]`	Not required for this block.	`None`
`metadata`	`Optional[VideoMetadata]`	Not required for this block.	`None`

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary with `plc_results` as a list containing one dictionary. That dictionary has 'read' and/or 'write' keys.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/PLCethernetIP/v1.py

def run(
    self,
    plc_ip: str,
    mode: str,
    tags_to_read: List[str],
    tags_to_write: Dict[str, Union[int, float, str]],
    depends_on: any,
    image: Optional[WorkflowImageData] = None,
    metadata: Optional[VideoMetadata] = None,
) -> dict:
    """Run PLC read/write operations using pylogix over Ethernet/IP.

    Args:
        plc_ip (str): PLC IP address.
        mode (str): 'read', 'write', or 'read_and_write'.
        tags_to_read (List[str]): Tags to read if applicable.
        tags_to_write (Dict[str, Union[int, float, str]]): Tags to write if applicable.
        depends_on (any): The step output this block depends on.
        image (Optional[WorkflowImageData]): Not required for this block.
        metadata (Optional[VideoMetadata]): Not required for this block.

    Returns:
        dict: A dictionary with `plc_results` as a list containing one dictionary. That dictionary has 'read' and/or 'write' keys.
    """
    read_results = {}
    write_results = {}

    with pylogix.PLC() as comm:
        comm.IPAddress = plc_ip

        if mode in ["read", "read_and_write"]:
            read_results = {
                tag: self._read_single_tag(comm, tag) for tag in tags_to_read
            }

        if mode in ["write", "read_and_write"]:
            write_results = {
                tag: self._write_single_tag(comm, tag, value)
                for tag, value in tags_to_write.items()
            }

    plc_output = {}
    if read_results:
        plc_output["read"] = read_results
    if write_results:
        plc_output["write"] = write_results

    return {"plc_results": [plc_output]}

`enterprise/workflows/enterprise_blocks/sinks/event_writer`¶

inference.enterprise.workflows.enterprise_blocks.sinks.event_writer.v1 ¶

Classes¶

BlockManifest ¶

Manifest for the Event Writer sink block.

Sends structured events to the Event Ingestion Service via its v2 API. Each event includes a schema (quality_check, inventory_count, safety_alert, or custom), schema-specific data, one image entry with optional annotations, and optional flat key-value custom metadata.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/event_writer/v1.py

class BlockManifest(WorkflowBlockManifest):
    """Manifest for the Event Writer sink block.

    Sends structured events to the Event Ingestion Service via its v2 API.
    Each event includes a schema (quality_check, inventory_count, safety_alert,
    or custom), schema-specific data, one image entry with optional annotations,
    and optional flat key-value custom metadata.
    """

    model_config = ConfigDict(
        json_schema_extra={
            "name": "Event Writer",
            "version": "v1",
            "short_description": "Write structured events to the Event Ingestion Service.",
            "long_description": LONG_DESCRIPTION,
            "license": "Roboflow Enterprise License",
            "block_type": "sink",
            "ui_manifest": {
                "section": "industrial",
                "icon": "fal fa-calendar-check",
                "blockPriority": 5,
                "enterprise_only": True,
                "local_only": True,
            },
        }
    )
    type: Literal["roboflow_enterprise/event_writer_sink@v1"]

    # --- Connection ---
    event_ingestion_url: Union[Selector(kind=[STRING_KIND]), str] = Field(
        default="http://localhost:8001",
        description="Base URL of the Event Ingestion Service.",
        examples=["http://localhost:8001", "$inputs.event_ingestion_url"],
    )

    # --- Schema selector ---
    event_schema: Literal[
        "quality_check", "inventory_count", "safety_alert", "custom"
    ] = Field(
        description="The event schema to use.",
        json_schema_extra={"always_visible": True},
    )

    # --- Images ---
    output_image: Selector(kind=[IMAGE_KIND]) = Field(
        description="The output/visualization image. Sent as the primary display image.",
        json_schema_extra={"always_visible": True},
    )
    input_image: Optional[Selector(kind=[IMAGE_KIND])] = Field(
        default=None,
        description="The original input image (optional). Sent as the source image.",
    )
    image_label: Optional[Union[Selector(kind=[STRING_KIND]), str]] = Field(
        default=None,
        description="Label for the image entry.",
        examples=["defect-analysis", "$inputs.image_label"],
    )

    # --- Custom metadata ---
    custom_metadata: Dict[
        str,
        Union[
            Selector(),
            str,
            float,
            bool,
            int,
        ],
    ] = Field(
        default_factory=dict,
        description="Flat key-value metadata (max 100 keys, values must be str/int/float/bool).",
        examples=[{"line": "A1", "shift": "morning"}],
        json_schema_extra={"always_visible": True},
    )

    # --- Quality Check fields ---
    qc_result: Optional[
        Union[Selector(kind=[STRING_KIND]), Literal["pass", "fail"]]
    ] = Field(
        default=None,
        description="Quality check result: pass or fail.",
        examples=["pass", "fail", "$steps.qc_logic.result"],
        json_schema_extra={
            "relevant_for": QUALITY_CHECK_RELEVANT,
            "always_visible": True,
        },
    )

    # --- External ID (shared across schemas) ---
    external_id: Optional[Union[Selector(kind=[STRING_KIND]), str]] = Field(
        default=None,
        description="External identifier for correlation with other systems (max 1000 chars).",
        examples=["batch-2025-001", "$inputs.external_id"],
        json_schema_extra={
            "relevant_for": ALL_DATA_SCHEMAS_RELEVANT,
        },
    )

    # --- Inventory Count fields ---
    location: Optional[Union[Selector(kind=[STRING_KIND]), str]] = Field(
        default=None,
        description="Location identifier for inventory count.",
        examples=["warehouse-A", "$inputs.location"],
        json_schema_extra={"relevant_for": INVENTORY_COUNT_RELEVANT},
    )
    item_count: Optional[Union[Selector(kind=[INTEGER_KIND]), int]] = Field(
        default=None,
        description="Number of items counted.",
        examples=[42, "$steps.counter.count"],
        json_schema_extra={"relevant_for": INVENTORY_COUNT_RELEVANT},
    )
    item_type: Optional[Union[Selector(kind=[STRING_KIND]), str]] = Field(
        default=None,
        description="Type of item being counted.",
        examples=["widget", "$inputs.item_type"],
        json_schema_extra={"relevant_for": INVENTORY_COUNT_RELEVANT},
    )

    # --- Safety Alert fields ---
    alert_type: Optional[Union[Selector(kind=[STRING_KIND]), str]] = Field(
        default=None,
        description="Alert type identifier (alphanumeric, underscores, hyphens).",
        examples=["no_hardhat", "$steps.classifier.top_class"],
        json_schema_extra={"relevant_for": SAFETY_ALERT_RELEVANT},
    )
    severity: Optional[
        Union[Selector(kind=[STRING_KIND]), Literal["low", "medium", "high"]]
    ] = Field(
        default=None,
        description="Severity level for the safety alert.",
        examples=["high", "$inputs.severity"],
        json_schema_extra={"relevant_for": SAFETY_ALERT_RELEVANT},
    )
    alert_description: Optional[Union[Selector(kind=[STRING_KIND]), str]] = Field(
        default=None,
        description="Description of the safety alert (max 10000 chars).",
        examples=["Worker detected without hardhat in zone B"],
        json_schema_extra={"relevant_for": SAFETY_ALERT_RELEVANT},
    )

    # --- Custom Event fields ---
    custom_value: Optional[Union[Selector(kind=[STRING_KIND]), str]] = Field(
        default=None,
        description="Arbitrary value for custom events (max 10000 chars).",
        examples=["anomaly detected at 14:32"],
        json_schema_extra={"relevant_for": CUSTOM_RELEVANT},
    )

    # --- Annotation pass-through ---
    object_detections: Optional[Selector(kind=[OBJECT_DETECTION_PREDICTION_KIND])] = (
        Field(
            default=None,
            description="Object detection predictions to attach to the image.",
            json_schema_extra={"additional_section": True},
        )
    )
    classifications: Optional[Selector(kind=[CLASSIFICATION_PREDICTION_KIND])] = Field(
        default=None,
        description="Classification predictions to attach to the image.",
        json_schema_extra={"additional_section": True},
    )
    instance_segmentations: Optional[
        Selector(kind=[INSTANCE_SEGMENTATION_PREDICTION_KIND])
    ] = Field(
        default=None,
        description="Instance segmentation predictions to attach to the image.",
        json_schema_extra={"additional_section": True},
    )
    keypoint_detections: Optional[
        Selector(kind=[KEYPOINT_DETECTION_PREDICTION_KIND])
    ] = Field(
        default=None,
        description="Keypoint detection predictions to attach to the image.",
        json_schema_extra={"additional_section": True},
    )

    # --- Execution control ---
    fire_and_forget: Union[bool, Selector(kind=[BOOLEAN_KIND])] = Field(
        default=True,
        description="If True, send the event asynchronously (no event_id returned). "
        "If False, wait for the response and return the event_id.",
        examples=[True, False, "$inputs.fire_and_forget"],
    )
    disable_sink: Union[bool, Selector(kind=[BOOLEAN_KIND])] = Field(
        default=False,
        description="If True, skip sending the event entirely.",
        examples=[False, "$inputs.disable_event_writer"],
    )
    request_timeout: Union[int, Selector(kind=[INTEGER_KIND])] = Field(
        default=5,
        description="HTTP request timeout in seconds.",
        examples=[5, 10],
    )

    @classmethod
    def describe_outputs(cls) -> List[OutputDefinition]:
        return [
            OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
            OutputDefinition(name="event_id", kind=[STRING_KIND]),
            OutputDefinition(name="message", kind=[STRING_KIND]),
        ]

    @classmethod
    def get_execution_engine_compatibility(cls) -> Optional[str]:
        return ">=1.3.0,<2.0.0"

EventWriterSinkBlockV1 ¶

Bases: WorkflowBlock

Sends structured events to the Event Ingestion Service via its v2 API.

Supports fire-and-forget (default) or synchronous execution modes. In fire-and-forget mode the HTTP request runs in the background and event_id will be empty. In synchronous mode the block waits for the response and returns the created event_id.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/event_writer/v1.py

class EventWriterSinkBlockV1(WorkflowBlock):
    """Sends structured events to the Event Ingestion Service via its v2 API.

    Supports fire-and-forget (default) or synchronous execution modes.
    In fire-and-forget mode the HTTP request runs in the background and
    ``event_id`` will be empty. In synchronous mode the block waits for
    the response and returns the created ``event_id``.
    """

    def __init__(
        self,
        background_tasks: Optional[BackgroundTasks],
        thread_pool_executor: Optional[ThreadPoolExecutor],
    ):
        self._background_tasks = background_tasks
        self._thread_pool_executor = thread_pool_executor

    @classmethod
    def get_init_parameters(cls) -> List[str]:
        return ["background_tasks", "thread_pool_executor"]

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return BlockManifest

    def run(
        self,
        event_ingestion_url: str,
        event_schema: str,
        output_image: WorkflowImageData,
        fire_and_forget: bool,
        disable_sink: bool,
        request_timeout: int,
        input_image: Optional[WorkflowImageData] = None,
        image_label: Optional[str] = None,
        custom_metadata: Optional[Dict[str, Any]] = None,
        qc_result: Optional[str] = None,
        external_id: Optional[str] = None,
        location: Optional[str] = None,
        item_count: Optional[int] = None,
        item_type: Optional[str] = None,
        alert_type: Optional[str] = None,
        severity: Optional[str] = None,
        alert_description: Optional[str] = None,
        custom_value: Optional[str] = None,
        object_detections: Optional[Any] = None,
        classifications: Optional[Any] = None,
        instance_segmentations: Optional[Any] = None,
        keypoint_detections: Optional[Any] = None,
    ) -> BlockResult:
        """Build and send a v2 event to the Event Ingestion Service.

        Args:
            event_ingestion_url (str): Base URL of the Event Ingestion Service.
            event_schema (str): One of 'quality_check', 'inventory_count', 'safety_alert', or 'custom'.
            output_image (WorkflowImageData): The output/visualization image sent as the primary display image.
            fire_and_forget (bool): When True the request runs in the background and event_id will be empty.
            disable_sink (bool): When True the block is skipped entirely.
            request_timeout (int): HTTP request timeout in seconds.

        Returns:
            dict: A dictionary with ``error_status`` (bool), ``event_id`` (str), and ``message`` (str).
        """
        if disable_sink:
            return {
                "error_status": False,
                "event_id": "",
                "message": "Sink was disabled by parameter `disable_sink`",
            }

        url = event_ingestion_url.rstrip("/")

        event_data = _build_event_data(
            event_schema=event_schema,
            qc_result=qc_result,
            external_id=external_id,
            location=location,
            item_count=item_count,
            item_type=item_type,
            alert_type=alert_type,
            severity=severity,
            alert_description=alert_description,
            custom_value=custom_value,
        )

        image_entry = _build_image_entry(
            output_image=output_image,
            input_image=input_image,
            image_label=image_label,
            object_detections=object_detections,
            classifications=classifications,
            instance_segmentations=instance_segmentations,
            keypoint_detections=keypoint_detections,
        )

        payload: Dict[str, Any] = {
            "inference_timestamp": datetime.now(timezone.utc).isoformat(),
            "event_schema": event_schema,
            "event_data": event_data,
            "images": [image_entry],
            "displayImagePosition": 0,
        }
        if custom_metadata is not None:
            payload["custom_metadata"] = custom_metadata

        request_handler = partial(
            _execute_event_request,
            url=f"{url}/v2/events",
            payload=payload,
            api_key=os.environ.get("EVENT_INGESTION_API_KEY"),
            timeout=request_timeout,
        )

        if fire_and_forget and self._background_tasks:
            self._background_tasks.add_task(request_handler)
            return {
                "error_status": False,
                "event_id": "",
                "message": "Event sent in background task",
            }
        if fire_and_forget and self._thread_pool_executor:
            self._thread_pool_executor.submit(request_handler)
            return {
                "error_status": False,
                "event_id": "",
                "message": "Event sent in background task",
            }

        error_status, message, event_id = request_handler()
        return {
            "error_status": error_status,
            "event_id": event_id,
            "message": message,
        }

Methods:¶

run ¶

run(
    event_ingestion_url,
    event_schema,
    output_image,
    fire_and_forget,
    disable_sink,
    request_timeout,
    input_image=None,
    image_label=None,
    custom_metadata=None,
    qc_result=None,
    external_id=None,
    location=None,
    item_count=None,
    item_type=None,
    alert_type=None,
    severity=None,
    alert_description=None,
    custom_value=None,
    object_detections=None,
    classifications=None,
    instance_segmentations=None,
    keypoint_detections=None,
)

Build and send a v2 event to the Event Ingestion Service.

Parameters:

Name	Type	Description	Default
`event_ingestion_url`	`str`	Base URL of the Event Ingestion Service.	required
`event_schema`	`str`	One of 'quality_check', 'inventory_count', 'safety_alert', or 'custom'.	required
`output_image`	`WorkflowImageData`	The output/visualization image sent as the primary display image.	required
`fire_and_forget`	`bool`	When True the request runs in the background and event_id will be empty.	required
`disable_sink`	`bool`	When True the block is skipped entirely.	required
`request_timeout`	`int`	HTTP request timeout in seconds.	required

Returns:

Name	Type	Description
`dict`	`BlockResult`	A dictionary with `error_status` (bool), `event_id` (str), and `message` (str).

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/event_writer/v1.py

def run(
    self,
    event_ingestion_url: str,
    event_schema: str,
    output_image: WorkflowImageData,
    fire_and_forget: bool,
    disable_sink: bool,
    request_timeout: int,
    input_image: Optional[WorkflowImageData] = None,
    image_label: Optional[str] = None,
    custom_metadata: Optional[Dict[str, Any]] = None,
    qc_result: Optional[str] = None,
    external_id: Optional[str] = None,
    location: Optional[str] = None,
    item_count: Optional[int] = None,
    item_type: Optional[str] = None,
    alert_type: Optional[str] = None,
    severity: Optional[str] = None,
    alert_description: Optional[str] = None,
    custom_value: Optional[str] = None,
    object_detections: Optional[Any] = None,
    classifications: Optional[Any] = None,
    instance_segmentations: Optional[Any] = None,
    keypoint_detections: Optional[Any] = None,
) -> BlockResult:
    """Build and send a v2 event to the Event Ingestion Service.

    Args:
        event_ingestion_url (str): Base URL of the Event Ingestion Service.
        event_schema (str): One of 'quality_check', 'inventory_count', 'safety_alert', or 'custom'.
        output_image (WorkflowImageData): The output/visualization image sent as the primary display image.
        fire_and_forget (bool): When True the request runs in the background and event_id will be empty.
        disable_sink (bool): When True the block is skipped entirely.
        request_timeout (int): HTTP request timeout in seconds.

    Returns:
        dict: A dictionary with ``error_status`` (bool), ``event_id`` (str), and ``message`` (str).
    """
    if disable_sink:
        return {
            "error_status": False,
            "event_id": "",
            "message": "Sink was disabled by parameter `disable_sink`",
        }

    url = event_ingestion_url.rstrip("/")

    event_data = _build_event_data(
        event_schema=event_schema,
        qc_result=qc_result,
        external_id=external_id,
        location=location,
        item_count=item_count,
        item_type=item_type,
        alert_type=alert_type,
        severity=severity,
        alert_description=alert_description,
        custom_value=custom_value,
    )

    image_entry = _build_image_entry(
        output_image=output_image,
        input_image=input_image,
        image_label=image_label,
        object_detections=object_detections,
        classifications=classifications,
        instance_segmentations=instance_segmentations,
        keypoint_detections=keypoint_detections,
    )

    payload: Dict[str, Any] = {
        "inference_timestamp": datetime.now(timezone.utc).isoformat(),
        "event_schema": event_schema,
        "event_data": event_data,
        "images": [image_entry],
        "displayImagePosition": 0,
    }
    if custom_metadata is not None:
        payload["custom_metadata"] = custom_metadata

    request_handler = partial(
        _execute_event_request,
        url=f"{url}/v2/events",
        payload=payload,
        api_key=os.environ.get("EVENT_INGESTION_API_KEY"),
        timeout=request_timeout,
    )

    if fire_and_forget and self._background_tasks:
        self._background_tasks.add_task(request_handler)
        return {
            "error_status": False,
            "event_id": "",
            "message": "Event sent in background task",
        }
    if fire_and_forget and self._thread_pool_executor:
        self._thread_pool_executor.submit(request_handler)
        return {
            "error_status": False,
            "event_id": "",
            "message": "Event sent in background task",
        }

    error_status, message, event_id = request_handler()
    return {
        "error_status": error_status,
        "event_id": event_id,
        "message": message,
    }

`enterprise/workflows/enterprise_blocks/sinks/microsoft_sql_server`¶

inference.enterprise.workflows.enterprise_blocks.sinks.microsoft_sql_server.v1 ¶

Classes¶

SQLServerConnectionError ¶

Bases: SQLServerError

Exception raised for connection-related errors

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/microsoft_sql_server/v1.py

class SQLServerConnectionError(SQLServerError):
    """Exception raised for connection-related errors"""

    pass

SQLServerError ¶

Bases: Exception

Base exception for SQL Server related errors

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/microsoft_sql_server/v1.py

class SQLServerError(Exception):
    """Base exception for SQL Server related errors"""

    pass

SQLServerInsertError ¶

Bases: SQLServerError

Exception raised for insert operation errors

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/microsoft_sql_server/v1.py

class SQLServerInsertError(SQLServerError):
    """Exception raised for insert operation errors"""

    pass

`enterprise/workflows/enterprise_blocks/sinks/opc_writer`¶

inference.enterprise.workflows.enterprise_blocks.sinks.opc_writer.v1 ¶

Classes¶

OPCUAConnectionManager ¶

Thread-safe connection manager for OPC UA clients with connection pooling and circuit breaker pattern.

Maintains a pool of connections keyed by (url, user_name) to avoid creating new connections for every write operation. Uses circuit breaker to fail fast when servers are unreachable.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

class OPCUAConnectionManager:
    """
    Thread-safe connection manager for OPC UA clients with connection pooling
    and circuit breaker pattern.

    Maintains a pool of connections keyed by (url, user_name) to avoid creating
    new connections for every write operation. Uses circuit breaker to fail fast
    when servers are unreachable.
    """

    _instance: Optional["OPCUAConnectionManager"] = None
    _lock = threading.Lock()

    # Circuit breaker: how long to wait before trying a failed server again
    CIRCUIT_BREAKER_TIMEOUT_SECONDS = 2.0

    def __new__(cls) -> "OPCUAConnectionManager":
        """Singleton pattern to ensure one connection manager across the application."""
        if cls._instance is None:
            with cls._lock:
                if cls._instance is None:
                    cls._instance = super().__new__(cls)
                    cls._instance._initialized = False
        return cls._instance

    def __init__(self):
        if self._initialized:
            return
        self._connections: Dict[str, Client] = {}
        self._connection_locks: Dict[str, threading.Lock] = {}
        self._connection_metadata: Dict[str, dict] = {}
        self._connection_failures: Dict[str, float] = (
            {}
        )  # key -> timestamp of last failure
        self._global_lock = threading.Lock()
        self._tloop: Optional[ThreadLoop] = None
        self._initialized = True
        logger.debug("OPC UA Connection Manager initialized")

    def _get_tloop(self) -> ThreadLoop:
        """Get or create the shared ThreadLoop for all clients."""
        if self._tloop is None or not self._tloop.is_alive():
            logger.debug("OPC UA Connection Manager creating shared ThreadLoop")
            self._tloop = ThreadLoop(timeout=120)
            self._tloop.start()
        return self._tloop

    def _stop_tloop(self) -> None:
        """Stop the shared ThreadLoop if it exists."""
        if self._tloop is not None and self._tloop.is_alive():
            logger.debug("OPC UA Connection Manager stopping shared ThreadLoop")
            try:
                self._tloop.loop.call_soon_threadsafe(self._tloop.loop.stop)
                self._tloop.join(timeout=2.0)
            except Exception as exc:
                logger.debug(f"OPC UA Connection Manager ThreadLoop stop error: {exc}")
            self._tloop = None

    def _get_connection_key(self, url: str, user_name: Optional[str]) -> str:
        """Generate a unique key for connection pooling."""
        return f"{url}|{user_name or ''}"

    def _get_connection_lock(self, key: str) -> threading.Lock:
        """Get or create a lock for a specific connection."""
        with self._global_lock:
            if key not in self._connection_locks:
                self._connection_locks[key] = threading.Lock()
            return self._connection_locks[key]

    def _create_client(
        self,
        url: str,
        user_name: Optional[str],
        password: Optional[str],
        timeout: int,
    ) -> Client:
        """Create and configure a new OPC UA client using the shared ThreadLoop."""
        logger.debug(f"OPC UA Connection Manager creating client for {url}")
        tloop = self._get_tloop()
        client = Client(url=url, tloop=tloop, sync_wrapper_timeout=timeout)
        if user_name and password:
            client.set_user(user_name)
            client.set_password(password)
        return client

    def _connect_with_retry(
        self,
        client: Client,
        url: str,
        max_retries: int = 3,
        base_backoff: float = 1.0,
    ) -> None:
        """
        Connect to OPC UA server with retry logic and exponential backoff.

        Args:
            client: The OPC UA client to connect
            url: Server URL (for logging)
            max_retries: Maximum number of connection attempts
            base_backoff: Base delay between retries (seconds), doubles each retry

        Raises:
            Exception: If all connection attempts fail
        """
        last_exception = None

        for attempt in range(max_retries):
            try:
                logger.debug(
                    f"OPC UA Connection Manager connecting to {url} "
                    f"(attempt {attempt + 1}/{max_retries})"
                )
                client.connect()
                logger.info(
                    f"OPC UA Connection Manager successfully connected to {url}"
                )
                return
            except BadUserAccessDenied as exc:
                # Auth errors should not be retried - they will keep failing
                logger.error(f"OPC UA Connection Manager authentication failed: {exc}")
                raise Exception(f"AUTH ERROR: {exc}")
            except OSError as exc:
                last_exception = exc
                logger.warning(
                    f"OPC UA Connection Manager network error on attempt {attempt + 1}: {exc}"
                )
            except Exception as exc:
                last_exception = exc
                logger.warning(
                    f"OPC UA Connection Manager connection error on attempt {attempt + 1}: "
                    f"{type(exc).__name__}: {exc}"
                )

            # Don't sleep after the last attempt
            if attempt < max_retries - 1:
                backoff_time = base_backoff * (2**attempt)
                logger.debug(
                    f"OPC UA Connection Manager waiting {backoff_time}s before retry"
                )
                time.sleep(backoff_time)

        # All retries exhausted
        logger.error(
            f"OPC UA Connection Manager failed to connect to {url} "
            f"after {max_retries} attempts"
        )
        if isinstance(last_exception, OSError):
            raise Exception(
                f"NETWORK ERROR: Failed to connect after {max_retries} attempts. Last error: {last_exception}"
            )
        raise Exception(
            f"CONNECTION ERROR: Failed to connect after {max_retries} attempts. Last error: {last_exception}"
        )

    def _is_circuit_open(self, key: str) -> bool:
        """
        Check if circuit breaker is open (server recently failed).
        Returns True if we should NOT attempt connection (fail fast).
        """
        if key not in self._connection_failures:
            return False

        time_since_failure = time.time() - self._connection_failures[key]
        if time_since_failure < self.CIRCUIT_BREAKER_TIMEOUT_SECONDS:
            return True

        # Timeout expired, clear the failure record
        del self._connection_failures[key]
        return False

    def _record_failure(self, key: str) -> None:
        """Record a connection failure for circuit breaker."""
        self._connection_failures[key] = time.time()

    def _clear_failure(self, key: str) -> None:
        """Clear failure record after successful connection."""
        if key in self._connection_failures:
            del self._connection_failures[key]

    def get_connection(
        self,
        url: str,
        user_name: Optional[str],
        password: Optional[str],
        timeout: int,
        max_retries: int = 1,
        base_backoff: float = 0.0,
    ) -> Client:
        """
        Get a connection from the pool or create a new one.

        This method is thread-safe and will reuse existing healthy connections.
        Uses circuit breaker pattern to fail fast for recently failed servers.

        Args:
            url: OPC UA server URL
            user_name: Optional username for authentication
            password: Optional password for authentication
            timeout: Connection timeout in seconds
            max_retries: Maximum number of connection attempts (default 1)
            base_backoff: Base delay between retries (default 0)

        Returns:
            A connected OPC UA client

        Raises:
            Exception: If connection fails or circuit breaker is open
        """
        key = self._get_connection_key(url, user_name)
        lock = self._get_connection_lock(key)

        with lock:
            # Circuit breaker: fail fast if server recently failed
            if self._is_circuit_open(key):
                logger.debug(
                    f"OPC UA Connection Manager circuit breaker open for {url}, "
                    f"failing fast (will retry in {self.CIRCUIT_BREAKER_TIMEOUT_SECONDS}s)"
                )
                raise Exception(
                    f"CIRCUIT OPEN: Server {url} recently failed, skipping connection attempt. "
                    f"Will retry after {self.CIRCUIT_BREAKER_TIMEOUT_SECONDS}s."
                )

            # Check if we have an existing connection
            if key in self._connections:
                logger.debug(f"OPC UA Connection Manager reusing connection for {url}")
                return self._connections[key]

            # Create new connection
            try:
                client = self._create_client(url, user_name, password, timeout)
                self._connect_with_retry(client, url, max_retries, base_backoff)

                # Success - clear any failure record and store in pool
                self._clear_failure(key)
                self._connections[key] = client
                self._connection_metadata[key] = {
                    "url": url,
                    "user_name": user_name,
                    "password": password,
                    "timeout": timeout,
                    "connected_at": datetime.now(),
                }

                return client
            except Exception as exc:
                # Record failure for circuit breaker
                self._record_failure(key)
                raise

    def _safe_disconnect(self, client: Client) -> None:
        """Safely disconnect a client, swallowing any errors."""
        try:
            logger.debug("OPC UA Connection Manager disconnecting client")
            client.disconnect()
        except Exception as exc:
            logger.debug(
                f"OPC UA Connection Manager disconnect error (non-fatal): {exc}"
            )

    def release_connection(
        self, url: str, user_name: Optional[str], force_close: bool = False
    ) -> None:
        """
        Release a connection back to the pool.

        By default, connections are kept alive for reuse. Set force_close=True
        to immediately close the connection.

        Args:
            url: OPC UA server URL
            user_name: Optional username used for the connection
            force_close: If True, close the connection instead of keeping it
        """
        if not force_close:
            # Connection stays in pool for reuse
            return

        key = self._get_connection_key(url, user_name)
        lock = self._get_connection_lock(key)

        with lock:
            if key in self._connections:
                self._safe_disconnect(self._connections[key])
                del self._connections[key]
                if key in self._connection_metadata:
                    del self._connection_metadata[key]
                logger.debug(f"OPC UA Connection Manager closed connection for {url}")

    def invalidate_connection(self, url: str, user_name: Optional[str]) -> None:
        """
        Invalidate a connection, forcing it to be recreated on next use.

        Call this when a connection error occurs during an operation to ensure
        the next operation gets a fresh connection.

        Args:
            url: OPC UA server URL
            user_name: Optional username used for the connection
        """
        key = self._get_connection_key(url, user_name)
        lock = self._get_connection_lock(key)

        with lock:
            if key in self._connections:
                self._safe_disconnect(self._connections[key])
                del self._connections[key]
                if key in self._connection_metadata:
                    del self._connection_metadata[key]
                logger.debug(
                    f"OPC UA Connection Manager invalidated connection for {url}"
                )

    def close_all(self) -> None:
        """Close all connections in the pool and stop the shared ThreadLoop."""
        with self._global_lock:
            for key, client in list(self._connections.items()):
                self._safe_disconnect(client)
            self._connections.clear()
            self._connection_metadata.clear()
            self._stop_tloop()
            logger.info("OPC UA Connection Manager closed all connections")

    def get_pool_stats(self) -> dict:
        """Get statistics about the connection pool."""
        with self._global_lock:
            return {
                "total_connections": len(self._connections),
                "connections": [
                    {
                        "url": meta["url"],
                        "user_name": meta["user_name"],
                        "connected_at": meta["connected_at"].isoformat(),
                    }
                    for meta in self._connection_metadata.values()
                ],
            }

Methods:¶

new ¶

__new__()

Singleton pattern to ensure one connection manager across the application.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def __new__(cls) -> "OPCUAConnectionManager":
    """Singleton pattern to ensure one connection manager across the application."""
    if cls._instance is None:
        with cls._lock:
            if cls._instance is None:
                cls._instance = super().__new__(cls)
                cls._instance._initialized = False
    return cls._instance

close_all ¶

close_all()

Close all connections in the pool and stop the shared ThreadLoop.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def close_all(self) -> None:
    """Close all connections in the pool and stop the shared ThreadLoop."""
    with self._global_lock:
        for key, client in list(self._connections.items()):
            self._safe_disconnect(client)
        self._connections.clear()
        self._connection_metadata.clear()
        self._stop_tloop()
        logger.info("OPC UA Connection Manager closed all connections")

get_connection ¶

get_connection(
    url,
    user_name,
    password,
    timeout,
    max_retries=1,
    base_backoff=0.0,
)

Get a connection from the pool or create a new one.

This method is thread-safe and will reuse existing healthy connections. Uses circuit breaker pattern to fail fast for recently failed servers.

Parameters:

Name	Type	Description	Default
`url`	`str`	OPC UA server URL	required
`user_name`	`Optional[str]`	Optional username for authentication	required
`password`	`Optional[str]`	Optional password for authentication	required
`timeout`	`int`	Connection timeout in seconds	required
`max_retries`	`int`	Maximum number of connection attempts (default 1)	`1`
`base_backoff`	`float`	Base delay between retries (default 0)	`0.0`

Returns:

Type	Description
`Client`	A connected OPC UA client

Raises:

Type	Description
`Exception`	If connection fails or circuit breaker is open

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def get_connection(
    self,
    url: str,
    user_name: Optional[str],
    password: Optional[str],
    timeout: int,
    max_retries: int = 1,
    base_backoff: float = 0.0,
) -> Client:
    """
    Get a connection from the pool or create a new one.

    This method is thread-safe and will reuse existing healthy connections.
    Uses circuit breaker pattern to fail fast for recently failed servers.

    Args:
        url: OPC UA server URL
        user_name: Optional username for authentication
        password: Optional password for authentication
        timeout: Connection timeout in seconds
        max_retries: Maximum number of connection attempts (default 1)
        base_backoff: Base delay between retries (default 0)

    Returns:
        A connected OPC UA client

    Raises:
        Exception: If connection fails or circuit breaker is open
    """
    key = self._get_connection_key(url, user_name)
    lock = self._get_connection_lock(key)

    with lock:
        # Circuit breaker: fail fast if server recently failed
        if self._is_circuit_open(key):
            logger.debug(
                f"OPC UA Connection Manager circuit breaker open for {url}, "
                f"failing fast (will retry in {self.CIRCUIT_BREAKER_TIMEOUT_SECONDS}s)"
            )
            raise Exception(
                f"CIRCUIT OPEN: Server {url} recently failed, skipping connection attempt. "
                f"Will retry after {self.CIRCUIT_BREAKER_TIMEOUT_SECONDS}s."
            )

        # Check if we have an existing connection
        if key in self._connections:
            logger.debug(f"OPC UA Connection Manager reusing connection for {url}")
            return self._connections[key]

        # Create new connection
        try:
            client = self._create_client(url, user_name, password, timeout)
            self._connect_with_retry(client, url, max_retries, base_backoff)

            # Success - clear any failure record and store in pool
            self._clear_failure(key)
            self._connections[key] = client
            self._connection_metadata[key] = {
                "url": url,
                "user_name": user_name,
                "password": password,
                "timeout": timeout,
                "connected_at": datetime.now(),
            }

            return client
        except Exception as exc:
            # Record failure for circuit breaker
            self._record_failure(key)
            raise

get_pool_stats ¶

get_pool_stats()

Get statistics about the connection pool.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def get_pool_stats(self) -> dict:
    """Get statistics about the connection pool."""
    with self._global_lock:
        return {
            "total_connections": len(self._connections),
            "connections": [
                {
                    "url": meta["url"],
                    "user_name": meta["user_name"],
                    "connected_at": meta["connected_at"].isoformat(),
                }
                for meta in self._connection_metadata.values()
            ],
        }

invalidate_connection ¶

invalidate_connection(url, user_name)

Invalidate a connection, forcing it to be recreated on next use.

Call this when a connection error occurs during an operation to ensure the next operation gets a fresh connection.

Parameters:

Name	Type	Description	Default
`url`	`str`	OPC UA server URL	required
`user_name`	`Optional[str]`	Optional username used for the connection	required

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def invalidate_connection(self, url: str, user_name: Optional[str]) -> None:
    """
    Invalidate a connection, forcing it to be recreated on next use.

    Call this when a connection error occurs during an operation to ensure
    the next operation gets a fresh connection.

    Args:
        url: OPC UA server URL
        user_name: Optional username used for the connection
    """
    key = self._get_connection_key(url, user_name)
    lock = self._get_connection_lock(key)

    with lock:
        if key in self._connections:
            self._safe_disconnect(self._connections[key])
            del self._connections[key]
            if key in self._connection_metadata:
                del self._connection_metadata[key]
            logger.debug(
                f"OPC UA Connection Manager invalidated connection for {url}"
            )

release_connection ¶

release_connection(url, user_name, force_close=False)

Release a connection back to the pool.

By default, connections are kept alive for reuse. Set force_close=True to immediately close the connection.

Parameters:

Name	Type	Description	Default
`url`	`str`	OPC UA server URL	required
`user_name`	`Optional[str]`	Optional username used for the connection	required
`force_close`	`bool`	If True, close the connection instead of keeping it	`False`

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def release_connection(
    self, url: str, user_name: Optional[str], force_close: bool = False
) -> None:
    """
    Release a connection back to the pool.

    By default, connections are kept alive for reuse. Set force_close=True
    to immediately close the connection.

    Args:
        url: OPC UA server URL
        user_name: Optional username used for the connection
        force_close: If True, close the connection instead of keeping it
    """
    if not force_close:
        # Connection stays in pool for reuse
        return

    key = self._get_connection_key(url, user_name)
    lock = self._get_connection_lock(key)

    with lock:
        if key in self._connections:
            self._safe_disconnect(self._connections[key])
            del self._connections[key]
            if key in self._connection_metadata:
                del self._connection_metadata[key]
            logger.debug(f"OPC UA Connection Manager closed connection for {url}")

UnsupportedTypeError ¶

Bases: Exception

Raised when an unsupported value type is specified

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

class UnsupportedTypeError(Exception):
    """Raised when an unsupported value type is specified"""

    pass

Functions:¶

get_available_namespaces ¶

get_available_namespaces(client)

Get list of available namespaces from OPC UA server. Returns empty list if unable to fetch namespaces.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def get_available_namespaces(client: Client) -> List[str]:
    """
    Get list of available namespaces from OPC UA server.
    Returns empty list if unable to fetch namespaces.
    """
    try:
        get_namespace_array = sync_async_client_method(AsyncClient.get_namespace_array)(
            client
        )
        return get_namespace_array()
    except Exception as exc:
        logger.info(f"Failed to get namespace array (non-fatal): {exc}")
        return ["<unable to fetch namespaces>"]

get_connection_manager ¶

get_connection_manager()

Get the global OPC UA connection manager instance.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def get_connection_manager() -> OPCUAConnectionManager:
    """Get the global OPC UA connection manager instance."""
    global _connection_manager
    if _connection_manager is None:
        _connection_manager = OPCUAConnectionManager()
    return _connection_manager

get_node_data_type ¶

get_node_data_type(var)

Get the data type of an OPC UA node. Returns a string representation of the type, or "Unknown" if unable to read.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def get_node_data_type(var) -> str:
    """
    Get the data type of an OPC UA node.
    Returns a string representation of the type, or "Unknown" if unable to read.
    """
    try:
        return str(var.read_data_type_as_variant_type())
    except Exception as exc:
        logger.info(f"Unable to read node data type: {exc}")
        return "Unknown"

opc_connect_and_write_value ¶

opc_connect_and_write_value(
    url,
    namespace,
    user_name,
    password,
    object_name,
    variable_name,
    value,
    timeout,
    node_lookup_mode="hierarchical",
    value_type="String",
    max_retries=1,
    retry_backoff_seconds=0.0,
)

Connect to OPC UA server and write a value using connection pooling.

Uses the connection manager to reuse existing connections. If no connection exists, attempts to create one. Fails fast on connection errors to avoid blocking the pipeline.

Parameters:

Name	Type	Description	Default
`url`	`str`	OPC UA server URL	required
`namespace`	`str`	Namespace URI or index	required
`user_name`	`Optional[str]`	Optional username for authentication	required
`password`	`Optional[str]`	Optional password for authentication	required
`object_name`	`str`	Target object path	required
`variable_name`	`str`	Variable to write	required
`value`	`Union[bool, float, int, str]`	Value to write	required
`timeout`	`int`	Connection timeout in seconds	required
`node_lookup_mode`	`Literal['hierarchical', 'direct']`	Path lookup strategy ('hierarchical' or 'direct')	`'hierarchical'`
`value_type`	`str`	OPC UA data type for the value	`'String'`
`max_retries`	`int`	Maximum number of connection attempts (default 1 = no retries)	`1`
`retry_backoff_seconds`	`float`	Base delay between retries (default 0 = no delay)	`0.0`

Returns:

Type	Description
`Tuple[bool, str]`	Tuple of (error_status, message)

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def opc_connect_and_write_value(
    url: str,
    namespace: str,
    user_name: Optional[str],
    password: Optional[str],
    object_name: str,
    variable_name: str,
    value: Union[bool, float, int, str],
    timeout: int,
    node_lookup_mode: Literal["hierarchical", "direct"] = "hierarchical",
    value_type: str = "String",
    max_retries: int = 1,
    retry_backoff_seconds: float = 0.0,
) -> Tuple[bool, str]:
    """
    Connect to OPC UA server and write a value using connection pooling.

    Uses the connection manager to reuse existing connections. If no connection
    exists, attempts to create one. Fails fast on connection errors to avoid
    blocking the pipeline.

    Args:
        url: OPC UA server URL
        namespace: Namespace URI or index
        user_name: Optional username for authentication
        password: Optional password for authentication
        object_name: Target object path
        variable_name: Variable to write
        value: Value to write
        timeout: Connection timeout in seconds
        node_lookup_mode: Path lookup strategy ('hierarchical' or 'direct')
        value_type: OPC UA data type for the value
        max_retries: Maximum number of connection attempts (default 1 = no retries)
        retry_backoff_seconds: Base delay between retries (default 0 = no delay)

    Returns:
        Tuple of (error_status, message)
    """
    logger.debug(
        f"OPC Writer attempting to write value={value} to {url}/{object_name}/{variable_name}"
    )

    connection_manager = get_connection_manager()

    try:
        # Get connection from pool (will create new if needed)
        client = connection_manager.get_connection(
            url=url,
            user_name=user_name,
            password=password,
            timeout=timeout,
            max_retries=max_retries,
            base_backoff=retry_backoff_seconds,
        )

        # Perform the write operation
        _opc_write_value(
            client=client,
            namespace=namespace,
            object_name=object_name,
            variable_name=variable_name,
            value=value,
            node_lookup_mode=node_lookup_mode,
            value_type=value_type,
        )

        logger.debug(
            f"OPC Writer successfully wrote value to {url}/{object_name}/{variable_name}"
        )
        return False, "Value set successfully"

    except Exception as exc:
        is_user_config_error = isinstance(exc, USER_CONFIG_ERROR_TYPES)

        # Check the exception chain for wrapped errors
        if not is_user_config_error and hasattr(exc, "__cause__") and exc.__cause__:
            is_user_config_error = isinstance(exc.__cause__, USER_CONFIG_ERROR_TYPES)

        if not is_user_config_error:
            logger.warning(
                f"OPC Writer error (invalidating connection): {type(exc).__name__}: {exc}"
            )
            connection_manager.invalidate_connection(url, user_name)
        else:
            # User configuration errors - connection is fine, just log the error
            logger.error(f"OPC Writer configuration error: {type(exc).__name__}: {exc}")

        return (
            True,
            f"Failed to write {value} to {object_name}:{variable_name} in {url}. Error: {exc}",
        )

safe_disconnect ¶

safe_disconnect(client)

Safely disconnect from OPC UA server, swallowing any errors

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/opc_writer/v1.py

def safe_disconnect(client: Client) -> None:
    """Safely disconnect from OPC UA server, swallowing any errors"""
    try:
        logger.debug("OPC Writer disconnecting from server")
        client.disconnect()
    except Exception as exc:
        logger.debug(f"OPC Writer disconnect error (non-fatal): {exc}")

`enterprise/workflows/enterprise_blocks/sinks/plc`¶

inference.enterprise.workflows.enterprise_blocks.sinks.plc.client ¶

Shared HTTP client helpers for the PLC Relay reader/writer blocks.

Both blocks talk to the on-device PLC Relay service over HTTP rather than opening a direct PLC connection. The relay owns the protocol (Allen-Bradley, Modbus, or Siemens S7), the device IP, and the tag schema.

These helpers use the relay's batch endpoints (/read_batch, /write_batch) so that reading or writing N tags costs a single HTTP round-trip and a single PLC transaction per frame, which matters at high FPS. The relay rejects the whole batch with an HTTP error only for structural problems (an unknown tag, a non-writable tag, or an empty/duplicate list); in that case every tag in the batch is reported as a failure and the relay's error detail is logged. A per-tag value problem (wrong type or out of range) does NOT fail the batch: the relay returns HTTP 200 and marks just that tag with success=false, so individual tag errors are reported and logged per tag.

Functions:¶

read_tags ¶

read_tags(session, base_url, tags, timeout)

Read a batch of tags through the relay's /read_batch endpoint.

Returns (tag_values, had_failure). tag_values maps each requested tag to its value, or to the READ_FAILURE sentinel when that tag could not be read. had_failure is True if any tag failed.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/client.py

def read_tags(
    session: requests.Session,
    base_url: str,
    tags: Iterable[str],
    timeout: int,
) -> Tuple[Dict[str, Any], bool]:
    """Read a batch of tags through the relay's ``/read_batch`` endpoint.

    Returns ``(tag_values, had_failure)``. ``tag_values`` maps each requested tag to
    its value, or to the ``READ_FAILURE`` sentinel when that tag could not be read.
    ``had_failure`` is ``True`` if any tag failed.
    """
    tags = list(tags)
    if not tags:
        return {}, False

    try:
        response = session.post(
            f"{base_url}/read_batch",
            json={"tags": tags},
            timeout=_request_timeout(timeout),
        )
    except (requests.exceptions.RequestException, ValueError) as e:
        # ValueError covers a non-positive (connect, read) timeout, which `requests`
        # rejects before sending; treat it like any other relay failure rather than
        # letting it crash the step.
        logger.error("Failed to reach PLC Relay while reading tags %s: %s", tags, e)
        return _all_failed(tags, READ_FAILURE), True

    if response.status_code != 200:
        # Whole-batch rejection (e.g. an unknown tag yields 404 for the entire batch).
        logger.error(
            "Error reading tags %s from PLC Relay: HTTP %s: %s",
            tags,
            response.status_code,
            _extract_detail(response),
        )
        return _all_failed(tags, READ_FAILURE), True

    data = _parse_json_object(response)
    if data is None:
        logger.error(
            "Malformed success response reading tags %s from PLC Relay: %s",
            tags,
            response.text,
        )
        return _all_failed(tags, READ_FAILURE), True

    by_name = _index_by_name(data.get("tags"))
    results: Dict[str, Any] = {}
    had_failure = False
    for tag in tags:
        entry = by_name.get(tag)
        if entry is None:
            logger.error("PLC Relay did not return a value for tag '%s'", tag)
            results[tag] = READ_FAILURE
            had_failure = True
        elif entry.get("error"):
            logger.error("Error reading tag '%s' from PLC: %s", tag, entry["error"])
            results[tag] = READ_FAILURE
            had_failure = True
        else:
            results[tag] = entry.get("value")
    return results, had_failure

relay_base_url ¶

relay_base_url(ip_address, relay_port)

Build the relay base URL from a host/IP and port.

Accepts a bare host/IP (192.168.1.10 -> http://192.168.1.10:<port>) or a full URL (http://host:8007), which is used as-is.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/client.py

def relay_base_url(ip_address: str, relay_port: int) -> str:
    """Build the relay base URL from a host/IP and port.

    Accepts a bare host/IP (``192.168.1.10`` -> ``http://192.168.1.10:<port>``) or a
    full URL (``http://host:8007``), which is used as-is.
    """
    addr = str(ip_address).strip()
    if addr.startswith("http://") or addr.startswith("https://"):
        return addr.rstrip("/")
    return f"http://{addr}:{relay_port}"

write_tags ¶

write_tags(session, base_url, tags_to_write, timeout)

Write a batch of tags through the relay's /write_batch endpoint.

Returns (write_results, had_failure). write_results maps each tag to WRITE_SUCCESS or the WRITE_FAILURE sentinel. had_failure is True if any tag failed.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/client.py

def write_tags(
    session: requests.Session,
    base_url: str,
    tags_to_write: Mapping[str, Any],
    timeout: int,
) -> Tuple[Dict[str, str], bool]:
    """Write a batch of tags through the relay's ``/write_batch`` endpoint.

    Returns ``(write_results, had_failure)``. ``write_results`` maps each tag to
    ``WRITE_SUCCESS`` or the ``WRITE_FAILURE`` sentinel. ``had_failure`` is ``True``
    if any tag failed.
    """
    if not tags_to_write:
        return {}, False

    names = list(tags_to_write)
    writes = [{"name": name, "value": value} for name, value in tags_to_write.items()]

    try:
        response = session.post(
            f"{base_url}/write_batch",
            json={"writes": writes},
            timeout=_request_timeout(timeout),
        )
    except (requests.exceptions.RequestException, ValueError) as e:
        # ValueError covers a non-positive (connect, read) timeout, which `requests`
        # rejects before sending; treat it like any other relay failure rather than
        # letting it crash the step.
        logger.error("Failed to reach PLC Relay while writing tags %s: %s", names, e)
        return _all_failed(names, WRITE_FAILURE), True

    if response.status_code != 200:
        # Whole-batch rejection: unknown tag (404), non-writable tag (403), or an
        # empty/duplicate write list (400). The detail names the offending tag.
        logger.error(
            "Error writing tags %s to PLC Relay: HTTP %s: %s",
            names,
            response.status_code,
            _extract_detail(response),
        )
        return _all_failed(names, WRITE_FAILURE), True

    data = _parse_json_object(response)
    if data is None:
        logger.error(
            "Malformed success response writing tags %s to PLC Relay: %s",
            names,
            response.text,
        )
        return _all_failed(names, WRITE_FAILURE), True

    by_name = _index_by_name(data.get("results"))
    results: Dict[str, str] = {}
    had_failure = False
    for name, value in tags_to_write.items():
        entry = by_name.get(name)
        if entry is None:
            logger.error("PLC Relay did not return a result for tag '%s'", name)
            results[name] = WRITE_FAILURE
            had_failure = True
        elif not entry.get("success", False):
            # The relay accepted the request but rejected the value (wrong type or
            # out of range for the tag's data type).
            logger.error(
                "Error writing tag '%s' with value '%s': %s",
                name,
                value,
                entry.get("error", WRITE_FAILURE),
            )
            results[name] = WRITE_FAILURE
            had_failure = True
        else:
            results[name] = WRITE_SUCCESS
    return results, had_failure

inference.enterprise.workflows.enterprise_blocks.sinks.plc.direct ¶

Direct (no-relay) PLC transports for the PLC Reader/Writer blocks.

Two protocols are supported when the block connects straight to the PLC instead of going through the on-device PLC Relay service:

EtherNet/IP via pylogix — tags are addressed by name (e.g. Program:Main.Tag).
Modbus TCP via pymodbus — tags are addressed as area:address strings (holding:100, coil:0, input:5, discrete:2); a bare number defaults to a holding register (100 == holding:100).

The connection object (ModbusTcpClient / pylogix.PLC) is created once via the make_*_client factories and then reused across frames by the block — opening and closing a TCP connection on every frame is a throughput/reliability problem for the high-FPS, frame-by-frame workflows these blocks target. The block owns the connection's lifetime; the read/write functions below take a live client, (re)connect it lazily if needed, and never close it.

Every read/write function returns (results, had_failure) with the same sentinels the relay client uses, so the blocks can treat all three transports uniformly. A failure on one tag is logged and does not stop the remaining tags.

Functions:¶

ethernet_read_tags ¶

ethernet_read_tags(comm, tags)

Read tags from a PLC over EtherNet/IP. Returns (tag_values, had_failure).

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/direct.py

def ethernet_read_tags(comm, tags: List[str]) -> Tuple[Dict[str, Any], bool]:
    """Read tags from a PLC over EtherNet/IP. Returns (tag_values, had_failure)."""
    results: Dict[str, Any] = {}
    had_failure = False
    for tag in tags:
        value, ok = _eip_read_one(comm, tag)
        results[tag] = value
        had_failure = had_failure or not ok
    return results, had_failure

ethernet_write_tags ¶

ethernet_write_tags(comm, tags_to_write)

Write tags to a PLC over EtherNet/IP. Returns (write_results, had_failure).

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/direct.py

def ethernet_write_tags(
    comm, tags_to_write: Mapping[str, Any]
) -> Tuple[Dict[str, str], bool]:
    """Write tags to a PLC over EtherNet/IP. Returns (write_results, had_failure)."""
    results: Dict[str, str] = {}
    had_failure = False
    for tag, value in tags_to_write.items():
        status, ok = _eip_write_one(comm, tag, value)
        results[tag] = status
        had_failure = had_failure or not ok
    return results, had_failure

make_eip_client ¶

make_eip_client(ip_address, processor_slot)

Create a reusable pylogix EtherNet/IP connection for a PLC.

The returned object is reused across frames by the block and closed (via Close()) only when the block is torn down or the target changes; pylogix opens the underlying socket lazily on the first read/write.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/direct.py

def make_eip_client(ip_address: str, processor_slot: int) -> "pylogix.PLC":
    """Create a reusable pylogix EtherNet/IP connection for a PLC.

    The returned object is reused across frames by the block and closed (via ``Close()``)
    only when the block is torn down or the target changes; pylogix opens the underlying
    socket lazily on the first read/write.
    """
    comm = pylogix.PLC()
    comm.IPAddress = ip_address
    comm.ProcessorSlot = processor_slot
    return comm

make_modbus_client ¶

make_modbus_client(ip_address, port)

Create a reusable Modbus TCP client for a PLC.

The client is reused across frames by the block (and reconnected on demand by _ensure_modbus_connected) rather than opened and closed every frame.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/direct.py

def make_modbus_client(ip_address: str, port: int) -> ModbusTcpClient:
    """Create a reusable Modbus TCP client for a PLC.

    The client is reused across frames by the block (and reconnected on demand by
    ``_ensure_modbus_connected``) rather than opened and closed every frame.
    """
    return ModbusTcpClient(ip_address, port=port)

modbus_read_tags ¶

modbus_read_tags(client, unit_id, tags)

Read tags from a PLC over Modbus TCP. Returns (tag_values, had_failure).

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/direct.py

def modbus_read_tags(
    client: ModbusTcpClient, unit_id: int, tags: List[str]
) -> Tuple[Dict[str, Any], bool]:
    """Read tags from a PLC over Modbus TCP. Returns (tag_values, had_failure)."""
    if not _ensure_modbus_connected(client):
        logger.error("Failed to connect to Modbus PLC")
        return {tag: READ_FAILURE for tag in tags}, bool(tags)
    results: Dict[str, Any] = {}
    had_failure = False
    for tag in tags:
        value, ok = _modbus_read_one(client, tag, unit_id)
        results[tag] = value
        had_failure = had_failure or not ok
    return results, had_failure

modbus_write_tags ¶

modbus_write_tags(client, unit_id, tags_to_write)

Write tags to a PLC over Modbus TCP. Returns (write_results, had_failure).

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/direct.py

def modbus_write_tags(
    client: ModbusTcpClient, unit_id: int, tags_to_write: Mapping[str, Any]
) -> Tuple[Dict[str, str], bool]:
    """Write tags to a PLC over Modbus TCP. Returns (write_results, had_failure)."""
    if not _ensure_modbus_connected(client):
        logger.error("Failed to connect to Modbus PLC")
        return {tag: WRITE_FAILURE for tag in tags_to_write}, bool(tags_to_write)
    results: Dict[str, str] = {}
    had_failure = False
    for tag, value in tags_to_write.items():
        status, ok = _modbus_write_one(client, tag, value, unit_id)
        results[tag] = status
        had_failure = had_failure or not ok
    return results, had_failure

inference.enterprise.workflows.enterprise_blocks.sinks.plc.v1 ¶

Classes¶

PLCReaderBlockManifest ¶

Manifest for a block that reads PLC tag values (relay or direct connection).

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/v1.py

class PLCReaderBlockManifest(WorkflowBlockManifest):
    """Manifest for a block that reads PLC tag values (relay or direct connection)."""

    model_config = ConfigDict(
        json_schema_extra={
            "name": "PLC Reader",
            "version": "v1",
            "short_description": "Read PLC tag values via the PLC Relay or a direct EtherNet/IP / Modbus connection.",
            "long_description": READER_LONG_DESCRIPTION,
            "license": "Roboflow Enterprise License",
            "block_type": "transformation",
            "ui_manifest": {**PLC_UI_MANIFEST_BASE, "blockPriority": 12},
        }
    )

    type: Literal["roboflow_core/plc_reader@v1"]

    tags_to_read: Union[
        List[str],
        Selector(kind=[LIST_OF_VALUES_KIND]),
        WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]),
    ] = Field(
        default_factory=list,
        description="PLC tags to read, entered comma-separated (e.g. `camera_msg, sku_number`). "
        "Relay and Direct (EtherNet/IP) modes use tag names. Direct (Modbus TCP) mode uses "
        "`area:address`, where area is `holding`, `input` (read-only), `coil`, or `discrete` "
        "(read-only); a bare number means a holding register (`100` = `holding:100`). "
        "Example for Modbus: `holding:100, coil:0`.",
        examples=[["camera_msg", "sku_number"], ["holding:100", "coil:0"]],
        json_schema_extra={"always_visible": True},
    )

    ip_address: Union[str, WorkflowParameterSelector(kind=[STRING_KIND])] = (
        _ip_address_field()
    )
    connection_mode: Literal["relay", "ethernet_ip", "modbus"] = (
        _connection_mode_field()
    )
    relay_port: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _relay_port_field()
    )
    request_timeout: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _request_timeout_field()
    )
    processor_slot: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _processor_slot_field()
    )
    modbus_port: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _modbus_port_field()
    )
    modbus_unit_id: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _modbus_unit_id_field()
    )

    _validate_request_timeout = field_validator("request_timeout")(
        _check_positive_request_timeout
    )

    @classmethod
    def describe_outputs(cls) -> List[OutputDefinition]:
        return [
            OutputDefinition(name="tag_values", kind=[DICTIONARY_KIND]),
            OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
        ]

    @classmethod
    def get_execution_engine_compatibility(cls) -> Optional[str]:
        return ">=1.0.0,<2.0.0"

PLCReaderBlockV1 ¶

Bases: _PLCConnectionMixin, WorkflowBlock

Reads PLC tag values over the PLC Relay or a direct EtherNet/IP / Modbus connection.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/v1.py

class PLCReaderBlockV1(_PLCConnectionMixin, WorkflowBlock):
    """Reads PLC tag values over the PLC Relay or a direct EtherNet/IP / Modbus connection."""

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return PLCReaderBlockManifest

    def run(
        self,
        tags_to_read: List[str],
        ip_address: str = "127.0.0.1",
        connection_mode: str = "relay",
        relay_port: int = DEFAULT_RELAY_PORT,
        request_timeout: int = 10,
        processor_slot: int = 0,
        modbus_port: int = 502,
        modbus_unit_id: int = 1,
        image: Optional[WorkflowImageData] = None,
        metadata: Optional[VideoMetadata] = None,
    ) -> dict:
        """Read tags from the PLC. Returns `tag_values` and `error_status`."""
        tag_values, had_failure = self._read(
            connection_mode,
            ip_address,
            relay_port,
            processor_slot,
            modbus_port,
            modbus_unit_id,
            request_timeout,
            tags_to_read,
        )
        return {"tag_values": tag_values, "error_status": had_failure}

Methods:¶

run ¶

run(
    tags_to_read,
    ip_address="127.0.0.1",
    connection_mode="relay",
    relay_port=DEFAULT_RELAY_PORT,
    request_timeout=10,
    processor_slot=0,
    modbus_port=502,
    modbus_unit_id=1,
    image=None,
    metadata=None,
)

Read tags from the PLC. Returns tag_values and error_status.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/v1.py

def run(
    self,
    tags_to_read: List[str],
    ip_address: str = "127.0.0.1",
    connection_mode: str = "relay",
    relay_port: int = DEFAULT_RELAY_PORT,
    request_timeout: int = 10,
    processor_slot: int = 0,
    modbus_port: int = 502,
    modbus_unit_id: int = 1,
    image: Optional[WorkflowImageData] = None,
    metadata: Optional[VideoMetadata] = None,
) -> dict:
    """Read tags from the PLC. Returns `tag_values` and `error_status`."""
    tag_values, had_failure = self._read(
        connection_mode,
        ip_address,
        relay_port,
        processor_slot,
        modbus_port,
        modbus_unit_id,
        request_timeout,
        tags_to_read,
    )
    return {"tag_values": tag_values, "error_status": had_failure}

PLCWriterBlockManifest ¶

Manifest for a block that writes PLC tag values (relay or direct connection).

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/v1.py

class PLCWriterBlockManifest(WorkflowBlockManifest):
    """Manifest for a block that writes PLC tag values (relay or direct connection)."""

    model_config = ConfigDict(
        json_schema_extra={
            "name": "PLC Writer",
            "version": "v1",
            "short_description": "Write PLC tag values via the PLC Relay or a direct EtherNet/IP / Modbus connection.",
            "long_description": WRITER_LONG_DESCRIPTION,
            "license": "Roboflow Enterprise License",
            "block_type": "sink",
            "ui_manifest": {**PLC_UI_MANIFEST_BASE, "blockPriority": 13},
        }
    )

    type: Literal["roboflow_core/plc_writer@v1"]

    tag: Union[str, WorkflowParameterSelector(kind=[STRING_KIND])] = Field(
        description="The single PLC tag to write. Relay and Direct (EtherNet/IP) modes use a "
        "tag name (e.g. `camera_fault`); Direct (Modbus TCP) mode uses `area:address` "
        "(`holding:100`, `coil:0`; a bare number is a holding register, and only `holding` "
        "registers and `coil`s are writable, not the read-only `input` / `discrete` areas). "
        "To write several tags, add one PLC Writer block per tag.",
        examples=["camera_fault", "holding:100"],
        json_schema_extra={"always_visible": True},
    )

    # `value` accepts a literal or a selector (e.g. a previous step's output). `str` is allowed
    # in the schema for all modes; `_validate_write_value_type` then rejects *literal* string
    # values for relay and Modbus (the relay contract is bool/int/float; Modbus registers/coils
    # are numeric/boolean). Direct (EtherNet/IP) keeps `str` for Logix STRING tags. A selector
    # resolves at runtime and is not statically inspectable, so it is skipped at validation time.
    value: Union[bool, int, float, str, Selector()] = Field(
        description="The value to write to the tag. May be a fixed value or a reference to a "
        "workflow input or a previous step's output. Must be a boolean, integer, or float, "
        "except Direct (EtherNet/IP) mode, which also accepts strings (for Logix STRING tags).",
        examples=[True, 5, "$steps.counter.count"],
        json_schema_extra={"always_visible": True},
    )

    depends_on: Optional[Selector()] = Field(
        default=None,
        description="Optional reference to a step this write should run after, for when the "
        "write order matters but the tag/value are not themselves derived from that step. "
        "Dependencies are otherwise inferred from selector-valued `tag` / `value`, so this is "
        "not needed for input- or step-driven writes.",
        examples=["$steps.some_previous_step"],
    )

    ip_address: Union[str, WorkflowParameterSelector(kind=[STRING_KIND])] = (
        _ip_address_field()
    )
    connection_mode: Literal["relay", "ethernet_ip", "modbus"] = (
        _connection_mode_field()
    )
    relay_port: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _relay_port_field()
    )
    request_timeout: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _request_timeout_field()
    )
    processor_slot: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _processor_slot_field()
    )
    modbus_port: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _modbus_port_field()
    )
    modbus_unit_id: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = (
        _modbus_unit_id_field()
    )
    disable_sink: Union[bool, Selector(kind=[BOOLEAN_KIND])] = Field(
        default=False,
        description="If True, skip the write to the PLC and return an empty result.",
        examples=[False, "$inputs.disable_plc_writer"],
        json_schema_extra={"additional_section": True},
    )

    _validate_request_timeout = field_validator("request_timeout")(
        _check_positive_request_timeout
    )

    @model_validator(mode="after")
    def _validate_write_value_type(self):
        # A literal string value is only meaningful for Direct (EtherNet/IP) (Logix STRING tags).
        # The relay accepts only bool/int/float, and Modbus registers/coils are numeric/boolean,
        # so reject a literal string for those modes at validation time. A selector resolves at
        # runtime and is not statically inspectable, so it is allowed (the relay / Modbus
        # transports reject a bad resolved value per tag).
        if self.connection_mode == "ethernet_ip":
            return self
        if isinstance(self.value, str) and not _is_selector(self.value):
            raise ValueError(
                f"A literal string value is only supported in Direct (EtherNet/IP) mode; "
                f"'{self.connection_mode}' mode accepts a boolean, integer, or float (or a "
                f"selector that resolves to one)."
            )
        return self

    @classmethod
    def describe_outputs(cls) -> List[OutputDefinition]:
        return [
            OutputDefinition(name="write_result", kind=[STRING_KIND]),
            OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
        ]

    @classmethod
    def get_execution_engine_compatibility(cls) -> Optional[str]:
        return ">=1.0.0,<2.0.0"

PLCWriterBlockV1 ¶

Bases: _PLCConnectionMixin, WorkflowBlock

Writes PLC tag values over the PLC Relay or a direct EtherNet/IP / Modbus connection.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/v1.py

class PLCWriterBlockV1(_PLCConnectionMixin, WorkflowBlock):
    """Writes PLC tag values over the PLC Relay or a direct EtherNet/IP / Modbus connection."""

    @classmethod
    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
        return PLCWriterBlockManifest

    def run(
        self,
        tag: str,
        value: Union[bool, int, float, str],
        depends_on: Optional[Any] = None,
        ip_address: str = "127.0.0.1",
        connection_mode: str = "relay",
        relay_port: int = DEFAULT_RELAY_PORT,
        request_timeout: int = 10,
        processor_slot: int = 0,
        modbus_port: int = 502,
        modbus_unit_id: int = 1,
        disable_sink: bool = False,
        image: Optional[WorkflowImageData] = None,
        metadata: Optional[VideoMetadata] = None,
    ) -> dict:
        """Write a single tag to the PLC. Returns `write_result` and `error_status`."""
        if disable_sink:
            return {"write_result": "", "error_status": False}

        write_results, had_failure = self._write(
            connection_mode,
            ip_address,
            relay_port,
            processor_slot,
            modbus_port,
            modbus_unit_id,
            request_timeout,
            {tag: value},
        )
        return {
            "write_result": write_results.get(tag, WRITE_FAILURE),
            "error_status": had_failure,
        }

Methods:¶

run ¶

run(
    tag,
    value,
    depends_on=None,
    ip_address="127.0.0.1",
    connection_mode="relay",
    relay_port=DEFAULT_RELAY_PORT,
    request_timeout=10,
    processor_slot=0,
    modbus_port=502,
    modbus_unit_id=1,
    disable_sink=False,
    image=None,
    metadata=None,
)

Write a single tag to the PLC. Returns write_result and error_status.

Source code in inference/enterprise/workflows/enterprise_blocks/sinks/plc/v1.py

def run(
    self,
    tag: str,
    value: Union[bool, int, float, str],
    depends_on: Optional[Any] = None,
    ip_address: str = "127.0.0.1",
    connection_mode: str = "relay",
    relay_port: int = DEFAULT_RELAY_PORT,
    request_timeout: int = 10,
    processor_slot: int = 0,
    modbus_port: int = 502,
    modbus_unit_id: int = 1,
    disable_sink: bool = False,
    image: Optional[WorkflowImageData] = None,
    metadata: Optional[VideoMetadata] = None,
) -> dict:
    """Write a single tag to the PLC. Returns `write_result` and `error_status`."""
    if disable_sink:
        return {"write_result": "", "error_status": False}

    write_results, had_failure = self._write(
        connection_mode,
        ip_address,
        relay_port,
        processor_slot,
        modbus_port,
        modbus_unit_id,
        request_timeout,
        {tag: value},
    )
    return {
        "write_result": write_results.get(tag, WRITE_FAILURE),
        "error_status": had_failure,
    }

Functions:¶

`models/clip`¶

inference.models.clip.clip_inference_models ¶

Classes¶

InferenceModelsClipAdapter ¶

Bases: Model

Roboflow ONNX ClipModel model.

This class is responsible for handling the ONNX ClipModel model, including loading the model, preprocessing the input, and performing inference.

Attributes:

Name	Type	Description
`visual_onnx_session`	`InferenceSession`	ONNX Runtime session for visual inference.
`textual_onnx_session`	`InferenceSession`	ONNX Runtime session for textual inference.
`resolution`	`int`	The resolution of the input image.
`clip_preprocess`	`function`	Function to preprocess the image.

Source code in inference/models/clip/clip_inference_models.py

class InferenceModelsClipAdapter(Model):
    """Roboflow ONNX ClipModel model.

    This class is responsible for handling the ONNX ClipModel model, including
    loading the model, preprocessing the input, and performing inference.

    Attributes:
        visual_onnx_session (onnxruntime.InferenceSession): ONNX Runtime session for visual inference.
        textual_onnx_session (onnxruntime.InferenceSession): ONNX Runtime session for textual inference.
        resolution (int): The resolution of the input image.
        clip_preprocess (function): Function to preprocess the image.
    """

    def __init__(
        self,
        model_id: str = CLIP_MODEL_ID,
        api_key: str = None,
        **kwargs,
    ):
        super().__init__()

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY
        self.task_type = "embedding"
        weights_provider_extra_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: Union[ClipOnnx, ClipTorch] = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            weights_provider_extra_headers=weights_provider_extra_headers,
            backend=backend,
            **kwargs,
        )

    def compare(
        self,
        subject: Any,
        prompt: Any,
        subject_type: str = "image",
        prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
        **kwargs,
    ) -> Union[List[float], Dict[str, float]]:
        """
        Compares the subject with the prompt to calculate similarity scores.

        Args:
            subject (Any): The subject data to be compared. Can be either an image or text.
            prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
            subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
            prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
            **kwargs: Additional keyword arguments.

        Returns:
            Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s). If prompt is a dictionary, returns a dictionary with keys corresponding to the original prompt dictionary's keys.

        Raises:
            ValueError: If subject_type or prompt_type is neither "image" nor "text".
            ValueError: If the number of prompts exceeds the maximum batch size.
        """

        if subject_type == "image":
            subject_embeddings = self.embed_image(subject)
        elif subject_type == "text":
            subject_embeddings = self.embed_text(subject)
        else:
            raise ValueError(
                "subject_type must be either 'image' or 'text', but got {request.subject_type}"
            )

        if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
            prompt_keys = prompt.keys()
            prompt = [prompt[k] for k in prompt_keys]
            prompt_obj = "dict"
        else:
            prompt = prompt
            if not isinstance(prompt, list):
                prompt = [prompt]
            prompt_obj = "list"

        if len(prompt) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
            )

        if prompt_type == "image":
            prompt_embeddings = self.embed_image(prompt)
        elif prompt_type == "text":
            prompt_embeddings = self.embed_text(prompt)
        else:
            raise ValueError(
                "prompt_type must be either 'image' or 'text', but got {request.prompt_type}"
            )

        similarities = [
            cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
        ]

        if prompt_obj == "dict":
            similarities = dict(zip(prompt_keys, similarities))

        return similarities

    def make_compare_response(
        self, similarities: Union[List[float], Dict[str, float]]
    ) -> ClipCompareResponse:
        """
        Creates a ClipCompareResponse object from the provided similarity data.

        Args:
            similarities (Union[List[float], Dict[str, float]]): A list or dictionary containing similarity scores.

        Returns:
            ClipCompareResponse: An instance of the ClipCompareResponse with the given similarity scores.

        Example:
            Assuming `ClipCompareResponse` expects a dictionary of string-float pairs:

            >>> make_compare_response({"image1": 0.98, "image2": 0.76})
            ClipCompareResponse(similarity={"image1": 0.98, "image2": 0.76})
        """
        response = ClipCompareResponse(similarity=similarities)
        return response

    def embed_image(
        self,
        image: Any,
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds an image or a list of images using the Clip model.

        Args:
            image (Any): The image or list of images to be embedded. Image can be in any format that is acceptable by the preproc_image method.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the image(s) as a numpy array.

        Raises:
            ValueError: If the number of images in the list exceeds the maximum batch size.

        Notes:
            The function measures performance using perf_counter and also has support for ONNX session to get embeddings.
        """
        t1 = perf_counter()

        if isinstance(image, list):
            if len(image) > CLIP_MAX_BATCH_SIZE:
                raise ValueError(
                    f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
                )
            imgs = [self.preproc_image(i) for i in image]
            img_in = np.concatenate(imgs, axis=0)
        else:
            img_in = self.preproc_image(image)
        embeddings = self._model.embed_images(images=img_in)
        return embeddings.cpu().numpy()

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray]:
        embeddings = self._model.embed_images(images=img_in)
        return (embeddings.cpu().numpy(),)

    def make_embed_image_response(
        self, embeddings: np.ndarray
    ) -> ClipEmbeddingResponse:
        """
        Converts the given embeddings into a ClipEmbeddingResponse object.

        Args:
            embeddings (np.ndarray): A numpy array containing the embeddings for an image or images.

        Returns:
            ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

        Example:
            >>> embeddings_array = np.array([[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
            >>> make_embed_image_response(embeddings_array)
            ClipEmbeddingResponse(embeddings=[[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
        """
        response = ClipEmbeddingResponse(embeddings=embeddings.tolist())

        return response

    def embed_text(
        self,
        text: Union[str, List[str]],
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds a text or a list of texts using the Clip model.

        Args:
            text (Union[str, List[str]]): The text string or list of text strings to be embedded.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the text or texts as a numpy array.

        Raises:
            ValueError: If the number of text strings in the list exceeds the maximum batch size.

        Notes:
            The function utilizes an ONNX session to compute embeddings and measures the embedding time with perf_counter.
        """
        if isinstance(text, list):
            texts = text
        else:
            texts = [text]
        embeddings = self._model.embed_text(texts=texts)
        return embeddings.cpu().numpy()

    def make_embed_text_response(self, embeddings: np.ndarray) -> ClipEmbeddingResponse:
        """
        Converts the given text embeddings into a ClipEmbeddingResponse object.

        Args:
            embeddings (np.ndarray): A numpy array containing the embeddings for a text or texts.

        Returns:
            ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

        Example:
            >>> embeddings_array = np.array([[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
            >>> make_embed_text_response(embeddings_array)
            ClipEmbeddingResponse(embeddings=[[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
        """
        response = ClipEmbeddingResponse(embeddings=embeddings.tolist())
        return response

    def infer_from_request(
        self, request: ClipInferenceRequest
    ) -> ClipEmbeddingResponse:
        """Routes the request to the appropriate inference function.

        Args:
            request (ClipInferenceRequest): The request object containing the inference details.

        Returns:
            ClipEmbeddingResponse: The response object containing the embeddings.
        """
        t1 = perf_counter()
        if isinstance(request, ClipImageEmbeddingRequest):
            infer_func = self.embed_image
            make_response_func = self.make_embed_image_response
        elif isinstance(request, ClipTextEmbeddingRequest):
            infer_func = self.embed_text
            make_response_func = self.make_embed_text_response
        elif isinstance(request, ClipCompareRequest):
            infer_func = self.compare
            make_response_func = self.make_compare_response
        else:
            raise ValueError(
                f"Request type {type(request)} is not a valid ClipInferenceRequest"
            )
        data = infer_func(**request.dict())
        response = make_response_func(data)
        response.time = perf_counter() - t1
        return response

    def make_response(self, embeddings, *args, **kwargs) -> InferenceResponse:
        return [self.make_embed_image_response(embeddings)]

    def postprocess(
        self,
        predictions: Tuple[np.ndarray],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Any:
        return [self.make_embed_image_response(predictions[0])]

    def infer(self, image: Any, **kwargs) -> Any:
        """Embeds an image
        - image:
            can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
        """
        return super().infer(image, **kwargs)

    def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
        """Preprocesses an inference request image.

        Args:
            image (InferenceRequestImage): The object containing information necessary to load the image for inference.

        Returns:
            np.ndarray: A numpy array of the preprocessed image pixel data.
        """
        return load_image_bgr(image)

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        return self.preproc_image(image), PreprocessReturnMetadata({})

Methods:¶

compare ¶

compare(
    subject,
    prompt,
    subject_type="image",
    prompt_type="text",
    **kwargs
)

Compares the subject with the prompt to calculate similarity scores.

Parameters:

Name	Type	Description	Default
`subject`	`Any`	The subject data to be compared. Can be either an image or text.	required
`prompt`	`Any`	The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.	required
`subject_type`	`str`	Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".	`'image'`
`prompt_type`	`Union[str, List[str], Dict[str, Any]]`	Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".	`'text'`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`Union[List[float], Dict[str, float]]`	Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s). If prompt is a dictionary, returns a dictionary with keys corresponding to the original prompt dictionary's keys.

Raises:

Type	Description
`ValueError`	If subject_type or prompt_type is neither "image" nor "text".
`ValueError`	If the number of prompts exceeds the maximum batch size.

Source code in inference/models/clip/clip_inference_models.py

def compare(
    self,
    subject: Any,
    prompt: Any,
    subject_type: str = "image",
    prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
    **kwargs,
) -> Union[List[float], Dict[str, float]]:
    """
    Compares the subject with the prompt to calculate similarity scores.

    Args:
        subject (Any): The subject data to be compared. Can be either an image or text.
        prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
        subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
        prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
        **kwargs: Additional keyword arguments.

    Returns:
        Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s). If prompt is a dictionary, returns a dictionary with keys corresponding to the original prompt dictionary's keys.

    Raises:
        ValueError: If subject_type or prompt_type is neither "image" nor "text".
        ValueError: If the number of prompts exceeds the maximum batch size.
    """

    if subject_type == "image":
        subject_embeddings = self.embed_image(subject)
    elif subject_type == "text":
        subject_embeddings = self.embed_text(subject)
    else:
        raise ValueError(
            "subject_type must be either 'image' or 'text', but got {request.subject_type}"
        )

    if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
        prompt_keys = prompt.keys()
        prompt = [prompt[k] for k in prompt_keys]
        prompt_obj = "dict"
    else:
        prompt = prompt
        if not isinstance(prompt, list):
            prompt = [prompt]
        prompt_obj = "list"

    if len(prompt) > CLIP_MAX_BATCH_SIZE:
        raise ValueError(
            f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
        )

    if prompt_type == "image":
        prompt_embeddings = self.embed_image(prompt)
    elif prompt_type == "text":
        prompt_embeddings = self.embed_text(prompt)
    else:
        raise ValueError(
            "prompt_type must be either 'image' or 'text', but got {request.prompt_type}"
        )

    similarities = [
        cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
    ]

    if prompt_obj == "dict":
        similarities = dict(zip(prompt_keys, similarities))

    return similarities

embed_image ¶

embed_image(image, **kwargs)

Embeds an image or a list of images using the Clip model.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image or list of images to be embedded. Image can be in any format that is acceptable by the preproc_image method.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`ndarray`	np.ndarray: The embeddings of the image(s) as a numpy array.

Raises:

Type	Description
`ValueError`	If the number of images in the list exceeds the maximum batch size.

Notes

The function measures performance using perf_counter and also has support for ONNX session to get embeddings.

Source code in inference/models/clip/clip_inference_models.py

def embed_image(
    self,
    image: Any,
    **kwargs,
) -> np.ndarray:
    """
    Embeds an image or a list of images using the Clip model.

    Args:
        image (Any): The image or list of images to be embedded. Image can be in any format that is acceptable by the preproc_image method.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the image(s) as a numpy array.

    Raises:
        ValueError: If the number of images in the list exceeds the maximum batch size.

    Notes:
        The function measures performance using perf_counter and also has support for ONNX session to get embeddings.
    """
    t1 = perf_counter()

    if isinstance(image, list):
        if len(image) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
            )
        imgs = [self.preproc_image(i) for i in image]
        img_in = np.concatenate(imgs, axis=0)
    else:
        img_in = self.preproc_image(image)
    embeddings = self._model.embed_images(images=img_in)
    return embeddings.cpu().numpy()

embed_text ¶

embed_text(text, **kwargs)

Embeds a text or a list of texts using the Clip model.

Parameters:

Name	Type	Description	Default
`text`	`Union[str, List[str]]`	The text string or list of text strings to be embedded.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`ndarray`	np.ndarray: The embeddings of the text or texts as a numpy array.

Raises:

Type	Description
`ValueError`	If the number of text strings in the list exceeds the maximum batch size.

Notes

The function utilizes an ONNX session to compute embeddings and measures the embedding time with perf_counter.

Source code in inference/models/clip/clip_inference_models.py

def embed_text(
    self,
    text: Union[str, List[str]],
    **kwargs,
) -> np.ndarray:
    """
    Embeds a text or a list of texts using the Clip model.

    Args:
        text (Union[str, List[str]]): The text string or list of text strings to be embedded.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the text or texts as a numpy array.

    Raises:
        ValueError: If the number of text strings in the list exceeds the maximum batch size.

    Notes:
        The function utilizes an ONNX session to compute embeddings and measures the embedding time with perf_counter.
    """
    if isinstance(text, list):
        texts = text
    else:
        texts = [text]
    embeddings = self._model.embed_text(texts=texts)
    return embeddings.cpu().numpy()

infer ¶

infer(image, **kwargs)

Embeds an image - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

Source code in inference/models/clip/clip_inference_models.py

def infer(self, image: Any, **kwargs) -> Any:
    """Embeds an image
    - image:
        can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
    """
    return super().infer(image, **kwargs)

infer_from_request ¶

infer_from_request(request)

Routes the request to the appropriate inference function.

Parameters:

Name	Type	Description	Default
`request`	`ClipInferenceRequest`	The request object containing the inference details.	required

Returns:

Name	Type	Description
`ClipEmbeddingResponse`	`ClipEmbeddingResponse`	The response object containing the embeddings.

Source code in inference/models/clip/clip_inference_models.py

def infer_from_request(
    self, request: ClipInferenceRequest
) -> ClipEmbeddingResponse:
    """Routes the request to the appropriate inference function.

    Args:
        request (ClipInferenceRequest): The request object containing the inference details.

    Returns:
        ClipEmbeddingResponse: The response object containing the embeddings.
    """
    t1 = perf_counter()
    if isinstance(request, ClipImageEmbeddingRequest):
        infer_func = self.embed_image
        make_response_func = self.make_embed_image_response
    elif isinstance(request, ClipTextEmbeddingRequest):
        infer_func = self.embed_text
        make_response_func = self.make_embed_text_response
    elif isinstance(request, ClipCompareRequest):
        infer_func = self.compare
        make_response_func = self.make_compare_response
    else:
        raise ValueError(
            f"Request type {type(request)} is not a valid ClipInferenceRequest"
        )
    data = infer_func(**request.dict())
    response = make_response_func(data)
    response.time = perf_counter() - t1
    return response

make_compare_response ¶

make_compare_response(similarities)

Creates a ClipCompareResponse object from the provided similarity data.

Parameters:

Name	Type	Description	Default
`similarities`	`Union[List[float], Dict[str, float]]`	A list or dictionary containing similarity scores.	required

Returns:

Name	Type	Description
`ClipCompareResponse`	`ClipCompareResponse`	An instance of the ClipCompareResponse with the given similarity scores.

Example

Assuming ClipCompareResponse expects a dictionary of string-float pairs:

make_compare_response({"image1": 0.98, "image2": 0.76}) ClipCompareResponse(similarity={"image1": 0.98, "image2": 0.76})

Source code in inference/models/clip/clip_inference_models.py

def make_compare_response(
    self, similarities: Union[List[float], Dict[str, float]]
) -> ClipCompareResponse:
    """
    Creates a ClipCompareResponse object from the provided similarity data.

    Args:
        similarities (Union[List[float], Dict[str, float]]): A list or dictionary containing similarity scores.

    Returns:
        ClipCompareResponse: An instance of the ClipCompareResponse with the given similarity scores.

    Example:
        Assuming `ClipCompareResponse` expects a dictionary of string-float pairs:

        >>> make_compare_response({"image1": 0.98, "image2": 0.76})
        ClipCompareResponse(similarity={"image1": 0.98, "image2": 0.76})
    """
    response = ClipCompareResponse(similarity=similarities)
    return response

make_embed_image_response ¶

make_embed_image_response(embeddings)

Converts the given embeddings into a ClipEmbeddingResponse object.

Parameters:

Name	Type	Description	Default
`embeddings`	`ndarray`	A numpy array containing the embeddings for an image or images.	required

Returns:

Name	Type	Description
`ClipEmbeddingResponse`	`ClipEmbeddingResponse`	An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

Example

embeddings_array = np.array([[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]]) make_embed_image_response(embeddings_array) ClipEmbeddingResponse(embeddings=[[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])

Source code in inference/models/clip/clip_inference_models.py

def make_embed_image_response(
    self, embeddings: np.ndarray
) -> ClipEmbeddingResponse:
    """
    Converts the given embeddings into a ClipEmbeddingResponse object.

    Args:
        embeddings (np.ndarray): A numpy array containing the embeddings for an image or images.

    Returns:
        ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

    Example:
        >>> embeddings_array = np.array([[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
        >>> make_embed_image_response(embeddings_array)
        ClipEmbeddingResponse(embeddings=[[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
    """
    response = ClipEmbeddingResponse(embeddings=embeddings.tolist())

    return response

make_embed_text_response ¶

make_embed_text_response(embeddings)

Converts the given text embeddings into a ClipEmbeddingResponse object.

Parameters:

Name	Type	Description	Default
`embeddings`	`ndarray`	A numpy array containing the embeddings for a text or texts.	required

Returns:

Name	Type	Description
`ClipEmbeddingResponse`	`ClipEmbeddingResponse`	An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

Example

embeddings_array = np.array([[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]]) make_embed_text_response(embeddings_array) ClipEmbeddingResponse(embeddings=[[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])

Source code in inference/models/clip/clip_inference_models.py

def make_embed_text_response(self, embeddings: np.ndarray) -> ClipEmbeddingResponse:
    """
    Converts the given text embeddings into a ClipEmbeddingResponse object.

    Args:
        embeddings (np.ndarray): A numpy array containing the embeddings for a text or texts.

    Returns:
        ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

    Example:
        >>> embeddings_array = np.array([[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
        >>> make_embed_text_response(embeddings_array)
        ClipEmbeddingResponse(embeddings=[[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
    """
    response = ClipEmbeddingResponse(embeddings=embeddings.tolist())
    return response

preproc_image ¶

preproc_image(image)

Preprocesses an inference request image.

Parameters:

Name	Type	Description	Default
`image`	`InferenceRequestImage`	The object containing information necessary to load the image for inference.	required

Returns:

Type	Description
`ndarray`	np.ndarray: A numpy array of the preprocessed image pixel data.

Source code in inference/models/clip/clip_inference_models.py

def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
    """Preprocesses an inference request image.

    Args:
        image (InferenceRequestImage): The object containing information necessary to load the image for inference.

    Returns:
        np.ndarray: A numpy array of the preprocessed image pixel data.
    """
    return load_image_bgr(image)

Functions:¶

inference.models.clip.clip_model ¶

Classes¶

Clip ¶

Bases: OnnxRoboflowCoreModel

Roboflow ONNX ClipModel model.

This class is responsible for handling the ONNX ClipModel model, including loading the model, preprocessing the input, and performing inference.

Attributes:

Name	Type	Description
`visual_onnx_session`	`InferenceSession`	ONNX Runtime session for visual inference.
`textual_onnx_session`	`InferenceSession`	ONNX Runtime session for textual inference.
`resolution`	`int`	The resolution of the input image.
`clip_preprocess`	`function`	Function to preprocess the image.

Source code in inference/models/clip/clip_model.py

class Clip(OnnxRoboflowCoreModel):
    """Roboflow ONNX ClipModel model.

    This class is responsible for handling the ONNX ClipModel model, including
    loading the model, preprocessing the input, and performing inference.

    Attributes:
        visual_onnx_session (onnxruntime.InferenceSession): ONNX Runtime session for visual inference.
        textual_onnx_session (onnxruntime.InferenceSession): ONNX Runtime session for textual inference.
        resolution (int): The resolution of the input image.
        clip_preprocess (function): Function to preprocess the image.
    """

    def __init__(
        self,
        *args,
        model_id: str = CLIP_MODEL_ID,
        onnxruntime_execution_providers: List[
            str
        ] = get_onnxruntime_execution_providers(ONNXRUNTIME_EXECUTION_PROVIDERS),
        **kwargs,
    ):
        """Initializes the Clip with the given arguments and keyword arguments."""
        self.onnxruntime_execution_providers = onnxruntime_execution_providers
        t1 = perf_counter()
        super().__init__(*args, model_id=model_id, **kwargs)
        # Create an ONNX Runtime Session with a list of execution providers in priority order. ORT attempts to load providers until one is successful. This keeps the code across devices identical.
        self.log("Creating inference sessions")
        self.visual_onnx_session = onnxruntime.InferenceSession(
            self.cache_file("visual.onnx"),
            providers=self.onnxruntime_execution_providers,
        )
        self._visual_session_lock = Lock()
        self.textual_onnx_session = onnxruntime.InferenceSession(
            self.cache_file("textual.onnx"),
            providers=self.onnxruntime_execution_providers,
        )
        self._textual_session_lock = Lock()
        if REQUIRED_ONNX_PROVIDERS:
            available_providers = onnxruntime.get_available_providers()
            for provider in REQUIRED_ONNX_PROVIDERS:
                if provider not in available_providers:
                    raise OnnxProviderNotAvailable(
                        f"Required ONNX Execution Provider {provider} is not availble. Check that you are using the correct docker image on a supported device."
                    )

        self.resolution = self.visual_onnx_session.get_inputs()[0].shape[2]

        self.clip_preprocess = clip.clip._transform(self.resolution)
        self.log(f"CLIP model loaded in {perf_counter() - t1:.2f} seconds")
        self.task_type = "embedding"

    def compare(
        self,
        subject: Any,
        prompt: Any,
        subject_type: str = "image",
        prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
        **kwargs,
    ) -> Union[List[float], Dict[str, float]]:
        """
        Compares the subject with the prompt to calculate similarity scores.

        Args:
            subject (Any): The subject data to be compared. Can be either an image or text.
            prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
            subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
            prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
            **kwargs: Additional keyword arguments.

        Returns:
            Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s). If prompt is a dictionary, returns a dictionary with keys corresponding to the original prompt dictionary's keys.

        Raises:
            ValueError: If subject_type or prompt_type is neither "image" nor "text".
            ValueError: If the number of prompts exceeds the maximum batch size.
        """

        if subject_type == "image":
            subject_embeddings = self.embed_image(subject)
        elif subject_type == "text":
            subject_embeddings = self.embed_text(subject)
        else:
            raise ValueError(
                "subject_type must be either 'image' or 'text', but got {request.subject_type}"
            )

        if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
            prompt_keys = prompt.keys()
            prompt = [prompt[k] for k in prompt_keys]
            prompt_obj = "dict"
        else:
            prompt = prompt
            if not isinstance(prompt, list):
                prompt = [prompt]
            prompt_obj = "list"

        if len(prompt) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
            )

        if prompt_type == "image":
            prompt_embeddings = self.embed_image(prompt)
        elif prompt_type == "text":
            prompt_embeddings = self.embed_text(prompt)
        else:
            raise ValueError(
                "prompt_type must be either 'image' or 'text', but got {request.prompt_type}"
            )

        similarities = [
            cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
        ]

        if prompt_obj == "dict":
            similarities = dict(zip(prompt_keys, similarities))

        return similarities

    def make_compare_response(
        self, similarities: Union[List[float], Dict[str, float]]
    ) -> ClipCompareResponse:
        """
        Creates a ClipCompareResponse object from the provided similarity data.

        Args:
            similarities (Union[List[float], Dict[str, float]]): A list or dictionary containing similarity scores.

        Returns:
            ClipCompareResponse: An instance of the ClipCompareResponse with the given similarity scores.

        Example:
            Assuming `ClipCompareResponse` expects a dictionary of string-float pairs:

            >>> make_compare_response({"image1": 0.98, "image2": 0.76})
            ClipCompareResponse(similarity={"image1": 0.98, "image2": 0.76})
        """
        response = ClipCompareResponse(similarity=similarities)
        return response

    def embed_image(
        self,
        image: Any,
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds an image or a list of images using the Clip model.

        Args:
            image (Any): The image or list of images to be embedded. Image can be in any format that is acceptable by the preproc_image method.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the image(s) as a numpy array.

        Raises:
            ValueError: If the number of images in the list exceeds the maximum batch size.

        Notes:
            The function measures performance using perf_counter and also has support for ONNX session to get embeddings.
        """
        t1 = perf_counter()

        if isinstance(image, list):
            if len(image) > CLIP_MAX_BATCH_SIZE:
                raise ValueError(
                    f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
                )
            imgs = [self.preproc_image(i) for i in image]
            img_in = np.concatenate(imgs, axis=0)
        else:
            img_in = self.preproc_image(image)

        onnx_input_image = {self.visual_onnx_session.get_inputs()[0].name: img_in}
        with self._visual_session_lock:
            return self.visual_onnx_session.run(None, onnx_input_image)[0]

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray]:
        onnx_input_image = {self.visual_onnx_session.get_inputs()[0].name: img_in}
        with self._visual_session_lock:
            embeddings = self.visual_onnx_session.run(None, onnx_input_image)[0]
        return (embeddings,)

    def make_embed_image_response(
        self, embeddings: np.ndarray
    ) -> ClipEmbeddingResponse:
        """
        Converts the given embeddings into a ClipEmbeddingResponse object.

        Args:
            embeddings (np.ndarray): A numpy array containing the embeddings for an image or images.

        Returns:
            ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

        Example:
            >>> embeddings_array = np.array([[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
            >>> make_embed_image_response(embeddings_array)
            ClipEmbeddingResponse(embeddings=[[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
        """
        response = ClipEmbeddingResponse(embeddings=embeddings.tolist())

        return response

    def embed_text(
        self,
        text: Union[str, List[str]],
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds a text or a list of texts using the Clip model.

        Args:
            text (Union[str, List[str]]): The text string or list of text strings to be embedded.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the text or texts as a numpy array.

        Raises:
            ValueError: If the number of text strings in the list exceeds the maximum batch size.

        Notes:
            The function utilizes an ONNX session to compute embeddings and measures the embedding time with perf_counter.
        """
        if isinstance(text, list):
            texts = text
        else:
            texts = [text]
        results = []
        for texts_batch in create_batches(
            sequence=texts, batch_size=CLIP_MAX_BATCH_SIZE
        ):
            tokenized_batch = clip.tokenize(texts_batch).numpy().astype(np.int32)
            onnx_input_text = {
                self.textual_onnx_session.get_inputs()[0].name: tokenized_batch
            }
            with self._textual_session_lock:
                embeddings = self.textual_onnx_session.run(None, onnx_input_text)[0]
            results.append(embeddings)
        return np.concatenate(results, axis=0)

    def make_embed_text_response(self, embeddings: np.ndarray) -> ClipEmbeddingResponse:
        """
        Converts the given text embeddings into a ClipEmbeddingResponse object.

        Args:
            embeddings (np.ndarray): A numpy array containing the embeddings for a text or texts.

        Returns:
            ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

        Example:
            >>> embeddings_array = np.array([[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
            >>> make_embed_text_response(embeddings_array)
            ClipEmbeddingResponse(embeddings=[[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
        """
        response = ClipEmbeddingResponse(embeddings=embeddings.tolist())
        return response

    def get_infer_bucket_file_list(self) -> List[str]:
        """Gets the list of files required for inference.

        Returns:
            List[str]: The list of file names.
        """
        return ["textual.onnx", "visual.onnx"]

    def infer_from_request(
        self, request: ClipInferenceRequest
    ) -> ClipEmbeddingResponse:
        """Routes the request to the appropriate inference function.

        Args:
            request (ClipInferenceRequest): The request object containing the inference details.

        Returns:
            ClipEmbeddingResponse: The response object containing the embeddings.
        """
        t1 = perf_counter()
        if isinstance(request, ClipImageEmbeddingRequest):
            infer_func = self.embed_image
            make_response_func = self.make_embed_image_response
        elif isinstance(request, ClipTextEmbeddingRequest):
            infer_func = self.embed_text
            make_response_func = self.make_embed_text_response
        elif isinstance(request, ClipCompareRequest):
            infer_func = self.compare
            make_response_func = self.make_compare_response
        else:
            raise ValueError(
                f"Request type {type(request)} is not a valid ClipInferenceRequest"
            )
        data = infer_func(**request.dict())
        response = make_response_func(data)
        response.time = perf_counter() - t1
        return response

    def make_response(self, embeddings, *args, **kwargs) -> InferenceResponse:
        return [self.make_embed_image_response(embeddings)]

    def postprocess(
        self,
        predictions: Tuple[np.ndarray],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Any:
        return [self.make_embed_image_response(predictions[0])]

    def infer(self, image: Any, **kwargs) -> Any:
        """Embeds an image
        - image:
            can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
        """
        return super().infer(image, **kwargs)

    def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
        """Preprocesses an inference request image.

        Args:
            image (InferenceRequestImage): The object containing information necessary to load the image for inference.

        Returns:
            np.ndarray: A numpy array of the preprocessed image pixel data.
        """
        pil_image = Image.fromarray(load_image_rgb(image))
        preprocessed_image = self.clip_preprocess(pil_image)

        img_in = np.expand_dims(preprocessed_image, axis=0)

        return img_in.astype(np.float32)

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        return self.preproc_image(image), PreprocessReturnMetadata({})

Methods:¶

init ¶

__init__(
    *args,
    model_id=CLIP_MODEL_ID,
    onnxruntime_execution_providers=get_onnxruntime_execution_providers(
        ONNXRUNTIME_EXECUTION_PROVIDERS
    ),
    **kwargs
)

Initializes the Clip with the given arguments and keyword arguments.

Source code in inference/models/clip/clip_model.py

def __init__(
    self,
    *args,
    model_id: str = CLIP_MODEL_ID,
    onnxruntime_execution_providers: List[
        str
    ] = get_onnxruntime_execution_providers(ONNXRUNTIME_EXECUTION_PROVIDERS),
    **kwargs,
):
    """Initializes the Clip with the given arguments and keyword arguments."""
    self.onnxruntime_execution_providers = onnxruntime_execution_providers
    t1 = perf_counter()
    super().__init__(*args, model_id=model_id, **kwargs)
    # Create an ONNX Runtime Session with a list of execution providers in priority order. ORT attempts to load providers until one is successful. This keeps the code across devices identical.
    self.log("Creating inference sessions")
    self.visual_onnx_session = onnxruntime.InferenceSession(
        self.cache_file("visual.onnx"),
        providers=self.onnxruntime_execution_providers,
    )
    self._visual_session_lock = Lock()
    self.textual_onnx_session = onnxruntime.InferenceSession(
        self.cache_file("textual.onnx"),
        providers=self.onnxruntime_execution_providers,
    )
    self._textual_session_lock = Lock()
    if REQUIRED_ONNX_PROVIDERS:
        available_providers = onnxruntime.get_available_providers()
        for provider in REQUIRED_ONNX_PROVIDERS:
            if provider not in available_providers:
                raise OnnxProviderNotAvailable(
                    f"Required ONNX Execution Provider {provider} is not availble. Check that you are using the correct docker image on a supported device."
                )

    self.resolution = self.visual_onnx_session.get_inputs()[0].shape[2]

    self.clip_preprocess = clip.clip._transform(self.resolution)
    self.log(f"CLIP model loaded in {perf_counter() - t1:.2f} seconds")
    self.task_type = "embedding"

compare ¶

compare(
    subject,
    prompt,
    subject_type="image",
    prompt_type="text",
    **kwargs
)

Compares the subject with the prompt to calculate similarity scores.

Parameters:

Name	Type	Description	Default
`subject`	`Any`	The subject data to be compared. Can be either an image or text.	required
`prompt`	`Any`	The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.	required
`subject_type`	`str`	Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".	`'image'`
`prompt_type`	`Union[str, List[str], Dict[str, Any]]`	Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".	`'text'`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`Union[List[float], Dict[str, float]]`	Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s). If prompt is a dictionary, returns a dictionary with keys corresponding to the original prompt dictionary's keys.

Raises:

Type	Description
`ValueError`	If subject_type or prompt_type is neither "image" nor "text".
`ValueError`	If the number of prompts exceeds the maximum batch size.

Source code in inference/models/clip/clip_model.py

def compare(
    self,
    subject: Any,
    prompt: Any,
    subject_type: str = "image",
    prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
    **kwargs,
) -> Union[List[float], Dict[str, float]]:
    """
    Compares the subject with the prompt to calculate similarity scores.

    Args:
        subject (Any): The subject data to be compared. Can be either an image or text.
        prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
        subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
        prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
        **kwargs: Additional keyword arguments.

    Returns:
        Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s). If prompt is a dictionary, returns a dictionary with keys corresponding to the original prompt dictionary's keys.

    Raises:
        ValueError: If subject_type or prompt_type is neither "image" nor "text".
        ValueError: If the number of prompts exceeds the maximum batch size.
    """

    if subject_type == "image":
        subject_embeddings = self.embed_image(subject)
    elif subject_type == "text":
        subject_embeddings = self.embed_text(subject)
    else:
        raise ValueError(
            "subject_type must be either 'image' or 'text', but got {request.subject_type}"
        )

    if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
        prompt_keys = prompt.keys()
        prompt = [prompt[k] for k in prompt_keys]
        prompt_obj = "dict"
    else:
        prompt = prompt
        if not isinstance(prompt, list):
            prompt = [prompt]
        prompt_obj = "list"

    if len(prompt) > CLIP_MAX_BATCH_SIZE:
        raise ValueError(
            f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
        )

    if prompt_type == "image":
        prompt_embeddings = self.embed_image(prompt)
    elif prompt_type == "text":
        prompt_embeddings = self.embed_text(prompt)
    else:
        raise ValueError(
            "prompt_type must be either 'image' or 'text', but got {request.prompt_type}"
        )

    similarities = [
        cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
    ]

    if prompt_obj == "dict":
        similarities = dict(zip(prompt_keys, similarities))

    return similarities

embed_image ¶

embed_image(image, **kwargs)

Embeds an image or a list of images using the Clip model.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image or list of images to be embedded. Image can be in any format that is acceptable by the preproc_image method.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`ndarray`	np.ndarray: The embeddings of the image(s) as a numpy array.

Raises:

Type	Description
`ValueError`	If the number of images in the list exceeds the maximum batch size.

Notes

The function measures performance using perf_counter and also has support for ONNX session to get embeddings.

Source code in inference/models/clip/clip_model.py

def embed_image(
    self,
    image: Any,
    **kwargs,
) -> np.ndarray:
    """
    Embeds an image or a list of images using the Clip model.

    Args:
        image (Any): The image or list of images to be embedded. Image can be in any format that is acceptable by the preproc_image method.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the image(s) as a numpy array.

    Raises:
        ValueError: If the number of images in the list exceeds the maximum batch size.

    Notes:
        The function measures performance using perf_counter and also has support for ONNX session to get embeddings.
    """
    t1 = perf_counter()

    if isinstance(image, list):
        if len(image) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
            )
        imgs = [self.preproc_image(i) for i in image]
        img_in = np.concatenate(imgs, axis=0)
    else:
        img_in = self.preproc_image(image)

    onnx_input_image = {self.visual_onnx_session.get_inputs()[0].name: img_in}
    with self._visual_session_lock:
        return self.visual_onnx_session.run(None, onnx_input_image)[0]

embed_text ¶

embed_text(text, **kwargs)

Embeds a text or a list of texts using the Clip model.

Parameters:

Name	Type	Description	Default
`text`	`Union[str, List[str]]`	The text string or list of text strings to be embedded.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`ndarray`	np.ndarray: The embeddings of the text or texts as a numpy array.

Raises:

Type	Description
`ValueError`	If the number of text strings in the list exceeds the maximum batch size.

Notes

The function utilizes an ONNX session to compute embeddings and measures the embedding time with perf_counter.

Source code in inference/models/clip/clip_model.py

def embed_text(
    self,
    text: Union[str, List[str]],
    **kwargs,
) -> np.ndarray:
    """
    Embeds a text or a list of texts using the Clip model.

    Args:
        text (Union[str, List[str]]): The text string or list of text strings to be embedded.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the text or texts as a numpy array.

    Raises:
        ValueError: If the number of text strings in the list exceeds the maximum batch size.

    Notes:
        The function utilizes an ONNX session to compute embeddings and measures the embedding time with perf_counter.
    """
    if isinstance(text, list):
        texts = text
    else:
        texts = [text]
    results = []
    for texts_batch in create_batches(
        sequence=texts, batch_size=CLIP_MAX_BATCH_SIZE
    ):
        tokenized_batch = clip.tokenize(texts_batch).numpy().astype(np.int32)
        onnx_input_text = {
            self.textual_onnx_session.get_inputs()[0].name: tokenized_batch
        }
        with self._textual_session_lock:
            embeddings = self.textual_onnx_session.run(None, onnx_input_text)[0]
        results.append(embeddings)
    return np.concatenate(results, axis=0)

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Gets the list of files required for inference.

Returns:

Type	Description
`List[str]`	List[str]: The list of file names.

Source code in inference/models/clip/clip_model.py

def get_infer_bucket_file_list(self) -> List[str]:
    """Gets the list of files required for inference.

    Returns:
        List[str]: The list of file names.
    """
    return ["textual.onnx", "visual.onnx"]

infer ¶

infer(image, **kwargs)

Embeds an image - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

Source code in inference/models/clip/clip_model.py

def infer(self, image: Any, **kwargs) -> Any:
    """Embeds an image
    - image:
        can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
    """
    return super().infer(image, **kwargs)

infer_from_request ¶

infer_from_request(request)

Routes the request to the appropriate inference function.

Parameters:

Name	Type	Description	Default
`request`	`ClipInferenceRequest`	The request object containing the inference details.	required

Returns:

Name	Type	Description
`ClipEmbeddingResponse`	`ClipEmbeddingResponse`	The response object containing the embeddings.

Source code in inference/models/clip/clip_model.py

def infer_from_request(
    self, request: ClipInferenceRequest
) -> ClipEmbeddingResponse:
    """Routes the request to the appropriate inference function.

    Args:
        request (ClipInferenceRequest): The request object containing the inference details.

    Returns:
        ClipEmbeddingResponse: The response object containing the embeddings.
    """
    t1 = perf_counter()
    if isinstance(request, ClipImageEmbeddingRequest):
        infer_func = self.embed_image
        make_response_func = self.make_embed_image_response
    elif isinstance(request, ClipTextEmbeddingRequest):
        infer_func = self.embed_text
        make_response_func = self.make_embed_text_response
    elif isinstance(request, ClipCompareRequest):
        infer_func = self.compare
        make_response_func = self.make_compare_response
    else:
        raise ValueError(
            f"Request type {type(request)} is not a valid ClipInferenceRequest"
        )
    data = infer_func(**request.dict())
    response = make_response_func(data)
    response.time = perf_counter() - t1
    return response

make_compare_response ¶

make_compare_response(similarities)

Creates a ClipCompareResponse object from the provided similarity data.

Parameters:

Name	Type	Description	Default
`similarities`	`Union[List[float], Dict[str, float]]`	A list or dictionary containing similarity scores.	required

Returns:

Name	Type	Description
`ClipCompareResponse`	`ClipCompareResponse`	An instance of the ClipCompareResponse with the given similarity scores.

Example

Assuming ClipCompareResponse expects a dictionary of string-float pairs:

make_compare_response({"image1": 0.98, "image2": 0.76}) ClipCompareResponse(similarity={"image1": 0.98, "image2": 0.76})

Source code in inference/models/clip/clip_model.py

def make_compare_response(
    self, similarities: Union[List[float], Dict[str, float]]
) -> ClipCompareResponse:
    """
    Creates a ClipCompareResponse object from the provided similarity data.

    Args:
        similarities (Union[List[float], Dict[str, float]]): A list or dictionary containing similarity scores.

    Returns:
        ClipCompareResponse: An instance of the ClipCompareResponse with the given similarity scores.

    Example:
        Assuming `ClipCompareResponse` expects a dictionary of string-float pairs:

        >>> make_compare_response({"image1": 0.98, "image2": 0.76})
        ClipCompareResponse(similarity={"image1": 0.98, "image2": 0.76})
    """
    response = ClipCompareResponse(similarity=similarities)
    return response

make_embed_image_response ¶

make_embed_image_response(embeddings)

Converts the given embeddings into a ClipEmbeddingResponse object.

Parameters:

Name	Type	Description	Default
`embeddings`	`ndarray`	A numpy array containing the embeddings for an image or images.	required

Returns:

Name	Type	Description
`ClipEmbeddingResponse`	`ClipEmbeddingResponse`	An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

Example

embeddings_array = np.array([[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]]) make_embed_image_response(embeddings_array) ClipEmbeddingResponse(embeddings=[[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])

Source code in inference/models/clip/clip_model.py

def make_embed_image_response(
    self, embeddings: np.ndarray
) -> ClipEmbeddingResponse:
    """
    Converts the given embeddings into a ClipEmbeddingResponse object.

    Args:
        embeddings (np.ndarray): A numpy array containing the embeddings for an image or images.

    Returns:
        ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

    Example:
        >>> embeddings_array = np.array([[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
        >>> make_embed_image_response(embeddings_array)
        ClipEmbeddingResponse(embeddings=[[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
    """
    response = ClipEmbeddingResponse(embeddings=embeddings.tolist())

    return response

make_embed_text_response ¶

make_embed_text_response(embeddings)

Converts the given text embeddings into a ClipEmbeddingResponse object.

Parameters:

Name	Type	Description	Default
`embeddings`	`ndarray`	A numpy array containing the embeddings for a text or texts.	required

Returns:

Name	Type	Description
`ClipEmbeddingResponse`	`ClipEmbeddingResponse`	An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

Example

embeddings_array = np.array([[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]]) make_embed_text_response(embeddings_array) ClipEmbeddingResponse(embeddings=[[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])

Source code in inference/models/clip/clip_model.py

def make_embed_text_response(self, embeddings: np.ndarray) -> ClipEmbeddingResponse:
    """
    Converts the given text embeddings into a ClipEmbeddingResponse object.

    Args:
        embeddings (np.ndarray): A numpy array containing the embeddings for a text or texts.

    Returns:
        ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

    Example:
        >>> embeddings_array = np.array([[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
        >>> make_embed_text_response(embeddings_array)
        ClipEmbeddingResponse(embeddings=[[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
    """
    response = ClipEmbeddingResponse(embeddings=embeddings.tolist())
    return response

preproc_image ¶

preproc_image(image)

Preprocesses an inference request image.

Parameters:

Name	Type	Description	Default
`image`	`InferenceRequestImage`	The object containing information necessary to load the image for inference.	required

Returns:

Type	Description
`ndarray`	np.ndarray: A numpy array of the preprocessed image pixel data.

Source code in inference/models/clip/clip_model.py

def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
    """Preprocesses an inference request image.

    Args:
        image (InferenceRequestImage): The object containing information necessary to load the image for inference.

    Returns:
        np.ndarray: A numpy array of the preprocessed image pixel data.
    """
    pil_image = Image.fromarray(load_image_rgb(image))
    preprocessed_image = self.clip_preprocess(pil_image)

    img_in = np.expand_dims(preprocessed_image, axis=0)

    return img_in.astype(np.float32)

Functions:¶

`models/deep_lab_v3_plus`¶

inference.models.deep_lab_v3_plus.deep_lab_v3_plus_segmentation ¶

Classes¶

DeepLabV3PlusSemanticSegmentation ¶

Bases: SemanticSegmentationBaseOnnxRoboflowInferenceModel

DeepLabV3Plus Semantic Segmentation ONNX Inference Model.

This class is responsible for performing semantic segmentation using the DeepLabV3Plus model with ONNX runtime.

Attributes:

Name	Type	Description
`weights_file`	`str`	Path to the ONNX weights file.

Methods:

Name	Description
`predict`	Performs inference on the given image using the ONNX session.

Source code in inference/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation.py

class DeepLabV3PlusSemanticSegmentation(
    SemanticSegmentationBaseOnnxRoboflowInferenceModel
):
    """DeepLabV3Plus Semantic Segmentation ONNX Inference Model.

    This class is responsible for performing semantic segmentation using the DeepLabV3Plus model
    with ONNX runtime.

    Attributes:
        weights_file (str): Path to the ONNX weights file.

    Methods:
        predict: Performs inference on the given image using the ONNX session.
    """

    # match train params
    preprocess_means = [0.485, 0.456, 0.406]
    preprocess_stds = [0.229, 0.224, 0.225]

    @property
    def weights_file(self) -> str:
        """Gets the weights file for the DeepLabV3Plus model.

        Returns:
            str: Path to the ONNX weights file.
        """
        return "weights.onnx"

Attributes¶

weights_file `property` ¶

weights_file

Gets the weights file for the DeepLabV3Plus model.

Returns:

Name	Type	Description
`str`	`str`	Path to the ONNX weights file.

`models/depth_anything_v3/architecture`¶

inference.models.depth_anything_v3.architecture.da3 ¶

Classes¶

DepthAnything3Net ¶

Bases: Module

Depth Anything 3 network for depth estimation. Simplified for single-view depth-only inference.

This network consists of: - Backbone: DinoV2 feature extractor - Head: DualDPT for depth prediction

Returns:

Type	Description
	Dictionary containing:
	depth: Predicted depth map (B, H, W)
	depth_conf: Depth confidence map (B, H, W)

Source code in inference/models/depth_anything_v3/architecture/da3.py

class DepthAnything3Net(nn.Module):
    """
    Depth Anything 3 network for depth estimation.
    Simplified for single-view depth-only inference.

    This network consists of:
    - Backbone: DinoV2 feature extractor
    - Head: DualDPT for depth prediction

    Returns:
        Dictionary containing:
        - depth: Predicted depth map (B, H, W)
        - depth_conf: Depth confidence map (B, H, W)
    """

    PATCH_SIZE = 14

    def __init__(
        self,
        backbone_name: str,
        out_layers: list,
        alt_start: int,
        qknorm_start: int,
        rope_start: int,
        cat_token: bool,
        head_dim_in: int,
        head_output_dim: int,
        head_features: int,
        head_out_channels: list,
    ):
        """
        Initialize DepthAnything3Net.

        Args:
            backbone_name: DinoV2 backbone variant ("vits" or "vitb")
            out_layers: Layer indices to extract features from
            alt_start: Layer index to start alternating attention
            qknorm_start: Layer index to start QK normalization
            rope_start: Layer index to start RoPE
            cat_token: Whether to concatenate local and global tokens
            head_dim_in: Input dimension for the head
            head_output_dim: Output dimension for the head
            head_features: Feature dimension in the head
            head_out_channels: Output channel dimensions per stage
        """
        super().__init__()
        self.backbone = DinoV2(
            name=backbone_name,
            out_layers=out_layers,
            alt_start=alt_start,
            qknorm_start=qknorm_start,
            rope_start=rope_start,
            cat_token=cat_token,
        )
        self.head = DualDPT(
            dim_in=head_dim_in,
            output_dim=head_output_dim,
            features=head_features,
            out_channels=head_out_channels,
        )
        self.device = (
            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        )

    def forward(
        self,
        x: torch.Tensor,
    ) -> Dict[str, torch.Tensor]:
        """
        Forward pass through the network.

        Args:
            x: Input images (B, N, 3, H, W) where N=1 for single-view

        Returns:
            Dictionary containing depth predictions
        """
        # Extract features using backbone
        feats, _ = self.backbone(x)
        H, W = x.shape[-2], x.shape[-1]

        # Process features through depth head
        with torch.autocast(device_type=x.device.type, enabled=False):
            output = self._process_depth_head(feats, H, W)

        return output

    def _process_depth_head(
        self, feats: list[torch.Tensor], H: int, W: int
    ) -> Dict[str, torch.Tensor]:
        """Process features through the depth prediction head."""
        return self.head(feats, H, W, patch_start_idx=0)

Methods:¶

init ¶

__init__(
    backbone_name,
    out_layers,
    alt_start,
    qknorm_start,
    rope_start,
    cat_token,
    head_dim_in,
    head_output_dim,
    head_features,
    head_out_channels,
)

Initialize DepthAnything3Net.

Parameters:

Name	Type	Description	Default
`backbone_name`	`str`	DinoV2 backbone variant ("vits" or "vitb")	required
`out_layers`	`list`	Layer indices to extract features from	required
`alt_start`	`int`	Layer index to start alternating attention	required
`qknorm_start`	`int`	Layer index to start QK normalization	required
`rope_start`	`int`	Layer index to start RoPE	required
`cat_token`	`bool`	Whether to concatenate local and global tokens	required
`head_dim_in`	`int`	Input dimension for the head	required
`head_output_dim`	`int`	Output dimension for the head	required
`head_features`	`int`	Feature dimension in the head	required
`head_out_channels`	`list`	Output channel dimensions per stage	required

Source code in inference/models/depth_anything_v3/architecture/da3.py

def __init__(
    self,
    backbone_name: str,
    out_layers: list,
    alt_start: int,
    qknorm_start: int,
    rope_start: int,
    cat_token: bool,
    head_dim_in: int,
    head_output_dim: int,
    head_features: int,
    head_out_channels: list,
):
    """
    Initialize DepthAnything3Net.

    Args:
        backbone_name: DinoV2 backbone variant ("vits" or "vitb")
        out_layers: Layer indices to extract features from
        alt_start: Layer index to start alternating attention
        qknorm_start: Layer index to start QK normalization
        rope_start: Layer index to start RoPE
        cat_token: Whether to concatenate local and global tokens
        head_dim_in: Input dimension for the head
        head_output_dim: Output dimension for the head
        head_features: Feature dimension in the head
        head_out_channels: Output channel dimensions per stage
    """
    super().__init__()
    self.backbone = DinoV2(
        name=backbone_name,
        out_layers=out_layers,
        alt_start=alt_start,
        qknorm_start=qknorm_start,
        rope_start=rope_start,
        cat_token=cat_token,
    )
    self.head = DualDPT(
        dim_in=head_dim_in,
        output_dim=head_output_dim,
        features=head_features,
        out_channels=head_out_channels,
    )
    self.device = (
        torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    )

forward ¶

forward(x)

Forward pass through the network.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Input images (B, N, 3, H, W) where N=1 for single-view	required

Returns:

Type	Description
`Dict[str, Tensor]`	Dictionary containing depth predictions

Source code in inference/models/depth_anything_v3/architecture/da3.py

def forward(
    self,
    x: torch.Tensor,
) -> Dict[str, torch.Tensor]:
    """
    Forward pass through the network.

    Args:
        x: Input images (B, N, 3, H, W) where N=1 for single-view

    Returns:
        Dictionary containing depth predictions
    """
    # Extract features using backbone
    feats, _ = self.backbone(x)
    H, W = x.shape[-2], x.shape[-1]

    # Process features through depth head
    with torch.autocast(device_type=x.device.type, enabled=False):
        output = self._process_depth_head(feats, H, W)

    return output

inference.models.depth_anything_v3.architecture.dpt ¶

Classes¶

FeatureFusionBlock ¶

Bases: Module

Top-down fusion block

Source code in inference/models/depth_anything_v3/architecture/dpt.py

class FeatureFusionBlock(nn.Module):
    """Top-down fusion block"""

    def __init__(
        self,
        features: int,
        activation: nn.Module,
        deconv: bool = False,
        bn: bool = False,
        expand: bool = False,
        align_corners: bool = True,
        size: Tuple[int, int] = None,
        has_residual: bool = True,
        groups: int = 1,
    ) -> None:
        super().__init__()
        self.align_corners = align_corners
        self.size = size
        self.has_residual = has_residual

        self.resConfUnit1 = (
            ResidualConvUnit(features, activation, bn, groups=groups)
            if has_residual
            else None
        )
        self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=groups)

        out_features = (features // 2) if expand else features
        self.out_conv = nn.Conv2d(
            features, out_features, 1, 1, 0, bias=True, groups=groups
        )
        self.skip_add = nn.quantized.FloatFunctional()

    def forward(self, *xs: torch.Tensor, size: Tuple[int, int] = None) -> torch.Tensor:
        y = xs[0]
        if self.has_residual and len(xs) > 1 and self.resConfUnit1 is not None:
            y = self.skip_add.add(y, self.resConfUnit1(xs[1]))

        y = self.resConfUnit2(y)

        if (size is None) and (self.size is None):
            up_kwargs = {"scale_factor": 2}
        elif size is None:
            up_kwargs = {"size": self.size}
        else:
            up_kwargs = {"size": size}

        y = custom_interpolate(
            y, **up_kwargs, mode="bilinear", align_corners=self.align_corners
        )
        y = self.out_conv(y)
        return y

ResidualConvUnit ¶

Bases: Module

Lightweight residual convolution block for fusion

Source code in inference/models/depth_anything_v3/architecture/dpt.py

class ResidualConvUnit(nn.Module):
    """Lightweight residual convolution block for fusion"""

    def __init__(
        self, features: int, activation: nn.Module, bn: bool, groups: int = 1
    ) -> None:
        super().__init__()
        self.bn = bn
        self.groups = groups
        self.conv1 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
        self.conv2 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
        self.norm1 = None
        self.norm2 = None
        self.activation = activation
        self.skip_add = nn.quantized.FloatFunctional()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out = self.activation(x)
        out = self.conv1(out)
        if self.norm1 is not None:
            out = self.norm1(out)

        out = self.activation(out)
        out = self.conv2(out)
        if self.norm2 is not None:
            out = self.norm2(out)

        return self.skip_add.add(out, x)

Functions:¶

inference.models.depth_anything_v3.architecture.dualdpt ¶

Classes¶

DualDPT ¶

Bases: Module

Dual-head DPT for dense prediction with an auxiliary head. Simplified for single-view depth estimation - only depth output is used.

Source code in inference/models/depth_anything_v3/architecture/dualdpt.py

class DualDPT(nn.Module):
    """
    Dual-head DPT for dense prediction with an auxiliary head.
    Simplified for single-view depth estimation - only depth output is used.
    """

    def __init__(
        self,
        dim_in: int,
        *,
        patch_size: int = 14,
        output_dim: int = 2,
        activation: str = "exp",
        conf_activation: str = "expp1",
        features: int = 256,
        out_channels: Sequence[int] = (256, 512, 1024, 1024),
        pos_embed: bool = True,
        down_ratio: int = 1,
        aux_pyramid_levels: int = 4,
        aux_out1_conv_num: int = 5,
        head_names: Tuple[str, str] = ("depth", "ray"),
    ) -> None:
        super().__init__()

        self.patch_size = patch_size
        self.activation = activation
        self.conf_activation = conf_activation
        self.pos_embed = pos_embed
        self.down_ratio = down_ratio

        self.aux_levels = aux_pyramid_levels
        self.aux_out1_conv_num = aux_out1_conv_num

        self.head_main, self.head_aux = head_names

        self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)

        self.norm = nn.LayerNorm(dim_in)
        self.projects = nn.ModuleList(
            [
                nn.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0)
                for oc in out_channels
            ]
        )

        self.resize_layers = nn.ModuleList(
            [
                nn.ConvTranspose2d(
                    out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0
                ),
                nn.ConvTranspose2d(
                    out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0
                ),
                nn.Identity(),
                nn.Conv2d(
                    out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1
                ),
            ]
        )

        self.scratch = _make_scratch(list(out_channels), features, expand=False)

        # Main fusion chain
        self.scratch.refinenet1 = _make_fusion_block(features)
        self.scratch.refinenet2 = _make_fusion_block(features)
        self.scratch.refinenet3 = _make_fusion_block(features)
        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False)

        head_features_1 = features
        head_features_2 = 32
        self.scratch.output_conv1 = nn.Conv2d(
            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
        )
        self.scratch.output_conv2 = nn.Sequential(
            nn.Conv2d(
                head_features_1 // 2,
                head_features_2,
                kernel_size=3,
                stride=1,
                padding=1,
            ),
            nn.ReLU(inplace=True),
            nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
        )

        # Auxiliary fusion chain (for ray head - not used for inference but needed for weight loading)
        self.scratch.refinenet1_aux = _make_fusion_block(features)
        self.scratch.refinenet2_aux = _make_fusion_block(features)
        self.scratch.refinenet3_aux = _make_fusion_block(features)
        self.scratch.refinenet4_aux = _make_fusion_block(features, has_residual=False)

        self.scratch.output_conv1_aux = nn.ModuleList(
            [self._make_aux_out1_block(head_features_1) for _ in range(self.aux_levels)]
        )

        use_ln = True
        ln_seq = (
            [
                Permute((0, 2, 3, 1)),
                nn.LayerNorm(head_features_2),
                Permute((0, 3, 1, 2)),
            ]
            if use_ln
            else []
        )
        self.scratch.output_conv2_aux = nn.ModuleList(
            [
                nn.Sequential(
                    nn.Conv2d(
                        head_features_1 // 2,
                        head_features_2,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),
                    *ln_seq,
                    nn.ReLU(inplace=True),
                    nn.Conv2d(head_features_2, 7, kernel_size=1, stride=1, padding=0),
                )
                for _ in range(self.aux_levels)
            ]
        )

    def forward(
        self,
        feats: List[torch.Tensor],
        H: int,
        W: int,
        patch_start_idx: int,
        chunk_size: int = 8,
    ) -> Dict[str, torch.Tensor]:
        B, S, N, C = feats[0][0].shape
        feats = [feat[0].reshape(B * S, N, C) for feat in feats]
        if chunk_size is None or chunk_size >= S:
            out_dict = self._forward_impl(feats, H, W, patch_start_idx)
            out_dict = {k: v.reshape(B, S, *v.shape[1:]) for k, v in out_dict.items()}
            return out_dict
        out_dicts = []
        for s0 in range(0, B * S, chunk_size):
            s1 = min(s0 + chunk_size, B * S)
            out_dict = self._forward_impl(
                [feat[s0:s1] for feat in feats],
                H,
                W,
                patch_start_idx,
            )
            out_dicts.append(out_dict)
        out_dict = {
            k: torch.cat([out_dict[k] for out_dict in out_dicts], dim=0)
            for k in out_dicts[0].keys()
        }
        out_dict = {k: v.view(B, S, *v.shape[1:]) for k, v in out_dict.items()}
        return out_dict

    def _forward_impl(
        self,
        feats: List[torch.Tensor],
        H: int,
        W: int,
        patch_start_idx: int,
    ) -> Dict[str, torch.Tensor]:
        B, _, C = feats[0].shape
        ph, pw = H // self.patch_size, W // self.patch_size
        resized_feats = []
        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
            x = feats[take_idx][:, patch_start_idx:]
            x = self.norm(x)
            x = x.permute(0, 2, 1).reshape(B, C, ph, pw)

            x = self.projects[stage_idx](x)
            if self.pos_embed:
                x = self._add_pos_embed(x, W, H)
            x = self.resize_layers[stage_idx](x)
            resized_feats.append(x)

        # Only compute main fusion for depth (skip aux for inference)
        fused_main, _ = self._fuse(resized_feats)

        h_out = int(ph * self.patch_size / self.down_ratio)
        w_out = int(pw * self.patch_size / self.down_ratio)

        fused_main = custom_interpolate(
            fused_main, (h_out, w_out), mode="bilinear", align_corners=True
        )
        if self.pos_embed:
            fused_main = self._add_pos_embed(fused_main, W, H)

        main_logits = self.scratch.output_conv2(fused_main)
        fmap = main_logits.permute(0, 2, 3, 1)
        main_pred = self._apply_activation_single(fmap[..., :-1], self.activation)
        main_conf = self._apply_activation_single(fmap[..., -1], self.conf_activation)

        return {
            self.head_main: main_pred.squeeze(-1),
            f"{self.head_main}_conf": main_conf,
        }

    def _fuse(
        self, feats: List[torch.Tensor]
    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
        l1, l2, l3, l4 = feats

        l1_rn = self.scratch.layer1_rn(l1)
        l2_rn = self.scratch.layer2_rn(l2)
        l3_rn = self.scratch.layer3_rn(l3)
        l4_rn = self.scratch.layer4_rn(l4)

        out = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
        aux_out = self.scratch.refinenet4_aux(l4_rn, size=l3_rn.shape[2:])
        aux_list: List[torch.Tensor] = []
        if self.aux_levels >= 4:
            aux_list.append(aux_out)

        out = self.scratch.refinenet3(out, l3_rn, size=l2_rn.shape[2:])
        aux_out = self.scratch.refinenet3_aux(aux_out, l3_rn, size=l2_rn.shape[2:])
        if self.aux_levels >= 3:
            aux_list.append(aux_out)

        out = self.scratch.refinenet2(out, l2_rn, size=l1_rn.shape[2:])
        aux_out = self.scratch.refinenet2_aux(aux_out, l2_rn, size=l1_rn.shape[2:])
        if self.aux_levels >= 2:
            aux_list.append(aux_out)

        out = self.scratch.refinenet1(out, l1_rn)
        aux_out = self.scratch.refinenet1_aux(aux_out, l1_rn)
        aux_list.append(aux_out)

        out = self.scratch.output_conv1(out)
        aux_list = [
            self.scratch.output_conv1_aux[i](aux) for i, aux in enumerate(aux_list)
        ]

        return out, aux_list

    def _add_pos_embed(
        self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1
    ) -> torch.Tensor:
        pw, ph = x.shape[-1], x.shape[-2]
        pe = create_uv_grid(pw, ph, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
        pe = position_grid_to_embed(pe, x.shape[1]) * ratio
        pe = pe.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
        return x + pe.to(x.dtype)

    def _make_aux_out1_block(self, in_ch: int) -> nn.Sequential:
        if self.aux_out1_conv_num == 5:
            return nn.Sequential(
                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
                nn.Conv2d(in_ch // 2, in_ch, 3, 1, 1),
                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
                nn.Conv2d(in_ch // 2, in_ch, 3, 1, 1),
                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
            )
        if self.aux_out1_conv_num == 3:
            return nn.Sequential(
                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
                nn.Conv2d(in_ch // 2, in_ch, 3, 1, 1),
                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
            )
        if self.aux_out1_conv_num == 1:
            return nn.Sequential(nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1))
        raise ValueError(f"aux_out1_conv_num {self.aux_out1_conv_num} not supported")

    def _apply_activation_single(
        self, x: torch.Tensor, activation: str = "linear"
    ) -> torch.Tensor:
        act = activation.lower() if isinstance(activation, str) else activation
        if act == "exp":
            return torch.exp(x)
        if act == "expm1":
            return torch.expm1(x)
        if act == "expp1":
            return torch.exp(x) + 1
        if act == "relu":
            return torch.relu(x)
        if act == "sigmoid":
            return torch.sigmoid(x)
        if act == "softplus":
            return torch.nn.functional.softplus(x)
        if act == "tanh":
            return torch.tanh(x)
        return x

Functions:¶

inference.models.depth_anything_v3.architecture.head_utils ¶

Classes¶

Permute ¶

Bases: Module

nn.Module wrapper around Tensor.permute for cleaner nn.Sequential usage.

Source code in inference/models/depth_anything_v3/architecture/head_utils.py

class Permute(nn.Module):
    """nn.Module wrapper around Tensor.permute for cleaner nn.Sequential usage."""

    dims: Tuple[int, ...]

    def __init__(self, dims: Tuple[int, ...]) -> None:
        super().__init__()
        self.dims = dims

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x.permute(*self.dims)

Functions:¶

create_uv_grid ¶

create_uv_grid(
    width,
    height,
    aspect_ratio=None,
    dtype=None,
    device=None,
)

Create a normalized UV grid of shape (width, height, 2).

Source code in inference/models/depth_anything_v3/architecture/head_utils.py

def create_uv_grid(
    width: int,
    height: int,
    aspect_ratio: float = None,
    dtype: torch.dtype = None,
    device: torch.device = None,
) -> torch.Tensor:
    """Create a normalized UV grid of shape (width, height, 2)."""
    if aspect_ratio is None:
        aspect_ratio = float(width) / float(height)

    diag_factor = (aspect_ratio**2 + 1.0) ** 0.5
    span_x = aspect_ratio / diag_factor
    span_y = 1.0 / diag_factor

    left_x = -span_x * (width - 1) / width
    right_x = span_x * (width - 1) / width
    top_y = -span_y * (height - 1) / height
    bottom_y = span_y * (height - 1) / height

    x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
    y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)

    uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
    uv_grid = torch.stack((uu, vv), dim=-1)

    return uv_grid

custom_interpolate ¶

custom_interpolate(
    x,
    size=None,
    scale_factor=None,
    mode="bilinear",
    align_corners=True,
)

Safe interpolation implementation to avoid INT_MAX overflow.

Source code in inference/models/depth_anything_v3/architecture/head_utils.py

def custom_interpolate(
    x: torch.Tensor,
    size: Union[Tuple[int, int], None] = None,
    scale_factor: Union[float, None] = None,
    mode: str = "bilinear",
    align_corners: bool = True,
) -> torch.Tensor:
    """Safe interpolation implementation to avoid INT_MAX overflow."""
    if size is None:
        assert scale_factor is not None, "Either size or scale_factor must be provided."
        size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))

    INT_MAX = 1610612736
    total = size[0] * size[1] * x.shape[0] * x.shape[1]

    if total > INT_MAX:
        chunks = torch.chunk(x, chunks=(total // INT_MAX) + 1, dim=0)
        outs = [
            F.interpolate(c, size=size, mode=mode, align_corners=align_corners)
            for c in chunks
        ]
        return torch.cat(outs, dim=0).contiguous()

    return F.interpolate(x, size=size, mode=mode, align_corners=align_corners)

make_sincos_pos_embed ¶

make_sincos_pos_embed(embed_dim, pos, omega_0=100)

Generate 1D positional embedding from a given grid using sine and cosine functions.

Source code in inference/models/depth_anything_v3/architecture/head_utils.py

def make_sincos_pos_embed(
    embed_dim: int, pos: torch.Tensor, omega_0: float = 100
) -> torch.Tensor:
    """Generate 1D positional embedding from a given grid using sine and cosine functions."""
    assert embed_dim % 2 == 0
    omega = torch.arange(embed_dim // 2, dtype=torch.float32, device=pos.device)
    omega /= embed_dim / 2.0
    omega = 1.0 / omega_0**omega

    pos = pos.reshape(-1)
    out = torch.einsum("m,d->md", pos, omega)

    emb_sin = torch.sin(out)
    emb_cos = torch.cos(out)

    emb = torch.cat([emb_sin, emb_cos], dim=1)
    return emb.float()

position_grid_to_embed ¶

position_grid_to_embed(pos_grid, embed_dim, omega_0=100)

Convert 2D position grid (HxWx2) to sinusoidal embeddings (HxWxC)

Source code in inference/models/depth_anything_v3/architecture/head_utils.py

def position_grid_to_embed(
    pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100
) -> torch.Tensor:
    """
    Convert 2D position grid (HxWx2) to sinusoidal embeddings (HxWxC)
    """
    H, W, grid_dim = pos_grid.shape
    assert grid_dim == 2
    pos_flat = pos_grid.reshape(-1, grid_dim)

    emb_x = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0)
    emb_y = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0)

    emb = torch.cat([emb_x, emb_y], dim=-1)

    return emb.view(H, W, embed_dim)

`models/depth_anything_v3/architecture/layers`¶

inference.models.depth_anything_v3.architecture.layers.drop_path ¶

Classes¶

DropPath ¶

Bases: Module

Drop paths (Stochastic Depth) per sample.

Source code in inference/models/depth_anything_v3/architecture/layers/drop_path.py

class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample."""

    def __init__(self, drop_prob=None):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)

inference.models.depth_anything_v3.architecture.layers.patch_embed ¶

Classes¶

PatchEmbed ¶

Bases: Module

2D image to patch embedding: (B,C,H,W) -> (B,N,D)

Parameters:

Name	Type	Description	Default
`img_size`	`Union[int, Tuple[int, int]]`	Image size.	`224`
`patch_size`	`Union[int, Tuple[int, int]]`	Patch token size.	`16`
`in_chans`	`int`	Number of input image channels.	`3`
`embed_dim`	`int`	Number of linear projection output channels.	`768`
`norm_layer`	`Optional[Callable]`	Normalization layer.	`None`

Source code in inference/models/depth_anything_v3/architecture/layers/patch_embed.py

class PatchEmbed(nn.Module):
    """
    2D image to patch embedding: (B,C,H,W) -> (B,N,D)

    Args:
        img_size: Image size.
        patch_size: Patch token size.
        in_chans: Number of input image channels.
        embed_dim: Number of linear projection output channels.
        norm_layer: Normalization layer.
    """

    def __init__(
        self,
        img_size: Union[int, Tuple[int, int]] = 224,
        patch_size: Union[int, Tuple[int, int]] = 16,
        in_chans: int = 3,
        embed_dim: int = 768,
        norm_layer: Optional[Callable] = None,
        flatten_embedding: bool = True,
    ) -> None:
        super().__init__()

        image_HW = make_2tuple(img_size)
        patch_HW = make_2tuple(patch_size)
        patch_grid_size = (
            image_HW[0] // patch_HW[0],
            image_HW[1] // patch_HW[1],
        )

        self.img_size = image_HW
        self.patch_size = patch_HW
        self.patches_resolution = patch_grid_size
        self.num_patches = patch_grid_size[0] * patch_grid_size[1]

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.flatten_embedding = flatten_embedding

        self.proj = nn.Conv2d(
            in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW
        )
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x: Tensor) -> Tensor:
        _, _, H, W = x.shape
        patch_H, patch_W = self.patch_size

        assert (
            H % patch_H == 0
        ), f"Input image height {H} is not a multiple of patch height {patch_H}"
        assert (
            W % patch_W == 0
        ), f"Input image width {W} is not a multiple of patch width: {patch_W}"

        x = self.proj(x)  # B C H W
        H, W = x.size(2), x.size(3)
        x = x.flatten(2).transpose(1, 2)  # B HW C
        x = self.norm(x)
        if not self.flatten_embedding:
            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
        return x

inference.models.depth_anything_v3.architecture.layers.rope ¶

Classes¶

PositionGetter ¶

Generates and caches 2D spatial positions for patches in a grid.

Source code in inference/models/depth_anything_v3/architecture/layers/rope.py

class PositionGetter:
    """Generates and caches 2D spatial positions for patches in a grid."""

    def __init__(self):
        self.position_cache: Dict[Tuple[int, int], torch.Tensor] = {}

    def __call__(
        self, batch_size: int, height: int, width: int, device: torch.device
    ) -> torch.Tensor:
        if (height, width) not in self.position_cache:
            y_coords = torch.arange(height, device=device)
            x_coords = torch.arange(width, device=device)
            positions = torch.cartesian_prod(y_coords, x_coords)
            self.position_cache[height, width] = positions

        cached_positions = self.position_cache[height, width]
        return (
            cached_positions.view(1, height * width, 2)
            .expand(batch_size, -1, -1)
            .clone()
        )

RotaryPositionEmbedding2D ¶

Bases: Module

2D Rotary Position Embedding implementation.

Source code in inference/models/depth_anything_v3/architecture/layers/rope.py

class RotaryPositionEmbedding2D(nn.Module):
    """2D Rotary Position Embedding implementation."""

    def __init__(self, frequency: float = 100.0, scaling_factor: float = 1.0):
        super().__init__()
        self.base_frequency = frequency
        self.scaling_factor = scaling_factor
        self.frequency_cache: Dict[Tuple, Tuple[torch.Tensor, torch.Tensor]] = {}

    def _compute_frequency_components(
        self, dim: int, seq_len: int, device: torch.device, dtype: torch.dtype
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        cache_key = (dim, seq_len, device, dtype)
        if cache_key not in self.frequency_cache:
            exponents = torch.arange(0, dim, 2, device=device).float() / dim
            inv_freq = 1.0 / (self.base_frequency**exponents)

            positions = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
            angles = torch.einsum("i,j->ij", positions, inv_freq)

            angles = angles.to(dtype)
            angles = torch.cat((angles, angles), dim=-1)
            cos_components = angles.cos().to(dtype)
            sin_components = angles.sin().to(dtype)
            self.frequency_cache[cache_key] = (cos_components, sin_components)

        return self.frequency_cache[cache_key]

    @staticmethod
    def _rotate_features(x: torch.Tensor) -> torch.Tensor:
        feature_dim = x.shape[-1]
        x1, x2 = x[..., : feature_dim // 2], x[..., feature_dim // 2 :]
        return torch.cat((-x2, x1), dim=-1)

    def _apply_1d_rope(
        self,
        tokens: torch.Tensor,
        positions: torch.Tensor,
        cos_comp: torch.Tensor,
        sin_comp: torch.Tensor,
    ) -> torch.Tensor:
        cos = F.embedding(positions, cos_comp)[:, None, :, :]
        sin = F.embedding(positions, sin_comp)[:, None, :, :]
        return (tokens * cos) + (self._rotate_features(tokens) * sin)

    def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
        assert tokens.size(-1) % 2 == 0, "Feature dimension must be even"
        assert (
            positions.ndim == 3 and positions.shape[-1] == 2
        ), "Positions must have shape (batch_size, n_tokens, 2)"

        feature_dim = tokens.size(-1) // 2

        max_position = int(positions.max()) + 1
        cos_comp, sin_comp = self._compute_frequency_components(
            feature_dim, max_position, tokens.device, tokens.dtype
        )

        vertical_features, horizontal_features = tokens.chunk(2, dim=-1)

        vertical_features = self._apply_1d_rope(
            vertical_features, positions[..., 0], cos_comp, sin_comp
        )
        horizontal_features = self._apply_1d_rope(
            horizontal_features, positions[..., 1], cos_comp, sin_comp
        )

        return torch.cat((vertical_features, horizontal_features), dim=-1)

`models/depth_anything_v3`¶

inference.models.depth_anything_v3.depth_anything_v3 ¶

Classes¶

DepthAnythingV3 ¶

Bases: DepthAnythingV2

Depth Anything V3 model for monocular depth estimation.

This model uses the Depth Anything V3 architecture with DinoV2 backbone and DualDPT head for dense depth prediction.

Note: Unlike V2, V3 is not HuggingFace Transformers compatible, so the architecture is vendored in and model loading is custom. However, the external interface (inputs/outputs) matches V2.

Source code in inference/models/depth_anything_v3/depth_anything_v3.py

class DepthAnythingV3(DepthAnythingV2):
    """
    Depth Anything V3 model for monocular depth estimation.

    This model uses the Depth Anything V3 architecture with DinoV2 backbone
    and DualDPT head for dense depth prediction.

    Note: Unlike V2, V3 is not HuggingFace Transformers compatible, so the
    architecture is vendored in and model loading is custom. However, the
    external interface (inputs/outputs) matches V2.
    """

    endpoint = "depth-anything-v3/small"

    def __init__(self, *args, **kwargs):

        try:
            super().__init__(*args, **kwargs)
        except Exception as e:
            print(f"Error initializing depth estimation model: {str(e)}")
            raise

        # Set appropriate dtype based on device
        if self.device.type == "mps":
            self.model = self.model.to(torch.float32)  # MPS prefers float32
        elif self.device.type == "cpu":
            warnings.warn(
                "Running DepthAnythingV3 on CPU. This may be very slow. Consider using GPU or MPS if available."
            )

    def initialize_model(self, **kwargs):
        """Initialize the model with vendored architecture instead of HF Transformers."""
        # Determine device
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            self.device = torch.device("mps")
        else:
            self.device = torch.device("cpu")
            warnings.warn(
                "Running DepthAnythingV3 on CPU. This may be slow. "
                "Consider using GPU or MPS if available."
            )

        # Determine dtype
        if self.device.type == "cuda":
            self.dtype = (
                torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
            )
        elif self.device.type == "mps":
            self.dtype = torch.float32  # MPS works better with float32
        else:
            self.dtype = torch.float32

        # Load configuration from config.json
        config_path = self._get_config_path()
        self.config = parse_config(config_path)

        # Build model with vendored architecture
        self.model = DepthAnything3Net(**self.config)

        # Load weights
        self._load_weights()

        # Move model to device and set eval mode
        self.model = self.model.to(self.device, dtype=self.dtype)
        self.model.eval()

        # Load processor from cache dir (uses preprocessor_config.json)
        self.processor = AutoImageProcessor.from_pretrained(self.cache_dir)

    def _load_weights(self):
        """Load pretrained weights from the model cache."""
        weights_path = self._get_model_weights_path()

        if weights_path.endswith(".safetensors"):
            state_dict = load_safetensors(weights_path)
        else:
            state_dict = torch.load(weights_path, map_location="cpu")

        # Convert state dict format
        state_dict = convert_state_dict(state_dict)

        # Load weights (strict=False to handle missing aux weights)
        missing, unexpected = self.model.load_state_dict(state_dict, strict=False)

        # Filter out expected missing keys:
        # - cam_enc, cam_dec: Camera encoder/decoder (not used for depth-only)
        # - gs_head, gs_adapter: Gaussian splatting head (not used)
        # - output_conv2_aux: Auxiliary ray prediction heads (not used for depth-only)
        expected_missing = [
            "cam_enc",
            "cam_dec",
            "gs_head",
            "gs_adapter",
            "output_conv2_aux",
        ]
        unexpected_filtered = [
            k for k in unexpected if not any(skip in k for skip in expected_missing)
        ]
        missing_filtered = [
            k for k in missing if not any(skip in k for skip in expected_missing)
        ]

        if missing_filtered:
            warnings.warn(f"Missing keys when loading weights: {missing_filtered}")
        if unexpected_filtered:
            warnings.warn(
                f"Unexpected keys when loading weights: {unexpected_filtered}"
            )

    def _get_config_path(self) -> str:
        """Get path to model config file."""
        cache_dir = Path(self.cache_dir)
        config_file = cache_dir / "config.json"
        if config_file.exists():
            return str(config_file)
        raise FileNotFoundError(
            f"Could not find config.json in {cache_dir}. "
            f"Expected config.json to be downloaded alongside model weights."
        )

    def _get_model_weights_path(self) -> str:
        """Get path to model weights file."""
        cache_dir = Path(self.cache_dir)

        # Try weights.safetensors (common HF convention)
        weights_file = cache_dir / "model.safetensors"
        if weights_file.exists():
            return str(weights_file)
        else:
            raise FileNotFoundError(f"Could not find {weights_file} in {cache_dir}")

    def predict(self, image_in: Image.Image, prompt="", history=None, **kwargs):
        """
        Run depth prediction on an input image.

        Unlike V2, the vendored DepthAnything3Net expects a tensor directly
        with shape (B, N, 3, H, W) where N=1 for single-view inference.
        """
        from inference.core.workflows.execution_engine.entities.base import (
            ImageParentMetadata,
            WorkflowImageData,
        )

        # Process input image using the HF processor
        inputs = self.processor(images=image_in, return_tensors="pt")

        # Extract pixel_values and add the N dimension
        # Processor outputs: (B, C, H, W) -> Model expects: (B, N, C, H, W)
        pixel_values = inputs["pixel_values"]
        pixel_values = pixel_values.unsqueeze(1)  # Add N=1 dimension

        # Move to device and dtype
        pixel_values = pixel_values.to(self.device, dtype=self.dtype)

        # Run inference
        with torch.inference_mode():
            outputs = self.model(pixel_values)

            # Extract depth from model output
            # Model returns dict with 'depth' key containing (B, S, H, W) tensor
            # where S=1 for single-view, so we squeeze it to (B, H, W)
            depth_map = outputs["depth"].squeeze(1)

            # Resize back to original image size
            depth_map = torch.nn.functional.interpolate(
                depth_map.unsqueeze(1),
                size=(image_in.height, image_in.width),
                mode="bilinear",
                align_corners=False,
            ).squeeze()

            depth_map = depth_map.to(torch.float32).cpu().numpy()

            # Normalize depth values
            depth_min = depth_map.min()
            depth_max = depth_map.max()
            if depth_max == depth_min:
                raise ValueError("Depth map has no variation (min equals max)")
            normalized_depth = (depth_map - depth_min) / (depth_max - depth_min)
            normalized_depth = 1 - normalized_depth

            # Create visualization
            depth_for_viz = (normalized_depth * 255.0).astype(np.uint8)
            cmap = plt.get_cmap("viridis")
            colored_depth = (cmap(depth_for_viz)[:, :, :3] * 255).astype(np.uint8)

            # Convert numpy array to WorkflowImageData
            parent_metadata = ImageParentMetadata(parent_id=f"{uuid4()}")
            colored_depth_image = WorkflowImageData(
                numpy_image=colored_depth, parent_metadata=parent_metadata
            )

            result = {
                "image": colored_depth_image,
                "normalized_depth": normalized_depth,
            }

            return (result,)

Methods:¶

initialize_model ¶

initialize_model(**kwargs)

Initialize the model with vendored architecture instead of HF Transformers.

Source code in inference/models/depth_anything_v3/depth_anything_v3.py

def initialize_model(self, **kwargs):
    """Initialize the model with vendored architecture instead of HF Transformers."""
    # Determine device
    if torch.cuda.is_available():
        self.device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        self.device = torch.device("mps")
    else:
        self.device = torch.device("cpu")
        warnings.warn(
            "Running DepthAnythingV3 on CPU. This may be slow. "
            "Consider using GPU or MPS if available."
        )

    # Determine dtype
    if self.device.type == "cuda":
        self.dtype = (
            torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        )
    elif self.device.type == "mps":
        self.dtype = torch.float32  # MPS works better with float32
    else:
        self.dtype = torch.float32

    # Load configuration from config.json
    config_path = self._get_config_path()
    self.config = parse_config(config_path)

    # Build model with vendored architecture
    self.model = DepthAnything3Net(**self.config)

    # Load weights
    self._load_weights()

    # Move model to device and set eval mode
    self.model = self.model.to(self.device, dtype=self.dtype)
    self.model.eval()

    # Load processor from cache dir (uses preprocessor_config.json)
    self.processor = AutoImageProcessor.from_pretrained(self.cache_dir)

predict ¶

predict(image_in, prompt='', history=None, **kwargs)

Run depth prediction on an input image.

Unlike V2, the vendored DepthAnything3Net expects a tensor directly with shape (B, N, 3, H, W) where N=1 for single-view inference.

Source code in inference/models/depth_anything_v3/depth_anything_v3.py

def predict(self, image_in: Image.Image, prompt="", history=None, **kwargs):
    """
    Run depth prediction on an input image.

    Unlike V2, the vendored DepthAnything3Net expects a tensor directly
    with shape (B, N, 3, H, W) where N=1 for single-view inference.
    """
    from inference.core.workflows.execution_engine.entities.base import (
        ImageParentMetadata,
        WorkflowImageData,
    )

    # Process input image using the HF processor
    inputs = self.processor(images=image_in, return_tensors="pt")

    # Extract pixel_values and add the N dimension
    # Processor outputs: (B, C, H, W) -> Model expects: (B, N, C, H, W)
    pixel_values = inputs["pixel_values"]
    pixel_values = pixel_values.unsqueeze(1)  # Add N=1 dimension

    # Move to device and dtype
    pixel_values = pixel_values.to(self.device, dtype=self.dtype)

    # Run inference
    with torch.inference_mode():
        outputs = self.model(pixel_values)

        # Extract depth from model output
        # Model returns dict with 'depth' key containing (B, S, H, W) tensor
        # where S=1 for single-view, so we squeeze it to (B, H, W)
        depth_map = outputs["depth"].squeeze(1)

        # Resize back to original image size
        depth_map = torch.nn.functional.interpolate(
            depth_map.unsqueeze(1),
            size=(image_in.height, image_in.width),
            mode="bilinear",
            align_corners=False,
        ).squeeze()

        depth_map = depth_map.to(torch.float32).cpu().numpy()

        # Normalize depth values
        depth_min = depth_map.min()
        depth_max = depth_map.max()
        if depth_max == depth_min:
            raise ValueError("Depth map has no variation (min equals max)")
        normalized_depth = (depth_map - depth_min) / (depth_max - depth_min)
        normalized_depth = 1 - normalized_depth

        # Create visualization
        depth_for_viz = (normalized_depth * 255.0).astype(np.uint8)
        cmap = plt.get_cmap("viridis")
        colored_depth = (cmap(depth_for_viz)[:, :, :3] * 255).astype(np.uint8)

        # Convert numpy array to WorkflowImageData
        parent_metadata = ImageParentMetadata(parent_id=f"{uuid4()}")
        colored_depth_image = WorkflowImageData(
            numpy_image=colored_depth, parent_metadata=parent_metadata
        )

        result = {
            "image": colored_depth_image,
            "normalized_depth": normalized_depth,
        }

        return (result,)

Functions:¶

convert_state_dict ¶

convert_state_dict(state_dict)

Convert state dict from official DA3 format to our simplified format.

Source code in inference/models/depth_anything_v3/depth_anything_v3.py

def convert_state_dict(state_dict: dict) -> dict:
    """
    Convert state dict from official DA3 format to our simplified format.
    """
    new_state_dict = {}
    for key, value in state_dict.items():
        # Remove 'model.' prefix if present
        new_key = key
        if new_key.startswith("model."):
            new_key = new_key[6:]

        # Map backbone paths
        new_key = new_key.replace("net.", "backbone.")

        # Skip camera encoder/decoder weights (not used for depth-only inference)
        if "cam_enc" in new_key or "cam_dec" in new_key:
            continue

        # Skip GS head weights (not used)
        if "gs_head" in new_key or "gs_adapter" in new_key:
            continue

        new_state_dict[new_key] = value

    return new_state_dict

parse_config ¶

parse_config(config_path)

Parse the config.json file from HuggingFace/official DA3 format.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	Path to the config.json file	required

Returns:

Type	Description
`dict`	Dictionary with model configuration parameters

Source code in inference/models/depth_anything_v3/depth_anything_v3.py

def parse_config(config_path: str) -> dict:
    """
    Parse the config.json file from HuggingFace/official DA3 format.

    Args:
        config_path: Path to the config.json file

    Returns:
        Dictionary with model configuration parameters
    """
    with open(config_path, "r") as f:
        raw_config = json.load(f)

    config = raw_config.get("config", raw_config)

    # Extract backbone (net) configuration
    net_config = config.get("net", {})
    backbone_name = net_config.get("name", "vitb")
    out_layers = net_config.get("out_layers", [5, 7, 9, 11])
    alt_start = net_config.get("alt_start", 4)
    qknorm_start = net_config.get("qknorm_start", 4)
    rope_start = net_config.get("rope_start", 4)
    cat_token = net_config.get("cat_token", True)

    # Extract head configuration
    head_config = config.get("head", {})
    head_dim_in = head_config.get("dim_in", 1536)
    head_output_dim = head_config.get("output_dim", 2)
    head_features = head_config.get("features", 128)
    head_out_channels = head_config.get("out_channels", [96, 192, 384, 768])

    return {
        "backbone_name": backbone_name,
        "out_layers": out_layers,
        "alt_start": alt_start,
        "qknorm_start": qknorm_start,
        "rope_start": rope_start,
        "cat_token": cat_token,
        "head_dim_in": head_dim_in,
        "head_output_dim": head_output_dim,
        "head_features": head_features,
        "head_out_channels": head_out_channels,
    }

`models/dinov3`¶

inference.models.dinov3.dinov3_classification ¶

Classes¶

DinoV3Classification ¶

Bases: ClassificationBaseOnnxRoboflowInferenceModel

DinoV3Classification handles classification inference for Dinov3 linear probe models using ONNX.

Inherits

Attributes:

Name	Type	Description
`multiclass`	`bool`	A flag that specifies if the model should handle multiclass classification.

Source code in inference/models/dinov3/dinov3_classification.py

class DinoV3Classification(ClassificationBaseOnnxRoboflowInferenceModel):
    """DinoV3Classification handles classification inference
    for Dinov3 linear probe models using ONNX.

    Inherits:
        ClassificationBaseOnnxRoboflowInferenceModel: Base class for ONNX Roboflow Inference.
        ClassificationMixin: Mixin class providing classification-specific methods.

    Attributes:
        multiclass (bool): A flag that specifies if the model should handle multiclass classification.
    """

    preprocess_means = [0.485, 0.456, 0.406]
    preprocess_stds = [0.229, 0.224, 0.225]

    def __init__(self, *args, **kwargs):
        """Initializes the DinoV3Classification instance.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__(*args, **kwargs)
        self.multiclass = self.environment.get("MULTICLASS", False)

    @property
    def weights_file(self) -> str:
        """Determines the weights file to be used based on the availability of AWS keys.

        If AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set, it returns the path to 'weights.onnx'.
        Otherwise, it returns the path to 'best.onnx'.

        Returns:
            str: Path to the weights file.
        """
        if AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and LAMBDA:
            return "weights.onnx"
        else:
            return "best.onnx"

Attributes¶

weights_file `property` ¶

weights_file

Determines the weights file to be used based on the availability of AWS keys.

If AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set, it returns the path to 'weights.onnx'. Otherwise, it returns the path to 'best.onnx'.

Returns:

Name	Type	Description
`str`	`str`	Path to the weights file.

Methods:¶

init ¶

__init__(*args, **kwargs)

Initializes the DinoV3Classification instance.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/dinov3/dinov3_classification.py

def __init__(self, *args, **kwargs):
    """Initializes the DinoV3Classification instance.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__(*args, **kwargs)
    self.multiclass = self.environment.get("MULTICLASS", False)

`models/doctr`¶

inference.models.doctr.doctr_model ¶

Classes¶

DocTR ¶

Bases: RoboflowCoreModel

Source code in inference/models/doctr/doctr_model.py

class DocTR(RoboflowCoreModel):
    def __init__(self, *args, model_id: str = "doctr_rec/crnn_vgg16_bn", **kwargs):
        """Initializes the DocTR model.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        self.api_key = kwargs.get("api_key")
        self.dataset_id = "doctr"
        self.version_id = "default"
        self.endpoint = model_id
        model_id = model_id.lower()

        self.det_model = DocTRDet(api_key=kwargs.get("api_key"))
        self.rec_model = DocTRRec(api_key=kwargs.get("api_key"))

        os.makedirs(f"{MODEL_CACHE_DIR}/doctr/models/", exist_ok=True)

        detector_weights_path = (
            f"{MODEL_CACHE_DIR}/doctr/models/{self.det_model.version_id}.pt"
        )
        shutil.copyfile(
            f"{MODEL_CACHE_DIR}/doctr_det/{self.det_model.version_id}/model.pt",
            detector_weights_path,
        )
        recognizer_weights_path = (
            f"{MODEL_CACHE_DIR}/doctr/models/{self.rec_model.version_id}.pt"
        )
        shutil.copyfile(
            f"{MODEL_CACHE_DIR}/doctr_rec/{self.rec_model.version_id}/model.pt",
            recognizer_weights_path,
        )

        det_model = db_resnet50(pretrained=False, pretrained_backbone=False)
        det_model.load_state_dict(
            torch.load(detector_weights_path, map_location=DEVICE, weights_only=True)
        )

        reco_model = crnn_vgg16_bn(pretrained=False, pretrained_backbone=False)
        reco_model.load_state_dict(
            torch.load(recognizer_weights_path, map_location=DEVICE, weights_only=True)
        )

        self.model = ocr_predictor(
            det_arch=det_model,
            reco_arch=reco_model,
            pretrained=False,
        )
        self.task_type = "ocr"

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        self.det_model.clear_cache(delete_from_disk=delete_from_disk)
        self.rec_model.clear_cache(delete_from_disk=delete_from_disk)

    def preprocess_image(self, image: Image.Image) -> Image.Image:
        """
        DocTR pre-processes images as part of its inference pipeline.

        Thus, no preprocessing is required here.
        """
        pass

    def infer_from_request(
        self, request: DoctrOCRInferenceRequest
    ) -> Union[OCRInferenceResponse, List]:
        if type(request.image) is list:
            response = []
            request_copy = copy.copy(request)
            for image in request.image:
                request_copy.image = image
                response.append(self.single_request(request=request_copy))
            return response
        return self.single_request(request)

    def single_request(self, request: DoctrOCRInferenceRequest) -> OCRInferenceResponse:
        t1 = perf_counter()
        result = self.infer(**request.dict())
        if not isinstance(result, tuple):
            result = (result, None, None)
        # maintaining backwards compatibility with previous implementation
        if request.generate_bounding_boxes:
            return OCRInferenceResponse(
                result=result[0],
                image=result[1],
                predictions=result[2],
                time=perf_counter() - t1,
            )
        else:
            return OCRInferenceResponse(
                result=result[0],
                time=perf_counter() - t1,
            )

    def infer(
        self, image: Any, **kwargs
    ) -> Union[
        str, Tuple[str, InferenceResponseImage, List[ObjectDetectionPrediction]]
    ]:
        """
        Run inference on a provided image.
            - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

        Args:
            request (DoctrOCRInferenceRequest): The inference request.

        Returns:
            OCRInferenceResponse: The inference response.
        """

        img = load_image(image)

        with tempfile.NamedTemporaryFile(suffix=".jpg") as f:
            image = Image.fromarray(img[0])

            image.save(f.name)

            doc = DocumentFile.from_images([f.name])

            result = self.model(doc).export()

            blocks = result["pages"][0]["blocks"]
            page_dimensions = result["pages"][0]["dimensions"]

            words = [
                word
                for block in blocks
                for line in block["lines"]
                for word in line["words"]
            ]

            result = " ".join([word["value"] for word in words])
            # maintaining backwards compatibility with previous implementation
            if not kwargs.get("generate_bounding_boxes", False):
                return result

            bounding_boxes = [
                _geometry_to_bbox(page_dimensions, word["geometry"]) for word in words
            ]
            objects = [
                ObjectDetectionPrediction(
                    **{
                        "x": bbox[0] + (bbox[2] - bbox[0]) // 2,
                        "y": bbox[1] + (bbox[3] - bbox[1]) // 2,
                        "width": bbox[2] - bbox[0],
                        "height": bbox[3] - bbox[1],
                        "confidence": float(word["objectness_score"]),
                        "class": word["value"],
                        "class_id": 0,
                        "detection_id": str(uuid.uuid4()),
                    }
                )
                for word, bbox in zip(words, bounding_boxes)
            ]
            image_height, image_width = img[0].shape[:2]
            return (
                result,
                InferenceResponseImage(width=image_width, height=image_height),
                objects,
            )

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["model.pt"].
        """
        return ["model.pt"]

Methods:¶

init ¶

__init__(
    *args, model_id="doctr_rec/crnn_vgg16_bn", **kwargs
)

Initializes the DocTR model.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/doctr/doctr_model.py

def __init__(self, *args, model_id: str = "doctr_rec/crnn_vgg16_bn", **kwargs):
    """Initializes the DocTR model.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    self.api_key = kwargs.get("api_key")
    self.dataset_id = "doctr"
    self.version_id = "default"
    self.endpoint = model_id
    model_id = model_id.lower()

    self.det_model = DocTRDet(api_key=kwargs.get("api_key"))
    self.rec_model = DocTRRec(api_key=kwargs.get("api_key"))

    os.makedirs(f"{MODEL_CACHE_DIR}/doctr/models/", exist_ok=True)

    detector_weights_path = (
        f"{MODEL_CACHE_DIR}/doctr/models/{self.det_model.version_id}.pt"
    )
    shutil.copyfile(
        f"{MODEL_CACHE_DIR}/doctr_det/{self.det_model.version_id}/model.pt",
        detector_weights_path,
    )
    recognizer_weights_path = (
        f"{MODEL_CACHE_DIR}/doctr/models/{self.rec_model.version_id}.pt"
    )
    shutil.copyfile(
        f"{MODEL_CACHE_DIR}/doctr_rec/{self.rec_model.version_id}/model.pt",
        recognizer_weights_path,
    )

    det_model = db_resnet50(pretrained=False, pretrained_backbone=False)
    det_model.load_state_dict(
        torch.load(detector_weights_path, map_location=DEVICE, weights_only=True)
    )

    reco_model = crnn_vgg16_bn(pretrained=False, pretrained_backbone=False)
    reco_model.load_state_dict(
        torch.load(recognizer_weights_path, map_location=DEVICE, weights_only=True)
    )

    self.model = ocr_predictor(
        det_arch=det_model,
        reco_arch=reco_model,
        pretrained=False,
    )
    self.task_type = "ocr"

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name	Type	Description
`list`	`list`	A list of required files for inference, e.g., ["model.pt"].

Source code in inference/models/doctr/doctr_model.py

def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns:
        list: A list of required files for inference, e.g., ["model.pt"].
    """
    return ["model.pt"]

infer ¶

infer(image, **kwargs)

Run inference on a provided image. - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

Parameters:

Name	Type	Description	Default
`request`	`DoctrOCRInferenceRequest`	The inference request.	required

Returns:

Name	Type	Description
`OCRInferenceResponse`	`Union[str, Tuple[str, InferenceResponseImage, List[ObjectDetectionPrediction]]]`	The inference response.

Source code in inference/models/doctr/doctr_model.py

def infer(
    self, image: Any, **kwargs
) -> Union[
    str, Tuple[str, InferenceResponseImage, List[ObjectDetectionPrediction]]
]:
    """
    Run inference on a provided image.
        - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

    Args:
        request (DoctrOCRInferenceRequest): The inference request.

    Returns:
        OCRInferenceResponse: The inference response.
    """

    img = load_image(image)

    with tempfile.NamedTemporaryFile(suffix=".jpg") as f:
        image = Image.fromarray(img[0])

        image.save(f.name)

        doc = DocumentFile.from_images([f.name])

        result = self.model(doc).export()

        blocks = result["pages"][0]["blocks"]
        page_dimensions = result["pages"][0]["dimensions"]

        words = [
            word
            for block in blocks
            for line in block["lines"]
            for word in line["words"]
        ]

        result = " ".join([word["value"] for word in words])
        # maintaining backwards compatibility with previous implementation
        if not kwargs.get("generate_bounding_boxes", False):
            return result

        bounding_boxes = [
            _geometry_to_bbox(page_dimensions, word["geometry"]) for word in words
        ]
        objects = [
            ObjectDetectionPrediction(
                **{
                    "x": bbox[0] + (bbox[2] - bbox[0]) // 2,
                    "y": bbox[1] + (bbox[3] - bbox[1]) // 2,
                    "width": bbox[2] - bbox[0],
                    "height": bbox[3] - bbox[1],
                    "confidence": float(word["objectness_score"]),
                    "class": word["value"],
                    "class_id": 0,
                    "detection_id": str(uuid.uuid4()),
                }
            )
            for word, bbox in zip(words, bounding_boxes)
        ]
        image_height, image_width = img[0].shape[:2]
        return (
            result,
            InferenceResponseImage(width=image_width, height=image_height),
            objects,
        )

preprocess_image ¶

preprocess_image(image)

DocTR pre-processes images as part of its inference pipeline.

Thus, no preprocessing is required here.

Source code in inference/models/doctr/doctr_model.py

def preprocess_image(self, image: Image.Image) -> Image.Image:
    """
    DocTR pre-processes images as part of its inference pipeline.

    Thus, no preprocessing is required here.
    """
    pass

DocTRDet ¶

Bases: RoboflowCoreModel

DocTR class for document Optical Character Recognition (OCR).

Attributes:

Name	Type	Description
`doctr`		The DocTR model.
`ort_session`		ONNX runtime inference session.

Source code in inference/models/doctr/doctr_model.py

class DocTRDet(RoboflowCoreModel):
    """DocTR class for document Optical Character Recognition (OCR).

    Attributes:
        doctr: The DocTR model.
        ort_session: ONNX runtime inference session.
    """

    def __init__(self, *args, model_id: str = "doctr_det/db_resnet50_v2", **kwargs):
        """Initializes the DocTR model.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """

        self.get_infer_bucket_file_list()

        super().__init__(*args, model_id=model_id, **kwargs)

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        super().clear_cache(delete_from_disk=delete_from_disk)

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["model.pt"].
        """
        return ["model.pt"]

Methods:¶

init ¶

__init__(
    *args, model_id="doctr_det/db_resnet50_v2", **kwargs
)

Initializes the DocTR model.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/doctr/doctr_model.py

def __init__(self, *args, model_id: str = "doctr_det/db_resnet50_v2", **kwargs):
    """Initializes the DocTR model.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """

    self.get_infer_bucket_file_list()

    super().__init__(*args, model_id=model_id, **kwargs)

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name	Type	Description
`list`	`list`	A list of required files for inference, e.g., ["model.pt"].

Source code in inference/models/doctr/doctr_model.py

def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns:
        list: A list of required files for inference, e.g., ["model.pt"].
    """
    return ["model.pt"]

DocTRRec ¶

Bases: RoboflowCoreModel

Source code in inference/models/doctr/doctr_model.py

class DocTRRec(RoboflowCoreModel):
    def __init__(self, *args, model_id: str = "doctr_rec/crnn_vgg16_bn_v2", **kwargs):
        """Initializes the DocTR model.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        self.get_infer_bucket_file_list()

        super().__init__(*args, model_id=model_id, **kwargs)

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        super().clear_cache(delete_from_disk=delete_from_disk)

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["model.pt"].
        """
        return ["model.pt"]

Methods:¶

init ¶

__init__(
    *args, model_id="doctr_rec/crnn_vgg16_bn_v2", **kwargs
)

Initializes the DocTR model.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/doctr/doctr_model.py

def __init__(self, *args, model_id: str = "doctr_rec/crnn_vgg16_bn_v2", **kwargs):
    """Initializes the DocTR model.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    self.get_infer_bucket_file_list()

    super().__init__(*args, model_id=model_id, **kwargs)

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name	Type	Description
`list`	`list`	A list of required files for inference, e.g., ["model.pt"].

Source code in inference/models/doctr/doctr_model.py

def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns:
        list: A list of required files for inference, e.g., ["model.pt"].
    """
    return ["model.pt"]

Functions:¶

`models/easy_ocr`¶

inference.models.easy_ocr.easy_ocr ¶

Classes¶

EasyOCR ¶

Bases: RoboflowCoreModel

Roboflow EasyOCR model implementation.

This class is responsible for handling the EasyOCR model, including loading the model, preprocessing the input, and performing inference.

Source code in inference/models/easy_ocr/easy_ocr.py

class EasyOCR(RoboflowCoreModel):
    """Roboflow EasyOCR model implementation.

    This class is responsible for handling the EasyOCR model, including
    loading the model, preprocessing the input, and performing inference.
    """

    def __init__(
        self,
        model_id: str = "easy_ocr/english_g2",
        device: str = DEVICE,
        *args,
        **kwargs,
    ):
        """Initializes EasyOCR with the given arguments and keyword arguments."""

        super().__init__(model_id=model_id.lower(), *args, **kwargs)
        self.device = device
        self.task_type = "ocr"
        self.recognizer = model_id.split("/")[1]

        shutil.copyfile(
            get_cache_file_path(file="weights.pt", model_id=model_id),
            get_cache_file_path(file=f"{self.recognizer}.pth", model_id=model_id),
        )

    def predict(self, image_in: np.ndarray, prompt="", history=None, **kwargs):
        language_codes = kwargs.get("language_codes", ["en"])
        quantize = kwargs.get("quantize", False)
        reader = easyocr.Reader(
            language_codes,
            download_enabled=False,
            user_network_directory=f"{MODEL_CACHE_DIR}/easy_ocr/{self.recognizer}/",
            model_storage_directory=f"{MODEL_CACHE_DIR}/easy_ocr/{self.recognizer}/",
            detect_network="craft",
            recog_network=self.recognizer,
            detector=True,
            recognizer=True,
            gpu=True,
            quantize=quantize,
        )

        results = reader.readtext(image_in)
        # convert native EasyOCR results from numpy to standard python types
        results = [
            (
                [
                    [x.item() if not isinstance(x, (int, float)) else x for x in c]
                    for c in res[0]
                ],
                res[1],
                res[2].item() if not isinstance(res[2], (int, float)) else res[2],
            )
            for res in results
        ]

        return results

    def postprocess(
        self,
        predictions: Tuple[np.ndarray, ...],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Any:
        return predictions, preprocess_return_metadata

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        image = load_image(image)[0]
        return image, InferenceResponseImage(
            width=image.shape[1], height=image.shape[0]
        )

    def infer_from_request(
        self, request: EasyOCRInferenceRequest
    ) -> Union[OCRInferenceResponse, List]:
        if type(request.image) is list:
            response = []
            request_copy = copy.copy(request)
            for image in request.image:
                request_copy.image = image
                response.append(self.single_request(request=request_copy))
            return response
        return self.single_request(request)

    def single_request(self, request: EasyOCRInferenceRequest) -> OCRInferenceResponse:
        t1 = perf_counter()
        prediction_result, image_metadata = self.infer(**request.dict())
        strings = [res[1] for res in prediction_result]
        return OCRInferenceResponse(
            result=" ".join(strings),
            image=image_metadata,
            predictions=[
                ObjectDetectionPrediction(
                    **{
                        "x": box[0][0] + (box[2][0] - box[0][0]) // 2,
                        "y": box[0][1] + (box[2][1] - box[0][1]) // 2,
                        "width": box[2][0] - box[0][0],
                        "height": box[2][1] - box[0][1],
                        "confidence": float(confidence),
                        "class": string,
                        "class_id": 0,
                        "detection_id": str(uuid.uuid4()),
                    }
                )
                for box, string, confidence in prediction_result
            ],
            time=perf_counter() - t1,
        )

    def get_infer_bucket_file_list(self) -> List[str]:
        return ["weights.pt", "craft_mlt_25k.pth"]

Methods:¶

init ¶

__init__(
    model_id="easy_ocr/english_g2",
    device=DEVICE,
    *args,
    **kwargs
)

Initializes EasyOCR with the given arguments and keyword arguments.

Source code in inference/models/easy_ocr/easy_ocr.py

def __init__(
    self,
    model_id: str = "easy_ocr/english_g2",
    device: str = DEVICE,
    *args,
    **kwargs,
):
    """Initializes EasyOCR with the given arguments and keyword arguments."""

    super().__init__(model_id=model_id.lower(), *args, **kwargs)
    self.device = device
    self.task_type = "ocr"
    self.recognizer = model_id.split("/")[1]

    shutil.copyfile(
        get_cache_file_path(file="weights.pt", model_id=model_id),
        get_cache_file_path(file=f"{self.recognizer}.pth", model_id=model_id),
    )

Functions:¶

inference.models.easy_ocr.easy_ocr_inference_models ¶

Classes¶

InferenceModelsEasyOCRAdapter ¶

Bases: Model

Roboflow EasyOCR model implementation.

This class is responsible for handling the EasyOCR model, including loading the model, preprocessing the input, and performing inference.

Source code in inference/models/easy_ocr/easy_ocr_inference_models.py

class InferenceModelsEasyOCRAdapter(Model):
    """Roboflow EasyOCR model implementation.

    This class is responsible for handling the EasyOCR model, including
    loading the model, preprocessing the input, and performing inference.
    """

    def __init__(
        self, model_id: str = "easy_ocr/english_g2", api_key: str = None, **kwargs
    ):
        super().__init__()

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY

        self.task_type = "ocr"

        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: EasyOCRTorch = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            **kwargs,
        )

    def predict(self, image_in: np.ndarray, **kwargs) -> Tuple[str, Detections]:
        parsed_texts, parsed_structures = self._model.infer(images=image_in, **kwargs)
        parsed_text = parsed_texts[0]
        parsed_structure = parsed_structures[0]
        return parsed_text, parsed_structure

    def postprocess(
        self,
        predictions: Tuple[np.ndarray, ...],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Any:
        return predictions, preprocess_return_metadata

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, InferenceResponseImage]:
        image = load_image_bgr(image)
        return image, InferenceResponseImage(
            width=image.shape[1], height=image.shape[0]
        )

    def infer_from_request(
        self, request: EasyOCRInferenceRequest
    ) -> Union[OCRInferenceResponse, List]:
        if type(request.image) is list:
            response = []
            request_copy = copy.copy(request)
            for image in request.image:
                request_copy.image = image
                response.append(self.single_request(request=request_copy))
            return response
        return self.single_request(request)

    def single_request(self, request: EasyOCRInferenceRequest) -> OCRInferenceResponse:
        t1 = perf_counter()
        kwargs = request.dict()
        kwargs["confidence"] = 0.0
        prediction_result, image_metadata = self.infer(**kwargs)
        predictions_for_image = []
        for instance_id in range(prediction_result[1].xyxy.shape[0]):
            x_min, y_min, x_max, y_max = prediction_result[1].xyxy[instance_id].tolist()
            width = x_max - x_min
            height = y_max - y_min
            center_x = (x_min + x_max) / 2
            center_y = (y_min + y_max) / 2
            predictions_for_image.append(
                ObjectDetectionPrediction(
                    # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                    **{
                        "x": center_x,
                        "y": center_y,
                        "width": width,
                        "height": height,
                        "confidence": 1.0,  # confidence is not returned by the model
                        "class": prediction_result[1].bboxes_metadata[instance_id][
                            "text"
                        ],
                        "class_id": 0,  # you can only prompt for one object at once
                        "detection_id": str(uuid.uuid4()),
                    }
                )
            )
        return OCRInferenceResponse(
            result=prediction_result[0],
            image=image_metadata,
            predictions=predictions_for_image,
            time=perf_counter() - t1,
        )

`models/florence2`¶

inference.models.florence2.utils ¶

Functions:¶

import_class_from_file ¶

import_class_from_file(
    file_path, class_name, alias_name=None
)

Emulates what huggingface transformers does to load remote code with trust_remote_code=True, but allows us to use the class directly so that we don't have to load untrusted code.

Source code in inference/models/florence2/utils.py

def import_class_from_file(file_path, class_name, alias_name=None):
    """
    Emulates what huggingface transformers does to load remote code with trust_remote_code=True,
    but allows us to use the class directly so that we don't have to load untrusted code.
    """
    file_path = os.path.abspath(file_path)
    module_name = os.path.splitext(os.path.basename(file_path))[0]
    module_dir = os.path.dirname(file_path)
    parent_dir = os.path.dirname(module_dir)

    sys.path.insert(0, parent_dir)

    previous_module = sys.modules.get(module_name)
    injected = False
    try:
        spec = importlib.util.spec_from_file_location(module_name, file_path)
        module = importlib.util.module_from_spec(spec)

        sys.modules[module_name] = module
        injected = True

        # Manually set the __package__ attribute to the parent package
        module.__package__ = os.path.basename(module_dir)

        spec.loader.exec_module(module)
        cls = getattr(module, class_name)
        if alias_name:
            globals()[alias_name] = cls
        return cls
    except Exception:
        if injected:
            if previous_module is not None:
                sys.modules[module_name] = previous_module
            else:
                sys.modules.pop(module_name, None)
        raise
    finally:
        sys.path.pop(0)

`models/gaze`¶

inference.models.gaze.gaze ¶

Classes¶

Gaze ¶

Deprecated. Raises FeatureDeprecatedError on instantiation.

The legacy registry entry under ("gaze", "l2cs") still resolves to this class so model-id lookups return a clear deprecation error rather than KeyError. The underlying L2CS-Net + MediaPipe implementation has been removed from inference.

Source code in inference/models/gaze/gaze.py

class Gaze:
    """Deprecated. Raises FeatureDeprecatedError on instantiation.

    The legacy registry entry under ("gaze", "l2cs") still resolves to
    this class so model-id lookups return a clear deprecation error rather
    than KeyError. The underlying L2CS-Net + MediaPipe implementation has
    been removed from inference.
    """

    def __init__(self, *args, **kwargs):
        raise FeatureDeprecatedError(
            feature="Gaze (L2CS-Net) model",
            reason="MediaPipe dependency removed from inference.",
        )

inference.models.gaze.gaze_inference_models ¶

Classes¶

InferenceModelsGazeAdapter ¶

Deprecated. Raises FeatureDeprecatedError on instantiation.

The cross-tree adapter that bridged the legacy inference/ Gaze API onto inference_models.model_pipelines.face_and_gaze_detection is no longer wired — the new tree's FaceAndGazeDetectionMPAndL2CS pipeline and the underlying MediaPipe dependency have been removed.

The stub class is kept so the registry adapter dispatch at inference/models/utils.py still resolves ("gaze", "l2cs") and surfaces a clear deprecation error instead of an ImportError.

Source code in inference/models/gaze/gaze_inference_models.py

class InferenceModelsGazeAdapter:
    """Deprecated. Raises FeatureDeprecatedError on instantiation.

    The cross-tree adapter that bridged the legacy `inference/` Gaze API
    onto `inference_models.model_pipelines.face_and_gaze_detection` is no
    longer wired — the new tree's `FaceAndGazeDetectionMPAndL2CS` pipeline
    and the underlying MediaPipe dependency have been removed.

    The stub class is kept so the registry adapter dispatch at
    `inference/models/utils.py` still resolves `("gaze", "l2cs")` and
    surfaces a clear deprecation error instead of an ImportError.
    """

    def __init__(self, *args, **kwargs):
        raise FeatureDeprecatedError(
            feature="Gaze (L2CS-Net) model via inference_models adapter",
            reason="MediaPipe dependency removed from inference.",
        )

`models/grounding_dino`¶

inference.models.grounding_dino.grounding_dino ¶

Classes¶

GroundingDINO ¶

Bases: RoboflowCoreModel

GroundingDINO class for zero-shot object detection.

Attributes:

Name	Type	Description
`model`		The GroundingDINO model.

Source code in inference/models/grounding_dino/grounding_dino.py

class GroundingDINO(RoboflowCoreModel):
    """GroundingDINO class for zero-shot object detection.

    Attributes:
        model: The GroundingDINO model.
    """

    def __init__(
        self, *args, model_id="grounding_dino/groundingdino_swint_ogc", **kwargs
    ):
        """Initializes the GroundingDINO model.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """

        super().__init__(*args, model_id=model_id, **kwargs)

        GROUNDING_DINO_CACHE_DIR = get_cache_dir(model_id=model_id)

        import groundingdino.config as _gd_config

        GROUNDING_DINO_CONFIG_PATH = os.path.join(
            os.path.dirname(_gd_config.__file__),
            "GroundingDINO_SwinT_OGC.py",
        )

        if not os.path.exists(GROUNDING_DINO_CACHE_DIR):
            os.makedirs(GROUNDING_DINO_CACHE_DIR)

        self.model = Model(
            model_config_path=GROUNDING_DINO_CONFIG_PATH,
            model_checkpoint_path=os.path.join(
                GROUNDING_DINO_CACHE_DIR, "groundingdino_swint_ogc.pth"
            ),
            device="cuda" if torch.cuda.is_available() else "cpu",
        )
        self.task_type = "object-detection"

    def preproc_image(self, image: Any):
        """Preprocesses an image.

        Args:
            image (InferenceRequestImage): The image to preprocess.

        Returns:
            np.array: The preprocessed image.
        """
        np_image = load_image_bgr(image)
        return np_image

    def infer_from_request(
        self,
        request: GroundingDINOInferenceRequest,
    ) -> ObjectDetectionInferenceResponse:
        """
        Perform inference based on the details provided in the request, and return the associated responses.
        """
        result = self.infer(**request.dict())
        return result

    def infer(
        self,
        image: InferenceRequestImage,
        text: List[str] = None,
        class_filter: list = None,
        box_threshold=0.5,
        text_threshold=0.5,
        class_agnostic_nms=CLASS_AGNOSTIC_NMS,
        **kwargs
    ):
        """
        Run inference on a provided image.
            - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

        Args:
            request (CVInferenceRequest): The inference request.
            class_filter (Optional[List[str]]): A list of class names to filter, if provided.

        Returns:
            GroundingDINOInferenceRequest: The inference response.
        """
        t1 = perf_counter()
        image = self.preproc_image(image)
        img_dims = image.shape

        detections = self.model.predict_with_classes(
            image=image,
            classes=text,
            box_threshold=box_threshold,
            text_threshold=text_threshold,
        )

        self.class_names = text

        if class_agnostic_nms:
            detections = detections.with_nms(class_agnostic=True)
        else:
            detections = detections.with_nms()

        xywh_bboxes = [xyxy_to_xywh(detection) for detection in detections.xyxy]

        t2 = perf_counter() - t1

        responses = ObjectDetectionInferenceResponse(
            predictions=[
                ObjectDetectionPrediction(
                    **{
                        "x": xywh_bboxes[i][0],
                        "y": xywh_bboxes[i][1],
                        "width": xywh_bboxes[i][2],
                        "height": xywh_bboxes[i][3],
                        "confidence": detections.confidence[i],
                        "class": self.class_names[int(detections.class_id[i])],
                        "class_id": int(detections.class_id[i]),
                    }
                )
                for i, pred in enumerate(detections.xyxy)
                if not class_filter
                or self.class_names[int(pred[6])] in class_filter
                and detections.class_id[i] is not None
            ],
            image=InferenceResponseImage(width=img_dims[1], height=img_dims[0]),
            time=t2,
        )
        return responses

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["model.pt"].
        """
        return ["groundingdino_swint_ogc.pth"]

Methods:¶

init ¶

__init__(
    *args,
    model_id="grounding_dino/groundingdino_swint_ogc",
    **kwargs
)

Initializes the GroundingDINO model.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/grounding_dino/grounding_dino.py

def __init__(
    self, *args, model_id="grounding_dino/groundingdino_swint_ogc", **kwargs
):
    """Initializes the GroundingDINO model.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """

    super().__init__(*args, model_id=model_id, **kwargs)

    GROUNDING_DINO_CACHE_DIR = get_cache_dir(model_id=model_id)

    import groundingdino.config as _gd_config

    GROUNDING_DINO_CONFIG_PATH = os.path.join(
        os.path.dirname(_gd_config.__file__),
        "GroundingDINO_SwinT_OGC.py",
    )

    if not os.path.exists(GROUNDING_DINO_CACHE_DIR):
        os.makedirs(GROUNDING_DINO_CACHE_DIR)

    self.model = Model(
        model_config_path=GROUNDING_DINO_CONFIG_PATH,
        model_checkpoint_path=os.path.join(
            GROUNDING_DINO_CACHE_DIR, "groundingdino_swint_ogc.pth"
        ),
        device="cuda" if torch.cuda.is_available() else "cpu",
    )
    self.task_type = "object-detection"

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name	Type	Description
`list`	`list`	A list of required files for inference, e.g., ["model.pt"].

Source code in inference/models/grounding_dino/grounding_dino.py

def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns:
        list: A list of required files for inference, e.g., ["model.pt"].
    """
    return ["groundingdino_swint_ogc.pth"]

infer ¶

infer(
    image,
    text=None,
    class_filter=None,
    box_threshold=0.5,
    text_threshold=0.5,
    class_agnostic_nms=CLASS_AGNOSTIC_NMS,
    **kwargs
)

Run inference on a provided image. - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

Parameters:

Name	Type	Description	Default
`request`	`CVInferenceRequest`	The inference request.	required
`class_filter`	`Optional[List[str]]`	A list of class names to filter, if provided.	`None`

Returns:

Name	Type	Description
`GroundingDINOInferenceRequest`		The inference response.

Source code in inference/models/grounding_dino/grounding_dino.py

def infer(
    self,
    image: InferenceRequestImage,
    text: List[str] = None,
    class_filter: list = None,
    box_threshold=0.5,
    text_threshold=0.5,
    class_agnostic_nms=CLASS_AGNOSTIC_NMS,
    **kwargs
):
    """
    Run inference on a provided image.
        - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

    Args:
        request (CVInferenceRequest): The inference request.
        class_filter (Optional[List[str]]): A list of class names to filter, if provided.

    Returns:
        GroundingDINOInferenceRequest: The inference response.
    """
    t1 = perf_counter()
    image = self.preproc_image(image)
    img_dims = image.shape

    detections = self.model.predict_with_classes(
        image=image,
        classes=text,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
    )

    self.class_names = text

    if class_agnostic_nms:
        detections = detections.with_nms(class_agnostic=True)
    else:
        detections = detections.with_nms()

    xywh_bboxes = [xyxy_to_xywh(detection) for detection in detections.xyxy]

    t2 = perf_counter() - t1

    responses = ObjectDetectionInferenceResponse(
        predictions=[
            ObjectDetectionPrediction(
                **{
                    "x": xywh_bboxes[i][0],
                    "y": xywh_bboxes[i][1],
                    "width": xywh_bboxes[i][2],
                    "height": xywh_bboxes[i][3],
                    "confidence": detections.confidence[i],
                    "class": self.class_names[int(detections.class_id[i])],
                    "class_id": int(detections.class_id[i]),
                }
            )
            for i, pred in enumerate(detections.xyxy)
            if not class_filter
            or self.class_names[int(pred[6])] in class_filter
            and detections.class_id[i] is not None
        ],
        image=InferenceResponseImage(width=img_dims[1], height=img_dims[0]),
        time=t2,
    )
    return responses

infer_from_request ¶

infer_from_request(request)

Perform inference based on the details provided in the request, and return the associated responses.

Source code in inference/models/grounding_dino/grounding_dino.py

def infer_from_request(
    self,
    request: GroundingDINOInferenceRequest,
) -> ObjectDetectionInferenceResponse:
    """
    Perform inference based on the details provided in the request, and return the associated responses.
    """
    result = self.infer(**request.dict())
    return result

preproc_image ¶

preproc_image(image)

Preprocesses an image.

Parameters:

Name	Type	Description	Default
`image`	`InferenceRequestImage`	The image to preprocess.	required

Returns:

Type	Description
	np.array: The preprocessed image.

Source code in inference/models/grounding_dino/grounding_dino.py

def preproc_image(self, image: Any):
    """Preprocesses an image.

    Args:
        image (InferenceRequestImage): The image to preprocess.

    Returns:
        np.array: The preprocessed image.
    """
    np_image = load_image_bgr(image)
    return np_image

Functions:¶

inference.models.grounding_dino.grounding_dino_inference_models ¶

Classes¶

InferenceModelsGroundingDINOAdapter ¶

Bases: Model

GroundingDINO class for zero-shot object detection.

Attributes:

Name	Type	Description
`model`		The GroundingDINO model.

Source code in inference/models/grounding_dino/grounding_dino_inference_models.py

class InferenceModelsGroundingDINOAdapter(Model):
    """GroundingDINO class for zero-shot object detection.

    Attributes:
        model: The GroundingDINO model.
    """

    def __init__(
        self,
        model_id: str = "grounding_dino/groundingdino_swint_ogc",
        api_key: str = None,
        **kwargs
    ):
        super().__init__()

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY

        self.task_type = "object-detection"

        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: GroundingDinoForObjectDetectionTorch = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            **kwargs,
        )

    def preproc_image(self, image: Any):
        """Preprocesses an image.

        Args:
            image (InferenceRequestImage): The image to preprocess.

        Returns:
            np.array: The preprocessed image.
        """
        return load_image_bgr(image)

    def infer_from_request(
        self,
        request: GroundingDINOInferenceRequest,
    ) -> ObjectDetectionInferenceResponse:
        """
        Perform inference based on the details provided in the request, and return the associated responses.
        """
        result = self.infer(**request.dict())
        return result

    def infer(
        self,
        image: InferenceRequestImage,
        text: List[str] = None,
        class_filter: list = None,
        box_threshold=0.5,
        text_threshold=0.5,
        class_agnostic_nms=CLASS_AGNOSTIC_NMS,
        **kwargs
    ):
        """
        Run inference on a provided image.
            - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

        Args:
            request (CVInferenceRequest): The inference request.
            class_filter (Optional[List[str]]): A list of class names to filter, if provided.

        Returns:
            GroundingDINOInferenceRequest: The inference response.
        """
        if text is None:
            raise ValueError(
                "`text` parameter is required for GroundingDINO inference."
            )
        t1 = perf_counter()
        image = self.preproc_image(image)
        img_dims = image.shape

        detections = self._model.infer(
            images=image,
            classes=text,
            box_threshold=box_threshold,
            text_threshold=text_threshold,
            class_agnostic_nms=class_agnostic_nms,
        )[0]
        t2 = perf_counter() - t1
        predictions_for_image = []
        for instance_id in range(detections.xyxy.shape[0]):
            x_min, y_min, x_max, y_max = detections.xyxy[instance_id].tolist()
            width = x_max - x_min
            height = y_max - y_min
            center_x = (x_min + x_max) / 2
            center_y = (y_min + y_max) / 2
            class_id = detections.class_id[instance_id].item()
            confidence = detections.confidence[instance_id].item()
            class_name = text[class_id]
            if class_filter and class_name not in class_filter:
                continue
            predictions_for_image.append(
                ObjectDetectionPrediction(
                    # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                    **{
                        "x": center_x,
                        "y": center_y,
                        "width": width,
                        "height": height,
                        "confidence": confidence,
                        "class": text[class_id],
                        "class_id": class_id,  # you can only prompt for one object at once
                    }
                )
            )
        return ObjectDetectionInferenceResponse(
            predictions=predictions_for_image,
            image=InferenceResponseImage(width=img_dims[1], height=img_dims[0]),
            time=t2,
        )

Methods:¶

infer ¶

infer(
    image,
    text=None,
    class_filter=None,
    box_threshold=0.5,
    text_threshold=0.5,
    class_agnostic_nms=CLASS_AGNOSTIC_NMS,
    **kwargs
)

Run inference on a provided image. - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

Parameters:

Name	Type	Description	Default
`request`	`CVInferenceRequest`	The inference request.	required
`class_filter`	`Optional[List[str]]`	A list of class names to filter, if provided.	`None`

Returns:

Name	Type	Description
`GroundingDINOInferenceRequest`		The inference response.

Source code in inference/models/grounding_dino/grounding_dino_inference_models.py

def infer(
    self,
    image: InferenceRequestImage,
    text: List[str] = None,
    class_filter: list = None,
    box_threshold=0.5,
    text_threshold=0.5,
    class_agnostic_nms=CLASS_AGNOSTIC_NMS,
    **kwargs
):
    """
    Run inference on a provided image.
        - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

    Args:
        request (CVInferenceRequest): The inference request.
        class_filter (Optional[List[str]]): A list of class names to filter, if provided.

    Returns:
        GroundingDINOInferenceRequest: The inference response.
    """
    if text is None:
        raise ValueError(
            "`text` parameter is required for GroundingDINO inference."
        )
    t1 = perf_counter()
    image = self.preproc_image(image)
    img_dims = image.shape

    detections = self._model.infer(
        images=image,
        classes=text,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
        class_agnostic_nms=class_agnostic_nms,
    )[0]
    t2 = perf_counter() - t1
    predictions_for_image = []
    for instance_id in range(detections.xyxy.shape[0]):
        x_min, y_min, x_max, y_max = detections.xyxy[instance_id].tolist()
        width = x_max - x_min
        height = y_max - y_min
        center_x = (x_min + x_max) / 2
        center_y = (y_min + y_max) / 2
        class_id = detections.class_id[instance_id].item()
        confidence = detections.confidence[instance_id].item()
        class_name = text[class_id]
        if class_filter and class_name not in class_filter:
            continue
        predictions_for_image.append(
            ObjectDetectionPrediction(
                # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                **{
                    "x": center_x,
                    "y": center_y,
                    "width": width,
                    "height": height,
                    "confidence": confidence,
                    "class": text[class_id],
                    "class_id": class_id,  # you can only prompt for one object at once
                }
            )
        )
    return ObjectDetectionInferenceResponse(
        predictions=predictions_for_image,
        image=InferenceResponseImage(width=img_dims[1], height=img_dims[0]),
        time=t2,
    )

infer_from_request ¶

infer_from_request(request)

Perform inference based on the details provided in the request, and return the associated responses.

Source code in inference/models/grounding_dino/grounding_dino_inference_models.py

def infer_from_request(
    self,
    request: GroundingDINOInferenceRequest,
) -> ObjectDetectionInferenceResponse:
    """
    Perform inference based on the details provided in the request, and return the associated responses.
    """
    result = self.infer(**request.dict())
    return result

preproc_image ¶

preproc_image(image)

Preprocesses an image.

Parameters:

Name	Type	Description	Default
`image`	`InferenceRequestImage`	The image to preprocess.	required

Returns:

Type	Description
	np.array: The preprocessed image.

Source code in inference/models/grounding_dino/grounding_dino_inference_models.py

def preproc_image(self, image: Any):
    """Preprocesses an image.

    Args:
        image (InferenceRequestImage): The image to preprocess.

    Returns:
        np.array: The preprocessed image.
    """
    return load_image_bgr(image)

Functions:¶

`models/owlv2`¶

inference.models.owlv2.owlv2 ¶

Classes¶

OwlV2 ¶

Source code in inference/models/owlv2/owlv2.py

class OwlV2(RoboflowInferenceModel):
    task_type = "object-detection"
    box_format = "xywh"

    def __init__(self, model_id=f"owlv2/{OWLV2_VERSION_ID}", *args, **kwargs):
        super().__init__(model_id, *args, **kwargs)
        # TODO: owlv2 makes use of version_id - version_id is being dropped so this class needs to be refactored

        self.owlv2_lock = RLock()

        if self.version_id is None:
            owlv2_model_id_chunks = model_id.split("/")
            if len(owlv2_model_id_chunks) != 2:
                raise InvalidModelIDError("Model ID: `%s` is invalid.", model_id)
            self.dataset_id = owlv2_model_id_chunks[0]
            self.version_id = owlv2_model_id_chunks[1]
        hf_id = os.path.join("google", self.version_id)
        processor = Owlv2Processor.from_pretrained(hf_id)
        self.image_size = (
            processor.image_processor.size.height,
            processor.image_processor.size.width,
        )
        self.image_mean = torch.tensor(
            processor.image_processor.image_mean, device=DEVICE
        ).view(1, 3, 1, 1)
        self.image_std = torch.tensor(
            processor.image_processor.image_std, device=DEVICE
        ).view(1, 3, 1, 1)
        self.model = Owlv2Singleton(hf_id).model
        self.reset_cache()

    def reset_cache(self):
        # each entry should be on the order of 300*4KB, so 1000 is 400MB of CUDA memory
        self.image_embed_cache = LimitedSizeDict(size_limit=OWLV2_IMAGE_CACHE_SIZE)
        # no need for limit here, as we're only storing on CPU
        self.cpu_image_embed_cache = LimitedSizeDict(
            size_limit=CPU_IMAGE_EMBED_CACHE_SIZE
        )
        # each entry should be on the order of 10 bytes, so 1000 is 10KB
        self.image_size_cache = LimitedSizeDict(size_limit=OWLV2_IMAGE_CACHE_SIZE)
        # entry size will vary depending on the number of samples, but 10 should be safe
        self.class_embeddings_cache = LimitedSizeDict(size_limit=OWLV2_MODEL_CACHE_SIZE)

    def draw_predictions(
        self,
        inference_request,
        inference_response,
    ) -> bytes:
        """Draw predictions from an inference response onto the original image provided by an inference request

        Args:
            inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
            inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

        Returns:
            str: A base64 encoded image string
        """
        all_class_names = [x.class_name for x in inference_response.predictions]
        all_class_names = sorted(list(set(all_class_names)))

        return draw_detection_predictions(
            inference_request=inference_request,
            inference_response=inference_response,
            colors={
                class_name: DEFAULT_COLOR_PALETTE[i % len(DEFAULT_COLOR_PALETTE)]
                for (i, class_name) in enumerate(all_class_names)
            },
        )

    def download_weights(self) -> None:
        # Download from huggingface
        pass

    def get_image_embeds(self, image_hash: Hash) -> Optional[tuple]:
        image_embed_cache_hit = self.image_embed_cache.get(image_hash)
        if image_embed_cache_hit is not None:
            return image_embed_cache_hit
        cpu_image_embed_cache_hit = self.cpu_image_embed_cache.get(image_hash)
        if cpu_image_embed_cache_hit is not None:
            tensors = tuple(t.to(DEVICE) for t in cpu_image_embed_cache_hit)
            return tensors
        return None

    def compute_image_size(
        self, image: Union[np.ndarray, LazyImageRetrievalWrapper]
    ) -> Tuple[int, int]:
        if isinstance(image, LazyImageRetrievalWrapper):
            image_size = self.image_size_cache.get(image.image_hash)
            if image_size is None:
                np_img = image.image_as_numpy
                image_size = np_img.shape[:2][::-1]
                with self.owlv2_lock:
                    self.image_size_cache[image.image_hash] = image_size
            return image_size
        else:
            return image.shape[:2][::-1]

    @torch.no_grad()
    def embed_image(
        self, image: Union[np.ndarray, LazyImageRetrievalWrapper]
    ) -> Tuple[Hash, tuple]:
        if isinstance(image, LazyImageRetrievalWrapper):
            image_hash = image.image_hash
        else:
            image_hash = hash_function(image.tobytes())

        image_embeds = self.get_image_embeds(image_hash)
        if image_embeds is not None:
            return image_hash, image_embeds

        np_image = (
            image.image_as_numpy
            if isinstance(image, LazyImageRetrievalWrapper)
            else image
        )
        pixel_values = preprocess_image(
            np_image, self.image_size, self.image_mean, self.image_std
        )

        # torch 2.4 lets you use "cuda:0" as device_type
        # but this crashes in 2.3
        # so we parse DEVICE as a string to make it work in both 2.3 and 2.4
        # as we don't know a priori our torch version
        device_str = "cuda" if str(DEVICE).startswith("cuda") else "cpu"
        # we disable autocast on CPU for stability, although it's possible using bfloat16 would work
        with torch.autocast(
            device_type=device_str, dtype=torch.float16, enabled=device_str == "cuda"
        ):
            image_embeds, _ = self.model.image_embedder(pixel_values=pixel_values)
            batch_size, h, w, dim = image_embeds.shape
            image_features = image_embeds.reshape(batch_size, h * w, dim)
            objectness = self.model.objectness_predictor(image_features)
            boxes = self.model.box_predictor(image_features, feature_map=image_embeds)
        image_class_embeds = self.model.class_head.dense0(image_features)
        image_class_embeds /= (
            torch.linalg.norm(image_class_embeds, ord=2, dim=-1, keepdim=True) + 1e-6
        )
        logit_shift = self.model.class_head.logit_shift(image_features)
        logit_scale = (
            self.model.class_head.elu(self.model.class_head.logit_scale(image_features))
            + 1
        )
        objectness = objectness.sigmoid()

        objectness, boxes, image_class_embeds, logit_shift, logit_scale = (
            filter_tensors_by_objectness(
                objectness, boxes, image_class_embeds, logit_shift, logit_scale
            )
        )
        image_embeds = (
            objectness,
            boxes,
            image_class_embeds,
            logit_shift,
            logit_scale,
        )
        with self.owlv2_lock:
            self.image_embed_cache[image_hash] = image_embeds

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        if isinstance(image, LazyImageRetrievalWrapper):
            image.unload_numpy_image()  # Clears both _image_as_numpy and image if needed.

        return image_hash, image_embeds

    def get_query_embedding(
        self,
        query_spec: QuerySpecType,
        iou_threshold: float,
        precomputed_embeddings: Optional[Dict[Hash, Tuple[torch.Tensor]]] = None,
    ) -> Optional[torch.Tensor]:
        # NOTE: for now we're handling each image seperately
        query_embeds = []
        if precomputed_embeddings is None:
            precomputed_embeddings = {}
        for image_hash, query_boxes in query_spec.items():
            if image_hash in precomputed_embeddings:
                image_embeds = precomputed_embeddings[image_hash]
            else:
                image_embeds = self.get_image_embeds(image_hash)
            if image_embeds is None:
                raise KeyError("We didn't embed the image first!")
            _objectness, image_boxes, image_class_embeds, _, _ = image_embeds

            query_boxes_tensor = torch.tensor(
                query_boxes, dtype=image_boxes.dtype, device=image_boxes.device
            )
            if image_boxes.numel() == 0 or query_boxes_tensor.numel() == 0:
                continue
            iou, _ = box_iou(
                to_corners(image_boxes), to_corners(query_boxes_tensor)
            )  # 3000, k
            ious, indices = torch.max(iou, dim=0)
            # filter for only iou > 0.4
            iou_mask = ious > iou_threshold
            indices = indices[iou_mask]
            if not indices.numel() > 0:
                continue

            embeds = image_class_embeds[indices]
            query_embeds.append(embeds)
        if not query_embeds:
            return None
        query = torch.cat(query_embeds, dim=0)
        return query

    def infer_from_embed(
        self,
        image_hash: Hash,
        query_embeddings: Dict[str, PosNegDictType],
        confidence: float,
        iou_threshold: float,
        max_detections: int = MAX_DETECTIONS,
        image_embeds: Optional[tuple] = None,
    ) -> List[Dict]:
        if image_embeds is None:
            image_embeds = self.get_image_embeds(image_hash)
        if image_embeds is None:
            raise KeyError("We didn't embed the image first!")
        _, image_boxes, image_class_embeds, _, _ = image_embeds
        class_map, class_names = make_class_map(query_embeddings)
        all_predicted_boxes, all_predicted_classes, all_predicted_scores = [], [], []
        for class_name, pos_neg_embedding_dict in query_embeddings.items():
            boxes, classes, scores = get_class_preds_from_embeds(
                pos_neg_embedding_dict,
                image_class_embeds,
                confidence,
                image_boxes,
                class_map,
                class_name,
                iou_threshold,
            )

            all_predicted_boxes.append(boxes)
            all_predicted_classes.append(classes)
            all_predicted_scores.append(scores)

        if not all_predicted_boxes:
            return []

        all_predicted_boxes = torch.cat(all_predicted_boxes, dim=0)
        all_predicted_classes = torch.cat(all_predicted_classes, dim=0)
        all_predicted_scores = torch.cat(all_predicted_scores, dim=0)

        # run nms on all predictions
        survival_indices = torchvision.ops.nms(
            to_corners(all_predicted_boxes), all_predicted_scores, iou_threshold
        )
        all_predicted_boxes = all_predicted_boxes[survival_indices]
        all_predicted_classes = all_predicted_classes[survival_indices]
        all_predicted_scores = all_predicted_scores[survival_indices]

        if len(all_predicted_boxes) > max_detections:
            all_predicted_boxes = all_predicted_boxes[:max_detections]
            all_predicted_classes = all_predicted_classes[:max_detections]
            all_predicted_scores = all_predicted_scores[:max_detections]

        # move tensors to numpy before returning
        all_predicted_boxes = all_predicted_boxes.cpu().numpy()
        all_predicted_classes = all_predicted_classes.cpu().numpy()
        all_predicted_scores = all_predicted_scores.cpu().numpy()

        return [
            {
                "class_name": class_names[int(c)],
                "x": float(x),
                "y": float(y),
                "w": float(w),
                "h": float(h),
                "confidence": float(score),
            }
            for c, (x, y, w, h), score in zip(
                all_predicted_classes, all_predicted_boxes, all_predicted_scores
            )
        ]

    def infer(
        self,
        image: Any,
        training_data: Dict,
        confidence: float = 0.99,
        iou_threshold: float = 0.3,
        max_detections: int = MAX_DETECTIONS,
        **kwargs,
    ):
        class_embeddings_dict = self.make_class_embeddings_dict(
            training_data, iou_threshold
        )
        return self.infer_from_embedding_dict(
            image,
            class_embeddings_dict,
            confidence,
            iou_threshold,
            max_detections=max_detections,
        )

    def infer_from_embedding_dict(
        self,
        image: Any,
        class_embeddings_dict: Dict[str, PosNegDictType],
        confidence: float,
        iou_threshold: float,
        max_detections: int = MAX_DETECTIONS,
        **kwargs,
    ):
        if not isinstance(image, list):
            images = [image]
        else:
            images = image

        images = [LazyImageRetrievalWrapper(image) for image in images]

        results = []
        image_sizes = []
        for image_wrapper in images:
            # happy path here is that both image size and image embeddings are cached
            # in which case we avoid loading the image at all
            image_size = self.compute_image_size(image_wrapper)
            image_sizes.append(image_size)
            image_hash, image_embeds = self.embed_image(image_wrapper)
            image_wrapper.unload_numpy_image()
            result = self.infer_from_embed(
                image_hash,
                class_embeddings_dict,
                confidence,
                iou_threshold,
                max_detections=max_detections,
                image_embeds=image_embeds,
            )
            results.append(result)
        return self.make_response(
            results, image_sizes, sorted(list(class_embeddings_dict.keys()))
        )

    def make_class_embeddings_dict(
        self,
        training_data: List[Any],
        iou_threshold: float,
        return_image_embeds: bool = False,
    ) -> Dict[str, PosNegDictType]:

        wrapped_training_data = [
            {
                "image": LazyImageRetrievalWrapper(train_image["image"]),
                "boxes": train_image["boxes"],
            }
            for train_image in training_data
        ]

        wrapped_training_data_hash = hash_wrapped_training_data(wrapped_training_data)

        if (
            class_embeddings_dict := self.class_embeddings_cache.get(
                wrapped_training_data_hash
            )
        ) is not None:
            if return_image_embeds:
                # Return a dummy empty dict as the second value
                # or extract it from CPU cache if available
                return_image_embeds_dict = {}
                with self.owlv2_lock:
                    for image_hash, value in self.cpu_image_embed_cache.items():
                        return_image_embeds_dict[image_hash] = value
                return class_embeddings_dict, return_image_embeds_dict
            else:
                return class_embeddings_dict

        class_embeddings_dict = defaultdict(lambda: {"positive": [], "negative": []})

        bool_to_literal = {True: "positive", False: "negative"}
        return_image_embeds_dict = dict()

        for train_image in wrapped_training_data:
            image_size = self.compute_image_size(train_image["image"])
            image_hash, image_embeds = self.embed_image(train_image["image"])
            if return_image_embeds:
                return_image_embeds_dict[image_hash] = tuple(
                    t.to("cpu") for t in image_embeds
                )
            # grab and normalize box prompts for this image
            boxes = train_image["boxes"]
            coords = [[box["x"], box["y"], box["w"], box["h"]] for box in boxes]
            coords = [tuple([c / max(image_size) for c in coord]) for coord in coords]
            classes = [box["cls"] for box in boxes]
            is_positive = [not box["negative"] for box in boxes]
            query_spec = {image_hash: coords}
            precomputed_embeddings = {image_hash: image_embeds}
            # compute the embeddings for the box prompts
            embeddings = self.get_query_embedding(
                query_spec,
                iou_threshold,
                precomputed_embeddings=precomputed_embeddings,
            )

            del train_image

            if embeddings is None:
                continue

            for embedding, class_name, is_pos in zip(embeddings, classes, is_positive):
                class_embeddings_dict[class_name][bool_to_literal[is_pos]].append(
                    embedding
                )
        # Convert lists of embeddings to tensors.
        class_embeddings_dict = {
            k: {
                "positive": torch.stack(v["positive"]) if v["positive"] else None,
                "negative": torch.stack(v["negative"]) if v["negative"] else None,
            }
            for k, v in class_embeddings_dict.items()
        }

        with self.owlv2_lock:
            self.class_embeddings_cache[wrapped_training_data_hash] = (
                class_embeddings_dict
            )
        if return_image_embeds:
            return class_embeddings_dict, return_image_embeds_dict

        return class_embeddings_dict

    def make_response(self, predictions, image_sizes, class_names):
        responses = [
            ObjectDetectionInferenceResponse(
                predictions=[
                    ObjectDetectionPrediction(
                        # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                        **{
                            "x": pred["x"] * max(image_sizes[ind]),
                            "y": pred["y"] * max(image_sizes[ind]),
                            "width": pred["w"] * max(image_sizes[ind]),
                            "height": pred["h"] * max(image_sizes[ind]),
                            "confidence": pred["confidence"],
                            "class": pred["class_name"],
                            "class_id": class_names.index(pred["class_name"]),
                        }
                    )
                    for pred in batch_predictions
                ],
                image=InferenceResponseImage(
                    width=image_sizes[ind][0], height=image_sizes[ind][1]
                ),
            )
            for ind, batch_predictions in enumerate(predictions)
        ]
        return responses

Methods:¶

draw_predictions ¶

draw_predictions(inference_request, inference_response)

Draw predictions from an inference response onto the original image provided by an inference request

Parameters:

Name	Type	Description	Default
`inference_request`	`ObjectDetectionInferenceRequest`	The inference request containing the image on which to draw predictions	required
`inference_response`	`ObjectDetectionInferenceResponse`	The inference response containing predictions to be drawn	required

Returns:

Name	Type	Description
`str`	`bytes`	A base64 encoded image string

Source code in inference/models/owlv2/owlv2.py

def draw_predictions(
    self,
    inference_request,
    inference_response,
) -> bytes:
    """Draw predictions from an inference response onto the original image provided by an inference request

    Args:
        inference_request (ObjectDetectionInferenceRequest): The inference request containing the image on which to draw predictions
        inference_response (ObjectDetectionInferenceResponse): The inference response containing predictions to be drawn

    Returns:
        str: A base64 encoded image string
    """
    all_class_names = [x.class_name for x in inference_response.predictions]
    all_class_names = sorted(list(set(all_class_names)))

    return draw_detection_predictions(
        inference_request=inference_request,
        inference_response=inference_response,
        colors={
            class_name: DEFAULT_COLOR_PALETTE[i % len(DEFAULT_COLOR_PALETTE)]
            for (i, class_name) in enumerate(all_class_names)
        },
    )

SerializedOwlV2 ¶

Source code in inference/models/owlv2/owlv2.py

class SerializedOwlV2(RoboflowInferenceModel):
    task_type = "object-detection"
    box_format = "xywh"

    # Cache of OwlV2 instances to avoid creating new ones for each serialize_training_data call
    # This improves performance by reusing model instances across serialization operations
    _base_owlv2_instances = {}

    @classmethod
    def get_or_create_owlv2_instance(cls, roboflow_id: str) -> OwlV2:
        """Get an existing OwlV2 instance from cache or create a new one if it doesn't exist.

        Args:
            roboflow_id: The model ID for the OwlV2 model

        Returns:
            An OwlV2 instance
        """
        if roboflow_id in cls._base_owlv2_instances:
            return cls._base_owlv2_instances[roboflow_id]
        else:
            owlv2 = OwlV2(model_id=roboflow_id)
            cls._base_owlv2_instances[roboflow_id] = owlv2
            return owlv2

    @classmethod
    def serialize_training_data(
        cls,
        training_data: List[Any],
        hf_id: str = f"google/{OWLV2_VERSION_ID}",
        iou_threshold: float = 0.3,
        save_dir: str = os.path.join(MODEL_CACHE_DIR, "owl-v2-serialized-data"),
        previous_embeddings_file: str = None,
    ):
        roboflow_id = hf_id.replace("google/", "owlv2/")

        owlv2 = cls.get_or_create_owlv2_instance(roboflow_id)

        if previous_embeddings_file is not None:
            if DEVICE == "cpu":
                model_data = torch.load(
                    previous_embeddings_file, map_location="cpu", weights_only=False
                )
            else:
                model_data = torch.load(previous_embeddings_file, weights_only=False)

            train_data_dict = model_data["train_data_dict"]
            if isinstance(model_data["image_embeds"], LimitedSizeDict):
                owlv2.cpu_image_embed_cache = model_data["image_embeds"]
            else:
                cache = LimitedSizeDict(size_limit=CPU_IMAGE_EMBED_CACHE_SIZE)
                for key, value in model_data["image_embeds"].items():
                    cache[key] = value
                owlv2.cpu_image_embed_cache = cache

        train_data_dict, image_embeds = owlv2.make_class_embeddings_dict(
            training_data, iou_threshold, return_image_embeds=True
        )
        return cls.save_model(
            hf_id, roboflow_id, train_data_dict, image_embeds, save_dir
        )

    @classmethod
    def save_model(
        cls,
        hf_id: str,
        roboflow_id: str,
        train_data_dict: Dict,
        image_embeds: Dict,
        save_dir: str,
    ):
        train_data_dict = {
            "huggingface_id": hf_id,
            "train_data_dict": train_data_dict,
            "class_names": list(train_data_dict.keys()),
            "roboflow_id": roboflow_id,
            "image_embeds": image_embeds,
        }
        train_data_path = os.path.join(save_dir, cls.weights_file_path)
        os.makedirs(save_dir, exist_ok=True)
        torch.save(train_data_dict, train_data_path)
        return train_data_path

    def infer_from_request(
        self,
        request: ObjectDetectionInferenceRequest,
    ) -> Union[
        List[ObjectDetectionInferenceResponse], ObjectDetectionInferenceResponse
    ]:
        return super().infer_from_request(request)

    def __init__(self, model_id, *args, **kwargs):
        super().__init__(model_id, *args, **kwargs)
        self.get_model_artifacts(**kwargs)

    def get_infer_bucket_file_list(self):
        return []

    def download_model_artefacts_from_s3(self):
        raise NotImplementedError("Owlv2 not currently supported on hosted inference")

    def download_model_artifacts_from_roboflow_api(
        self,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
        **kwargs,
    ):
        logger.info("Downloading OWLv2 model artifacts")

        # Use the same lock file pattern as in clear_cache
        lock_dir = MODEL_CACHE_DIR + "/_file_locks"  # Dedicated lock directory
        os.makedirs(lock_dir, exist_ok=True)  # Ensure lock directory exists.
        lock_file = os.path.join(lock_dir, f"{os.path.basename(self.cache_dir)}.lock")
        try:
            lock = FileLock(lock_file, timeout=120)  # 120 second timeout for downloads
            with lock:
                if self.version_id is not None:
                    api_data = get_roboflow_model_data(
                        api_key=self.api_key,
                        model_id=self.endpoint,
                        endpoint_type=ModelEndpointType.OWLV2,
                        device_id=self.device_id,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    api_data = api_data["owlv2"]
                    if "model" not in api_data:
                        raise ModelArtefactError(
                            "Could not find `model` key in roboflow API model description response."
                        )
                    logger.info("Downloading OWLv2 model weights for %s", self.endpoint)
                    model_weights_response = get_from_url(
                        api_data["model"], json_response=False
                    )
                else:
                    logger.info("Getting OWLv2 model data for %s", self.endpoint)
                    api_data = get_roboflow_instant_model_data(
                        api_key=self.api_key,
                        model_id=self.endpoint,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    if (
                        "modelFiles" not in api_data
                        or "owlv2" not in api_data["modelFiles"]
                        or "model" not in api_data["modelFiles"]["owlv2"]
                    ):
                        raise ModelArtefactError(
                            "Could not find `modelFiles` key or `modelFiles`.`owlv2` or `modelFiles`.`owlv2`.`model` key in roboflow API model description response."
                        )
                    logger.info("Downloading OWLv2 model weights for %s", self.endpoint)
                    model_weights_response = get_from_url(
                        api_data["modelFiles"]["owlv2"]["model"], json_response=False
                    )
                save_bytes_in_cache(
                    content=model_weights_response.content,
                    file=self.weights_file,
                    model_id=self.endpoint,
                )
                logger.info("OWLv2 model weights saved to cache")
        except Exception as e:
            logger.error("Error downloading OWLv2 model artifacts: %s", e)
            raise
        finally:
            try:
                if os.path.exists(lock_file):
                    os.unlink(lock_file)  # Clean up lock file
            except OSError:
                pass  # Best effort cleanup

    def load_model_artifacts_from_cache(self):
        if DEVICE == "cpu":
            self.model_data = torch.load(
                self.cache_file(self.weights_file),
                map_location="cpu",
                weights_only=False,
            )
        else:
            self.model_data = torch.load(
                self.cache_file(self.weights_file), weights_only=False
            )
        self.class_names = self.model_data["class_names"]
        self.train_data_dict = self.model_data["train_data_dict"]
        self.huggingface_id = self.model_data["huggingface_id"]
        self.roboflow_id = self.model_data["roboflow_id"]
        # Use the same cached OwlV2 instance mechanism to avoid creating duplicates
        self.owlv2 = self.__class__.get_or_create_owlv2_instance(self.roboflow_id)
        if isinstance(self.model_data["image_embeds"], LimitedSizeDict):
            self.owlv2.cpu_image_embed_cache = self.model_data["image_embeds"]
        else:
            cache = LimitedSizeDict(size_limit=CPU_IMAGE_EMBED_CACHE_SIZE)
            for key, value in self.model_data["image_embeds"].items():
                cache[key] = value
            self.owlv2.cpu_image_embed_cache = cache

    weights_file_path = "weights.pt"

    @property
    def weights_file(self):
        return self.weights_file_path

    def infer(
        self,
        image,
        confidence: float = 0.99,
        iou_threshold: float = 0.3,
        max_detections: int = MAX_DETECTIONS,
        **kwargs,
    ):
        logger.debug("Inferring OWLv2 model")
        result = self.owlv2.infer_from_embedding_dict(
            image,
            self.train_data_dict,
            confidence=confidence,
            iou_threshold=iou_threshold,
            max_detections=max_detections,
            **kwargs,
        )
        logger.debug("OWLv2 model inference complete")
        return result

    def draw_predictions(
        self,
        inference_request: ObjectDetectionInferenceRequest,
        inference_response: ObjectDetectionInferenceResponse,
    ):
        return self.owlv2.draw_predictions(
            inference_request,
            inference_response,
        )

    def save_small_model_without_image_embeds(
        self, save_dir: str = os.path.join(MODEL_CACHE_DIR, "owl-v2-serialized-data")
    ):
        self.owlv2.cpu_image_embed_cache = LimitedSizeDict(
            size_limit=CPU_IMAGE_EMBED_CACHE_SIZE
        )
        return self.save_model(
            self.huggingface_id,
            self.roboflow_id,
            self.train_data_dict,
            {},
            save_dir,
        )

Methods:¶

get_or_create_owlv2_instance `classmethod` ¶

get_or_create_owlv2_instance(roboflow_id)

Get an existing OwlV2 instance from cache or create a new one if it doesn't exist.

Parameters:

Name	Type	Description	Default
`roboflow_id`	`str`	The model ID for the OwlV2 model	required

Returns:

Type	Description
`OwlV2`	An OwlV2 instance

Source code in inference/models/owlv2/owlv2.py

@classmethod
def get_or_create_owlv2_instance(cls, roboflow_id: str) -> OwlV2:
    """Get an existing OwlV2 instance from cache or create a new one if it doesn't exist.

    Args:
        roboflow_id: The model ID for the OwlV2 model

    Returns:
        An OwlV2 instance
    """
    if roboflow_id in cls._base_owlv2_instances:
        return cls._base_owlv2_instances[roboflow_id]
    else:
        owlv2 = OwlV2(model_id=roboflow_id)
        cls._base_owlv2_instances[roboflow_id] = owlv2
        return owlv2

Functions:¶

preprocess_image ¶

preprocess_image(
    np_image, image_size, image_mean, image_std
)

Preprocess an image for OWLv2 by resizing, normalizing, and padding it. This is much faster than using the Owlv2Processor directly, as we ensure we use GPU if available.

Parameters:

Name	Type	Description	Default
`np_image`	`ndarray`	The image to preprocess, with shape (H, W, 3)	required
`image_size`	`tuple[int, int]`	The target size of the image	required
`image_mean`	`Tensor`	The mean of the image, on DEVICE, with shape (1, 3, 1, 1)	required
`image_std`	`Tensor`	The standard deviation of the image, on DEVICE, with shape (1, 3, 1, 1)	required

Returns:

Type	Description
`Tensor`	torch.Tensor: The preprocessed image, on DEVICE, with shape (1, 3, H, W)

Source code in inference/models/owlv2/owlv2.py

def preprocess_image(
    np_image: np.ndarray,
    image_size: Tuple[int, int],
    image_mean: torch.Tensor,
    image_std: torch.Tensor,
) -> torch.Tensor:
    """Preprocess an image for OWLv2 by resizing, normalizing, and padding it.
    This is much faster than using the Owlv2Processor directly, as we ensure we use GPU if available.

    Args:
        np_image (np.ndarray): The image to preprocess, with shape (H, W, 3)
        image_size (tuple[int, int]): The target size of the image
        image_mean (torch.Tensor): The mean of the image, on DEVICE, with shape (1, 3, 1, 1)
        image_std (torch.Tensor): The standard deviation of the image, on DEVICE, with shape (1, 3, 1, 1)

    Returns:
        torch.Tensor: The preprocessed image, on DEVICE, with shape (1, 3, H, W)
    """
    current_size = np_image.shape[:2]

    r = min(image_size[0] / current_size[0], image_size[1] / current_size[1])
    target_size = (int(r * current_size[0]), int(r * current_size[1]))

    torch_image = (
        torch.tensor(np_image)
        .permute(2, 0, 1)
        .unsqueeze(0)
        .to(DEVICE)
        .to(dtype=torch.float32)
        / 255.0
    )
    torch_image = F.interpolate(
        torch_image, size=target_size, mode="bilinear", align_corners=False
    )

    padded_image_tensor = torch.ones((1, 3, *image_size), device=DEVICE) * 0.5
    padded_image_tensor[:, :, : torch_image.shape[2], : torch_image.shape[3]] = (
        torch_image
    )

    padded_image_tensor = (padded_image_tensor - image_mean) / image_std

    return padded_image_tensor

`models/paligemma`¶

inference.models.paligemma.paligemma ¶

Classes¶

LoRAPaliGemma ¶

Bases: LoRATransformerModel

By using you agree to the terms listed at https://ai.google.dev/gemma/terms

Source code in inference/models/paligemma/paligemma.py

class LoRAPaliGemma(LoRATransformerModel):
    """By using you agree to the terms listed at https://ai.google.dev/gemma/terms"""

    generation_includes_input = True
    transformers_class = PaliGemmaForConditionalGeneration
    load_base_from_roboflow = True

    def initialize_model(self, **kwargs):
        import torch

        lora_config = LoraConfig.from_pretrained(self.cache_dir, device_map=DEVICE)
        model_id = lora_config.base_model_name_or_path
        revision = lora_config.revision
        if revision is not None:
            try:
                self.dtype = getattr(torch, revision)
            except AttributeError:
                pass
        if not self.load_base_from_roboflow:
            model_load_id = model_id
            cache_dir = os.path.join(MODEL_CACHE_DIR, "huggingface")
            revision = revision
            token = self.huggingface_token
        else:
            model_load_id = self.get_lora_base_from_roboflow(model_id, revision)
            cache_dir = model_load_id
            revision = None
            token = None
        self.base_model = self.transformers_class.from_pretrained(
            model_load_id,
            revision=revision,
            device_map=DEVICE,
            cache_dir=cache_dir,
            token=token,
            attn_implementation=_get_paligemma_attn_implementation(),
        ).to(self.dtype)
        self.model = (
            PeftModel.from_pretrained(self.base_model, self.cache_dir)
            .eval()
            .to(self.dtype)
        )

        self.model.merge_and_unload()

        self.processor = self.processor_class.from_pretrained(
            model_load_id, revision=revision, cache_dir=cache_dir, token=token
        )

PaliGemma ¶

Bases: TransformerModel

By using you agree to the terms listed at https://ai.google.dev/gemma/terms

Source code in inference/models/paligemma/paligemma.py

class PaliGemma(TransformerModel):
    """By using you agree to the terms listed at https://ai.google.dev/gemma/terms"""

    generation_includes_input = True
    transformers_class = PaliGemmaForConditionalGeneration

    def initialize_model(self, **kwargs):
        if not self.load_base_from_roboflow:
            model_id = self.dataset_id
        else:
            model_id = self.cache_dir

        self.model = (
            self.transformers_class.from_pretrained(
                model_id,
                cache_dir=self.cache_dir,
                device_map=DEVICE,
                token=self.huggingface_token,
                torch_dtype=self.default_dtype,
                attn_implementation=_get_paligemma_attn_implementation(),
            )
            .eval()
            .to(self.dtype)
        )

        self.processor = self.processor_class.from_pretrained(
            model_id, cache_dir=self.cache_dir, token=self.huggingface_token
        )

`models/perception_encoder`¶

inference.models.perception_encoder.perception_encoder ¶

Classes¶

PerceptionEncoder ¶

Bases: RoboflowCoreModel

Roboflow Perception Encoder model implementation.

This class is responsible for handling the Percpetion Encoder model, including loading the model, preprocessing the input, and performing inference.

Attributes:

Name	Type	Description
`model`	`CLIP`	The PE-CLIP model instance.
`preprocess`	`function`	Function to preprocess the image.
`tokenizer`	`function`	Function to tokenize text.
`device`	`str`	The device to run inference on (cuda/cpu).

Source code in inference/models/perception_encoder/perception_encoder.py

class PerceptionEncoder(RoboflowCoreModel):
    """Roboflow Perception Encoder model implementation.

    This class is responsible for handling the Percpetion Encoder model, including
    loading the model, preprocessing the input, and performing inference.

    Attributes:
        model (pe.CLIP): The PE-CLIP model instance.
        preprocess (function): Function to preprocess the image.
        tokenizer (function): Function to tokenize text.
        device (str): The device to run inference on (cuda/cpu).
    """

    def __init__(
        self,
        model_id: str = PERCEPTION_ENCODER_MODEL_ID,
        device: str = DEVICE,
        *args,
        **kwargs,
    ):
        """Initializes the PerceptionEncoder with the given arguments and keyword arguments."""
        t1 = perf_counter()
        super().__init__(model_id=model_id.lower(), *args, **kwargs)
        self.device = device
        self.log("Creating PE-CLIP model")
        # Parse model config from model_id (format: perception-encoder/PE-Core-L14-336)
        model_config = model_id.split("/")[-1]
        checkpoint_path = os.path.join(self.cache_dir, "model.pt")
        self.model = pe.CLIP.from_config(
            model_config, pretrained=True, checkpoint_path=checkpoint_path
        )
        self.model = self.model.to(device)
        self.model.eval()

        self.preprocessor = transforms.get_image_transform(self.model.image_size)
        self.tokenizer = transforms.get_text_tokenizer(self.model.context_length)

        self.task_type = "embedding"

    def get_infer_bucket_file_list(self) -> List[str]:
        """Gets the list of files required for inference."""
        return ["model.pt"]  # No files needed as model is downloaded from HuggingFace

    def initialize_model(self, **kwargs) -> None:
        """Initialize the model. Not needed for PE-CLIP as it's loaded in __init__."""
        pass

    def preproc_image(self, image: InferenceRequestImage) -> torch.Tensor:
        """Preprocesses an inference request image."""
        pil_image = Image.fromarray(load_image_rgb(image))
        preprocessed_image = self.preprocessor(pil_image)
        return preprocessed_image.unsqueeze(0)

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[torch.Tensor, PreprocessReturnMetadata]:
        return self.preproc_image(image), PreprocessReturnMetadata({})

    def compare(
        self,
        subject: Any,
        prompt: Any,
        subject_type: str = "image",
        prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
        **kwargs,
    ) -> Union[List[float], Dict[str, float]]:
        """
        Compares the subject with the prompt to calculate similarity scores.

        Args:
            subject (Any): The subject data to be compared. Can be either an image or text.
            prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
            subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
            prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
            **kwargs: Additional keyword arguments.

        Returns:
            Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s).
        """
        if subject_type == "image":
            subject_embeddings = self.embed_image(subject)
        elif subject_type == "text":
            subject_embeddings = self.embed_text(subject)
        else:
            raise ValueError(
                f"subject_type must be either 'image' or 'text', but got {subject_type}"
            )

        if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
            prompt_keys = prompt.keys()
            prompt = [prompt[k] for k in prompt_keys]
            prompt_obj = "dict"
        else:
            if not isinstance(prompt, list):
                prompt = [prompt]
            prompt_obj = "list"

        if len(prompt) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
            )

        if prompt_type == "image":
            prompt_embeddings = self.embed_image(prompt)
        elif prompt_type == "text":
            prompt_embeddings = self.embed_text(prompt)
        else:
            raise ValueError(
                f"prompt_type must be either 'image' or 'text', but got {prompt_type}"
            )

        similarities = [
            cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
        ]

        if prompt_obj == "dict":
            similarities = dict(zip(prompt_keys, similarities))

        return similarities

    def make_compare_response(
        self, similarities: Union[List[float], Dict[str, float]]
    ) -> PerceptionEncoderCompareResponse:
        """Creates a PerceptionEncoderCompareResponse object from the provided similarity data."""
        response = PerceptionEncoderCompareResponse(similarity=similarities)
        return response

    def embed_image(
        self,
        image: Any,
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds an image or a list of images using the PE-CLIP model.

        Args:
            image (Any): The image or list of images to be embedded.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the image(s) as a numpy array.
        """
        t1 = perf_counter()

        if isinstance(image, list):
            if len(image) > CLIP_MAX_BATCH_SIZE:
                raise ValueError(
                    f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
                )
            imgs = [self.preproc_image(i) for i in image]
            img_in = torch.cat(imgs, dim=0).to(self.device)
        else:
            img_in = self.preproc_image(image).to(self.device)

        if self.device == "cpu" or self.device == "mps":
            with torch.inference_mode():
                image_features, _, _ = self.model(img_in, None)
                # Convert to float32 before converting to numpy
                embeddings = image_features.float().cpu().numpy()
        else:
            with torch.inference_mode(), torch.autocast(self.device):
                image_features, _, _ = self.model(img_in, None)
                # Convert to float32 before converting to numpy
                embeddings = image_features.float().cpu().numpy()

        return embeddings

    def embed_text(
        self,
        text: Union[str, List[str]],
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds a text or a list of texts using the PE-CLIP model.

        Args:
            text (Union[str, List[str]]): The text string or list of text strings to be embedded.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the text or texts as a numpy array.
        """
        if isinstance(text, list):
            texts = text
        else:
            texts = [text]

        results = []
        for texts_batch in create_batches(
            sequence=texts, batch_size=CLIP_MAX_BATCH_SIZE
        ):
            tokenized = self.tokenizer(texts_batch).to(self.device)
            # Use float32 for CPU, bfloat16 for CUDA
            if self.device == "cpu" or self.device == "mps":
                with torch.no_grad():
                    _, text_features, _ = self.model(None, tokenized)
            else:
                with torch.inference_mode(), torch.autocast(self.device):
                    _, text_features, _ = self.model(None, tokenized)

            # Convert to float32 before converting to numpy
            embeddings = text_features.float().cpu().numpy()
            results.append(embeddings)

        return np.concatenate(results, axis=0)

    def predict(self, img_in: torch.Tensor, **kwargs) -> Tuple[np.ndarray]:
        """Predict embeddings for an input tensor.

        Args:
            img_in (torch.Tensor): The input tensor to get embeddings for.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray]: A tuple containing the embeddings as a numpy array.
        """
        img_in = img_in.to(self.device)
        if self.device == "cpu" or self.device == "mps":
            with torch.inference_mode():
                image_features, _, _ = self.model(img_in, None)
        else:
            with torch.inference_mode(), torch.autocast(self.device):
                image_features, _, _ = self.model(img_in, None)

        embeddings = image_features.float().cpu().numpy()
        return (embeddings,)

    def make_embed_image_response(
        self, embeddings: np.ndarray
    ) -> PerceptionEncoderEmbeddingResponse:
        """Converts the given embeddings into a PerceptionEncoderEmbeddingResponse object."""
        response = PerceptionEncoderEmbeddingResponse(embeddings=embeddings.tolist())
        return response

    def make_embed_text_response(
        self, embeddings: np.ndarray
    ) -> PerceptionEncoderEmbeddingResponse:
        """Converts the given text embeddings into a PerceptionEncoderEmbeddingResponse object."""
        response = PerceptionEncoderEmbeddingResponse(embeddings=embeddings.tolist())
        return response

    def infer_from_request(
        self, request: PerceptionEncoderInferenceRequest
    ) -> PerceptionEncoderEmbeddingResponse:
        """Routes the request to the appropriate inference function."""
        t1 = perf_counter()
        if isinstance(request, PerceptionEncoderImageEmbeddingRequest):
            infer_func = self.embed_image
            make_response_func = self.make_embed_image_response
        elif isinstance(request, PerceptionEncoderTextEmbeddingRequest):
            infer_func = self.embed_text
            make_response_func = self.make_embed_text_response
        elif isinstance(request, PerceptionEncoderCompareRequest):
            infer_func = self.compare
            make_response_func = self.make_compare_response
        else:
            raise ValueError(
                f"Request type {type(request)} is not a valid PerceptionEncoderInferenceRequest"
            )
        data = infer_func(**request.dict())
        response = make_response_func(data)
        response.time = perf_counter() - t1
        return response

    def make_response(self, embeddings, *args, **kwargs) -> InferenceResponse:
        return [self.make_embed_image_response(embeddings)]

    def postprocess(
        self,
        predictions: Tuple[np.ndarray],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Any:
        return [self.make_embed_image_response(predictions[0])]

    def infer(self, image: Any, **kwargs) -> Any:
        """Embeds an image"""
        return super().infer(image, **kwargs)

Methods:¶

init ¶

__init__(
    model_id=PERCEPTION_ENCODER_MODEL_ID,
    device=DEVICE,
    *args,
    **kwargs
)

Initializes the PerceptionEncoder with the given arguments and keyword arguments.

Source code in inference/models/perception_encoder/perception_encoder.py

def __init__(
    self,
    model_id: str = PERCEPTION_ENCODER_MODEL_ID,
    device: str = DEVICE,
    *args,
    **kwargs,
):
    """Initializes the PerceptionEncoder with the given arguments and keyword arguments."""
    t1 = perf_counter()
    super().__init__(model_id=model_id.lower(), *args, **kwargs)
    self.device = device
    self.log("Creating PE-CLIP model")
    # Parse model config from model_id (format: perception-encoder/PE-Core-L14-336)
    model_config = model_id.split("/")[-1]
    checkpoint_path = os.path.join(self.cache_dir, "model.pt")
    self.model = pe.CLIP.from_config(
        model_config, pretrained=True, checkpoint_path=checkpoint_path
    )
    self.model = self.model.to(device)
    self.model.eval()

    self.preprocessor = transforms.get_image_transform(self.model.image_size)
    self.tokenizer = transforms.get_text_tokenizer(self.model.context_length)

    self.task_type = "embedding"

compare ¶

compare(
    subject,
    prompt,
    subject_type="image",
    prompt_type="text",
    **kwargs
)

Compares the subject with the prompt to calculate similarity scores.

Parameters:

Name	Type	Description	Default
`subject`	`Any`	The subject data to be compared. Can be either an image or text.	required
`prompt`	`Any`	The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.	required
`subject_type`	`str`	Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".	`'image'`
`prompt_type`	`Union[str, List[str], Dict[str, Any]]`	Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".	`'text'`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`Union[List[float], Dict[str, float]]`	Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s).

Source code in inference/models/perception_encoder/perception_encoder.py

def compare(
    self,
    subject: Any,
    prompt: Any,
    subject_type: str = "image",
    prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
    **kwargs,
) -> Union[List[float], Dict[str, float]]:
    """
    Compares the subject with the prompt to calculate similarity scores.

    Args:
        subject (Any): The subject data to be compared. Can be either an image or text.
        prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
        subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
        prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
        **kwargs: Additional keyword arguments.

    Returns:
        Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s).
    """
    if subject_type == "image":
        subject_embeddings = self.embed_image(subject)
    elif subject_type == "text":
        subject_embeddings = self.embed_text(subject)
    else:
        raise ValueError(
            f"subject_type must be either 'image' or 'text', but got {subject_type}"
        )

    if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
        prompt_keys = prompt.keys()
        prompt = [prompt[k] for k in prompt_keys]
        prompt_obj = "dict"
    else:
        if not isinstance(prompt, list):
            prompt = [prompt]
        prompt_obj = "list"

    if len(prompt) > CLIP_MAX_BATCH_SIZE:
        raise ValueError(
            f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
        )

    if prompt_type == "image":
        prompt_embeddings = self.embed_image(prompt)
    elif prompt_type == "text":
        prompt_embeddings = self.embed_text(prompt)
    else:
        raise ValueError(
            f"prompt_type must be either 'image' or 'text', but got {prompt_type}"
        )

    similarities = [
        cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
    ]

    if prompt_obj == "dict":
        similarities = dict(zip(prompt_keys, similarities))

    return similarities

embed_image ¶

embed_image(image, **kwargs)

Embeds an image or a list of images using the PE-CLIP model.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image or list of images to be embedded.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`ndarray`	np.ndarray: The embeddings of the image(s) as a numpy array.

Source code in inference/models/perception_encoder/perception_encoder.py

def embed_image(
    self,
    image: Any,
    **kwargs,
) -> np.ndarray:
    """
    Embeds an image or a list of images using the PE-CLIP model.

    Args:
        image (Any): The image or list of images to be embedded.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the image(s) as a numpy array.
    """
    t1 = perf_counter()

    if isinstance(image, list):
        if len(image) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
            )
        imgs = [self.preproc_image(i) for i in image]
        img_in = torch.cat(imgs, dim=0).to(self.device)
    else:
        img_in = self.preproc_image(image).to(self.device)

    if self.device == "cpu" or self.device == "mps":
        with torch.inference_mode():
            image_features, _, _ = self.model(img_in, None)
            # Convert to float32 before converting to numpy
            embeddings = image_features.float().cpu().numpy()
    else:
        with torch.inference_mode(), torch.autocast(self.device):
            image_features, _, _ = self.model(img_in, None)
            # Convert to float32 before converting to numpy
            embeddings = image_features.float().cpu().numpy()

    return embeddings

embed_text ¶

embed_text(text, **kwargs)

Embeds a text or a list of texts using the PE-CLIP model.

Parameters:

Name	Type	Description	Default
`text`	`Union[str, List[str]]`	The text string or list of text strings to be embedded.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`ndarray`	np.ndarray: The embeddings of the text or texts as a numpy array.

Source code in inference/models/perception_encoder/perception_encoder.py

def embed_text(
    self,
    text: Union[str, List[str]],
    **kwargs,
) -> np.ndarray:
    """
    Embeds a text or a list of texts using the PE-CLIP model.

    Args:
        text (Union[str, List[str]]): The text string or list of text strings to be embedded.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the text or texts as a numpy array.
    """
    if isinstance(text, list):
        texts = text
    else:
        texts = [text]

    results = []
    for texts_batch in create_batches(
        sequence=texts, batch_size=CLIP_MAX_BATCH_SIZE
    ):
        tokenized = self.tokenizer(texts_batch).to(self.device)
        # Use float32 for CPU, bfloat16 for CUDA
        if self.device == "cpu" or self.device == "mps":
            with torch.no_grad():
                _, text_features, _ = self.model(None, tokenized)
        else:
            with torch.inference_mode(), torch.autocast(self.device):
                _, text_features, _ = self.model(None, tokenized)

        # Convert to float32 before converting to numpy
        embeddings = text_features.float().cpu().numpy()
        results.append(embeddings)

    return np.concatenate(results, axis=0)

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Gets the list of files required for inference.

Source code in inference/models/perception_encoder/perception_encoder.py

def get_infer_bucket_file_list(self) -> List[str]:
    """Gets the list of files required for inference."""
    return ["model.pt"]  # No files needed as model is downloaded from HuggingFace

infer ¶

infer(image, **kwargs)

Embeds an image

Source code in inference/models/perception_encoder/perception_encoder.py

def infer(self, image: Any, **kwargs) -> Any:
    """Embeds an image"""
    return super().infer(image, **kwargs)

infer_from_request ¶

infer_from_request(request)

Routes the request to the appropriate inference function.

Source code in inference/models/perception_encoder/perception_encoder.py

def infer_from_request(
    self, request: PerceptionEncoderInferenceRequest
) -> PerceptionEncoderEmbeddingResponse:
    """Routes the request to the appropriate inference function."""
    t1 = perf_counter()
    if isinstance(request, PerceptionEncoderImageEmbeddingRequest):
        infer_func = self.embed_image
        make_response_func = self.make_embed_image_response
    elif isinstance(request, PerceptionEncoderTextEmbeddingRequest):
        infer_func = self.embed_text
        make_response_func = self.make_embed_text_response
    elif isinstance(request, PerceptionEncoderCompareRequest):
        infer_func = self.compare
        make_response_func = self.make_compare_response
    else:
        raise ValueError(
            f"Request type {type(request)} is not a valid PerceptionEncoderInferenceRequest"
        )
    data = infer_func(**request.dict())
    response = make_response_func(data)
    response.time = perf_counter() - t1
    return response

initialize_model ¶

initialize_model(**kwargs)

Initialize the model. Not needed for PE-CLIP as it's loaded in init.

Source code in inference/models/perception_encoder/perception_encoder.py

def initialize_model(self, **kwargs) -> None:
    """Initialize the model. Not needed for PE-CLIP as it's loaded in __init__."""
    pass

make_compare_response ¶

make_compare_response(similarities)

Creates a PerceptionEncoderCompareResponse object from the provided similarity data.

Source code in inference/models/perception_encoder/perception_encoder.py

def make_compare_response(
    self, similarities: Union[List[float], Dict[str, float]]
) -> PerceptionEncoderCompareResponse:
    """Creates a PerceptionEncoderCompareResponse object from the provided similarity data."""
    response = PerceptionEncoderCompareResponse(similarity=similarities)
    return response

make_embed_image_response ¶

make_embed_image_response(embeddings)

Converts the given embeddings into a PerceptionEncoderEmbeddingResponse object.

Source code in inference/models/perception_encoder/perception_encoder.py

def make_embed_image_response(
    self, embeddings: np.ndarray
) -> PerceptionEncoderEmbeddingResponse:
    """Converts the given embeddings into a PerceptionEncoderEmbeddingResponse object."""
    response = PerceptionEncoderEmbeddingResponse(embeddings=embeddings.tolist())
    return response

make_embed_text_response ¶

make_embed_text_response(embeddings)

Converts the given text embeddings into a PerceptionEncoderEmbeddingResponse object.

Source code in inference/models/perception_encoder/perception_encoder.py

def make_embed_text_response(
    self, embeddings: np.ndarray
) -> PerceptionEncoderEmbeddingResponse:
    """Converts the given text embeddings into a PerceptionEncoderEmbeddingResponse object."""
    response = PerceptionEncoderEmbeddingResponse(embeddings=embeddings.tolist())
    return response

predict ¶

predict(img_in, **kwargs)

Predict embeddings for an input tensor.

Parameters:

Name	Type	Description	Default
`img_in`	`Tensor`	The input tensor to get embeddings for.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`Tuple[ndarray]`	Tuple[np.ndarray]: A tuple containing the embeddings as a numpy array.

Source code in inference/models/perception_encoder/perception_encoder.py

def predict(self, img_in: torch.Tensor, **kwargs) -> Tuple[np.ndarray]:
    """Predict embeddings for an input tensor.

    Args:
        img_in (torch.Tensor): The input tensor to get embeddings for.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray]: A tuple containing the embeddings as a numpy array.
    """
    img_in = img_in.to(self.device)
    if self.device == "cpu" or self.device == "mps":
        with torch.inference_mode():
            image_features, _, _ = self.model(img_in, None)
    else:
        with torch.inference_mode(), torch.autocast(self.device):
            image_features, _, _ = self.model(img_in, None)

    embeddings = image_features.float().cpu().numpy()
    return (embeddings,)

preproc_image ¶

preproc_image(image)

Preprocesses an inference request image.

Source code in inference/models/perception_encoder/perception_encoder.py

def preproc_image(self, image: InferenceRequestImage) -> torch.Tensor:
    """Preprocesses an inference request image."""
    pil_image = Image.fromarray(load_image_rgb(image))
    preprocessed_image = self.preprocessor(pil_image)
    return preprocessed_image.unsqueeze(0)

Functions:¶

inference.models.perception_encoder.perception_encoder_inference_models ¶

Classes¶

InferenceModelsPerceptionEncoderAdapter ¶

Bases: Model

Roboflow Perception Encoder model implementation.

This class is responsible for handling the Percpetion Encoder model, including loading the model, preprocessing the input, and performing inference.

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

class InferenceModelsPerceptionEncoderAdapter(Model):
    """Roboflow Perception Encoder model implementation.

    This class is responsible for handling the Percpetion Encoder model, including
    loading the model, preprocessing the input, and performing inference.
    """

    def __init__(
        self, model_id: str = PERCEPTION_ENCODER_MODEL_ID, api_key: str = None, **kwargs
    ):
        super().__init__()
        if model_id.startswith("perception_encoder/"):
            model_id = model_id.replace("perception_encoder/", "perception-encoder/")

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY

        self.task_type = "embedding"

        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: PerceptionEncoderTorch = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            **kwargs,
        )

    def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
        """Preprocesses an inference request image."""
        return load_image_bgr(image)

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[torch.Tensor, PreprocessReturnMetadata]:
        return self.preproc_image(image), PreprocessReturnMetadata({})

    def compare(
        self,
        subject: Any,
        prompt: Any,
        subject_type: str = "image",
        prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
        **kwargs,
    ) -> Union[List[float], Dict[str, float]]:
        """
        Compares the subject with the prompt to calculate similarity scores.

        Args:
            subject (Any): The subject data to be compared. Can be either an image or text.
            prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
            subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
            prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
            **kwargs: Additional keyword arguments.

        Returns:
            Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s).
        """
        if subject_type == "image":
            subject_embeddings = self.embed_image(subject)
        elif subject_type == "text":
            subject_embeddings = self.embed_text(subject)
        else:
            raise ValueError(
                f"subject_type must be either 'image' or 'text', but got {subject_type}"
            )

        if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
            prompt_keys = prompt.keys()
            prompt = [prompt[k] for k in prompt_keys]
            prompt_obj = "dict"
        else:
            if not isinstance(prompt, list):
                prompt = [prompt]
            prompt_obj = "list"

        if len(prompt) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
            )

        if prompt_type == "image":
            prompt_embeddings = self.embed_image(prompt)
        elif prompt_type == "text":
            prompt_embeddings = self.embed_text(prompt)
        else:
            raise ValueError(
                f"prompt_type must be either 'image' or 'text', but got {prompt_type}"
            )

        similarities = [
            cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
        ]

        if prompt_obj == "dict":
            similarities = dict(zip(prompt_keys, similarities))

        return similarities

    def make_compare_response(
        self, similarities: Union[List[float], Dict[str, float]]
    ) -> PerceptionEncoderCompareResponse:
        """Creates a PerceptionEncoderCompareResponse object from the provided similarity data."""
        response = PerceptionEncoderCompareResponse(similarity=similarities)
        return response

    def embed_image(
        self,
        image: Any,
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds an image or a list of images using the PE-CLIP model.

        Args:
            image (Any): The image or list of images to be embedded.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the image(s) as a numpy array.
        """
        if isinstance(image, list):
            if len(image) > CLIP_MAX_BATCH_SIZE:
                raise ValueError(
                    f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
                )
            img_in = [self.preproc_image(i) for i in image]
        else:
            img_in = [self.preproc_image(image)]

        return self._model.embed_images(img_in).cpu().numpy()

    def embed_text(
        self,
        text: Union[str, List[str]],
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds a text or a list of texts using the PE-CLIP model.

        Args:
            text (Union[str, List[str]]): The text string or list of text strings to be embedded.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the text or texts as a numpy array.
        """
        if isinstance(text, list):
            texts = text
        else:
            texts = [text]
        if len(texts) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of texts that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
            )
        return self._model.embed_text(texts).cpu().numpy()

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray]:
        """Predict embeddings for an input tensor.

        Args:
            img_in (torch.Tensor): The input tensor to get embeddings for.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray]: A tuple containing the embeddings as a numpy array.
        """
        embeddings = self._model.embed_images(img_in).cpu().numpy()
        return (embeddings,)

    def make_embed_image_response(
        self, embeddings: np.ndarray
    ) -> PerceptionEncoderEmbeddingResponse:
        """Converts the given embeddings into a PerceptionEncoderEmbeddingResponse object."""
        response = PerceptionEncoderEmbeddingResponse(embeddings=embeddings.tolist())
        return response

    def make_embed_text_response(
        self, embeddings: np.ndarray
    ) -> PerceptionEncoderEmbeddingResponse:
        """Converts the given text embeddings into a PerceptionEncoderEmbeddingResponse object."""
        response = PerceptionEncoderEmbeddingResponse(embeddings=embeddings.tolist())
        return response

    def infer_from_request(
        self, request: PerceptionEncoderInferenceRequest
    ) -> PerceptionEncoderEmbeddingResponse:
        """Routes the request to the appropriate inference function."""
        t1 = perf_counter()
        if isinstance(request, PerceptionEncoderImageEmbeddingRequest):
            infer_func = self.embed_image
            make_response_func = self.make_embed_image_response
        elif isinstance(request, PerceptionEncoderTextEmbeddingRequest):
            infer_func = self.embed_text
            make_response_func = self.make_embed_text_response
        elif isinstance(request, PerceptionEncoderCompareRequest):
            infer_func = self.compare
            make_response_func = self.make_compare_response
        else:
            raise ValueError(
                f"Request type {type(request)} is not a valid PerceptionEncoderInferenceRequest"
            )
        data = infer_func(**request.dict())
        response = make_response_func(data)
        response.time = perf_counter() - t1
        return response

    def make_response(self, embeddings, *args, **kwargs) -> InferenceResponse:
        return [self.make_embed_image_response(embeddings)]

    def postprocess(
        self,
        predictions: Tuple[np.ndarray],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Any:
        return [self.make_embed_image_response(predictions[0])]

    def infer(self, image: Any, **kwargs) -> Any:
        """Embeds an image"""
        return super().infer(image, **kwargs)

Methods:¶

compare ¶

compare(
    subject,
    prompt,
    subject_type="image",
    prompt_type="text",
    **kwargs
)

Compares the subject with the prompt to calculate similarity scores.

Parameters:

Name	Type	Description	Default
`subject`	`Any`	The subject data to be compared. Can be either an image or text.	required
`prompt`	`Any`	The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.	required
`subject_type`	`str`	Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".	`'image'`
`prompt_type`	`Union[str, List[str], Dict[str, Any]]`	Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".	`'text'`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`Union[List[float], Dict[str, float]]`	Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s).

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def compare(
    self,
    subject: Any,
    prompt: Any,
    subject_type: str = "image",
    prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
    **kwargs,
) -> Union[List[float], Dict[str, float]]:
    """
    Compares the subject with the prompt to calculate similarity scores.

    Args:
        subject (Any): The subject data to be compared. Can be either an image or text.
        prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
        subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
        prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
        **kwargs: Additional keyword arguments.

    Returns:
        Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s).
    """
    if subject_type == "image":
        subject_embeddings = self.embed_image(subject)
    elif subject_type == "text":
        subject_embeddings = self.embed_text(subject)
    else:
        raise ValueError(
            f"subject_type must be either 'image' or 'text', but got {subject_type}"
        )

    if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
        prompt_keys = prompt.keys()
        prompt = [prompt[k] for k in prompt_keys]
        prompt_obj = "dict"
    else:
        if not isinstance(prompt, list):
            prompt = [prompt]
        prompt_obj = "list"

    if len(prompt) > CLIP_MAX_BATCH_SIZE:
        raise ValueError(
            f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
        )

    if prompt_type == "image":
        prompt_embeddings = self.embed_image(prompt)
    elif prompt_type == "text":
        prompt_embeddings = self.embed_text(prompt)
    else:
        raise ValueError(
            f"prompt_type must be either 'image' or 'text', but got {prompt_type}"
        )

    similarities = [
        cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
    ]

    if prompt_obj == "dict":
        similarities = dict(zip(prompt_keys, similarities))

    return similarities

embed_image ¶

embed_image(image, **kwargs)

Embeds an image or a list of images using the PE-CLIP model.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image or list of images to be embedded.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`ndarray`	np.ndarray: The embeddings of the image(s) as a numpy array.

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def embed_image(
    self,
    image: Any,
    **kwargs,
) -> np.ndarray:
    """
    Embeds an image or a list of images using the PE-CLIP model.

    Args:
        image (Any): The image or list of images to be embedded.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the image(s) as a numpy array.
    """
    if isinstance(image, list):
        if len(image) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
            )
        img_in = [self.preproc_image(i) for i in image]
    else:
        img_in = [self.preproc_image(image)]

    return self._model.embed_images(img_in).cpu().numpy()

embed_text ¶

embed_text(text, **kwargs)

Embeds a text or a list of texts using the PE-CLIP model.

Parameters:

Name	Type	Description	Default
`text`	`Union[str, List[str]]`	The text string or list of text strings to be embedded.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`ndarray`	np.ndarray: The embeddings of the text or texts as a numpy array.

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def embed_text(
    self,
    text: Union[str, List[str]],
    **kwargs,
) -> np.ndarray:
    """
    Embeds a text or a list of texts using the PE-CLIP model.

    Args:
        text (Union[str, List[str]]): The text string or list of text strings to be embedded.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the text or texts as a numpy array.
    """
    if isinstance(text, list):
        texts = text
    else:
        texts = [text]
    if len(texts) > CLIP_MAX_BATCH_SIZE:
        raise ValueError(
            f"The maximum number of texts that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
        )
    return self._model.embed_text(texts).cpu().numpy()

infer ¶

infer(image, **kwargs)

Embeds an image

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def infer(self, image: Any, **kwargs) -> Any:
    """Embeds an image"""
    return super().infer(image, **kwargs)

infer_from_request ¶

infer_from_request(request)

Routes the request to the appropriate inference function.

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def infer_from_request(
    self, request: PerceptionEncoderInferenceRequest
) -> PerceptionEncoderEmbeddingResponse:
    """Routes the request to the appropriate inference function."""
    t1 = perf_counter()
    if isinstance(request, PerceptionEncoderImageEmbeddingRequest):
        infer_func = self.embed_image
        make_response_func = self.make_embed_image_response
    elif isinstance(request, PerceptionEncoderTextEmbeddingRequest):
        infer_func = self.embed_text
        make_response_func = self.make_embed_text_response
    elif isinstance(request, PerceptionEncoderCompareRequest):
        infer_func = self.compare
        make_response_func = self.make_compare_response
    else:
        raise ValueError(
            f"Request type {type(request)} is not a valid PerceptionEncoderInferenceRequest"
        )
    data = infer_func(**request.dict())
    response = make_response_func(data)
    response.time = perf_counter() - t1
    return response

make_compare_response ¶

make_compare_response(similarities)

Creates a PerceptionEncoderCompareResponse object from the provided similarity data.

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def make_compare_response(
    self, similarities: Union[List[float], Dict[str, float]]
) -> PerceptionEncoderCompareResponse:
    """Creates a PerceptionEncoderCompareResponse object from the provided similarity data."""
    response = PerceptionEncoderCompareResponse(similarity=similarities)
    return response

make_embed_image_response ¶

make_embed_image_response(embeddings)

Converts the given embeddings into a PerceptionEncoderEmbeddingResponse object.

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def make_embed_image_response(
    self, embeddings: np.ndarray
) -> PerceptionEncoderEmbeddingResponse:
    """Converts the given embeddings into a PerceptionEncoderEmbeddingResponse object."""
    response = PerceptionEncoderEmbeddingResponse(embeddings=embeddings.tolist())
    return response

make_embed_text_response ¶

make_embed_text_response(embeddings)

Converts the given text embeddings into a PerceptionEncoderEmbeddingResponse object.

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def make_embed_text_response(
    self, embeddings: np.ndarray
) -> PerceptionEncoderEmbeddingResponse:
    """Converts the given text embeddings into a PerceptionEncoderEmbeddingResponse object."""
    response = PerceptionEncoderEmbeddingResponse(embeddings=embeddings.tolist())
    return response

predict ¶

predict(img_in, **kwargs)

Predict embeddings for an input tensor.

Parameters:

Name	Type	Description	Default
`img_in`	`Tensor`	The input tensor to get embeddings for.	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
`Tuple[ndarray]`	Tuple[np.ndarray]: A tuple containing the embeddings as a numpy array.

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray]:
    """Predict embeddings for an input tensor.

    Args:
        img_in (torch.Tensor): The input tensor to get embeddings for.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray]: A tuple containing the embeddings as a numpy array.
    """
    embeddings = self._model.embed_images(img_in).cpu().numpy()
    return (embeddings,)

preproc_image ¶

preproc_image(image)

Preprocesses an inference request image.

Source code in inference/models/perception_encoder/perception_encoder_inference_models.py

def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
    """Preprocesses an inference request image."""
    return load_image_bgr(image)

Functions:¶

`models/perception_encoder/vision_encoder`¶

inference.models.perception_encoder.vision_encoder.config ¶

Include all available vision encoder configurations.

Classes¶

PEConfig `dataclass` ¶

Vision Tower Config.

Source code in inference/models/perception_encoder/vision_encoder/config.py

@dataclass
class PEConfig:
    """Vision Tower Config."""

    patch_size: int
    width: int
    layers: int
    heads: int
    mlp_ratio: float
    output_dim: Optional[int]

    ls_init_value: float = None
    drop_path: float = 0.0

    image_size: int = (224,)
    use_abs_posemb: bool = True
    use_cls_token: bool = False
    use_rope2d: bool = True

    pool_type: str = "attn"
    attn_pooler_heads: int = 8

    use_ln_pre: bool = True
    use_ln_post: bool = True

PETextConfig `dataclass` ¶

Text Tower Config.

Source code in inference/models/perception_encoder/vision_encoder/config.py

@dataclass
class PETextConfig:
    """Text Tower Config."""

    context_length: int
    width: int
    heads: int
    layers: int

    output_dim: int

    mlp_ratio: float = 4.0
    vocab_size: int = 49408

inference.models.perception_encoder.vision_encoder.pe ¶

Classes¶

SelfAttention ¶

Bases: Module

Implements sequence packed attention and RoPe

Source code in inference/models/perception_encoder/vision_encoder/pe.py

class SelfAttention(nn.Module):
    r"""
    Implements sequence packed attention and RoPe
    """

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        rope: Optional[nn.Module] = None,
    ):
        super(SelfAttention, self).__init__()
        self.embed_dim = embed_dim

        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert (
            self.head_dim * num_heads == self.embed_dim
        ), "embed_dim must be divisible by num_heads"

        # To make this compatibile with nn.MultiHeadAttention
        self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
        self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)

        self.rope = rope
        self.scale = self.head_dim ** (-0.5)

    def init_tensors(self):
        xavier_uniform_(self.in_proj_weight)
        constant_(self.in_proj_bias, 0.0)
        constant_(self.out_proj.bias, 0.0)

    def forward(self, x, attn_mask=None):
        batch, seq, embed_dim = x.shape
        proj = F.linear(x, self.in_proj_weight, self.in_proj_bias)

        # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
        proj = (
            proj.unflatten(-1, (3, embed_dim))
            .unsqueeze(0)
            .transpose(0, -2)
            .squeeze(-2)
            .contiguous()
        )
        q, k, v = proj[0], proj[1], proj[2]

        # Use "q_" so that we don't accidentally quit in pdb :)
        q = rearrange(q, "b s (h d) -> b h s d", h=self.num_heads)
        k = rearrange(k, "b s (h d) -> b h s d", h=self.num_heads)
        v = rearrange(v, "b s (h d) -> b h s d", h=self.num_heads)

        if self.rope:
            q, k = self.rope(q, k)

        attn = F.scaled_dot_product_attention(
            q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False, scale=self.scale
        )
        attn = rearrange(attn, "b h s d -> b s (h d)")

        return F.linear(attn, self.out_proj.weight, self.out_proj.bias)

Transformer ¶

Bases: Module

Source code in inference/models/perception_encoder/vision_encoder/pe.py

class Transformer(nn.Module):
    def __init__(
        self,
        width: int,
        layers: int,
        heads: int,
        mlp_ratio: float = 4.0,
        ls_init_value: float = None,
        act_layer: Callable = nn.GELU,
        norm_layer: Callable = nn.LayerNorm,
        drop_path: float = 0.0,
        rope: Optional[nn.Module] = None,
    ):
        super().__init__()
        self.width = width
        self.layers = layers
        self.grad_checkpointing = False

        self.resblocks = nn.ModuleList(
            [
                ResidualAttentionBlock(
                    width,
                    heads,
                    mlp_ratio,
                    ls_init_value=ls_init_value,
                    act_layer=act_layer,
                    norm_layer=norm_layer,
                    drop_path=drop_path,
                    rope=rope,
                )
                for _ in range(layers)
            ]
        )

    @torch.jit.ignore
    def set_grad_checkpointing(self, enable=True):
        self.grad_checkpointing = enable

    @torch.jit.ignore
    def truncate(self, layer_idx: int):
        """Delete layers so the last layer is the given layer index."""
        self.layers = ((self.layers + layer_idx) % self.layers) + 1
        self.resblocks = nn.ModuleList(self.resblocks[: self.layers])

    def forward(
        self,
        x: torch.Tensor,
        attn_mask: Optional[torch.Tensor] = None,
        layer_idx: int = -1,
    ):
        stop_idx = (self.layers + layer_idx) % self.layers

        for i, r in enumerate(self.resblocks):
            if self.grad_checkpointing and not torch.jit.is_scripting():
                # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
                x = checkpoint(r, x, None, None, attn_mask)
            else:
                x = r(x, attn_mask=attn_mask)

            if i == stop_idx:
                break

        return x

Methods:¶

truncate ¶

truncate(layer_idx)

Delete layers so the last layer is the given layer index.

Source code in inference/models/perception_encoder/vision_encoder/pe.py

@torch.jit.ignore
def truncate(self, layer_idx: int):
    """Delete layers so the last layer is the given layer index."""
    self.layers = ((self.layers + layer_idx) % self.layers) + 1
    self.resblocks = nn.ModuleList(self.resblocks[: self.layers])

VisionTransformer ¶

Bases: Module

Source code in inference/models/perception_encoder/vision_encoder/pe.py

class VisionTransformer(nn.Module):
    def __init__(
        self,
        patch_size: int,
        width: int,
        layers: int,
        heads: int,
        mlp_ratio: float,
        act_layer: Callable = nn.GELU,
        norm_layer: Callable = partial(nn.LayerNorm, eps=1e-5),
        use_ln_pre: bool = True,
        use_ln_post: bool = True,
        ls_init_value: float = None,
        drop_path: float = 0.0,
        image_size: int = 448,  # Pretrain image size only; you can pass in any image size
        use_abs_posemb: bool = True,
        use_rope2d: bool = True,
        use_cls_token: bool = False,
        output_dim: Optional[int] = 1280,
        attn_pooler_heads: int = 8,
        pool_type: Literal["attn", "tok", "avg", "none"] = "attn",
    ):
        super().__init__()
        assert pool_type in ("attn", "tok", "avg", "none")
        self.pool_type = pool_type
        self.patch_size = patch_size

        self.output_dim = output_dim or width
        self.proj_dim = output_dim
        self.heads = heads
        self.width = width
        self.layers = layers

        self.use_abs_posemb = use_abs_posemb
        self.use_cls_token = use_cls_token
        self.use_rope2d = use_rope2d
        self.image_size = image_size

        self.conv1 = nn.Conv2d(
            in_channels=3,
            out_channels=width,
            kernel_size=patch_size,
            stride=patch_size,
            bias=False,
        )
        self.rope = (
            Rope2D(
                dim=width // heads,
                use_cls_token=self.use_cls_token,
            )
            if self.use_rope2d
            else None
        )

        self.ln_pre = norm_layer(width) if use_ln_pre else nn.Identity()
        self.ln_post = norm_layer(self.width) if use_ln_post else nn.Identity()

        self.transformer = Transformer(
            width,
            layers,
            heads,
            mlp_ratio,
            ls_init_value=ls_init_value,
            act_layer=act_layer,
            norm_layer=norm_layer,
            drop_path=drop_path,
            rope=self.rope,
        )

        if pool_type == "attn":
            self.attn_pool = AttentionPooling(
                embed_dim=width,
                num_heads=attn_pooler_heads,
                act_layer=act_layer,
                norm_layer=norm_layer,
            )
        else:
            self.attn_pool = None

        self.init_tensors()

    def init_tensors(self):
        def init_submodule_tensors(module):
            for name, child in module.named_children():
                if hasattr(child, "init_tensors"):
                    logger.debug(f"Initializing tensors for submodule: {name}")
                    child.init_tensors()
                init_submodule_tensors(child)

        init_submodule_tensors(self)
        self.rope.init_tensors()

        # class embeddings and positional embeddings
        init_scale = self.width**-0.5

        if self.use_cls_token:
            self.class_embedding = nn.Parameter(init_scale * torch.randn(self.width))

        if self.use_abs_posemb:
            self.posemb_grid_size = self.image_size // self.patch_size
            self.positional_embedding = nn.Parameter(
                init_scale
                * torch.randn(
                    int(self.use_cls_token) + self.posemb_grid_size**2, self.width
                )
            )

        if self.proj_dim is not None:
            self.proj = nn.Parameter(
                init_scale * torch.randn(self.width, self.proj_dim)
            )

    def load_ckpt(self, ckpt_path: str):
        _sd = torch.load(ckpt_path, weights_only=True)
        if "state_dict" in _sd:
            _sd = _sd["state_dict"]
        elif "weights" in _sd:
            _sd = _sd["weights"]

        # for backwards compatibility
        _sd = {k.replace("module.", ""): v for k, v in _sd.items()}
        if any(k.startswith("visual.") for k in _sd):
            _sd = {k.replace("visual.", ""): v for k, v in _sd.items() if "visual" in k}

        m, u = self.load_state_dict(_sd, strict=False)
        logger.info(f"Missing keys for loading vision encoder: {m}")
        logger.info(f"Unexpected keys for loading vision encoder: {u}")
        print(f"Missing keys for loading vision encoder: {m}")
        print(f"Unexpected keys for loading vision encoder: {u}")

    def truncate(self, layer_idx: int):
        """Delete layers so the last layer is the given layer index."""
        self.transformer.truncate(layer_idx)
        self.layers = self.transformer.layers

    @classmethod
    def from_config(
        cls,
        name: str,
        pretrained: bool = False,
        checkpoint_path: Optional[str] = None,
        **kwdargs,
    ):
        if name not in PE_VISION_CONFIG:
            raise RuntimeError(f"{name} not found in configs.")

        args = asdict(PE_VISION_CONFIG[name])
        args.update(kwdargs)

        model = cls(**args)
        if pretrained:
            model.load_ckpt(fetch_pe_checkpoint(name, checkpoint_path))

        return model

    @classmethod
    def available_configs(cls):
        return list(PE_VISION_CONFIG.keys())

    @torch.jit.ignore
    def set_grad_checkpointing(self, enable=True):
        self.transformer.set_grad_checkpointing(enable=enable)

    def _sample_abs_posemb(self, grid_h: int, grid_w: int):
        """Interpolates the absolute position embedding if necessary."""
        if self.posemb_grid_size == grid_h and self.posemb_grid_size == grid_w:
            return self.positional_embedding[None, ...]

        pos_embed = self.positional_embedding
        if self.use_cls_token:
            cls_token_embed, pos_embed = pos_embed[:1], pos_embed[1:]

        pos_embed = (
            pos_embed.reshape(1, self.posemb_grid_size, self.posemb_grid_size, -1)
            .permute(0, 3, 1, 2)
            .contiguous()
        )
        pos_embed = F.interpolate(
            pos_embed, size=(grid_h, grid_w), mode="bilinear", align_corners=False
        )
        pos_embed = pos_embed.permute(0, 2, 3, 1).reshape(-1, self.width).contiguous()

        if self.use_cls_token:
            pos_embed = torch.cat([cls_token_embed, pos_embed], dim=0)

        return pos_embed[None, ...]

    def _pool(self, x: torch.Tensor):
        if self.pool_type == "tok":
            return x[:, 0]
        elif self.pool_type == "avg":
            return x.mean(dim=1)
        elif self.pool_type == "attn":
            return self.attn_pool(x).squeeze(1)
        elif self.pool_type == "none":
            return x
        else:
            raise NotImplementedError

    def forward_features(
        self,
        x: torch.Tensor,
        norm: bool = False,
        layer_idx: int = -1,
        strip_cls_token: bool = False,
    ):
        batch, _, h, w = x.shape
        grid_h, grid_w = h // self.patch_size, w // self.patch_size

        x = self.conv1(x)
        x = x.permute(0, 2, 3, 1).reshape(batch, -1, self.width)

        if self.use_cls_token:
            x = torch.cat(
                [self.class_embedding.view(1, 1, -1).expand(batch, -1, -1), x],
                dim=1,
            )

        if self.use_abs_posemb:
            x = x + self._sample_abs_posemb(grid_h, grid_w)

        if self.use_rope2d:
            self.rope.update_grid(x.device, grid_h, grid_w)

        x = self.ln_pre(x)
        x = self.transformer(x, layer_idx=layer_idx)

        if norm:
            x = self.ln_post(x)

        if strip_cls_token and self.use_cls_token:
            x = x[:, 1:, :]

        return x

    def forward(self, x: torch.Tensor, **kwargs):
        x = self.forward_features(x, norm=True, **kwargs)
        x = self._pool(x)

        if self.proj_dim is not None:
            x = x @ self.proj

        return x

Methods:¶

truncate ¶

truncate(layer_idx)

Delete layers so the last layer is the given layer index.

Source code in inference/models/perception_encoder/vision_encoder/pe.py

def truncate(self, layer_idx: int):
    """Delete layers so the last layer is the given layer index."""
    self.transformer.truncate(layer_idx)
    self.layers = self.transformer.layers

inference.models.perception_encoder.vision_encoder.rope ¶

Classes¶

Rope2D ¶

Helper class to apply RoPE2D as well as interpolate on the fly.

Source code in inference/models/perception_encoder/vision_encoder/rope.py

class Rope2D:
    """Helper class to apply RoPE2D as well as interpolate on the fly."""

    def __init__(self, dim, use_cls_token=False):
        self.dim = dim
        self.use_cls_token = use_cls_token
        self.grid_size = None
        self.freq = None

    def init_tensors(self):
        self.rope = RotaryEmbedding(self.dim // 2)

    def update_grid(self, device, grid_h, grid_w):
        if self.grid_size != (grid_h, grid_w):
            self.grid_size = (grid_h, grid_w)

            self.rope = self.rope.to(device)

            if self.use_cls_token:
                # +1 to leave space for the cls token to be (0, 0)
                grid_y_range = torch.arange(grid_h, device=device) + 1
                grid_x_range = torch.arange(grid_w, device=device) + 1
            else:
                grid_y_range = torch.arange(grid_h, device=device)
                grid_x_range = torch.arange(grid_w, device=device)

            freqs_y = self.rope(grid_y_range)[:, None].expand(grid_h, grid_w, -1)
            freqs_x = self.rope(grid_x_range)[None, :].expand(grid_h, grid_w, -1)
            freq = torch.cat([freqs_x, freqs_y], dim=-1).reshape(grid_h * grid_w, -1)

            if self.use_cls_token:
                freq = torch.cat(
                    [torch.zeros(1, freq.shape[-1], device=device), freq], dim=0
                )

            self.freq = freq[None, ...]

        self.freq = self.freq.to(device)

    def __call__(self, q, k):
        # batch, heads, seq, dim = q.shape
        q = apply_rotary_emb(self.freq[:, None, :, :], q)
        k = apply_rotary_emb(self.freq[:, None, :, :], k)

        return q, k

inference.models.perception_encoder.vision_encoder.tokenizer ¶

CLIP tokenizer

Classes¶

SimpleTokenizer ¶

Bases: object

Source code in inference/models/perception_encoder/vision_encoder/tokenizer.py

class SimpleTokenizer(object):
    def __init__(
        self,
        bpe_path: str = default_bpe(),
        additional_special_tokens: Optional[List[str]] = None,
        context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH,
        clean: str = "lower",
        reduction_mask: str = "",
    ):
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
        merges = merges[1 : 49152 - 256 - 2 + 1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v + "</w>" for v in vocab]
        for merge in merges:
            vocab.append("".join(merge))
        special_tokens = ["<start_of_text>", "<end_of_text>"]
        if additional_special_tokens:
            special_tokens += additional_special_tokens
        vocab.extend(special_tokens)
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {t: t for t in special_tokens}
        special = "|".join(special_tokens)
        self.pat = re.compile(
            special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
            re.IGNORECASE,
        )
        self.vocab_size = len(self.encoder)
        self.all_special_ids = [self.encoder[t] for t in special_tokens]
        self.sot_token_id = self.all_special_ids[0]
        self.eot_token_id = self.all_special_ids[1]
        self.context_length = context_length
        self.clean_fn = get_clean_fn(clean)
        self.reduction_fn = (
            get_reduction_mask_fn(reduction_mask) if reduction_mask else None
        )

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token[:-1]) + (token[-1] + "</w>",)
        pairs = get_pairs(word)

        if not pairs:
            return token + "</w>"

        while True:
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = " ".join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        text = self.clean_fn(text)
        for token in re.findall(self.pat, text):
            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
            bpe_tokens.extend(
                self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
            )
        return bpe_tokens

    def decode(self, tokens):
        text = "".join([self.decoder[token] for token in tokens])
        text = (
            bytearray([self.byte_decoder[c] for c in text])
            .decode("utf-8", errors="replace")
            .replace("</w>", " ")
        )
        return text

    def __call__(
        self, texts: Union[str, List[str]], context_length: Optional[int] = None
    ) -> torch.LongTensor:
        """Returns the tokenized representation of given input string(s)

        Parameters
        ----------
        texts : Union[str, List[str]]
            An input string or a list of input strings to tokenize
        context_length : int
            The context length to use; all CLIP models use 77 as the context length

        Returns
        -------
        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
        """
        if isinstance(texts, str):
            texts = [texts]

        context_length = context_length or self.context_length
        assert context_length, "Please set a valid context length"

        if self.reduction_fn is not None:
            # use reduction strategy for tokenize if set, otherwise default to truncation below
            return self.reduction_fn(
                texts,
                context_length=context_length,
                sot_token_id=self.sot_token_id,
                eot_token_id=self.eot_token_id,
                encode_fn=self.encode,
            )

        all_tokens = [
            [self.sot_token_id] + self.encode(text) + [self.eot_token_id]
            for text in texts
        ]
        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

        for i, tokens in enumerate(all_tokens):
            if len(tokens) > context_length:
                tokens = tokens[:context_length]  # Truncate
                tokens[-1] = self.eot_token_id
            result[i, : len(tokens)] = torch.tensor(tokens)

        return result

Methods:¶

call ¶

__call__(texts, context_length=None)

Returns the tokenized representation of given input string(s)

Parameters¶

texts : Union[str, List[str]] An input string or a list of input strings to tokenize context_length : int The context length to use; all CLIP models use 77 as the context length

Returns¶

A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]

Source code in inference/models/perception_encoder/vision_encoder/tokenizer.py

def __call__(
    self, texts: Union[str, List[str]], context_length: Optional[int] = None
) -> torch.LongTensor:
    """Returns the tokenized representation of given input string(s)

    Parameters
    ----------
    texts : Union[str, List[str]]
        An input string or a list of input strings to tokenize
    context_length : int
        The context length to use; all CLIP models use 77 as the context length

    Returns
    -------
    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
    """
    if isinstance(texts, str):
        texts = [texts]

    context_length = context_length or self.context_length
    assert context_length, "Please set a valid context length"

    if self.reduction_fn is not None:
        # use reduction strategy for tokenize if set, otherwise default to truncation below
        return self.reduction_fn(
            texts,
            context_length=context_length,
            sot_token_id=self.sot_token_id,
            eot_token_id=self.eot_token_id,
            encode_fn=self.encode,
        )

    all_tokens = [
        [self.sot_token_id] + self.encode(text) + [self.eot_token_id]
        for text in texts
    ]
    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

    for i, tokens in enumerate(all_tokens):
        if len(tokens) > context_length:
            tokens = tokens[:context_length]  # Truncate
            tokens[-1] = self.eot_token_id
        result[i, : len(tokens)] = torch.tensor(tokens)

    return result

Functions:¶

bytes_to_unicode `cached` ¶

bytes_to_unicode()

Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on.

Source code in inference/models/perception_encoder/vision_encoder/tokenizer.py

@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1))
        + list(range(ord("¡"), ord("¬") + 1))
        + list(range(ord("®"), ord("ÿ") + 1))
    )
    # Precompute set for faster lookup, eliminate repeated containment checks
    bs_set = set(bs)
    cs = bs[:]
    n = 0
    for b in range(256):  # 2**8
        if b not in bs_set:
            bs.append(b)
            cs.append(256 + n)
            n += 1
    # Use list comprehension and map for efficient conversion
    cs = list(map(chr, cs))
    return dict(zip(bs, cs))

canonicalize_text ¶

canonicalize_text(
    text, *, keep_punctuation_exact_string=None
)

Returns canonicalized text (lowercase and punctuation removed).

From: https://github.com/google-research/big_vision/blob/53f18caf27a9419231bbf08d3388b07671616d3d/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94

Parameters:

Name	Type	Description	Default
`text`		string to be canonicalized.	required
`keep_punctuation_exact_string`		If provided, then this exact string kept. For example providing '{}' will keep any occurrences of '{}' (but will still remove '{' and '}' that appear separately).	`None`

Source code in inference/models/perception_encoder/vision_encoder/tokenizer.py

def canonicalize_text(text, *, keep_punctuation_exact_string=None):
    """Returns canonicalized `text` (lowercase and punctuation removed).

    From: https://github.com/google-research/big_vision/blob/53f18caf27a9419231bbf08d3388b07671616d3d/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94

    Args:
      text: string to be canonicalized.
      keep_punctuation_exact_string: If provided, then this exact string kept.
        For example providing '{}' will keep any occurrences of '{}' (but will
        still remove '{' and '}' that appear separately).
    """
    text = text.replace("_", " ")
    if keep_punctuation_exact_string:
        text = keep_punctuation_exact_string.join(
            part.translate(str.maketrans("", "", string.punctuation))
            for part in text.split(keep_punctuation_exact_string)
        )
    else:
        text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

get_pairs ¶

get_pairs(word)

Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings).

Source code in inference/models/perception_encoder/vision_encoder/tokenizer.py

def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

get_reduction_mask_fn ¶

get_reduction_mask_fn(type)

Choose strategy for dropping (masking) tokens to achieve target context length

Source code in inference/models/perception_encoder/vision_encoder/tokenizer.py

def get_reduction_mask_fn(type: str):
    """Choose strategy for dropping (masking) tokens to achieve target context length"""
    assert type in ("simple", "random", "shuffle")
    if type == "simple":
        return simple_mask_tokenize  # randomly select block [start:end]
    elif type == "random":
        return random_mask_tokenize  # randomly drop tokens (keep order)
    elif type == "shuffle":
        return partial(
            random_mask_tokenize, shuffle=True
        )  # randomly drop tokens (shuffle order)

`models/qwen25vl`¶

inference.models.qwen25vl.qwen25vl ¶

Classes¶

`models/resnet`¶

inference.models.resnet.resnet_classification ¶

Classes¶

ResNetClassification ¶

Bases: ClassificationBaseOnnxRoboflowInferenceModel

VitClassification handles classification inference for Vision Transformer (ViT) models using ONNX.

Inherits

Attributes:

Name	Type	Description
`multiclass`	`bool`	A flag that specifies if the model should handle multiclass classification.

Source code in inference/models/resnet/resnet_classification.py

class ResNetClassification(ClassificationBaseOnnxRoboflowInferenceModel):
    """VitClassification handles classification inference
    for Vision Transformer (ViT) models using ONNX.

    Inherits:
        ClassificationBaseOnnxRoboflowInferenceModel: Base class for ONNX Roboflow Inference.
        ClassificationMixin: Mixin class providing classification-specific methods.

    Attributes:
        multiclass (bool): A flag that specifies if the model should handle multiclass classification.
    """

    preprocess_means = [0.485, 0.456, 0.406]
    preprocess_stds = [0.229, 0.224, 0.225]

    def __init__(self, *args, **kwargs):
        """Initializes the VitClassification instance.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__(*args, **kwargs)
        self.multiclass = self.environment.get("MULTICLASS", False)

    @property
    def weights_file(self) -> str:
        """Determines the weights file to be used based on the availability of AWS keys.

        If AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set, it returns the path to 'weights.onnx'.
        Otherwise, it returns the path to 'best.onnx'.

        Returns:
            str: Path to the weights file.
        """
        if AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and LAMBDA:
            return "weights.onnx"
        else:
            return "best.onnx"

Attributes¶

weights_file `property` ¶

weights_file

Determines the weights file to be used based on the availability of AWS keys.

If AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set, it returns the path to 'weights.onnx'. Otherwise, it returns the path to 'best.onnx'.

Returns:

Name	Type	Description
`str`	`str`	Path to the weights file.

Methods:¶

init ¶

__init__(*args, **kwargs)

Initializes the VitClassification instance.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/resnet/resnet_classification.py

def __init__(self, *args, **kwargs):
    """Initializes the VitClassification instance.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__(*args, **kwargs)
    self.multiclass = self.environment.get("MULTICLASS", False)

`models/rfdetr`¶

inference.models.rfdetr.rfdetr ¶

Classes¶

RFDETRInstanceSegmentation ¶

Bases: RFDETRObjectDetection, InstanceSegmentationBaseOnnxRoboflowInferenceModel

Source code in inference/models/rfdetr/rfdetr.py

class RFDETRInstanceSegmentation(
    RFDETRObjectDetection, InstanceSegmentationBaseOnnxRoboflowInferenceModel
):
    task_type = "instance-segmentation"

    def initialize_model(self, **kwargs) -> None:
        super().initialize_model(**kwargs)
        mask_shape = self.onnx_session.get_outputs()[2].shape
        self.mask_shape = mask_shape[2:]

    def predict(self, img_in: ImageMetaType, **kwargs) -> Tuple[np.ndarray]:
        """Performs object detection on the given image using the ONNX session with the RFDETR model.

        Args:
            img_in (np.ndarray): Input image as a NumPy array.

        Returns:
            Tuple[np.ndarray]: NumPy array representing the predictions, including boxes, confidence scores, and class IDs.
        """
        with self._session_lock:
            predictions = run_session_via_iobinding(
                self.onnx_session, self.input_name, img_in
            )
        bboxes = predictions[0]
        logits = predictions[1]
        masks = predictions[2]

        return (bboxes, logits, masks)

    def postprocess(
        self,
        predictions: Tuple[np.ndarray, ...],
        preproc_return_metadata: PreprocessReturnMetadata,
        confidence: float = DEFAULT_CONFIDENCE,
        max_detections: int = DEFAUlT_MAX_DETECTIONS,
        **kwargs,
    ) -> List[InstanceSegmentationInferenceResponse]:
        bboxes, logits, masks = predictions
        bboxes = bboxes.astype(np.float32)
        logits = logits.astype(np.float32)

        batch_size, num_queries, num_classes = logits.shape
        logits_sigmoid = self.sigmoid_stable(logits)

        img_dims = preproc_return_metadata["img_dims"]

        processed_predictions = []
        processed_masks = []

        for batch_idx in range(batch_size):
            orig_h, orig_w = img_dims[batch_idx]

            logits_flat = logits_sigmoid[batch_idx].reshape(-1)

            # Use argpartition for better performance when max_detections is smaller than logits_flat
            if len(logits_flat) > max_detections:
                partition_indices = np.argpartition(-logits_flat, max_detections)[
                    :max_detections
                ]
                sorted_indices = partition_indices[
                    np.argsort(-logits_flat[partition_indices])
                ]
            else:
                sorted_indices = np.argsort(-logits_flat)
            topk_scores = logits_flat[sorted_indices]

            conf_mask = topk_scores > confidence
            sorted_indices = sorted_indices[conf_mask]
            topk_scores = topk_scores[conf_mask]

            topk_boxes = sorted_indices // num_classes
            topk_labels = sorted_indices % num_classes

            if self.is_one_indexed:
                class_filter_mask = topk_labels != self.background_class_index

                topk_labels[topk_labels > self.background_class_index] -= 1
                topk_scores = topk_scores[class_filter_mask]
                topk_labels = topk_labels[class_filter_mask]
                topk_boxes = topk_boxes[class_filter_mask]

            selected_boxes = bboxes[batch_idx, topk_boxes]
            selected_masks = masks[batch_idx, topk_boxes]

            cxcy = selected_boxes[:, :2]
            wh = selected_boxes[:, 2:]
            xy_min = cxcy - 0.5 * wh
            xy_max = cxcy + 0.5 * wh
            boxes_xyxy = np.concatenate([xy_min, xy_max], axis=1)

            if self.resize_method == "Stretch to":
                scale_fct = np.array([orig_w, orig_h, orig_w, orig_h], dtype=np.float32)
                boxes_xyxy *= scale_fct
            else:
                if self._needs_nonsquare_preproc:
                    input_h, input_w = self._preproc_resize_h, self._preproc_resize_w
                else:
                    input_h, input_w = self.img_size_h, self.img_size_w

                scale = min(input_w / orig_w, input_h / orig_h)
                scaled_w = int(orig_w * scale)
                scaled_h = int(orig_h * scale)

                pad_x = (input_w - scaled_w) / 2
                pad_y = (input_h - scaled_h) / 2

                boxes_input = boxes_xyxy * np.array(
                    [input_w, input_h, input_w, input_h], dtype=np.float32
                )

                boxes_input[:, 0] -= pad_x
                boxes_input[:, 1] -= pad_y
                boxes_input[:, 2] -= pad_x
                boxes_input[:, 3] -= pad_y

                boxes_xyxy = boxes_input / scale

            np.clip(
                boxes_xyxy,
                [0, 0, 0, 0],
                [orig_w, orig_h, orig_w, orig_h],
                out=boxes_xyxy,
            )

            batch_predictions = np.column_stack(
                (
                    boxes_xyxy,
                    topk_scores,
                    np.zeros((len(topk_scores), 1), dtype=np.float32),
                    topk_labels,
                )
            )
            valid_pred_mask = batch_predictions[:, 6] < len(self.class_names)

            outputs_predictions = []
            outputs_polygons = []
            class_filter_local = kwargs.get("class_filter")
            for i, pred in enumerate(batch_predictions):
                if not valid_pred_mask[i]:
                    continue
                # Early class filtering to avoid unnecessary mask processing
                if class_filter_local:
                    try:
                        pred_class_name = self.class_names[int(pred[6])]
                    except Exception:
                        continue
                    if pred_class_name not in class_filter_local:
                        continue
                mask = selected_masks[i]

                if self.resize_method != "Stretch to":
                    if self._needs_nonsquare_preproc:
                        input_h, input_w = (
                            self._preproc_resize_h,
                            self._preproc_resize_w,
                        )
                    else:
                        input_h, input_w = self.img_size_h, self.img_size_w
                    mask_h, mask_w = mask.shape[0], mask.shape[1]

                    letterbox_scale = min(input_w / orig_w, input_h / orig_h)
                    scaled_w = int(orig_w * letterbox_scale)
                    scaled_h = int(orig_h * letterbox_scale)

                    pad_x_input = (input_w - scaled_w) / 2
                    pad_y_input = (input_h - scaled_h) / 2

                    crop_x1 = int(round(pad_x_input * mask_w / input_w))
                    crop_y1 = int(round(pad_y_input * mask_h / input_h))
                    crop_x2 = int(round((pad_x_input + scaled_w) * mask_w / input_w))
                    crop_y2 = int(round((pad_y_input + scaled_h) * mask_h / input_h))

                    mask = mask[crop_y1:crop_y2, crop_x1:crop_x2]

                mask_decode_mode = kwargs.get("mask_decode_mode", "accurate")
                if mask_decode_mode == "accurate":
                    target_res = (orig_w, orig_h)
                    if mask.shape[1] != target_res[0] or mask.shape[0] != target_res[1]:
                        mask = cv2.resize(
                            mask.astype(np.float32),
                            target_res,
                            interpolation=cv2.INTER_LINEAR,
                        )
                elif mask_decode_mode == "tradeoff":
                    tradeoff_factor = kwargs.get("tradeoff_factor", 0.0)
                    mask_res = (mask.shape[1], mask.shape[0])  # (w, h)
                    full_res = (orig_w, orig_h)  # (w, h)
                    target_res = (
                        int(
                            mask_res[0] * (1 - tradeoff_factor)
                            + full_res[0] * tradeoff_factor
                        ),
                        int(
                            mask_res[1] * (1 - tradeoff_factor)
                            + full_res[1] * tradeoff_factor
                        ),
                    )
                    if mask.shape[1] != target_res[0] or mask.shape[0] != target_res[1]:
                        mask = cv2.resize(
                            mask.astype(np.float32),
                            target_res,
                            interpolation=cv2.INTER_LINEAR,
                        )

                mask_bin = (mask > 0).astype(np.uint8)
                points = mask2poly(mask_bin)

                # After letterbox cropping, both paths reduce to a simple
                # linear rescale from prediction dims to original dims.
                new_points = []
                prediction_h, prediction_w = mask_bin.shape[0], mask_bin.shape[1]
                for point in points:
                    new_x = point[0] * (orig_w / prediction_w)
                    new_y = point[1] * (orig_h / prediction_h)
                    new_points.append(np.array([new_x, new_y]))
                outputs_polygons.append(new_points)
                outputs_predictions.append(list(pred))

            processed_predictions.append(outputs_predictions)
            processed_masks.append(outputs_polygons)

        res = self.make_response(
            processed_predictions, processed_masks, img_dims, **kwargs
        )
        return res

    def make_response(
        self,
        predictions: List[List[List[float]]],
        masks: List[List[List[np.ndarray]]],
        img_dims: List[Tuple[int, int]],
        class_filter: Optional[List[str]] = None,
        *args,
        **kwargs,
    ) -> List[InstanceSegmentationInferenceResponse]:
        """Constructs instance segmentation response objects from preprocessed predictions and polygons."""
        # Align to actual number of real images; predictions/masks may include padded slots
        if isinstance(img_dims, dict) and "img_dims" in img_dims:
            img_dims = img_dims["img_dims"]
        effective_len = min(len(img_dims), len(predictions), len(masks))

        responses = []
        for ind in range(effective_len):
            batch_predictions = predictions[ind]
            batch_masks = masks[ind]
            preds_out = []
            for pred, mask in zip(batch_predictions, batch_masks):
                if class_filter and self.class_names[int(pred[6])] not in class_filter:
                    continue
                preds_out.append(
                    InstanceSegmentationPrediction(
                        **{
                            "x": (pred[0] + pred[2]) / 2,
                            "y": (pred[1] + pred[3]) / 2,
                            "width": pred[2] - pred[0],
                            "height": pred[3] - pred[1],
                            "confidence": pred[4],
                            "class": self.class_names[int(pred[6])],
                            "class_id": int(pred[6]),
                            "points": [Point(x=point[0], y=point[1]) for point in mask],
                        }
                    )
                )
            responses.append(
                InstanceSegmentationInferenceResponse(
                    predictions=preds_out,
                    image=InferenceResponseImage(
                        width=img_dims[ind][1], height=img_dims[ind][0]
                    ),
                )
            )
        return responses

Methods:¶

make_response ¶

make_response(
    predictions,
    masks,
    img_dims,
    class_filter=None,
    *args,
    **kwargs
)

Constructs instance segmentation response objects from preprocessed predictions and polygons.

Source code in inference/models/rfdetr/rfdetr.py

def make_response(
    self,
    predictions: List[List[List[float]]],
    masks: List[List[List[np.ndarray]]],
    img_dims: List[Tuple[int, int]],
    class_filter: Optional[List[str]] = None,
    *args,
    **kwargs,
) -> List[InstanceSegmentationInferenceResponse]:
    """Constructs instance segmentation response objects from preprocessed predictions and polygons."""
    # Align to actual number of real images; predictions/masks may include padded slots
    if isinstance(img_dims, dict) and "img_dims" in img_dims:
        img_dims = img_dims["img_dims"]
    effective_len = min(len(img_dims), len(predictions), len(masks))

    responses = []
    for ind in range(effective_len):
        batch_predictions = predictions[ind]
        batch_masks = masks[ind]
        preds_out = []
        for pred, mask in zip(batch_predictions, batch_masks):
            if class_filter and self.class_names[int(pred[6])] not in class_filter:
                continue
            preds_out.append(
                InstanceSegmentationPrediction(
                    **{
                        "x": (pred[0] + pred[2]) / 2,
                        "y": (pred[1] + pred[3]) / 2,
                        "width": pred[2] - pred[0],
                        "height": pred[3] - pred[1],
                        "confidence": pred[4],
                        "class": self.class_names[int(pred[6])],
                        "class_id": int(pred[6]),
                        "points": [Point(x=point[0], y=point[1]) for point in mask],
                    }
                )
            )
        responses.append(
            InstanceSegmentationInferenceResponse(
                predictions=preds_out,
                image=InferenceResponseImage(
                    width=img_dims[ind][1], height=img_dims[ind][0]
                ),
            )
        )
    return responses

predict ¶

predict(img_in, **kwargs)

Performs object detection on the given image using the ONNX session with the RFDETR model.

Parameters:

Name	Type	Description	Default
`img_in`	`ndarray`	Input image as a NumPy array.	required

Returns:

Type	Description
`Tuple[ndarray]`	Tuple[np.ndarray]: NumPy array representing the predictions, including boxes, confidence scores, and class IDs.

Source code in inference/models/rfdetr/rfdetr.py

def predict(self, img_in: ImageMetaType, **kwargs) -> Tuple[np.ndarray]:
    """Performs object detection on the given image using the ONNX session with the RFDETR model.

    Args:
        img_in (np.ndarray): Input image as a NumPy array.

    Returns:
        Tuple[np.ndarray]: NumPy array representing the predictions, including boxes, confidence scores, and class IDs.
    """
    with self._session_lock:
        predictions = run_session_via_iobinding(
            self.onnx_session, self.input_name, img_in
        )
    bboxes = predictions[0]
    logits = predictions[1]
    masks = predictions[2]

    return (bboxes, logits, masks)

RFDETRObjectDetection ¶

Bases: ObjectDetectionBaseOnnxRoboflowInferenceModel

Roboflow ONNX Object detection with the RFDETR model.

This class is responsible for performing object detection using the RFDETR model with ONNX runtime.

Attributes:

Name	Type	Description
`weights_file`	`str`	Path to the ONNX weights file.

Methods:

Name	Description
`predict`	Performs object detection on the given image using the ONNX session.

Source code in inference/models/rfdetr/rfdetr.py

class RFDETRObjectDetection(ObjectDetectionBaseOnnxRoboflowInferenceModel):
    """Roboflow ONNX Object detection with the RFDETR model.

    This class is responsible for performing object detection using the RFDETR model
    with ONNX runtime.

    Attributes:
        weights_file (str): Path to the ONNX weights file.

    Methods:
        predict: Performs object detection on the given image using the ONNX session.
    """

    preprocess_means = [0.485, 0.456, 0.406]
    preprocess_stds = [0.229, 0.224, 0.225]

    @property
    def weights_file(self) -> str:
        """Gets the weights file for the RFDETR model.

        Returns:
            str: Path to the ONNX weights file.
        """
        return "weights.onnx"

    def preproc_image(
        self,
        image: Union[Any, InferenceRequestImage],
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
    ) -> Tuple[np.ndarray, Tuple[int, int]]:
        """
        Preprocesses an inference request image by loading it, then applying any pre-processing specified by the Roboflow platform, then scaling it to the inference input dimensions.

        Args:
            image (Union[Any, InferenceRequestImage]): An object containing information necessary to load the image for inference.
            disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
            disable_preproc_contrast (bool, optional): If true, the contrast preprocessing step is disabled for this call. Default is False.
            disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
            disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.

        Returns:
            Tuple[np.ndarray, Tuple[int, int]]: A tuple containing a numpy array of the preprocessed image pixel data and a tuple of the images original size.
        """
        if isinstance(image, Image.Image) and USE_PYTORCH_FOR_PREPROCESSING:
            if CUDA_IS_AVAILABLE:
                np_image = torch.from_numpy(np.asarray(image, copy=False)).cuda()
            else:
                np_image = torch.from_numpy(np.asarray(image, copy=False))
            is_bgr = False
        else:
            np_image, is_bgr = load_image(
                image,
                disable_preproc_auto_orient=disable_preproc_auto_orient
                or "auto-orient" not in self.preproc.keys()
                or DISABLE_PREPROC_AUTO_ORIENT,
            )
        if USE_PYTORCH_FOR_PREPROCESSING:
            if not isinstance(np_image, torch.Tensor):
                np_image = torch.from_numpy(np_image)
            if torch.cuda.is_available():
                np_image = np_image.cuda()

        preprocessed_image, img_dims = self.preprocess_image(
            np_image,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
        )

        if USE_PYTORCH_FOR_PREPROCESSING:
            preprocessed_image = (
                preprocessed_image.permute(2, 0, 1).unsqueeze(0).contiguous()
            )
            preprocessed_image = preprocessed_image.float()

            preprocessed_image /= 255.0

            means = torch.tensor(
                self.preprocess_means, device=preprocessed_image.device
            ).view(3, 1, 1)
            stds = torch.tensor(
                self.preprocess_stds, device=preprocessed_image.device
            ).view(3, 1, 1)
            preprocessed_image = (preprocessed_image - means) / stds
        else:
            preprocessed_image = preprocessed_image.astype(np.float32)
            preprocessed_image /= 255.0

            preprocessed_image[:, :, 0] = (
                preprocessed_image[:, :, 0] - self.preprocess_means[0]
            ) / self.preprocess_stds[0]
            preprocessed_image[:, :, 1] = (
                preprocessed_image[:, :, 1] - self.preprocess_means[1]
            ) / self.preprocess_stds[1]
            preprocessed_image[:, :, 2] = (
                preprocessed_image[:, :, 2] - self.preprocess_means[2]
            ) / self.preprocess_stds[2]

        if self._needs_nonsquare_preproc:
            intermediate_size = (self._preproc_resize_w, self._preproc_resize_h)
        else:
            intermediate_size = None

        if self.resize_method == "Stretch to":
            if isinstance(preprocessed_image, np.ndarray):
                preprocessed_image = preprocessed_image.astype(np.float32)
                resized = cv2.resize(
                    preprocessed_image,
                    (self.img_size_w, self.img_size_h),
                )
            elif USE_PYTORCH_FOR_PREPROCESSING:
                resized = torch.nn.functional.interpolate(
                    preprocessed_image,
                    size=(self.img_size_h, self.img_size_w),
                    mode="bilinear",
                )
            else:
                raise ValueError(
                    f"Received an image of unknown type, {type(preprocessed_image)}; "
                    "This is most likely a bug. Contact Roboflow team through github issues "
                    "(https://github.com/roboflow/inference/issues) providing full context of the problem"
                )

        elif self.resize_method == "Fit (black edges) in":
            resized = letterbox_image(
                preprocessed_image,
                intermediate_size or (self.img_size_w, self.img_size_h),
            )
        elif self.resize_method == "Fit (white edges) in":
            resized = letterbox_image(
                preprocessed_image,
                intermediate_size or (self.img_size_w, self.img_size_h),
                color=(255, 255, 255),
            )
        elif self.resize_method == "Fit (grey edges) in":
            resized = letterbox_image(
                preprocessed_image,
                intermediate_size or (self.img_size_w, self.img_size_h),
                color=(114, 114, 114),
            )

        if intermediate_size is not None:
            if isinstance(resized, np.ndarray):
                resized = cv2.resize(
                    resized.astype(np.float32),
                    (self.img_size_w, self.img_size_h),
                )
            elif USE_PYTORCH_FOR_PREPROCESSING:
                resized = torch.nn.functional.interpolate(
                    resized,
                    size=(self.img_size_h, self.img_size_w),
                    mode="bilinear",
                )
            else:
                raise ValueError(
                    f"Received an image of unknown type, {type(resized)}; "
                    "This is most likely a bug. Contact Roboflow team through github issues "
                    "(https://github.com/roboflow/inference/issues) providing full context of the problem"
                )

        if is_bgr:
            if isinstance(resized, np.ndarray):
                resized = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
            else:
                resized = resized[:, [2, 1, 0], :, :]

        if isinstance(resized, np.ndarray):
            img_in = np.transpose(resized, (2, 0, 1))
            img_in = img_in.astype(np.float32)
            img_in = np.expand_dims(img_in, axis=0)
        elif USE_PYTORCH_FOR_PREPROCESSING:
            img_in = resized.float()
        else:
            raise ValueError(
                f"Received an image of unknown type, {type(resized)}; "
                "This is most likely a bug. Contact Roboflow team through github issues "
                "(https://github.com/roboflow/inference/issues) providing full context of the problem"
            )
        return img_in, img_dims

    def preprocess(
        self,
        image: Any,
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
        fix_batch_size: bool = False,
        **kwargs,
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        img_in, img_dims = self.load_image(
            image,
            disable_preproc_auto_orient=disable_preproc_auto_orient,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
        )
        if not USE_PYTORCH_FOR_PREPROCESSING:
            img_in = img_in.astype(np.float32)
        else:
            img_in = img_in.float()

        if self.batching_enabled:
            batch_padding = 0
            if FIX_BATCH_SIZE or fix_batch_size:
                if MAX_BATCH_SIZE == float("inf"):
                    logger.warning(
                        "Requested fix_batch_size but MAX_BATCH_SIZE is not set. Using dynamic batching."
                    )
                    batch_padding = 0
                else:
                    batch_padding = MAX_BATCH_SIZE - img_in.shape[0]
            if batch_padding < 0:
                raise ValueError(
                    f"Requested fix_batch_size but passed in {img_in.shape[0]} images "
                    f"when the model's batch size is {MAX_BATCH_SIZE}\n"
                    f"Consider turning off fix_batch_size, changing `MAX_BATCH_SIZE` in"
                    f"your inference server config, or passing at most {MAX_BATCH_SIZE} images at a time"
                )
            else:
                raise ValueError(
                    f"Received an image of unknown type, {type(img_in)}; "
                    "This is most likely a bug. Contact Roboflow team through github issues "
                    "(https://github.com/roboflow/inference/issues) providing full context of the problem"
                )

        return img_in, PreprocessReturnMetadata(
            {
                "img_dims": img_dims,
                "disable_preproc_static_crop": disable_preproc_static_crop,
            }
        )

    def predict(self, img_in: ImageMetaType, **kwargs) -> Tuple[np.ndarray]:
        """Performs object detection on the given image using the ONNX session with the RFDETR model.

        Args:
            img_in (np.ndarray): Input image as a NumPy array.

        Returns:
            Tuple[np.ndarray]: NumPy array representing the predictions, including boxes, confidence scores, and class IDs.
        """
        with self._session_lock:
            predictions = run_session_via_iobinding(
                self.onnx_session, self.input_name, img_in
            )
        bboxes = predictions[0]
        logits = predictions[1]

        return (bboxes, logits)

    def sigmoid_stable(self, x):
        # More efficient, branchless, numerically stable sigmoid computation
        z = np.exp(-np.abs(x))
        return np.where(x >= 0, 1 / (1 + z), z / (1 + z))

    def postprocess(
        self,
        predictions: Tuple[np.ndarray, ...],
        preproc_return_metadata: PreprocessReturnMetadata,
        confidence: float = DEFAULT_CONFIDENCE,
        max_detections: int = DEFAUlT_MAX_DETECTIONS,
        **kwargs,
    ) -> List[ObjectDetectionInferenceResponse]:
        bboxes, logits = predictions
        bboxes = bboxes.astype(np.float32)
        logits = logits.astype(np.float32)

        batch_size, num_queries, num_classes = logits.shape
        logits_sigmoid = self.sigmoid_stable(logits)

        img_dims = preproc_return_metadata["img_dims"]

        processed_predictions = []

        for batch_idx in range(batch_size):
            orig_h, orig_w = img_dims[batch_idx]

            logits_flat = logits_sigmoid[batch_idx].reshape(-1)

            # Use argpartition for better performance when max_detections is smaller than logits_flat
            if len(logits_flat) > max_detections:
                partition_indices = np.argpartition(-logits_flat, max_detections)[
                    :max_detections
                ]
                sorted_indices = partition_indices[
                    np.argsort(-logits_flat[partition_indices])
                ]
            else:
                sorted_indices = np.argsort(-logits_flat)
            topk_scores = logits_flat[sorted_indices]

            conf_mask = topk_scores > confidence
            sorted_indices = sorted_indices[conf_mask]
            topk_scores = topk_scores[conf_mask]

            topk_boxes = sorted_indices // num_classes
            topk_labels = sorted_indices % num_classes

            if self.is_one_indexed:
                class_filter_mask = topk_labels != self.background_class_index

                topk_labels[topk_labels > self.background_class_index] -= 1
                topk_scores = topk_scores[class_filter_mask]
                topk_labels = topk_labels[class_filter_mask]
                topk_boxes = topk_boxes[class_filter_mask]

            selected_boxes = bboxes[batch_idx, topk_boxes]

            cxcy = selected_boxes[:, :2]
            wh = selected_boxes[:, 2:]
            xy_min = cxcy - 0.5 * wh
            xy_max = cxcy + 0.5 * wh
            boxes_xyxy = np.concatenate([xy_min, xy_max], axis=1)

            if self.resize_method == "Stretch to":
                scale_fct = np.array([orig_w, orig_h, orig_w, orig_h], dtype=np.float32)
                boxes_xyxy *= scale_fct
            else:
                if self._needs_nonsquare_preproc:
                    input_h, input_w = self._preproc_resize_h, self._preproc_resize_w
                else:
                    input_h, input_w = self.img_size_h, self.img_size_w

                scale = min(input_w / orig_w, input_h / orig_h)
                scaled_w = int(orig_w * scale)
                scaled_h = int(orig_h * scale)

                pad_x = (input_w - scaled_w) / 2
                pad_y = (input_h - scaled_h) / 2

                boxes_input = boxes_xyxy * np.array(
                    [input_w, input_h, input_w, input_h], dtype=np.float32
                )

                boxes_input[:, 0] -= pad_x
                boxes_input[:, 1] -= pad_y
                boxes_input[:, 2] -= pad_x
                boxes_input[:, 3] -= pad_y

                boxes_xyxy = boxes_input / scale

            np.clip(
                boxes_xyxy,
                [0, 0, 0, 0],
                [orig_w, orig_h, orig_w, orig_h],
                out=boxes_xyxy,
            )

            batch_predictions = np.column_stack(
                (
                    boxes_xyxy,
                    topk_scores,
                    np.zeros((len(topk_scores), 1), dtype=np.float32),
                    topk_labels,
                )
            )
            batch_predictions = batch_predictions[
                batch_predictions[:, 6] < len(self.class_names)
            ]

            processed_predictions.append(batch_predictions)

        res = self.make_response(processed_predictions, img_dims, **kwargs)
        return res

    def initialize_model(self, **kwargs) -> None:
        """Initializes the ONNX model, setting up the inference session and other necessary properties."""
        logger.debug("Getting model artefacts")
        self.get_model_artifacts(**kwargs)

        input_resolution = self.environment.get("RESOLUTION")
        if input_resolution is None:
            input_resolution = self.preproc.get("resize", {}).get("width")
        if isinstance(input_resolution, (list, tuple)):
            input_resolution = input_resolution[0]
        try:
            input_resolution = int(input_resolution)
        except (TypeError, ValueError):
            input_resolution = None
        if (
            input_resolution is not None
            and input_resolution >= RFDETR_ONNX_MAX_RESOLUTION
        ):
            logger.error(
                "NOT loading '%s' model, input resolution is '%s', ONNX max resolution limit set to '%s' (limit can be increased via RFDETR_ONNX_MAX_RESOLUTION env variable)",
                self.endpoint,
                input_resolution,
                RFDETR_ONNX_MAX_RESOLUTION,
            )
            raise CannotInitialiseModelDueToInputSizeError(
                f"Resolution too high for RFDETR"
            )

        logger.debug("Creating inference session")
        if self.load_weights or not self.has_model_metadata:
            t1_session = perf_counter()
            providers = get_onnxruntime_execution_providers(
                ONNXRUNTIME_EXECUTION_PROVIDERS
            )

            if not self.load_weights:
                providers = [
                    "CPUExecutionProvider"
                ]  # "OpenVINOExecutionProvider" dropped until further investigation is done

            try:
                session_options = onnxruntime.SessionOptions()
                session_options.log_severity_level = 3
                # TensorRT does better graph optimization for its EP than onnx
                if has_trt(providers):
                    session_options.graph_optimization_level = (
                        onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
                    )
                expanded_execution_providers = []
                for ep in self.onnxruntime_execution_providers:
                    if ep == "TensorrtExecutionProvider":
                        ep = (
                            "TensorrtExecutionProvider",
                            {
                                "trt_max_workspace_size": str(1 << 30),
                                "trt_engine_cache_enable": True,
                                "trt_engine_cache_path": os.path.join(
                                    TENSORRT_CACHE_PATH, self.endpoint
                                ),
                                "trt_fp16_enable": True,
                                "trt_dump_subgraphs": False,
                                "trt_force_sequential_engine_build": False,
                                "trt_dla_enable": False,
                            },
                        )
                    expanded_execution_providers.append(ep)

                if "OpenVINOExecutionProvider" in expanded_execution_providers:
                    expanded_execution_providers.remove("OpenVINOExecutionProvider")

                self.onnx_session = onnxruntime.InferenceSession(
                    self.cache_file(self.weights_file),
                    providers=expanded_execution_providers,
                    sess_options=session_options,
                )
            except Exception as e:
                self.clear_cache(delete_from_disk=DISK_CACHE_CLEANUP)
                raise ModelArtefactError(
                    f"Unable to load ONNX session. Cause: {e}"
                ) from e
            logger.debug(f"Session created in {perf_counter() - t1_session} seconds")

            inputs = self.onnx_session.get_inputs()[0]
            input_shape = inputs.shape
            self.batch_size = input_shape[0]
            self.img_size_h = input_shape[2]
            self.img_size_w = input_shape[3]
            self.input_name = inputs.name
            if isinstance(self.img_size_h, str) or isinstance(self.img_size_w, str):
                if "resize" in self.preproc:
                    self.img_size_h = int(self.preproc["resize"]["height"])
                    self.img_size_w = int(self.preproc["resize"]["width"])
                else:
                    self.img_size_h = 640
                    self.img_size_w = 640

            if isinstance(self.batch_size, str):
                self.batching_enabled = True
                logger.debug(
                    f"Model {self.endpoint} is loaded with dynamic batching enabled"
                )
            else:
                self.batching_enabled = False
                logger.debug(
                    f"Model {self.endpoint} is loaded with dynamic batching disabled"
                )

            model_metadata = {
                "batch_size": self.batch_size,
                "img_size_h": self.img_size_h,
                "img_size_w": self.img_size_w,
            }
            logger.debug(f"Writing model metadata to memcache")
            self.write_model_metadata_to_memcache(model_metadata)
            if not self.load_weights:  # had to load weights to get metadata
                del self.onnx_session
        else:
            if not self.has_model_metadata:
                raise ValueError(
                    "This should be unreachable, should get weights if we don't have model metadata"
                )
            logger.debug(f"Loading model metadata from memcache")
            metadata = self.model_metadata_from_memcache()
            self.batch_size = metadata["batch_size"]
            self.img_size_h = metadata["img_size_h"]
            self.img_size_w = metadata["img_size_w"]
            if isinstance(self.batch_size, str):
                self.batching_enabled = True
                logger.debug(
                    f"Model {self.endpoint} is loaded with dynamic batching enabled"
                )
            else:
                self.batching_enabled = False
                logger.debug(
                    f"Model {self.endpoint} is loaded with dynamic batching disabled"
                )

        self._needs_nonsquare_preproc = False
        if self.preproc.get("resize"):
            preproc_w = int(self.preproc["resize"].get("width", self.img_size_w))
            preproc_h = int(self.preproc["resize"].get("height", self.img_size_h))
            self._needs_nonsquare_preproc = (
                self.resize_method != "Stretch to"
                and preproc_w != preproc_h
                and self.img_size_h == self.img_size_w
            )
            if self._needs_nonsquare_preproc:
                self._preproc_resize_w = preproc_w
                self._preproc_resize_h = preproc_h
                logger.debug(
                    "Non-square preprocessing detected: resize to %dx%d then stretch to %dx%d",
                    preproc_w,
                    preproc_h,
                    self.img_size_w,
                    self.img_size_h,
                )

        if ROBOFLOW_BACKGROUND_CLASS in self.class_names:
            self.is_one_indexed = True
            self.background_class_index = self.class_names.index(
                ROBOFLOW_BACKGROUND_CLASS
            )
            self.class_names = (
                self.class_names[: self.background_class_index]
                + self.class_names[self.background_class_index + 1 :]
            )
        else:
            self.is_one_indexed = False
        logger.debug("Model initialisation finished.")

    def validate_model_classes(self) -> None:
        pass

Attributes¶

weights_file `property` ¶

weights_file

Gets the weights file for the RFDETR model.

Returns:

Name	Type	Description
`str`	`str`	Path to the ONNX weights file.

Methods:¶

initialize_model ¶

initialize_model(**kwargs)

Initializes the ONNX model, setting up the inference session and other necessary properties.

Source code in inference/models/rfdetr/rfdetr.py

def initialize_model(self, **kwargs) -> None:
    """Initializes the ONNX model, setting up the inference session and other necessary properties."""
    logger.debug("Getting model artefacts")
    self.get_model_artifacts(**kwargs)

    input_resolution = self.environment.get("RESOLUTION")
    if input_resolution is None:
        input_resolution = self.preproc.get("resize", {}).get("width")
    if isinstance(input_resolution, (list, tuple)):
        input_resolution = input_resolution[0]
    try:
        input_resolution = int(input_resolution)
    except (TypeError, ValueError):
        input_resolution = None
    if (
        input_resolution is not None
        and input_resolution >= RFDETR_ONNX_MAX_RESOLUTION
    ):
        logger.error(
            "NOT loading '%s' model, input resolution is '%s', ONNX max resolution limit set to '%s' (limit can be increased via RFDETR_ONNX_MAX_RESOLUTION env variable)",
            self.endpoint,
            input_resolution,
            RFDETR_ONNX_MAX_RESOLUTION,
        )
        raise CannotInitialiseModelDueToInputSizeError(
            f"Resolution too high for RFDETR"
        )

    logger.debug("Creating inference session")
    if self.load_weights or not self.has_model_metadata:
        t1_session = perf_counter()
        providers = get_onnxruntime_execution_providers(
            ONNXRUNTIME_EXECUTION_PROVIDERS
        )

        if not self.load_weights:
            providers = [
                "CPUExecutionProvider"
            ]  # "OpenVINOExecutionProvider" dropped until further investigation is done

        try:
            session_options = onnxruntime.SessionOptions()
            session_options.log_severity_level = 3
            # TensorRT does better graph optimization for its EP than onnx
            if has_trt(providers):
                session_options.graph_optimization_level = (
                    onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
                )
            expanded_execution_providers = []
            for ep in self.onnxruntime_execution_providers:
                if ep == "TensorrtExecutionProvider":
                    ep = (
                        "TensorrtExecutionProvider",
                        {
                            "trt_max_workspace_size": str(1 << 30),
                            "trt_engine_cache_enable": True,
                            "trt_engine_cache_path": os.path.join(
                                TENSORRT_CACHE_PATH, self.endpoint
                            ),
                            "trt_fp16_enable": True,
                            "trt_dump_subgraphs": False,
                            "trt_force_sequential_engine_build": False,
                            "trt_dla_enable": False,
                        },
                    )
                expanded_execution_providers.append(ep)

            if "OpenVINOExecutionProvider" in expanded_execution_providers:
                expanded_execution_providers.remove("OpenVINOExecutionProvider")

            self.onnx_session = onnxruntime.InferenceSession(
                self.cache_file(self.weights_file),
                providers=expanded_execution_providers,
                sess_options=session_options,
            )
        except Exception as e:
            self.clear_cache(delete_from_disk=DISK_CACHE_CLEANUP)
            raise ModelArtefactError(
                f"Unable to load ONNX session. Cause: {e}"
            ) from e
        logger.debug(f"Session created in {perf_counter() - t1_session} seconds")

        inputs = self.onnx_session.get_inputs()[0]
        input_shape = inputs.shape
        self.batch_size = input_shape[0]
        self.img_size_h = input_shape[2]
        self.img_size_w = input_shape[3]
        self.input_name = inputs.name
        if isinstance(self.img_size_h, str) or isinstance(self.img_size_w, str):
            if "resize" in self.preproc:
                self.img_size_h = int(self.preproc["resize"]["height"])
                self.img_size_w = int(self.preproc["resize"]["width"])
            else:
                self.img_size_h = 640
                self.img_size_w = 640

        if isinstance(self.batch_size, str):
            self.batching_enabled = True
            logger.debug(
                f"Model {self.endpoint} is loaded with dynamic batching enabled"
            )
        else:
            self.batching_enabled = False
            logger.debug(
                f"Model {self.endpoint} is loaded with dynamic batching disabled"
            )

        model_metadata = {
            "batch_size": self.batch_size,
            "img_size_h": self.img_size_h,
            "img_size_w": self.img_size_w,
        }
        logger.debug(f"Writing model metadata to memcache")
        self.write_model_metadata_to_memcache(model_metadata)
        if not self.load_weights:  # had to load weights to get metadata
            del self.onnx_session
    else:
        if not self.has_model_metadata:
            raise ValueError(
                "This should be unreachable, should get weights if we don't have model metadata"
            )
        logger.debug(f"Loading model metadata from memcache")
        metadata = self.model_metadata_from_memcache()
        self.batch_size = metadata["batch_size"]
        self.img_size_h = metadata["img_size_h"]
        self.img_size_w = metadata["img_size_w"]
        if isinstance(self.batch_size, str):
            self.batching_enabled = True
            logger.debug(
                f"Model {self.endpoint} is loaded with dynamic batching enabled"
            )
        else:
            self.batching_enabled = False
            logger.debug(
                f"Model {self.endpoint} is loaded with dynamic batching disabled"
            )

    self._needs_nonsquare_preproc = False
    if self.preproc.get("resize"):
        preproc_w = int(self.preproc["resize"].get("width", self.img_size_w))
        preproc_h = int(self.preproc["resize"].get("height", self.img_size_h))
        self._needs_nonsquare_preproc = (
            self.resize_method != "Stretch to"
            and preproc_w != preproc_h
            and self.img_size_h == self.img_size_w
        )
        if self._needs_nonsquare_preproc:
            self._preproc_resize_w = preproc_w
            self._preproc_resize_h = preproc_h
            logger.debug(
                "Non-square preprocessing detected: resize to %dx%d then stretch to %dx%d",
                preproc_w,
                preproc_h,
                self.img_size_w,
                self.img_size_h,
            )

    if ROBOFLOW_BACKGROUND_CLASS in self.class_names:
        self.is_one_indexed = True
        self.background_class_index = self.class_names.index(
            ROBOFLOW_BACKGROUND_CLASS
        )
        self.class_names = (
            self.class_names[: self.background_class_index]
            + self.class_names[self.background_class_index + 1 :]
        )
    else:
        self.is_one_indexed = False
    logger.debug("Model initialisation finished.")

predict ¶

predict(img_in, **kwargs)

Performs object detection on the given image using the ONNX session with the RFDETR model.

Parameters:

Name	Type	Description	Default
`img_in`	`ndarray`	Input image as a NumPy array.	required

Returns:

Type	Description
`Tuple[ndarray]`	Tuple[np.ndarray]: NumPy array representing the predictions, including boxes, confidence scores, and class IDs.

Source code in inference/models/rfdetr/rfdetr.py

def predict(self, img_in: ImageMetaType, **kwargs) -> Tuple[np.ndarray]:
    """Performs object detection on the given image using the ONNX session with the RFDETR model.

    Args:
        img_in (np.ndarray): Input image as a NumPy array.

    Returns:
        Tuple[np.ndarray]: NumPy array representing the predictions, including boxes, confidence scores, and class IDs.
    """
    with self._session_lock:
        predictions = run_session_via_iobinding(
            self.onnx_session, self.input_name, img_in
        )
    bboxes = predictions[0]
    logits = predictions[1]

    return (bboxes, logits)

preproc_image ¶

preproc_image(
    image,
    disable_preproc_auto_orient=False,
    disable_preproc_contrast=False,
    disable_preproc_grayscale=False,
    disable_preproc_static_crop=False,
)

Preprocesses an inference request image by loading it, then applying any pre-processing specified by the Roboflow platform, then scaling it to the inference input dimensions.

Parameters:

Name	Type	Description	Default
`image`	`Union[Any, InferenceRequestImage]`	An object containing information necessary to load the image for inference.	required
`disable_preproc_auto_orient`	`bool`	If true, the auto orient preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_contrast`	`bool`	If true, the contrast preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_grayscale`	`bool`	If true, the grayscale preprocessing step is disabled for this call. Default is False.	`False`
`disable_preproc_static_crop`	`bool`	If true, the static crop preprocessing step is disabled for this call. Default is False.	`False`

Returns:

Type	Description
`Tuple[ndarray, Tuple[int, int]]`	Tuple[np.ndarray, Tuple[int, int]]: A tuple containing a numpy array of the preprocessed image pixel data and a tuple of the images original size.

Source code in inference/models/rfdetr/rfdetr.py

def preproc_image(
    self,
    image: Union[Any, InferenceRequestImage],
    disable_preproc_auto_orient: bool = False,
    disable_preproc_contrast: bool = False,
    disable_preproc_grayscale: bool = False,
    disable_preproc_static_crop: bool = False,
) -> Tuple[np.ndarray, Tuple[int, int]]:
    """
    Preprocesses an inference request image by loading it, then applying any pre-processing specified by the Roboflow platform, then scaling it to the inference input dimensions.

    Args:
        image (Union[Any, InferenceRequestImage]): An object containing information necessary to load the image for inference.
        disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
        disable_preproc_contrast (bool, optional): If true, the contrast preprocessing step is disabled for this call. Default is False.
        disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
        disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.

    Returns:
        Tuple[np.ndarray, Tuple[int, int]]: A tuple containing a numpy array of the preprocessed image pixel data and a tuple of the images original size.
    """
    if isinstance(image, Image.Image) and USE_PYTORCH_FOR_PREPROCESSING:
        if CUDA_IS_AVAILABLE:
            np_image = torch.from_numpy(np.asarray(image, copy=False)).cuda()
        else:
            np_image = torch.from_numpy(np.asarray(image, copy=False))
        is_bgr = False
    else:
        np_image, is_bgr = load_image(
            image,
            disable_preproc_auto_orient=disable_preproc_auto_orient
            or "auto-orient" not in self.preproc.keys()
            or DISABLE_PREPROC_AUTO_ORIENT,
        )
    if USE_PYTORCH_FOR_PREPROCESSING:
        if not isinstance(np_image, torch.Tensor):
            np_image = torch.from_numpy(np_image)
        if torch.cuda.is_available():
            np_image = np_image.cuda()

    preprocessed_image, img_dims = self.preprocess_image(
        np_image,
        disable_preproc_contrast=disable_preproc_contrast,
        disable_preproc_grayscale=disable_preproc_grayscale,
        disable_preproc_static_crop=disable_preproc_static_crop,
    )

    if USE_PYTORCH_FOR_PREPROCESSING:
        preprocessed_image = (
            preprocessed_image.permute(2, 0, 1).unsqueeze(0).contiguous()
        )
        preprocessed_image = preprocessed_image.float()

        preprocessed_image /= 255.0

        means = torch.tensor(
            self.preprocess_means, device=preprocessed_image.device
        ).view(3, 1, 1)
        stds = torch.tensor(
            self.preprocess_stds, device=preprocessed_image.device
        ).view(3, 1, 1)
        preprocessed_image = (preprocessed_image - means) / stds
    else:
        preprocessed_image = preprocessed_image.astype(np.float32)
        preprocessed_image /= 255.0

        preprocessed_image[:, :, 0] = (
            preprocessed_image[:, :, 0] - self.preprocess_means[0]
        ) / self.preprocess_stds[0]
        preprocessed_image[:, :, 1] = (
            preprocessed_image[:, :, 1] - self.preprocess_means[1]
        ) / self.preprocess_stds[1]
        preprocessed_image[:, :, 2] = (
            preprocessed_image[:, :, 2] - self.preprocess_means[2]
        ) / self.preprocess_stds[2]

    if self._needs_nonsquare_preproc:
        intermediate_size = (self._preproc_resize_w, self._preproc_resize_h)
    else:
        intermediate_size = None

    if self.resize_method == "Stretch to":
        if isinstance(preprocessed_image, np.ndarray):
            preprocessed_image = preprocessed_image.astype(np.float32)
            resized = cv2.resize(
                preprocessed_image,
                (self.img_size_w, self.img_size_h),
            )
        elif USE_PYTORCH_FOR_PREPROCESSING:
            resized = torch.nn.functional.interpolate(
                preprocessed_image,
                size=(self.img_size_h, self.img_size_w),
                mode="bilinear",
            )
        else:
            raise ValueError(
                f"Received an image of unknown type, {type(preprocessed_image)}; "
                "This is most likely a bug. Contact Roboflow team through github issues "
                "(https://github.com/roboflow/inference/issues) providing full context of the problem"
            )

    elif self.resize_method == "Fit (black edges) in":
        resized = letterbox_image(
            preprocessed_image,
            intermediate_size or (self.img_size_w, self.img_size_h),
        )
    elif self.resize_method == "Fit (white edges) in":
        resized = letterbox_image(
            preprocessed_image,
            intermediate_size or (self.img_size_w, self.img_size_h),
            color=(255, 255, 255),
        )
    elif self.resize_method == "Fit (grey edges) in":
        resized = letterbox_image(
            preprocessed_image,
            intermediate_size or (self.img_size_w, self.img_size_h),
            color=(114, 114, 114),
        )

    if intermediate_size is not None:
        if isinstance(resized, np.ndarray):
            resized = cv2.resize(
                resized.astype(np.float32),
                (self.img_size_w, self.img_size_h),
            )
        elif USE_PYTORCH_FOR_PREPROCESSING:
            resized = torch.nn.functional.interpolate(
                resized,
                size=(self.img_size_h, self.img_size_w),
                mode="bilinear",
            )
        else:
            raise ValueError(
                f"Received an image of unknown type, {type(resized)}; "
                "This is most likely a bug. Contact Roboflow team through github issues "
                "(https://github.com/roboflow/inference/issues) providing full context of the problem"
            )

    if is_bgr:
        if isinstance(resized, np.ndarray):
            resized = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
        else:
            resized = resized[:, [2, 1, 0], :, :]

    if isinstance(resized, np.ndarray):
        img_in = np.transpose(resized, (2, 0, 1))
        img_in = img_in.astype(np.float32)
        img_in = np.expand_dims(img_in, axis=0)
    elif USE_PYTORCH_FOR_PREPROCESSING:
        img_in = resized.float()
    else:
        raise ValueError(
            f"Received an image of unknown type, {type(resized)}; "
            "This is most likely a bug. Contact Roboflow team through github issues "
            "(https://github.com/roboflow/inference/issues) providing full context of the problem"
        )
    return img_in, img_dims

Functions:¶

`models/sam`¶

inference.models.sam.segment_anything ¶

Classes¶

SegmentAnything ¶

Bases: RoboflowCoreModel

SegmentAnything class for handling segmentation tasks.

Attributes:

Name	Type	Description
`sam`		The segmentation model.
`predictor`		The predictor for the segmentation model.
`ort_session`		ONNX runtime inference session.
`embedding_cache`		Cache for embeddings.
`image_size_cache`		Cache for image sizes.
`embedding_cache_keys`		Keys for the embedding cache.
`low_res_logits_cache`		Cache for low resolution logits.
`segmentation_cache_keys`		Keys for the segmentation cache.

Source code in inference/models/sam/segment_anything.py

class SegmentAnything(RoboflowCoreModel):
    """SegmentAnything class for handling segmentation tasks.

    Attributes:
        sam: The segmentation model.
        predictor: The predictor for the segmentation model.
        ort_session: ONNX runtime inference session.
        embedding_cache: Cache for embeddings.
        image_size_cache: Cache for image sizes.
        embedding_cache_keys: Keys for the embedding cache.
        low_res_logits_cache: Cache for low resolution logits.
        segmentation_cache_keys: Keys for the segmentation cache.
    """

    def __init__(self, *args, model_id: str = f"sam/{SAM_VERSION_ID}", **kwargs):
        """Initializes the SegmentAnything.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__(*args, model_id=model_id, **kwargs)
        self.sam = sam_model_registry[self.version_id](
            checkpoint=self.cache_file("encoder.pth")
        )
        self.sam.to(device="cuda" if torch.cuda.is_available() else "cpu")
        self.predictor = SamPredictor(self.sam)
        self.ort_session = onnxruntime.InferenceSession(
            self.cache_file("decoder.onnx"),
            providers=[
                "CUDAExecutionProvider",
                "OpenVINOExecutionProvider",
                "CPUExecutionProvider",
            ],
        )
        self._state_lock = Lock()
        self.embedding_cache = {}
        self.image_size_cache = {}
        self.embedding_cache_keys = []

        self.low_res_logits_cache = {}
        self.segmentation_cache_keys = []
        self.task_type = "unsupervised-segmentation"

    def get_infer_bucket_file_list(self) -> List[str]:
        """Gets the list of files required for inference.

        Returns:
            List[str]: List of file names.
        """
        return ["encoder.pth", "decoder.onnx"]

    def embed_image(self, image: Any, image_id: Optional[str] = None, **kwargs):
        """
        Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
        the cached result will be returned.

        Args:
            image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
            image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                      with this ID. Defaults to None.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                               and the second element is the shape (height, width) of the processed image.

        Notes:
            - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
            - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.

        Example:
            >>> img_array = ... # some image array
            >>> embed_image(img_array, image_id="sample123")
            (array([...]), (224, 224))
        """
        if image_id and image_id in self.embedding_cache:
            return (
                self.embedding_cache[image_id],
                self.image_size_cache[image_id],
            )
        img_in = self.preproc_image(image)
        self.predictor.set_image(img_in)
        embedding = self.predictor.get_image_embedding().cpu().numpy()
        if image_id:
            self.embedding_cache[image_id] = embedding
            self.image_size_cache[image_id] = img_in.shape[:2]
            self.embedding_cache_keys.append(image_id)
            if len(self.embedding_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
                cache_key = self.embedding_cache_keys.pop(0)
                del self.embedding_cache[cache_key]
                del self.image_size_cache[cache_key]
        return (embedding, img_in.shape[:2])

    def infer_from_request(self, request: SamInferenceRequest):
        """Performs inference based on the request type.

        Args:
            request (SamInferenceRequest): The inference request.

        Returns:
            Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
        """
        with self._state_lock:
            t1 = perf_counter()
            if isinstance(request, SamEmbeddingRequest):
                embedding, _ = self.embed_image(**request.dict())
                inference_time = perf_counter() - t1
                if request.format == "json":
                    return SamEmbeddingResponse(
                        embeddings=embedding.tolist(), time=inference_time
                    )
                elif request.format == "binary":
                    binary_vector = BytesIO()
                    np.save(binary_vector, embedding)
                    binary_vector.seek(0)
                    return SamEmbeddingResponse(
                        embeddings=binary_vector.getvalue(), time=inference_time
                    )
            elif isinstance(request, SamSegmentationRequest):
                masks, low_res_masks = self.segment_image(**request.dict())
                if request.format == "json":
                    masks = masks > self.predictor.model.mask_threshold
                    masks = masks2poly(masks)
                    low_res_masks = low_res_masks > self.predictor.model.mask_threshold
                    low_res_masks = masks2poly(low_res_masks)
                elif request.format == "binary":
                    binary_vector = BytesIO()
                    np.savez_compressed(
                        binary_vector, masks=masks, low_res_masks=low_res_masks
                    )
                    binary_vector.seek(0)
                    binary_data = binary_vector.getvalue()
                    return binary_data
                else:
                    raise ValueError(f"Invalid format {request.format}")

                response = SamSegmentationResponse(
                    masks=[m.tolist() for m in masks],
                    low_res_masks=[m.tolist() for m in low_res_masks],
                    time=perf_counter() - t1,
                )
                return response

    def preproc_image(self, image: InferenceRequestImage):
        """Preprocesses an image.

        Args:
            image (InferenceRequestImage): The image to preprocess.

        Returns:
            np.array: The preprocessed image.
        """
        np_image = load_image_rgb(image)
        return np_image

    def segment_image(
        self,
        image: Any,
        embeddings: Optional[Union[np.ndarray, List[List[float]]]] = None,
        embeddings_format: Optional[str] = "json",
        has_mask_input: Optional[bool] = False,
        image_id: Optional[str] = None,
        mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
        mask_input_format: Optional[str] = "json",
        orig_im_size: Optional[List[int]] = None,
        point_coords: Optional[List[List[float]]] = None,
        point_labels: Optional[List[int]] = None,
        use_mask_input_cache: Optional[bool] = True,
        **kwargs,
    ):
        """
        Segments an image based on provided embeddings, points, masks, or cached results.
        If embeddings are not directly provided, the function can derive them from the input image or cache.

        Args:
            image (Any): The image to be segmented.
            embeddings (Optional[Union[np.ndarray, List[List[float]]]]): The embeddings of the image.
                Defaults to None, in which case the image is used to compute embeddings.
            embeddings_format (Optional[str]): Format of the provided embeddings; either 'json' or 'binary'. Defaults to 'json'.
            has_mask_input (Optional[bool]): Specifies whether mask input is provided. Defaults to False.
            image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
            mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input mask for the image.
            mask_input_format (Optional[str]): Format of the provided mask input; either 'json' or 'binary'. Defaults to 'json'.
            orig_im_size (Optional[List[int]]): Original size of the image when providing embeddings directly.
            point_coords (Optional[List[List[float]]]): Coordinates of points in the image. Defaults to None (no points).
            point_labels (Optional[List[int]]): Labels associated with the provided points. Defaults to None (no labels).
            use_mask_input_cache (Optional[bool]): Flag to determine if cached mask input should be used. Defaults to True.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the segmentation masks of the image
                                          and the second element is the low resolution segmentation masks.

        Raises:
            ValueError: If necessary inputs are missing or inconsistent.

        Notes:
            - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
              on repeated requests for the same image.
            - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.
        """
        if not embeddings:
            if not image and not image_id:
                raise ValueError(
                    "Must provide either image, cached image_id, or embeddings"
                )
            elif image_id and not image and image_id not in self.embedding_cache:
                raise ValueError(
                    f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
                )
            embedding, original_image_size = self.embed_image(
                image=image, image_id=image_id
            )
        else:
            if not orig_im_size:
                raise ValueError(
                    "Must provide original image size if providing embeddings"
                )
            original_image_size = orig_im_size
            if embeddings_format == "json":
                embedding = np.array(embeddings)
            elif embeddings_format == "binary":
                embedding = np.load(BytesIO(embeddings))

        point_coords = list(point_coords) if point_coords is not None else []
        point_coords.append([0, 0])
        point_coords = np.array(point_coords, dtype=np.float32)
        point_coords = np.expand_dims(point_coords, axis=0)
        point_coords = self.predictor.transform.apply_coords(
            point_coords,
            original_image_size,
        )

        point_labels = list(point_labels) if point_labels is not None else []
        point_labels.append(-1)
        point_labels = np.array(point_labels, dtype=np.float32)
        point_labels = np.expand_dims(point_labels, axis=0)

        if has_mask_input:
            if (
                image_id
                and image_id in self.low_res_logits_cache
                and use_mask_input_cache
            ):
                mask_input = self.low_res_logits_cache[image_id]
            elif not mask_input and (
                not image_id or image_id not in self.low_res_logits_cache
            ):
                raise ValueError("Must provide either mask_input or cached image_id")
            else:
                if mask_input_format == "json":
                    polys = mask_input
                    mask_input = np.zeros((1, len(polys), 256, 256), dtype=np.uint8)
                    for i, poly in enumerate(polys):
                        poly = ShapelyPolygon(poly)
                        raster = rasterio.features.rasterize(
                            [poly], out_shape=(256, 256)
                        )
                        mask_input[0, i, :, :] = raster
                elif mask_input_format == "binary":
                    binary_data = base64.b64decode(mask_input)
                    mask_input = np.load(BytesIO(binary_data))
        else:
            mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32)

        ort_inputs = {
            "image_embeddings": embedding.astype(np.float32),
            "point_coords": point_coords.astype(np.float32),
            "point_labels": point_labels,
            "mask_input": mask_input.astype(np.float32),
            "has_mask_input": (
                np.zeros(1, dtype=np.float32)
                if not has_mask_input
                else np.ones(1, dtype=np.float32)
            ),
            "orig_im_size": np.array(original_image_size, dtype=np.float32),
        }
        masks, _, low_res_logits = self.ort_session.run(None, ort_inputs)
        if image_id:
            self.low_res_logits_cache[image_id] = low_res_logits
            if image_id not in self.segmentation_cache_keys:
                self.segmentation_cache_keys.append(image_id)
            if len(self.segmentation_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
                cache_key = self.segmentation_cache_keys.pop(0)
                del self.low_res_logits_cache[cache_key]
        masks = masks[0]
        low_res_masks = low_res_logits[0]

        return masks, low_res_masks

Methods:¶

init ¶

__init__(*args, model_id=f'sam/{SAM_VERSION_ID}', **kwargs)

Initializes the SegmentAnything.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/sam/segment_anything.py

def __init__(self, *args, model_id: str = f"sam/{SAM_VERSION_ID}", **kwargs):
    """Initializes the SegmentAnything.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__(*args, model_id=model_id, **kwargs)
    self.sam = sam_model_registry[self.version_id](
        checkpoint=self.cache_file("encoder.pth")
    )
    self.sam.to(device="cuda" if torch.cuda.is_available() else "cpu")
    self.predictor = SamPredictor(self.sam)
    self.ort_session = onnxruntime.InferenceSession(
        self.cache_file("decoder.onnx"),
        providers=[
            "CUDAExecutionProvider",
            "OpenVINOExecutionProvider",
            "CPUExecutionProvider",
        ],
    )
    self._state_lock = Lock()
    self.embedding_cache = {}
    self.image_size_cache = {}
    self.embedding_cache_keys = []

    self.low_res_logits_cache = {}
    self.segmentation_cache_keys = []
    self.task_type = "unsupervised-segmentation"

embed_image ¶

embed_image(image, image_id=None, **kwargs)

Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached, the cached result will be returned.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be embedded. The format should be compatible with the preproc_image method.	required
`image_id`	`Optional[str]`	An identifier for the image. If provided, the embedding result will be cached with this ID. Defaults to None.	`None`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
	Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image and the second element is the shape (height, width) of the processed image.

Notes

Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Example

img_array = ... # some image array embed_image(img_array, image_id="sample123") (array([...]), (224, 224))

Source code in inference/models/sam/segment_anything.py

def embed_image(self, image: Any, image_id: Optional[str] = None, **kwargs):
    """
    Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
    the cached result will be returned.

    Args:
        image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
        image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                  with this ID. Defaults to None.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                           and the second element is the shape (height, width) of the processed image.

    Notes:
        - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
        - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.

    Example:
        >>> img_array = ... # some image array
        >>> embed_image(img_array, image_id="sample123")
        (array([...]), (224, 224))
    """
    if image_id and image_id in self.embedding_cache:
        return (
            self.embedding_cache[image_id],
            self.image_size_cache[image_id],
        )
    img_in = self.preproc_image(image)
    self.predictor.set_image(img_in)
    embedding = self.predictor.get_image_embedding().cpu().numpy()
    if image_id:
        self.embedding_cache[image_id] = embedding
        self.image_size_cache[image_id] = img_in.shape[:2]
        self.embedding_cache_keys.append(image_id)
        if len(self.embedding_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
            cache_key = self.embedding_cache_keys.pop(0)
            del self.embedding_cache[cache_key]
            del self.image_size_cache[cache_key]
    return (embedding, img_in.shape[:2])

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Gets the list of files required for inference.

Returns:

Type	Description
`List[str]`	List[str]: List of file names.

Source code in inference/models/sam/segment_anything.py

def get_infer_bucket_file_list(self) -> List[str]:
    """Gets the list of files required for inference.

    Returns:
        List[str]: List of file names.
    """
    return ["encoder.pth", "decoder.onnx"]

infer_from_request ¶

infer_from_request(request)

Performs inference based on the request type.

Parameters:

Name	Type	Description	Default
`request`	`SamInferenceRequest`	The inference request.	required

Returns:

Type	Description
	Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.

Source code in inference/models/sam/segment_anything.py

def infer_from_request(self, request: SamInferenceRequest):
    """Performs inference based on the request type.

    Args:
        request (SamInferenceRequest): The inference request.

    Returns:
        Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
    """
    with self._state_lock:
        t1 = perf_counter()
        if isinstance(request, SamEmbeddingRequest):
            embedding, _ = self.embed_image(**request.dict())
            inference_time = perf_counter() - t1
            if request.format == "json":
                return SamEmbeddingResponse(
                    embeddings=embedding.tolist(), time=inference_time
                )
            elif request.format == "binary":
                binary_vector = BytesIO()
                np.save(binary_vector, embedding)
                binary_vector.seek(0)
                return SamEmbeddingResponse(
                    embeddings=binary_vector.getvalue(), time=inference_time
                )
        elif isinstance(request, SamSegmentationRequest):
            masks, low_res_masks = self.segment_image(**request.dict())
            if request.format == "json":
                masks = masks > self.predictor.model.mask_threshold
                masks = masks2poly(masks)
                low_res_masks = low_res_masks > self.predictor.model.mask_threshold
                low_res_masks = masks2poly(low_res_masks)
            elif request.format == "binary":
                binary_vector = BytesIO()
                np.savez_compressed(
                    binary_vector, masks=masks, low_res_masks=low_res_masks
                )
                binary_vector.seek(0)
                binary_data = binary_vector.getvalue()
                return binary_data
            else:
                raise ValueError(f"Invalid format {request.format}")

            response = SamSegmentationResponse(
                masks=[m.tolist() for m in masks],
                low_res_masks=[m.tolist() for m in low_res_masks],
                time=perf_counter() - t1,
            )
            return response

preproc_image ¶

preproc_image(image)

Preprocesses an image.

Parameters:

Name	Type	Description	Default
`image`	`InferenceRequestImage`	The image to preprocess.	required

Returns:

Type	Description
	np.array: The preprocessed image.

Source code in inference/models/sam/segment_anything.py

def preproc_image(self, image: InferenceRequestImage):
    """Preprocesses an image.

    Args:
        image (InferenceRequestImage): The image to preprocess.

    Returns:
        np.array: The preprocessed image.
    """
    np_image = load_image_rgb(image)
    return np_image

segment_image ¶

segment_image(
    image,
    embeddings=None,
    embeddings_format="json",
    has_mask_input=False,
    image_id=None,
    mask_input=None,
    mask_input_format="json",
    orig_im_size=None,
    point_coords=None,
    point_labels=None,
    use_mask_input_cache=True,
    **kwargs
)

Segments an image based on provided embeddings, points, masks, or cached results. If embeddings are not directly provided, the function can derive them from the input image or cache.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be segmented.	required
`embeddings`	`Optional[Union[ndarray, List[List[float]]]]`	The embeddings of the image. Defaults to None, in which case the image is used to compute embeddings.	`None`
`embeddings_format`	`Optional[str]`	Format of the provided embeddings; either 'json' or 'binary'. Defaults to 'json'.	`'json'`
`has_mask_input`	`Optional[bool]`	Specifies whether mask input is provided. Defaults to False.	`False`
`image_id`	`Optional[str]`	A cached identifier for the image. Useful for accessing cached embeddings or masks.	`None`
`mask_input`	`Optional[Union[ndarray, List[List[List[float]]]]]`	Input mask for the image.	`None`
`mask_input_format`	`Optional[str]`	Format of the provided mask input; either 'json' or 'binary'. Defaults to 'json'.	`'json'`
`orig_im_size`	`Optional[List[int]]`	Original size of the image when providing embeddings directly.	`None`
`point_coords`	`Optional[List[List[float]]]`	Coordinates of points in the image. Defaults to None (no points).	`None`
`point_labels`	`Optional[List[int]]`	Labels associated with the provided points. Defaults to None (no labels).	`None`
`use_mask_input_cache`	`Optional[bool]`	Flag to determine if cached mask input should be used. Defaults to True.	`True`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
	Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the segmentation masks of the image and the second element is the low resolution segmentation masks.

Raises:

Type	Description
`ValueError`	If necessary inputs are missing or inconsistent.

Notes

Embeddings, segmentations, and low-resolution logits can be cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Source code in inference/models/sam/segment_anything.py

def segment_image(
    self,
    image: Any,
    embeddings: Optional[Union[np.ndarray, List[List[float]]]] = None,
    embeddings_format: Optional[str] = "json",
    has_mask_input: Optional[bool] = False,
    image_id: Optional[str] = None,
    mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
    mask_input_format: Optional[str] = "json",
    orig_im_size: Optional[List[int]] = None,
    point_coords: Optional[List[List[float]]] = None,
    point_labels: Optional[List[int]] = None,
    use_mask_input_cache: Optional[bool] = True,
    **kwargs,
):
    """
    Segments an image based on provided embeddings, points, masks, or cached results.
    If embeddings are not directly provided, the function can derive them from the input image or cache.

    Args:
        image (Any): The image to be segmented.
        embeddings (Optional[Union[np.ndarray, List[List[float]]]]): The embeddings of the image.
            Defaults to None, in which case the image is used to compute embeddings.
        embeddings_format (Optional[str]): Format of the provided embeddings; either 'json' or 'binary'. Defaults to 'json'.
        has_mask_input (Optional[bool]): Specifies whether mask input is provided. Defaults to False.
        image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
        mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input mask for the image.
        mask_input_format (Optional[str]): Format of the provided mask input; either 'json' or 'binary'. Defaults to 'json'.
        orig_im_size (Optional[List[int]]): Original size of the image when providing embeddings directly.
        point_coords (Optional[List[List[float]]]): Coordinates of points in the image. Defaults to None (no points).
        point_labels (Optional[List[int]]): Labels associated with the provided points. Defaults to None (no labels).
        use_mask_input_cache (Optional[bool]): Flag to determine if cached mask input should be used. Defaults to True.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the segmentation masks of the image
                                      and the second element is the low resolution segmentation masks.

    Raises:
        ValueError: If necessary inputs are missing or inconsistent.

    Notes:
        - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
          on repeated requests for the same image.
        - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.
    """
    if not embeddings:
        if not image and not image_id:
            raise ValueError(
                "Must provide either image, cached image_id, or embeddings"
            )
        elif image_id and not image and image_id not in self.embedding_cache:
            raise ValueError(
                f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
            )
        embedding, original_image_size = self.embed_image(
            image=image, image_id=image_id
        )
    else:
        if not orig_im_size:
            raise ValueError(
                "Must provide original image size if providing embeddings"
            )
        original_image_size = orig_im_size
        if embeddings_format == "json":
            embedding = np.array(embeddings)
        elif embeddings_format == "binary":
            embedding = np.load(BytesIO(embeddings))

    point_coords = list(point_coords) if point_coords is not None else []
    point_coords.append([0, 0])
    point_coords = np.array(point_coords, dtype=np.float32)
    point_coords = np.expand_dims(point_coords, axis=0)
    point_coords = self.predictor.transform.apply_coords(
        point_coords,
        original_image_size,
    )

    point_labels = list(point_labels) if point_labels is not None else []
    point_labels.append(-1)
    point_labels = np.array(point_labels, dtype=np.float32)
    point_labels = np.expand_dims(point_labels, axis=0)

    if has_mask_input:
        if (
            image_id
            and image_id in self.low_res_logits_cache
            and use_mask_input_cache
        ):
            mask_input = self.low_res_logits_cache[image_id]
        elif not mask_input and (
            not image_id or image_id not in self.low_res_logits_cache
        ):
            raise ValueError("Must provide either mask_input or cached image_id")
        else:
            if mask_input_format == "json":
                polys = mask_input
                mask_input = np.zeros((1, len(polys), 256, 256), dtype=np.uint8)
                for i, poly in enumerate(polys):
                    poly = ShapelyPolygon(poly)
                    raster = rasterio.features.rasterize(
                        [poly], out_shape=(256, 256)
                    )
                    mask_input[0, i, :, :] = raster
            elif mask_input_format == "binary":
                binary_data = base64.b64decode(mask_input)
                mask_input = np.load(BytesIO(binary_data))
    else:
        mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32)

    ort_inputs = {
        "image_embeddings": embedding.astype(np.float32),
        "point_coords": point_coords.astype(np.float32),
        "point_labels": point_labels,
        "mask_input": mask_input.astype(np.float32),
        "has_mask_input": (
            np.zeros(1, dtype=np.float32)
            if not has_mask_input
            else np.ones(1, dtype=np.float32)
        ),
        "orig_im_size": np.array(original_image_size, dtype=np.float32),
    }
    masks, _, low_res_logits = self.ort_session.run(None, ort_inputs)
    if image_id:
        self.low_res_logits_cache[image_id] = low_res_logits
        if image_id not in self.segmentation_cache_keys:
            self.segmentation_cache_keys.append(image_id)
        if len(self.segmentation_cache_keys) > SAM_MAX_EMBEDDING_CACHE_SIZE:
            cache_key = self.segmentation_cache_keys.pop(0)
            del self.low_res_logits_cache[cache_key]
    masks = masks[0]
    low_res_masks = low_res_logits[0]

    return masks, low_res_masks

Functions:¶

`models/sam2`¶

inference.models.sam2.segment_anything2 ¶

Classes¶

SegmentAnything2 ¶

Bases: RoboflowCoreModel

SegmentAnything class for handling segmentation tasks.

Attributes:

Name	Type	Description
`sam`		The segmentation model.
`embedding_cache`		Cache for embeddings.
`image_size_cache`		Cache for image sizes.
`embedding_cache_keys`		Keys for the embedding cache.

Source code in inference/models/sam2/segment_anything2.py

class SegmentAnything2(RoboflowCoreModel):
    """SegmentAnything class for handling segmentation tasks.

    Attributes:
        sam: The segmentation model.
        embedding_cache: Cache for embeddings.
        image_size_cache: Cache for image sizes.
        embedding_cache_keys: Keys for the embedding cache.

    """

    def __init__(
        self,
        *args,
        model_id: str = f"sam2/{SAM2_VERSION_ID}",
        low_res_logits_cache_size: int = SAM2_MAX_LOGITS_CACHE_SIZE,
        embedding_cache_size: int = SAM2_MAX_EMBEDDING_CACHE_SIZE,
        **kwargs,
    ):
        """Initializes the SegmentAnything.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__(*args, model_id=model_id, **kwargs)
        checkpoint = self.cache_file("weights.pt")
        model_cfg = {
            "hiera_large": "sam2_hiera_l.yaml",
            "hiera_small": "sam2_hiera_s.yaml",
            "hiera_tiny": "sam2_hiera_t.yaml",
            "hiera_b_plus": "sam2_hiera_b+.yaml",
        }[self.version_id]

        self.sam = build_sam2(model_cfg, checkpoint, device=DEVICE)
        self.low_res_logits_cache_size = low_res_logits_cache_size
        self.embedding_cache_size = embedding_cache_size

        self.embedding_cache = {}
        self.image_size_cache = {}
        self.embedding_cache_keys = []
        self.low_res_logits_cache: Dict[Tuple[str, str], LogitsCacheType] = {}
        self.low_res_logits_cache_keys = []
        self._state_lock = RLock()
        self.task_type = "unsupervised-segmentation"

    def get_infer_bucket_file_list(self) -> List[str]:
        """Gets the list of files required for inference.

        Returns:
            List[str]: List of file names.
        """
        return ["weights.pt"]

    def embed_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        **kwargs,
    ):
        """
        Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
        the cached result will be returned.

        Args:
            image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
            image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                      with this ID. Defaults to None.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                               and the second element is the shape (height, width) of the processed image.

        Notes:
            - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
            - The cache has a maximum size defined by SAM2_MAX_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.

        Example:
            >>> img_array = ... # some image array
            >>> embed_image(img_array, image_id="sample123")
            (array([...]), (224, 224))
        """
        if image_id:
            embedding_cache_content = self.embedding_cache.get(image_id)
            image_size_content = self.image_size_cache.get(image_id)
            if embedding_cache_content is not None and image_size_content is not None:
                return embedding_cache_content, image_size_content, image_id

        img_in = self.preproc_image(image)
        if image_id is None:
            image_id = hashlib.md5(img_in.tobytes()).hexdigest()[:12]

        embedding_cache_content = self.embedding_cache.get(image_id)
        image_size_content = self.image_size_cache.get(image_id)
        if embedding_cache_content is not None and image_size_content is not None:
            return (
                embedding_cache_content,
                image_size_content,
                image_id,
            )

        with torch.inference_mode():
            with _temporarily_disable_torch_jit_script():
                predictor = SAM2ImagePredictor(self.sam)
            predictor.set_image(img_in)
            embedding_dict = predictor._features

        with self._state_lock:
            self.embedding_cache[image_id] = embedding_dict
            self.image_size_cache[image_id] = img_in.shape[:2]
            safe_remove_from_list(values=self.embedding_cache_keys, element=image_id)
            self.embedding_cache_keys.append(image_id)
            if len(self.embedding_cache_keys) > self.embedding_cache_size:
                cache_key = safe_pop_from_list(values=self.embedding_cache_keys)
                if cache_key is not None:
                    safe_remove_from_dict(values=self.embedding_cache, key=cache_key)
                    safe_remove_from_dict(values=self.image_size_cache, key=cache_key)
            return embedding_dict, img_in.shape[:2], image_id

    @usage_collector("model")
    def infer_from_request(self, request: Sam2InferenceRequest):
        """Performs inference based on the request type.

        Args:
            request (SamInferenceRequest): The inference request.

        Returns:
            Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
        """
        t1 = perf_counter()
        if isinstance(request, Sam2EmbeddingRequest):
            _, _, image_id = self.embed_image(**request.dict())
            inference_time = perf_counter() - t1
            return Sam2EmbeddingResponse(time=inference_time, image_id=image_id)
        elif isinstance(request, Sam2SegmentationRequest):
            masks, scores, low_resolution_logits = self.segment_image(**request.dict())

            if request.format == "json":
                return turn_segmentation_results_into_api_response(
                    masks=masks,
                    scores=scores,
                    mask_threshold=0.0,
                    inference_start_timestamp=t1,
                )
            elif request.format == "rle":
                return turn_segmentation_results_into_rle_response(
                    masks=masks,
                    scores=scores,
                    mask_threshold=0.0,
                    inference_start_timestamp=t1,
                )
            elif request.format == "binary":
                binary_vector = BytesIO()
                np.savez_compressed(
                    binary_vector, masks=masks, low_res_masks=low_resolution_logits
                )
                binary_vector.seek(0)
                binary_data = binary_vector.getvalue()
                return binary_data
            else:
                raise ValueError(f"Invalid format {request.format}")

        else:
            raise ValueError(f"Invalid request type {type(request)}")

    def preproc_image(self, image: InferenceRequestImage):
        """Preprocesses an image.

        Args:
            image (InferenceRequestImage): The image to preprocess.

        Returns:
            np.array: The preprocessed image.
        """
        np_image = load_image_rgb(image)
        return np_image

    def segment_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        prompts: Optional[Union[Sam2PromptSet, dict]] = None,
        multimask_output: Optional[bool] = True,
        mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
        save_logits_to_cache: bool = False,
        load_logits_from_cache: bool = False,
        **kwargs,
    ):
        """
        Segments an image based on provided embeddings, points, masks, or cached results.
        If embeddings are not directly provided, the function can derive them from the input image or cache.

        Args:
            image (Any): The image to be segmented.
            image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
            prompts (Optional[List[Sam2Prompt]]): List of prompts to use for segmentation. Defaults to None.
            mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input low_res_logits for the image.
            multimask_output: (bool): Flag to decide if multiple masks proposal to be predicted (among which the most
                promising will be returned
            )
            use_logits_cache: (bool): Flag to decide to use cached logits from prior prompting
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple of np.array, where:
                - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence
                    for each prompt element
                - second element is of size (prompt_set_size, ) and represents ths score for most confident mask
                    of each prompt element
                - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits
                    for most confident mask of each prompt element

        Raises:
            ValueError: If necessary inputs are missing or inconsistent.

        Notes:
            - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
              on repeated requests for the same image.
            - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.
        """
        load_logits_from_cache = (
            load_logits_from_cache and not DISABLE_SAM2_LOGITS_CACHE
        )
        save_logits_to_cache = save_logits_to_cache and not DISABLE_SAM2_LOGITS_CACHE
        with torch.inference_mode():
            if image is None and not image_id:
                raise ValueError("Must provide either image or  cached image_id")
            elif image_id and image is None and image_id not in self.embedding_cache:
                raise ValueError(
                    f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
                )
            embedding, original_image_size, image_id = self.embed_image(
                image=image, image_id=image_id
            )
            with _temporarily_disable_torch_jit_script():
                predictor = SAM2ImagePredictor(self.sam)
            predictor._is_image_set = True
            predictor._features = embedding
            predictor._orig_hw = [original_image_size]
            predictor._is_batch = False
            args = dict()
            prompt_set: Sam2PromptSet
            if prompts:
                if type(prompts) is dict:
                    prompt_set = Sam2PromptSet(**prompts)
                    args = prompt_set.to_sam2_inputs()
                else:
                    prompt_set = prompts
                    args = prompts.to_sam2_inputs()
            else:
                prompt_set = Sam2PromptSet()

            if mask_input is None and load_logits_from_cache:
                mask_input = maybe_load_low_res_logits_from_cache(
                    image_id, prompt_set, self.low_res_logits_cache
                )

            args = pad_points(args)
            if not any(args.values()):
                args = {"point_coords": [[0, 0]], "point_labels": [-1], "box": None}
            masks, scores, low_resolution_logits = predictor.predict(
                mask_input=mask_input,
                multimask_output=multimask_output,
                return_logits=True,
                normalize_coords=True,
                **args,
            )
            masks, scores, low_resolution_logits = choose_most_confident_sam_prediction(
                masks=masks,
                scores=scores,
                low_resolution_logits=low_resolution_logits,
            )

            if save_logits_to_cache:
                self.add_low_res_logits_to_cache(
                    low_resolution_logits, image_id, prompt_set
                )

            return masks, scores, low_resolution_logits

    def add_low_res_logits_to_cache(
        self, logits: np.ndarray, image_id: str, prompt_set: Sam2PromptSet
    ) -> None:
        logits = logits[:, None, :, :]
        prompt_id = hash_prompt_set(image_id, prompt_set)
        with self._state_lock:
            self.low_res_logits_cache[prompt_id] = {
                "logits": logits,
                "prompt_set": prompt_set,
            }
            safe_remove_from_list(
                values=self.low_res_logits_cache_keys, element=prompt_id
            )
            self.low_res_logits_cache_keys.append(prompt_id)
            if len(self.low_res_logits_cache_keys) > self.low_res_logits_cache_size:
                cache_key = safe_pop_from_list(values=self.low_res_logits_cache_keys)
                if cache_key is not None:
                    safe_remove_from_dict(
                        values=self.low_res_logits_cache, key=cache_key
                    )

Methods:¶

init ¶

__init__(
    *args,
    model_id=f"sam2/{SAM2_VERSION_ID}",
    low_res_logits_cache_size=SAM2_MAX_LOGITS_CACHE_SIZE,
    embedding_cache_size=SAM2_MAX_EMBEDDING_CACHE_SIZE,
    **kwargs,
)

Initializes the SegmentAnything.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/sam2/segment_anything2.py

def __init__(
    self,
    *args,
    model_id: str = f"sam2/{SAM2_VERSION_ID}",
    low_res_logits_cache_size: int = SAM2_MAX_LOGITS_CACHE_SIZE,
    embedding_cache_size: int = SAM2_MAX_EMBEDDING_CACHE_SIZE,
    **kwargs,
):
    """Initializes the SegmentAnything.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__(*args, model_id=model_id, **kwargs)
    checkpoint = self.cache_file("weights.pt")
    model_cfg = {
        "hiera_large": "sam2_hiera_l.yaml",
        "hiera_small": "sam2_hiera_s.yaml",
        "hiera_tiny": "sam2_hiera_t.yaml",
        "hiera_b_plus": "sam2_hiera_b+.yaml",
    }[self.version_id]

    self.sam = build_sam2(model_cfg, checkpoint, device=DEVICE)
    self.low_res_logits_cache_size = low_res_logits_cache_size
    self.embedding_cache_size = embedding_cache_size

    self.embedding_cache = {}
    self.image_size_cache = {}
    self.embedding_cache_keys = []
    self.low_res_logits_cache: Dict[Tuple[str, str], LogitsCacheType] = {}
    self.low_res_logits_cache_keys = []
    self._state_lock = RLock()
    self.task_type = "unsupervised-segmentation"

embed_image ¶

embed_image(image, image_id=None, **kwargs)

Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached, the cached result will be returned.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be embedded. The format should be compatible with the preproc_image method.	required
`image_id`	`Optional[str]`	An identifier for the image. If provided, the embedding result will be cached with this ID. Defaults to None.	`None`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
	Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image and the second element is the shape (height, width) of the processed image.

Notes

Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM2_MAX_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Example

img_array = ... # some image array embed_image(img_array, image_id="sample123") (array([...]), (224, 224))

Source code in inference/models/sam2/segment_anything2.py

def embed_image(
    self,
    image: Optional[InferenceRequestImage],
    image_id: Optional[str] = None,
    **kwargs,
):
    """
    Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
    the cached result will be returned.

    Args:
        image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
        image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                  with this ID. Defaults to None.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                           and the second element is the shape (height, width) of the processed image.

    Notes:
        - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
        - The cache has a maximum size defined by SAM2_MAX_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.

    Example:
        >>> img_array = ... # some image array
        >>> embed_image(img_array, image_id="sample123")
        (array([...]), (224, 224))
    """
    if image_id:
        embedding_cache_content = self.embedding_cache.get(image_id)
        image_size_content = self.image_size_cache.get(image_id)
        if embedding_cache_content is not None and image_size_content is not None:
            return embedding_cache_content, image_size_content, image_id

    img_in = self.preproc_image(image)
    if image_id is None:
        image_id = hashlib.md5(img_in.tobytes()).hexdigest()[:12]

    embedding_cache_content = self.embedding_cache.get(image_id)
    image_size_content = self.image_size_cache.get(image_id)
    if embedding_cache_content is not None and image_size_content is not None:
        return (
            embedding_cache_content,
            image_size_content,
            image_id,
        )

    with torch.inference_mode():
        with _temporarily_disable_torch_jit_script():
            predictor = SAM2ImagePredictor(self.sam)
        predictor.set_image(img_in)
        embedding_dict = predictor._features

    with self._state_lock:
        self.embedding_cache[image_id] = embedding_dict
        self.image_size_cache[image_id] = img_in.shape[:2]
        safe_remove_from_list(values=self.embedding_cache_keys, element=image_id)
        self.embedding_cache_keys.append(image_id)
        if len(self.embedding_cache_keys) > self.embedding_cache_size:
            cache_key = safe_pop_from_list(values=self.embedding_cache_keys)
            if cache_key is not None:
                safe_remove_from_dict(values=self.embedding_cache, key=cache_key)
                safe_remove_from_dict(values=self.image_size_cache, key=cache_key)
        return embedding_dict, img_in.shape[:2], image_id

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Gets the list of files required for inference.

Returns:

Type	Description
`List[str]`	List[str]: List of file names.

Source code in inference/models/sam2/segment_anything2.py

def get_infer_bucket_file_list(self) -> List[str]:
    """Gets the list of files required for inference.

    Returns:
        List[str]: List of file names.
    """
    return ["weights.pt"]

infer_from_request ¶

infer_from_request(request)

Performs inference based on the request type.

Parameters:

Name	Type	Description	Default
`request`	`SamInferenceRequest`	The inference request.	required

Returns:

Type	Description
	Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.

Source code in inference/models/sam2/segment_anything2.py

@usage_collector("model")
def infer_from_request(self, request: Sam2InferenceRequest):
    """Performs inference based on the request type.

    Args:
        request (SamInferenceRequest): The inference request.

    Returns:
        Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
    """
    t1 = perf_counter()
    if isinstance(request, Sam2EmbeddingRequest):
        _, _, image_id = self.embed_image(**request.dict())
        inference_time = perf_counter() - t1
        return Sam2EmbeddingResponse(time=inference_time, image_id=image_id)
    elif isinstance(request, Sam2SegmentationRequest):
        masks, scores, low_resolution_logits = self.segment_image(**request.dict())

        if request.format == "json":
            return turn_segmentation_results_into_api_response(
                masks=masks,
                scores=scores,
                mask_threshold=0.0,
                inference_start_timestamp=t1,
            )
        elif request.format == "rle":
            return turn_segmentation_results_into_rle_response(
                masks=masks,
                scores=scores,
                mask_threshold=0.0,
                inference_start_timestamp=t1,
            )
        elif request.format == "binary":
            binary_vector = BytesIO()
            np.savez_compressed(
                binary_vector, masks=masks, low_res_masks=low_resolution_logits
            )
            binary_vector.seek(0)
            binary_data = binary_vector.getvalue()
            return binary_data
        else:
            raise ValueError(f"Invalid format {request.format}")

    else:
        raise ValueError(f"Invalid request type {type(request)}")

preproc_image ¶

preproc_image(image)

Preprocesses an image.

Parameters:

Name	Type	Description	Default
`image`	`InferenceRequestImage`	The image to preprocess.	required

Returns:

Type	Description
	np.array: The preprocessed image.

Source code in inference/models/sam2/segment_anything2.py

def preproc_image(self, image: InferenceRequestImage):
    """Preprocesses an image.

    Args:
        image (InferenceRequestImage): The image to preprocess.

    Returns:
        np.array: The preprocessed image.
    """
    np_image = load_image_rgb(image)
    return np_image

segment_image ¶

segment_image(
    image,
    image_id=None,
    prompts=None,
    multimask_output=True,
    mask_input=None,
    save_logits_to_cache=False,
    load_logits_from_cache=False,
    **kwargs
)

Segments an image based on provided embeddings, points, masks, or cached results. If embeddings are not directly provided, the function can derive them from the input image or cache.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be segmented.	required
`image_id`	`Optional[str]`	A cached identifier for the image. Useful for accessing cached embeddings or masks.	`None`
`prompts`	`Optional[List[Sam2Prompt]]`	List of prompts to use for segmentation. Defaults to None.	`None`
`mask_input`	`Optional[Union[ndarray, List[List[List[float]]]]]`	Input low_res_logits for the image.	`None`
`multimask_output`	`Optional[bool]`	(bool): Flag to decide if multiple masks proposal to be predicted (among which the most promising will be returned	`True`
`use_logits_cache`		(bool): Flag to decide to use cached logits from prior prompting	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type

Description

Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple of np.array, where: - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence for each prompt element - second element is of size (prompt_set_size, ) and represents ths score for most confident mask of each prompt element - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits for most confident mask of each prompt element

Raises:

Type	Description
`ValueError`	If necessary inputs are missing or inconsistent.

Notes

Embeddings, segmentations, and low-resolution logits can be cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Source code in inference/models/sam2/segment_anything2.py

def segment_image(
    self,
    image: Optional[InferenceRequestImage],
    image_id: Optional[str] = None,
    prompts: Optional[Union[Sam2PromptSet, dict]] = None,
    multimask_output: Optional[bool] = True,
    mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
    save_logits_to_cache: bool = False,
    load_logits_from_cache: bool = False,
    **kwargs,
):
    """
    Segments an image based on provided embeddings, points, masks, or cached results.
    If embeddings are not directly provided, the function can derive them from the input image or cache.

    Args:
        image (Any): The image to be segmented.
        image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
        prompts (Optional[List[Sam2Prompt]]): List of prompts to use for segmentation. Defaults to None.
        mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input low_res_logits for the image.
        multimask_output: (bool): Flag to decide if multiple masks proposal to be predicted (among which the most
            promising will be returned
        )
        use_logits_cache: (bool): Flag to decide to use cached logits from prior prompting
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple of np.array, where:
            - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence
                for each prompt element
            - second element is of size (prompt_set_size, ) and represents ths score for most confident mask
                of each prompt element
            - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits
                for most confident mask of each prompt element

    Raises:
        ValueError: If necessary inputs are missing or inconsistent.

    Notes:
        - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
          on repeated requests for the same image.
        - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.
    """
    load_logits_from_cache = (
        load_logits_from_cache and not DISABLE_SAM2_LOGITS_CACHE
    )
    save_logits_to_cache = save_logits_to_cache and not DISABLE_SAM2_LOGITS_CACHE
    with torch.inference_mode():
        if image is None and not image_id:
            raise ValueError("Must provide either image or  cached image_id")
        elif image_id and image is None and image_id not in self.embedding_cache:
            raise ValueError(
                f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
            )
        embedding, original_image_size, image_id = self.embed_image(
            image=image, image_id=image_id
        )
        with _temporarily_disable_torch_jit_script():
            predictor = SAM2ImagePredictor(self.sam)
        predictor._is_image_set = True
        predictor._features = embedding
        predictor._orig_hw = [original_image_size]
        predictor._is_batch = False
        args = dict()
        prompt_set: Sam2PromptSet
        if prompts:
            if type(prompts) is dict:
                prompt_set = Sam2PromptSet(**prompts)
                args = prompt_set.to_sam2_inputs()
            else:
                prompt_set = prompts
                args = prompts.to_sam2_inputs()
        else:
            prompt_set = Sam2PromptSet()

        if mask_input is None and load_logits_from_cache:
            mask_input = maybe_load_low_res_logits_from_cache(
                image_id, prompt_set, self.low_res_logits_cache
            )

        args = pad_points(args)
        if not any(args.values()):
            args = {"point_coords": [[0, 0]], "point_labels": [-1], "box": None}
        masks, scores, low_resolution_logits = predictor.predict(
            mask_input=mask_input,
            multimask_output=multimask_output,
            return_logits=True,
            normalize_coords=True,
            **args,
        )
        masks, scores, low_resolution_logits = choose_most_confident_sam_prediction(
            masks=masks,
            scores=scores,
            low_resolution_logits=low_resolution_logits,
        )

        if save_logits_to_cache:
            self.add_low_res_logits_to_cache(
                low_resolution_logits, image_id, prompt_set
            )

        return masks, scores, low_resolution_logits

Functions:¶

choose_most_confident_sam_prediction ¶

choose_most_confident_sam_prediction(
    masks, scores, low_resolution_logits
)

This function is supposed to post-process SAM2 inference and choose most confident mask regardless of multimask_output parameter value Args: masks: np array with values 0.0 and 1.0 representing predicted mask of size (prompt_set_size, proposed_maks, h, w) or (proposed_maks, h, w) - depending on prompt set size - unfortunately, prompt_set_size=1 causes squeeze operation in SAM2 library, so to handle inference uniformly, we need to compensate with this function. scores: array of size (prompt_set_size, proposed_maks) or (proposed_maks, ) depending on prompt set size - this array gives confidence score for mask proposal low_resolution_logits: array of size (prompt_set_size, proposed_maks, 256, 256) or (proposed_maks, 256, 256) - depending on prompt set size. These low resolution logits can be passed to a subsequent iteration as mask input. Returns: Tuple of np.array, where: - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence for each prompt element - second element is of size (prompt_set_size, ) and represents ths score for most confident mask of each prompt element - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits for most confident mask of each prompt element

Source code in inference/models/sam2/segment_anything2.py

def choose_most_confident_sam_prediction(
    masks: np.ndarray,
    scores: np.ndarray,
    low_resolution_logits: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    This function is supposed to post-process SAM2 inference and choose most confident
    mask regardless of `multimask_output` parameter value
    Args:
        masks: np array with values 0.0 and 1.0 representing predicted mask of size
            (prompt_set_size, proposed_maks, h, w) or (proposed_maks, h, w) - depending on
            prompt set size - unfortunately, prompt_set_size=1 causes squeeze operation
            in SAM2 library, so to handle inference uniformly, we need to compensate with
            this function.
        scores: array of size (prompt_set_size, proposed_maks) or (proposed_maks, ) depending
            on prompt set size - this array gives confidence score for mask proposal
        low_resolution_logits: array of size (prompt_set_size, proposed_maks, 256, 256) or
            (proposed_maks, 256, 256) - depending on prompt set size. These low resolution logits
             can be passed to a subsequent iteration as mask input.
    Returns:
        Tuple of np.array, where:
            - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence
                for each prompt element
            - second element is of size (prompt_set_size, ) and represents ths score for most confident mask
                of each prompt element
            - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits
                for most confident mask of each prompt element
    """
    if len(masks.shape) == 3:
        masks = np.expand_dims(masks, axis=0)
        scores = np.expand_dims(scores, axis=0)
        low_resolution_logits = np.expand_dims(low_resolution_logits, axis=0)
    selected_masks, selected_scores, selected_low_resolution_logits = [], [], []
    for mask, score, low_resolution_logit in zip(masks, scores, low_resolution_logits):
        selected_mask, selected_score, selected_low_resolution_logit = (
            choose_most_confident_prompt_set_element_prediction(
                mask=mask,
                score=score,
                low_resolution_logit=low_resolution_logit,
            )
        )
        selected_masks.append(selected_mask)
        selected_scores.append(selected_score)
        selected_low_resolution_logits.append(selected_low_resolution_logit)
    return (
        np.asarray(selected_masks),
        np.asarray(selected_scores),
        np.asarray(selected_low_resolution_logits),
    )

find_prior_prompt_in_cache ¶

find_prior_prompt_in_cache(
    initial_prompt_set, image_id, cache
)

Performs search over the cache to see if prior used prompts are subset of this one.

Source code in inference/models/sam2/segment_anything2.py

def find_prior_prompt_in_cache(
    initial_prompt_set: Sam2PromptSet,
    image_id: str,
    cache: Dict[Tuple[str, str], LogitsCacheType],
) -> Optional[np.ndarray]:
    """
    Performs search over the cache to see if prior used prompts are subset of this one.
    """

    logits_for_image = [cache[k] for k in cache if k[0] == image_id]
    maxed_size = 0
    best_match: Optional[np.ndarray] = None
    desired_size = initial_prompt_set.num_points() - 1
    for cached_dict in logits_for_image[::-1]:
        logits = cached_dict["logits"]
        prompt_set: Sam2PromptSet = cached_dict["prompt_set"]
        is_viable = is_prompt_strict_subset(prompt_set, initial_prompt_set)
        if not is_viable:
            continue

        size = prompt_set.num_points()
        # short circuit search if we find prompt with one less point (most recent possible mask)
        if size == desired_size:
            return logits
        if size >= maxed_size:
            maxed_size = size
            best_match = logits

    return best_match

hash_prompt_set ¶

hash_prompt_set(image_id, prompt_set)

Computes unique hash from a prompt set.

Source code in inference/models/sam2/segment_anything2.py

def hash_prompt_set(image_id: str, prompt_set: Sam2PromptSet) -> Tuple[str, str]:
    """Computes unique hash from a prompt set."""
    md5_hash = hashlib.md5()
    md5_hash.update(str(prompt_set).encode("utf-8"))
    return image_id, md5_hash.hexdigest()[:12]

maybe_load_low_res_logits_from_cache ¶

maybe_load_low_res_logits_from_cache(
    image_id, prompt_set, cache
)

Loads prior masks from the cache by searching over possibel prior prompts.

Source code in inference/models/sam2/segment_anything2.py

def maybe_load_low_res_logits_from_cache(
    image_id: str,
    prompt_set: Sam2PromptSet,
    cache: Dict[Tuple[str, str], LogitsCacheType],
) -> Optional[np.ndarray]:
    "Loads prior masks from the cache by searching over possibel prior prompts."
    prompts = prompt_set.prompts
    if not prompts:
        return None

    return find_prior_prompt_in_cache(prompt_set, image_id, cache)

pad_points ¶

pad_points(args)

Pad arguments to be passed to sam2 model with not_a_point label (-1). This is necessary when there are multiple prompts per image so that a tensor can be created.

Also pads empty point lists with a dummy non-point entry.

Source code in inference/models/sam2/segment_anything2.py

def pad_points(args: Dict[str, Any]) -> Dict[str, Any]:
    """
    Pad arguments to be passed to sam2 model with not_a_point label (-1).
    This is necessary when there are multiple prompts per image so that a tensor can be created.


    Also pads empty point lists with a dummy non-point entry.
    """
    args = copy.deepcopy(args)
    if args["point_coords"] is not None:
        max_len = max(max(len(prompt) for prompt in args["point_coords"]), 1)
        for prompt in args["point_coords"]:
            for _ in range(max_len - len(prompt)):
                prompt.append([0, 0])
        for label in args["point_labels"]:
            for _ in range(max_len - len(label)):
                label.append(-1)
    else:
        if args["point_labels"] is not None:
            raise ValueError(
                "Can't have point labels without corresponding point coordinates"
            )
    return args

inference.models.sam2.segment_anything2_inference_models ¶

Classes¶

InferenceModelsSAM2Adapter ¶

Bases: Model

SegmentAnything class for handling segmentation tasks.

Attributes:

Name	Type	Description
`sam`		The segmentation model.
`embedding_cache`		Cache for embeddings.
`image_size_cache`		Cache for image sizes.
`embedding_cache_keys`		Keys for the embedding cache.

Source code in inference/models/sam2/segment_anything2_inference_models.py

class InferenceModelsSAM2Adapter(Model):
    """SegmentAnything class for handling segmentation tasks.

    Attributes:
        sam: The segmentation model.
        embedding_cache: Cache for embeddings.
        image_size_cache: Cache for image sizes.
        embedding_cache_keys: Keys for the embedding cache.

    """

    def __init__(
        self,
        *args,
        model_id: str = f"sam2/{SAM2_VERSION_ID}",
        api_key: Optional[str] = None,
        low_res_logits_cache_size: int = SAM2_MAX_LOGITS_CACHE_SIZE,
        embedding_cache_size: int = SAM2_MAX_EMBEDDING_CACHE_SIZE,
        **kwargs,
    ):
        """Initializes the SegmentAnything.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__()

        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

        self.api_key = api_key if api_key else API_KEY

        self.task_type = "unsupervised-segmentation"

        sam2_image_embeddings_cache = Sam2ImageEmbeddingsInMemoryCache.init(
            size_limit=embedding_cache_size,
            send_to_cpu=True,
        )
        sam2_low_resolution_masks_cache = Sam2LowResolutionMasksInMemoryCache.init(
            size_limit=low_res_logits_cache_size,
            send_to_cpu=True,
        )
        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: SAM2Torch = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            sam2_image_embeddings_cache=sam2_image_embeddings_cache,
            sam2_low_resolution_masks_cache=sam2_low_resolution_masks_cache,
            sam2_allow_client_generated_hash_ids=True,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            **kwargs,
        )

    @usage_collector("model")
    def infer_from_request(self, request: Sam2InferenceRequest):
        """Performs inference based on the request type.

        Args:
            request (SamInferenceRequest): The inference request.

        Returns:
            Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
        """
        t1 = perf_counter()
        if isinstance(request, Sam2EmbeddingRequest):
            _, _, image_id = self.embed_image(**request.dict())
            inference_time = perf_counter() - t1
            return Sam2EmbeddingResponse(time=inference_time, image_id=image_id)
        elif isinstance(request, Sam2SegmentationRequest):
            masks, scores, low_resolution_logits = self.segment_image(**request.dict())

            if request.format == "json":
                return turn_segmentation_results_into_api_response(
                    masks=masks,
                    scores=scores,
                    mask_threshold=MASK_THRESHOLD,
                    inference_start_timestamp=t1,
                )
            elif request.format == "rle":
                return turn_segmentation_results_into_rle_response(
                    masks=masks,
                    scores=scores,
                    mask_threshold=0.0,
                    inference_start_timestamp=t1,
                )
            elif request.format == "binary":
                binary_vector = BytesIO()
                np.savez_compressed(
                    binary_vector, masks=masks, low_res_masks=low_resolution_logits
                )
                binary_vector.seek(0)
                binary_data = binary_vector.getvalue()
                return binary_data
            else:
                raise ValueError(f"Invalid format {request.format}")

        else:
            raise ValueError(f"Invalid request type {type(request)}")

    def embed_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        **kwargs,
    ):
        """
        Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
        the cached result will be returned.

        Args:
            image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
            image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                      with this ID. Defaults to None.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                               and the second element is the shape (height, width) of the processed image.

        Notes:
            - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
            - The cache has a maximum size defined by SAM2_MAX_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.

        Example:
            >>> img_array = ... # some image array
            >>> embed_image(img_array, image_id="sample123")
            (array([...]), (224, 224))
        """
        loaded_image = self.preproc_image(image)
        if loaded_image is None:
            raise ValueError("Image must be provided to handle this request.")
        embeddings = self._model.embed_images(
            images=loaded_image, image_hashes=image_id, **kwargs
        )[0]
        embedding_dict = {
            "image_embed": embeddings.embeddings.cpu().numpy(),
            "high_res_feats": [
                f.cpu().numpy() for f in embeddings.high_resolution_features
            ],
        }
        return embedding_dict, embeddings.image_size_hw, embeddings.image_hash

    def preproc_image(self, image: InferenceRequestImage):
        """Preprocesses an image.

        Args:
            image (InferenceRequestImage): The image to preprocess.

        Returns:
            np.array: The preprocessed image.
        """
        if image is not None:
            return load_image_bgr(image)
        return None

    def segment_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        prompts: Optional[Union[Sam2PromptSet, dict]] = None,
        multimask_output: Optional[bool] = True,
        mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
        save_logits_to_cache: bool = False,
        load_logits_from_cache: bool = False,
        **kwargs,
    ):
        """
        Segments an image based on provided embeddings, points, masks, or cached results.
        If embeddings are not directly provided, the function can derive them from the input image or cache.

        Args:
            image (Any): The image to be segmented.
            image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
            prompts (Optional[List[Sam2Prompt]]): List of prompts to use for segmentation. Defaults to None.
            mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input low_res_logits for the image.
            multimask_output: (bool): Flag to decide if multiple masks proposal to be predicted (among which the most
                promising will be returned
            )
            use_logits_cache: (bool): Flag to decide to use cached logits from prior prompting
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple of np.array, where:
                - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence
                    for each prompt element
                - second element is of size (prompt_set_size, ) and represents ths score for most confident mask
                    of each prompt element
                - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits
                    for most confident mask of each prompt element

        Raises:
            ValueError: If necessary inputs are missing or inconsistent.

        Notes:
            - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
              on repeated requests for the same image.
            - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.
        """
        load_logits_from_cache = (
            load_logits_from_cache and not DISABLE_SAM2_LOGITS_CACHE
        )
        save_logits_to_cache = save_logits_to_cache and not DISABLE_SAM2_LOGITS_CACHE
        if prompts is not None:
            if type(prompts) is dict:
                prompts = Sam2PromptSet(**prompts)
        else:
            prompts = Sam2PromptSet()
        args = prompts.to_sam2_inputs()
        args = pad_points(args)
        if not any(args.values()):
            args = {"point_coords": [[0, 0]], "point_labels": [-1], "box": None}
        if args["point_coords"] is not None:
            args["point_coords"] = np.array(args["point_coords"])
        if args["point_labels"] is not None:
            args["point_labels"] = np.array(args["point_labels"])
        if args["box"] is not None:
            args["box"] = np.array(args["box"])
        if mask_input is not None and isinstance(mask_input, list):
            mask_input = np.array(mask_input)

        segment_kwargs = dict(
            point_coordinates=args["point_coords"],
            point_labels=args["point_labels"],
            boxes=args["box"],
            mask_input=mask_input,
            multi_mask_output=multimask_output,
            threshold=MASK_THRESHOLD,
            load_from_mask_input_cache=load_logits_from_cache,
            save_to_mask_input_cache=save_logits_to_cache,
            use_embeddings_cache=True,
            return_logits=True,
        )

        prediction = None
        if image_id is not None:
            try:
                prediction = self._model.segment_images(
                    images=None, image_hashes=image_id, **segment_kwargs
                )[0]
            except ModelInputError as error:
                if "no embeddings were found in the cache" not in str(error):
                    raise
                prediction = None
        if prediction is None:
            loaded_image = self.preproc_image(image)
            prediction = self._model.segment_images(
                images=loaded_image, image_hashes=image_id, **segment_kwargs
            )[0]
        return choose_most_confident_sam_prediction(
            masks=prediction.masks.cpu().numpy(),
            scores=prediction.scores.cpu().numpy(),
            low_resolution_logits=prediction.logits.cpu().numpy(),
        )

Methods:¶

init ¶

__init__(
    *args,
    model_id=f"sam2/{SAM2_VERSION_ID}",
    api_key=None,
    low_res_logits_cache_size=SAM2_MAX_LOGITS_CACHE_SIZE,
    embedding_cache_size=SAM2_MAX_EMBEDDING_CACHE_SIZE,
    **kwargs,
)

Initializes the SegmentAnything.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/sam2/segment_anything2_inference_models.py

def __init__(
    self,
    *args,
    model_id: str = f"sam2/{SAM2_VERSION_ID}",
    api_key: Optional[str] = None,
    low_res_logits_cache_size: int = SAM2_MAX_LOGITS_CACHE_SIZE,
    embedding_cache_size: int = SAM2_MAX_EMBEDDING_CACHE_SIZE,
    **kwargs,
):
    """Initializes the SegmentAnything.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__()

    self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}

    self.api_key = api_key if api_key else API_KEY

    self.task_type = "unsupervised-segmentation"

    sam2_image_embeddings_cache = Sam2ImageEmbeddingsInMemoryCache.init(
        size_limit=embedding_cache_size,
        send_to_cpu=True,
    )
    sam2_low_resolution_masks_cache = Sam2LowResolutionMasksInMemoryCache.init(
        size_limit=low_res_logits_cache_size,
        send_to_cpu=True,
    )
    extra_weights_provider_headers = get_extra_weights_provider_headers(
        countinference=kwargs.get("countinference"),
        service_secret=kwargs.get("service_secret"),
    )
    backend = list(
        VALID_INFERENCE_MODELS_BACKENDS.difference(
            DISABLED_INFERENCE_MODELS_BACKENDS
        )
    )
    self._model: SAM2Torch = AutoModel.from_pretrained(
        model_id_or_path=model_id,
        api_key=self.api_key,
        allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
        allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
        sam2_image_embeddings_cache=sam2_image_embeddings_cache,
        sam2_low_resolution_masks_cache=sam2_low_resolution_masks_cache,
        sam2_allow_client_generated_hash_ids=True,
        weights_provider_extra_headers=extra_weights_provider_headers,
        backend=backend,
        **kwargs,
    )

embed_image ¶

embed_image(image, image_id=None, **kwargs)

Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached, the cached result will be returned.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be embedded. The format should be compatible with the preproc_image method.	required
`image_id`	`Optional[str]`	An identifier for the image. If provided, the embedding result will be cached with this ID. Defaults to None.	`None`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
	Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image and the second element is the shape (height, width) of the processed image.

Notes

Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM2_MAX_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Example

img_array = ... # some image array embed_image(img_array, image_id="sample123") (array([...]), (224, 224))

Source code in inference/models/sam2/segment_anything2_inference_models.py

def embed_image(
    self,
    image: Optional[InferenceRequestImage],
    image_id: Optional[str] = None,
    **kwargs,
):
    """
    Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
    the cached result will be returned.

    Args:
        image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
        image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                  with this ID. Defaults to None.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                           and the second element is the shape (height, width) of the processed image.

    Notes:
        - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
        - The cache has a maximum size defined by SAM2_MAX_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.

    Example:
        >>> img_array = ... # some image array
        >>> embed_image(img_array, image_id="sample123")
        (array([...]), (224, 224))
    """
    loaded_image = self.preproc_image(image)
    if loaded_image is None:
        raise ValueError("Image must be provided to handle this request.")
    embeddings = self._model.embed_images(
        images=loaded_image, image_hashes=image_id, **kwargs
    )[0]
    embedding_dict = {
        "image_embed": embeddings.embeddings.cpu().numpy(),
        "high_res_feats": [
            f.cpu().numpy() for f in embeddings.high_resolution_features
        ],
    }
    return embedding_dict, embeddings.image_size_hw, embeddings.image_hash

infer_from_request ¶

infer_from_request(request)

Performs inference based on the request type.

Parameters:

Name	Type	Description	Default
`request`	`SamInferenceRequest`	The inference request.	required

Returns:

Type	Description
	Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.

Source code in inference/models/sam2/segment_anything2_inference_models.py

@usage_collector("model")
def infer_from_request(self, request: Sam2InferenceRequest):
    """Performs inference based on the request type.

    Args:
        request (SamInferenceRequest): The inference request.

    Returns:
        Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
    """
    t1 = perf_counter()
    if isinstance(request, Sam2EmbeddingRequest):
        _, _, image_id = self.embed_image(**request.dict())
        inference_time = perf_counter() - t1
        return Sam2EmbeddingResponse(time=inference_time, image_id=image_id)
    elif isinstance(request, Sam2SegmentationRequest):
        masks, scores, low_resolution_logits = self.segment_image(**request.dict())

        if request.format == "json":
            return turn_segmentation_results_into_api_response(
                masks=masks,
                scores=scores,
                mask_threshold=MASK_THRESHOLD,
                inference_start_timestamp=t1,
            )
        elif request.format == "rle":
            return turn_segmentation_results_into_rle_response(
                masks=masks,
                scores=scores,
                mask_threshold=0.0,
                inference_start_timestamp=t1,
            )
        elif request.format == "binary":
            binary_vector = BytesIO()
            np.savez_compressed(
                binary_vector, masks=masks, low_res_masks=low_resolution_logits
            )
            binary_vector.seek(0)
            binary_data = binary_vector.getvalue()
            return binary_data
        else:
            raise ValueError(f"Invalid format {request.format}")

    else:
        raise ValueError(f"Invalid request type {type(request)}")

preproc_image ¶

preproc_image(image)

Preprocesses an image.

Parameters:

Name	Type	Description	Default
`image`	`InferenceRequestImage`	The image to preprocess.	required

Returns:

Type	Description
	np.array: The preprocessed image.

Source code in inference/models/sam2/segment_anything2_inference_models.py

def preproc_image(self, image: InferenceRequestImage):
    """Preprocesses an image.

    Args:
        image (InferenceRequestImage): The image to preprocess.

    Returns:
        np.array: The preprocessed image.
    """
    if image is not None:
        return load_image_bgr(image)
    return None

segment_image ¶

segment_image(
    image,
    image_id=None,
    prompts=None,
    multimask_output=True,
    mask_input=None,
    save_logits_to_cache=False,
    load_logits_from_cache=False,
    **kwargs
)

Segments an image based on provided embeddings, points, masks, or cached results. If embeddings are not directly provided, the function can derive them from the input image or cache.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be segmented.	required
`image_id`	`Optional[str]`	A cached identifier for the image. Useful for accessing cached embeddings or masks.	`None`
`prompts`	`Optional[List[Sam2Prompt]]`	List of prompts to use for segmentation. Defaults to None.	`None`
`mask_input`	`Optional[Union[ndarray, List[List[List[float]]]]]`	Input low_res_logits for the image.	`None`
`multimask_output`	`Optional[bool]`	(bool): Flag to decide if multiple masks proposal to be predicted (among which the most promising will be returned	`True`
`use_logits_cache`		(bool): Flag to decide to use cached logits from prior prompting	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type

Description

Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple of np.array, where: - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence for each prompt element - second element is of size (prompt_set_size, ) and represents ths score for most confident mask of each prompt element - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits for most confident mask of each prompt element

Raises:

Type	Description
`ValueError`	If necessary inputs are missing or inconsistent.

Notes

Embeddings, segmentations, and low-resolution logits can be cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Source code in inference/models/sam2/segment_anything2_inference_models.py

def segment_image(
    self,
    image: Optional[InferenceRequestImage],
    image_id: Optional[str] = None,
    prompts: Optional[Union[Sam2PromptSet, dict]] = None,
    multimask_output: Optional[bool] = True,
    mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
    save_logits_to_cache: bool = False,
    load_logits_from_cache: bool = False,
    **kwargs,
):
    """
    Segments an image based on provided embeddings, points, masks, or cached results.
    If embeddings are not directly provided, the function can derive them from the input image or cache.

    Args:
        image (Any): The image to be segmented.
        image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
        prompts (Optional[List[Sam2Prompt]]): List of prompts to use for segmentation. Defaults to None.
        mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input low_res_logits for the image.
        multimask_output: (bool): Flag to decide if multiple masks proposal to be predicted (among which the most
            promising will be returned
        )
        use_logits_cache: (bool): Flag to decide to use cached logits from prior prompting
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple of np.array, where:
            - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence
                for each prompt element
            - second element is of size (prompt_set_size, ) and represents ths score for most confident mask
                of each prompt element
            - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits
                for most confident mask of each prompt element

    Raises:
        ValueError: If necessary inputs are missing or inconsistent.

    Notes:
        - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
          on repeated requests for the same image.
        - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.
    """
    load_logits_from_cache = (
        load_logits_from_cache and not DISABLE_SAM2_LOGITS_CACHE
    )
    save_logits_to_cache = save_logits_to_cache and not DISABLE_SAM2_LOGITS_CACHE
    if prompts is not None:
        if type(prompts) is dict:
            prompts = Sam2PromptSet(**prompts)
    else:
        prompts = Sam2PromptSet()
    args = prompts.to_sam2_inputs()
    args = pad_points(args)
    if not any(args.values()):
        args = {"point_coords": [[0, 0]], "point_labels": [-1], "box": None}
    if args["point_coords"] is not None:
        args["point_coords"] = np.array(args["point_coords"])
    if args["point_labels"] is not None:
        args["point_labels"] = np.array(args["point_labels"])
    if args["box"] is not None:
        args["box"] = np.array(args["box"])
    if mask_input is not None and isinstance(mask_input, list):
        mask_input = np.array(mask_input)

    segment_kwargs = dict(
        point_coordinates=args["point_coords"],
        point_labels=args["point_labels"],
        boxes=args["box"],
        mask_input=mask_input,
        multi_mask_output=multimask_output,
        threshold=MASK_THRESHOLD,
        load_from_mask_input_cache=load_logits_from_cache,
        save_to_mask_input_cache=save_logits_to_cache,
        use_embeddings_cache=True,
        return_logits=True,
    )

    prediction = None
    if image_id is not None:
        try:
            prediction = self._model.segment_images(
                images=None, image_hashes=image_id, **segment_kwargs
            )[0]
        except ModelInputError as error:
            if "no embeddings were found in the cache" not in str(error):
                raise
            prediction = None
    if prediction is None:
        loaded_image = self.preproc_image(image)
        prediction = self._model.segment_images(
            images=loaded_image, image_hashes=image_id, **segment_kwargs
        )[0]
    return choose_most_confident_sam_prediction(
        masks=prediction.masks.cpu().numpy(),
        scores=prediction.scores.cpu().numpy(),
        low_resolution_logits=prediction.logits.cpu().numpy(),
    )

Functions:¶

choose_most_confident_sam_prediction ¶

choose_most_confident_sam_prediction(
    masks, scores, low_resolution_logits
)

This function is supposed to post-process SAM2 inference and choose most confident mask regardless of multimask_output parameter value Args: masks: np array with values 0.0 and 1.0 representing predicted mask of size (prompt_set_size, proposed_maks, h, w) or (proposed_maks, h, w) - depending on prompt set size - unfortunately, prompt_set_size=1 causes squeeze operation in SAM2 library, so to handle inference uniformly, we need to compensate with this function. scores: array of size (prompt_set_size, proposed_maks) or (proposed_maks, ) depending on prompt set size - this array gives confidence score for mask proposal low_resolution_logits: array of size (prompt_set_size, proposed_maks, 256, 256) or (proposed_maks, 256, 256) - depending on prompt set size. These low resolution logits can be passed to a subsequent iteration as mask input. Returns: Tuple of np.array, where: - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence for each prompt element - second element is of size (prompt_set_size, ) and represents ths score for most confident mask of each prompt element - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits for most confident mask of each prompt element

Source code in inference/models/sam2/segment_anything2_inference_models.py

def choose_most_confident_sam_prediction(
    masks: np.ndarray,
    scores: np.ndarray,
    low_resolution_logits: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    This function is supposed to post-process SAM2 inference and choose most confident
    mask regardless of `multimask_output` parameter value
    Args:
        masks: np array with values 0.0 and 1.0 representing predicted mask of size
            (prompt_set_size, proposed_maks, h, w) or (proposed_maks, h, w) - depending on
            prompt set size - unfortunately, prompt_set_size=1 causes squeeze operation
            in SAM2 library, so to handle inference uniformly, we need to compensate with
            this function.
        scores: array of size (prompt_set_size, proposed_maks) or (proposed_maks, ) depending
            on prompt set size - this array gives confidence score for mask proposal
        low_resolution_logits: array of size (prompt_set_size, proposed_maks, 256, 256) or
            (proposed_maks, 256, 256) - depending on prompt set size. These low resolution logits
             can be passed to a subsequent iteration as mask input.
    Returns:
        Tuple of np.array, where:
            - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence
                for each prompt element
            - second element is of size (prompt_set_size, ) and represents ths score for most confident mask
                of each prompt element
            - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits
                for most confident mask of each prompt element
    """
    if len(masks.shape) == 3:
        masks = np.expand_dims(masks, axis=0)
        scores = np.expand_dims(scores, axis=0)
        low_resolution_logits = np.expand_dims(low_resolution_logits, axis=0)
    selected_masks, selected_scores, selected_low_resolution_logits = [], [], []
    for mask, score, low_resolution_logit in zip(masks, scores, low_resolution_logits):
        selected_mask, selected_score, selected_low_resolution_logit = (
            choose_most_confident_prompt_set_element_prediction(
                mask=mask,
                score=score,
                low_resolution_logit=low_resolution_logit,
            )
        )
        selected_masks.append(selected_mask)
        selected_scores.append(selected_score)
        selected_low_resolution_logits.append(selected_low_resolution_logit)
    return (
        np.asarray(selected_masks),
        np.asarray(selected_scores),
        np.asarray(selected_low_resolution_logits),
    )

pad_points ¶

pad_points(args)

Pad arguments to be passed to sam2 model with not_a_point label (-1). This is necessary when there are multiple prompts per image so that a tensor can be created.

Also pads empty point lists with a dummy non-point entry.

Source code in inference/models/sam2/segment_anything2_inference_models.py

def pad_points(args: Dict[str, Any]) -> Dict[str, Any]:
    """
    Pad arguments to be passed to sam2 model with not_a_point label (-1).
    This is necessary when there are multiple prompts per image so that a tensor can be created.


    Also pads empty point lists with a dummy non-point entry.
    """
    args = copy.deepcopy(args)
    if args["point_coords"] is not None:
        max_len = max(max(len(prompt) for prompt in args["point_coords"]), 1)
        for prompt in args["point_coords"]:
            for _ in range(max_len - len(prompt)):
                prompt.append([0, 0])
        for label in args["point_labels"]:
            for _ in range(max_len - len(label)):
                label.append(-1)
    else:
        if args["point_labels"] is not None:
            raise ValueError(
                "Can't have point labels without corresponding point coordinates"
            )
    return args

`models/sam3`¶

inference.models.sam3.segment_anything3 ¶

Classes¶

SegmentAnything3 ¶

Bases: RoboflowCoreModel

SAM3 wrapper with a similar interface to SAM2 in this codebase.

Source code in inference/models/sam3/segment_anything3.py

class SegmentAnything3(RoboflowCoreModel):
    """SAM3 wrapper with a similar interface to SAM2 in this codebase."""

    def __init__(
        self,
        *args,
        model_id: str = "sam3/sam3_final",
        **kwargs,
    ):
        super().__init__(*args, model_id=model_id, **kwargs)

        # Lazy import SAM3 to avoid hard dependency when disabled
        from sam3 import build_sam3_image_model

        checkpoint = self.cache_file("weights.pt")
        bpe_path = self.cache_file("bpe_simple_vocab_16e6.txt.gz")

        self.sam3_lock = threading.RLock()

        self.model = build_sam3_image_model(
            bpe_path=bpe_path,
            checkpoint_path=checkpoint,
            device="cuda" if torch.cuda.is_available() else "cpu",
            load_from_HF=False,
            compile=False,
        )

        # Preprocessing and postprocessing for PCS image path
        self.transform = ComposeAPI(
            transforms=[
                RandomResizeAPI(
                    sizes=SAM3_IMAGE_SIZE,
                    max_size=SAM3_IMAGE_SIZE,
                    square=True,
                    consistent_transform=False,
                ),
                ToTensorAPI(),
                NormalizeAPI(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ]
        )

        self.image_size = SAM3_IMAGE_SIZE
        self.task_type = "unsupervised-segmentation"

    def _is_core_sam3_endpoint(self) -> bool:
        return isinstance(self.endpoint, str) and self.endpoint.startswith("sam3/")

    @property
    def model_artifact_bucket(self):
        # Use CORE bucket for base SAM3, standard INFER bucket for fine-tuned models
        return CORE_MODEL_BUCKET if self._is_core_sam3_endpoint() else INFER_BUCKET

    def download_weights(self) -> None:
        infer_bucket_files = self.get_infer_bucket_file_list()

        # Auth check aligned with chosen endpoint type
        if MODELS_CACHE_AUTH_ENABLED:
            endpoint_type = (
                ModelEndpointType.CORE_MODEL
                if self._is_core_sam3_endpoint()
                else ModelEndpointType.ORT
            )
            if not _check_if_api_key_has_access_to_model(
                api_key=self.api_key,
                model_id=self.endpoint,
                endpoint_type=endpoint_type,
                countinference=self.countinference,
                service_secret=self.service_secret,
            ):
                raise RoboflowAPINotAuthorizedError(
                    f"API key {self.api_key} does not have access to model {self.endpoint}"
                )

        # Already cached
        if are_all_files_cached(files=infer_bucket_files, model_id=self.endpoint):
            return None

        # S3 path works for both; keys are {endpoint}/<file>
        if is_model_artefacts_bucket_available():
            self.download_model_artefacts_from_s3()
            return None

        # API fallback
        if self._is_core_sam3_endpoint():
            # Base SAM3 from core_model endpoint; preserves filenames
            return super().download_model_from_roboflow_api()

        # Fine-tuned SAM3: use ORT endpoint to fetch weights map or model url
        api_data = get_roboflow_model_data(
            api_key=self.api_key,
            model_id=self.endpoint,
            endpoint_type=ModelEndpointType.ORT,
            device_id=self.device_id,
            countinference=self.countinference,
            service_secret=self.service_secret,
        )

        ort = api_data.get("ort") if isinstance(api_data, dict) else None
        if not isinstance(ort, dict):
            raise ModelArtefactError("ORT response malformed for fine-tuned SAM3")

        # Preferred: explicit weights map of filename -> URL
        weights_map = ort.get("weights")
        if isinstance(weights_map, dict) and len(weights_map) > 0:
            for filename, url in weights_map.items():
                resp = get_from_url(url, json_response=False)
                save_bytes_in_cache(
                    content=resp.content,
                    file=str(filename),
                    model_id=self.endpoint,
                )
            return None

        raise ModelArtefactError(
            "ORT response missing both 'weights' for fine-tuned SAM3"
        )

    def get_infer_bucket_file_list(self) -> List[str]:
        # SAM3 weights managed by env; no core bucket artifacts

        return [
            "weights.pt",
            "bpe_simple_vocab_16e6.txt.gz",
        ]

    def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
        np_image = load_image_rgb(image)
        return np_image

    @usage_collector("model")
    def infer_from_request(self, request: Sam3InferenceRequest):
        # with self.sam3_lock:
        t1 = perf_counter()
        if isinstance(request, Sam3SegmentationRequest):
            # Pass strongly-typed fields to preserve Sam3Prompt objects
            result = self.segment_image(
                image=request.image,
                image_id=request.image_id,
                prompts=request.prompts,
                output_prob_thresh=request.output_prob_thresh or 0.5,
                format=request.format or "polygon",
                nms_iou_threshold=request.nms_iou_threshold,
            )
            # segment_image now returns either bytes or a response model
            return result
        else:
            raise ValueError(f"Invalid request type {type(request)}")

    def segment_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        prompts: Optional[List[Sam3Prompt]] = None,
        output_prob_thresh: float = 0.5,
        format: Optional[str] = "polygon",
        nms_iou_threshold: Optional[float] = None,
        **kwargs,
    ):
        np_image = load_image_rgb(image)
        h, w = np_image.shape[:2]
        pil_image = Image.fromarray(np_image)

        # Inference-only path; disable autograd throughout
        with torch.inference_mode():
            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                start_ts = perf_counter()

                # TODO this can also take tensor directly instead of PIL image, so we want to avoid double conversion
                # TODO: this also supports multiple images for multi batch inference
                datapoint = Sam3Datapoint(
                    find_queries=[],
                    images=[Sam3ImageDP(data=pil_image, objects=[], size=(h, w))],
                )

                # Build prompts in order
                prompts = prompts or []

                # Map prompt_index -> prompt_id to retrieve results later
                prompt_ids: List[int] = []
                for idx, p in enumerate(prompts):
                    if getattr(p, "boxes", None):
                        q = _build_visual_query(
                            coco_id=idx,
                            h=h,
                            w=w,
                            boxes=p.boxes,
                            labels=p.box_labels or [],
                            text=p.text,
                        )
                    else:
                        q = _build_text_query(
                            coco_id=idx,
                            h=h,
                            w=w,
                            text=p.text,
                        )
                    datapoint.find_queries.append(q)
                    prompt_ids.append(idx)

                # Transform and collate to BatchedDatapoint
                datapoint = self.transform(datapoint)
                batch = collate_fn_api(batch=[datapoint], dict_key="dummy")["dummy"]
                batch = copy_data_to_device(
                    batch,
                    torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                    non_blocking=True,
                )

                # Forward
                output = self.model(batch)

                # Calculate minimum threshold for initial filtering
                # (we'll apply per-prompt thresholds later)
                min_threshold = output_prob_thresh
                for p in prompts:
                    prompt_thresh = getattr(p, "output_prob_thresh", None)
                    if prompt_thresh is not None:
                        min_threshold = min(min_threshold, prompt_thresh)

                # Postprocess to original size and build per-prompt results
                post = PostProcessImage(
                    max_dets_per_img=-1,
                    iou_type="segm",
                    use_original_sizes_box=True,
                    use_original_sizes_mask=True,
                    convert_mask_to_rle=False,
                    detection_threshold=float(
                        min_threshold if min_threshold is not None else 0.35
                    ),
                    to_cpu=True,
                )
                processed = post.process_results(output, batch.find_metadatas)

        needs_cross_prompt_nms = nms_iou_threshold is not None
        prompt_results: List[Sam3PromptResult] = []

        if needs_cross_prompt_nms and len(prompts) > 0:
            all_masks = _collect_masks_with_per_prompt_threshold(
                processed=processed,
                prompt_ids=prompt_ids,
                prompts=prompts,
                default_threshold=output_prob_thresh,
            )

            if len(all_masks) > 0:
                all_masks = _apply_nms_cross_prompt(all_masks, nms_iou_threshold)

            regrouped = _regroup_masks_by_prompt(all_masks, len(prompts))

            # Build prompt results from regrouped masks
            for idx, coco_id in enumerate(prompt_ids):
                has_visual = bool(getattr(prompts[idx], "boxes", None))
                num_boxes = len(prompts[idx].boxes or []) if has_visual else 0
                echo = Sam3PromptEcho(
                    prompt_index=idx,
                    type=("visual" if has_visual else "text"),
                    text=prompts[idx].text,
                    num_boxes=num_boxes,
                )

                # Convert regrouped masks to predictions
                prompt_masks = regrouped.get(idx, [])
                if prompt_masks:
                    masks_np = np.stack([m for m, _ in prompt_masks], axis=0)
                    scores = [s for _, s in prompt_masks]
                else:
                    masks_np = np.zeros((0, 0, 0), dtype=np.uint8)
                    scores = []

                preds = _masks_to_predictions(masks_np, scores, format)
                prompt_results.append(
                    Sam3PromptResult(prompt_index=idx, echo=echo, predictions=preds)
                )
        else:
            for idx, coco_id in enumerate(prompt_ids):
                has_visual = bool(getattr(prompts[idx], "boxes", None))
                num_boxes = len(prompts[idx].boxes or []) if has_visual else 0
                echo = Sam3PromptEcho(
                    prompt_index=idx,
                    type=("visual" if has_visual else "text"),
                    text=prompts[idx].text,
                    num_boxes=num_boxes,
                )
                masks_np = _to_numpy_masks(processed[coco_id].get("masks"))
                scores = list(processed[coco_id].get("scores", []))
                prompt_thresh = getattr(prompts[idx], "output_prob_thresh", None)
                if prompt_thresh is not None:
                    masks_np, scores = _filter_by_threshold(
                        masks_np, scores, prompt_thresh
                    )
                preds = _masks_to_predictions(masks_np, scores, format)
                prompt_results.append(
                    Sam3PromptResult(prompt_index=idx, echo=echo, predictions=preds)
                )

        return Sam3SegmentationResponse(
            time=perf_counter() - start_ts, prompt_results=prompt_results
        )

Functions:¶

inference.models.sam3.segment_anything3_inference_models ¶

Classes¶

InferenceModelsSAM3Adapter ¶

Bases: Model

Adapter wrapping inference_models SAM3Torch for open-vocabulary segmentation.

Replaces inference.models.sam3.segment_anything3.SegmentAnything3. Handles Sam3SegmentationRequest with text and/or visual (box) prompts via SAM3Torch.segment_with_text_prompts.

Source code in inference/models/sam3/segment_anything3_inference_models.py

class InferenceModelsSAM3Adapter(Model):
    """Adapter wrapping inference_models SAM3Torch for open-vocabulary segmentation.

    Replaces inference.models.sam3.segment_anything3.SegmentAnything3.
    Handles Sam3SegmentationRequest with text and/or visual (box) prompts via
    SAM3Torch.segment_with_text_prompts.
    """

    def __init__(
        self,
        *args,
        model_id: str = "sam3/sam3_final",
        api_key: Optional[str] = None,
        **kwargs,
    ):
        super().__init__()
        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}
        self.api_key = api_key if api_key else API_KEY
        self.task_type = "unsupervised-segmentation"

        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: SAM3Torch = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            **kwargs,
        )

    @usage_collector("model")
    def infer_from_request(self, request: Sam3InferenceRequest):
        t1 = perf_counter()
        if isinstance(request, Sam3SegmentationRequest):
            return self.segment_image(
                image=request.image,
                prompts=request.prompts,
                output_prob_thresh=request.output_prob_thresh or 0.5,
                format=request.format or "polygon",
                nms_iou_threshold=request.nms_iou_threshold,
                inference_start_timestamp=t1,
            )
        raise ValueError(f"Invalid request type {type(request)}")

    def segment_image(
        self,
        image: InferenceRequestImage,
        prompts: List[Sam3Prompt],
        output_prob_thresh: float = 0.5,
        format: str = "polygon",
        nms_iou_threshold: Optional[float] = None,
        inference_start_timestamp: Optional[float] = None,
    ) -> Sam3SegmentationResponse:
        if inference_start_timestamp is None:
            inference_start_timestamp = perf_counter()
        np_image = load_image_rgb(image)

        # The backend applies a single threshold floor; use the min so per-prompt
        # thresholds applied below can still refine higher values.
        min_threshold = output_prob_thresh
        for p in prompts:
            prompt_thresh = getattr(p, "output_prob_thresh", None)
            if prompt_thresh is not None:
                min_threshold = min(min_threshold, prompt_thresh)

        prompt_dicts = [_sam3_prompt_to_dict(p) for p in prompts]

        # segment_with_text_prompts returns List[per-image] of List[per-prompt] dicts
        # with keys: prompt_index, masks (N,H,W ndarray), scores (list).
        per_image_results = self._model.segment_with_text_prompts(
            images=[np_image],
            prompts=prompt_dicts,
            output_prob_thresh=float(min_threshold),
            max_detections=SAM3_MAX_DETECTIONS,
        )
        per_prompt = per_image_results[0]

        # processed: prompt_idx -> {"masks": ndarray, "scores": list}
        processed: Dict[int, Dict[str, Any]] = {}
        for idx, r in enumerate(per_prompt):
            processed[idx] = {
                "masks": r.get("masks"),
                "scores": list(r.get("scores", [])),
            }

        if nms_iou_threshold is not None and len(prompts) > 0:
            all_masks = _collect_masks_with_per_prompt_threshold(
                processed=processed,
                prompts=prompts,
                default_threshold=output_prob_thresh,
            )
            if len(all_masks) > 0:
                all_masks = _apply_nms_cross_prompt(all_masks, nms_iou_threshold)
            regrouped = _regroup_masks_by_prompt(all_masks, len(prompts))

            prompt_results: List[Sam3PromptResult] = []
            for idx, p in enumerate(prompts):
                echo = _build_echo(idx, p)
                bucket = regrouped.get(idx, [])
                if bucket:
                    masks_np = np.stack([m for m, _ in bucket], axis=0)
                    scores = [s for _, s in bucket]
                else:
                    masks_np = np.zeros((0, 0, 0), dtype=np.uint8)
                    scores = []
                preds = _masks_to_predictions(masks_np, scores, format)
                prompt_results.append(
                    Sam3PromptResult(prompt_index=idx, echo=echo, predictions=preds)
                )
        else:
            prompt_results = []
            for idx, p in enumerate(prompts):
                masks_np = _to_numpy_masks(processed[idx]["masks"])
                scores = processed[idx]["scores"]
                prompt_thresh = getattr(p, "output_prob_thresh", None)
                if prompt_thresh is not None:
                    masks_np, scores = _filter_by_threshold(
                        masks_np, scores, prompt_thresh
                    )
                preds = _masks_to_predictions(masks_np, scores, format)
                prompt_results.append(
                    Sam3PromptResult(
                        prompt_index=idx,
                        echo=_build_echo(idx, p),
                        predictions=preds,
                    )
                )

        return Sam3SegmentationResponse(
            time=perf_counter() - inference_start_timestamp,
            prompt_results=prompt_results,
        )

Functions:¶

inference.models.sam3.visual_segmentation ¶

Classes¶

Sam3ForInteractiveImageSegmentation ¶

Bases: RoboflowCoreModel

SegmentAnything3 class for handling segmentation tasks onm images with box prompting and point prompting, the way as SAM2 did.

Source code in inference/models/sam3/visual_segmentation.py

class Sam3ForInteractiveImageSegmentation(RoboflowCoreModel):
    """
    SegmentAnything3 class for handling segmentation tasks onm images with
    box prompting and point prompting, the way as SAM2 did.
    """

    def __init__(
        self,
        *args,
        model_id: str = "sam3/sam3_final",
        low_res_logits_cache_size: int = SAM3_MAX_LOGITS_CACHE_SIZE,
        embedding_cache_size: int = SAM3_MAX_EMBEDDING_CACHE_SIZE,
        **kwargs,
    ):
        """Initializes the SegmentAnything.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__(*args, model_id=model_id, **kwargs)
        checkpoint = self.cache_file("weights.pt")
        bpe_path = self.cache_file("bpe_simple_vocab_16e6.txt.gz")

        self.sam_model = build_sam3_image_model(
            bpe_path=bpe_path,
            checkpoint_path=checkpoint,
            device="cuda" if torch.cuda.is_available() else "cpu",
            load_from_HF=False,
            compile=False,
            enable_inst_interactivity=True,
        )
        self.low_res_logits_cache_size = low_res_logits_cache_size
        self.embedding_cache_size = embedding_cache_size
        self.embedding_cache = {}
        self.image_size_cache = {}
        self.embedding_cache_keys = []
        self.low_res_logits_cache: Dict[Tuple[str, str], LogitsCacheType] = {}
        self.low_res_logits_cache_keys = []
        self._state_lock = RLock()
        self.task_type = "unsupervised-segmentation"

    def get_infer_bucket_file_list(self) -> List[str]:
        """Gets the list of files required for inference.

        Returns:
            List[str]: List of file names.
        """
        return ["weights.pt"]

    @torch.inference_mode()
    def embed_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        **kwargs,
    ):
        """
        Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
        the cached result will be returned.

        Args:
            image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
            image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                      with this ID. Defaults to None.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                               and the second element is the shape (height, width) of the processed image.

        Notes:
            - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
            - The cache has a maximum size defined by SAM2_MAX_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.

        Example:
            >>> img_array = ... # some image array
            >>> embed_image(img_array, image_id="sample123")
            (array([...]), (224, 224))
        """
        if image_id:
            embedding_cache_content = self.embedding_cache.get(image_id)
            image_size_content = self.image_size_cache.get(image_id)
            if embedding_cache_content is not None and image_size_content is not None:
                return embedding_cache_content, image_size_content, image_id

        img_in = self.preproc_image(image)
        if image_id is None:
            image_id = hashlib.md5(img_in.tobytes()).hexdigest()[:12]

        embedding_cache_content = self.embedding_cache.get(image_id)
        image_size_content = self.image_size_cache.get(image_id)
        if embedding_cache_content is not None and image_size_content is not None:
            return (
                embedding_cache_content,
                image_size_content,
                image_id,
            )

        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            with _temporarily_disable_torch_jit_script():
                processor = Sam3Processor(self.sam_model)
            state = processor.set_image(torch.from_numpy(img_in).permute(2, 0, 1))
            embedding_dict = state

        with self._state_lock:
            self.embedding_cache[image_id] = embedding_dict
            self.image_size_cache[image_id] = img_in.shape[:2]
            safe_remove_from_list(values=self.embedding_cache_keys, element=image_id)
            self.embedding_cache_keys.append(image_id)
            if len(self.embedding_cache_keys) > self.embedding_cache_size:
                cache_key = safe_pop_from_list(values=self.embedding_cache_keys)
                if cache_key is not None:
                    safe_remove_from_dict(values=self.embedding_cache, key=cache_key)
                    safe_remove_from_dict(values=self.image_size_cache, key=cache_key)
            return embedding_dict, img_in.shape[:2], image_id

    @usage_collector("model")
    def infer_from_request(self, request: Sam2InferenceRequest):
        """Performs inference based on the request type.

        Args:
            request (SamInferenceRequest): The inference request.

        Returns:
            Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
        """
        t1 = perf_counter()
        if isinstance(request, Sam2EmbeddingRequest):
            _, _, image_id = self.embed_image(**request.dict())
            inference_time = perf_counter() - t1
            return Sam2EmbeddingResponse(time=inference_time, image_id=image_id)
        elif isinstance(request, Sam2SegmentationRequest):
            masks, scores, low_resolution_logits = self.segment_image(**request.dict())
            predictions = _masks_to_predictions(masks, scores, request.format)
            return Sam2SegmentationResponse(
                time=perf_counter() - t1,
                predictions=predictions,
            )
        else:
            raise ValueError(f"Invalid request type {type(request)}")

    def preproc_image(self, image: InferenceRequestImage):
        """Preprocesses an image.

        Args:
            image (InferenceRequestImage): The image to preprocess.

        Returns:
            np.array: The preprocessed image.
        """
        np_image = load_image_rgb(image)
        return np_image

    def segment_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        prompts: Optional[Union[Sam2PromptSet, dict]] = None,
        multimask_output: Optional[bool] = True,
        mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
        save_logits_to_cache: bool = False,
        load_logits_from_cache: bool = False,
        **kwargs,
    ):
        """
        Segments an image based on provided embeddings, points, masks, or cached results.
        If embeddings are not directly provided, the function can derive them from the input image or cache.

        Args:
            image (Any): The image to be segmented.
            image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
            prompts (Optional[List[Sam2Prompt]]): List of prompts to use for segmentation. Defaults to None.
            mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input low_res_logits for the image.
            multimask_output: (bool): Flag to decide if multiple masks proposal to be predicted (among which the most
                promising will be returned
            )
            use_logits_cache: (bool): Flag to decide to use cached logits from prior prompting
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple of np.array, where:
                - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence
                    for each prompt element
                - second element is of size (prompt_set_size, ) and represents ths score for most confident mask
                    of each prompt element
                - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits
                    for most confident mask of each prompt element

        Raises:
            ValueError: If necessary inputs are missing or inconsistent.

        Notes:
            - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
              on repeated requests for the same image.
            - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
              the oldest entries are removed.
        """
        load_logits_from_cache = (
            load_logits_from_cache and not DISABLE_SAM3_LOGITS_CACHE
        )
        save_logits_to_cache = save_logits_to_cache and not DISABLE_SAM3_LOGITS_CACHE
        with torch.inference_mode():
            if image is None and not image_id:
                raise ValueError("Must provide either image or  cached image_id")
            elif image_id and image is None and image_id not in self.embedding_cache:
                raise ValueError(
                    f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
                )
            embedding, original_image_size, image_id = self.embed_image(
                image=image, image_id=image_id
            )

            args = dict()
            prompt_set: Sam2PromptSet
            if prompts:
                if type(prompts) is dict:
                    prompt_set = Sam2PromptSet(**prompts)
                    args = prompt_set.to_sam2_inputs()
                else:
                    prompt_set = prompts
                    args = prompts.to_sam2_inputs()
            else:
                prompt_set = Sam2PromptSet()

            if mask_input is None and load_logits_from_cache:
                mask_input = maybe_load_low_res_logits_from_cache(
                    image_id, prompt_set, self.low_res_logits_cache
                )

            args = pad_points(args)
            if not any(args.values()):
                args = {"point_coords": [[0, 0]], "point_labels": [-1], "box": None}

            # predict_inst internally sets/clears _is_image_set and _features
            # on the shared inst_interactive_predictor (see sam3_image.py:627-635).
            # Without a lock, concurrent requests race: one thread's cleanup
            # (setting _is_image_set=False) can hit between another thread's
            # set (_is_image_set=True) and its _predict() check, causing
            # "An image must be set with .set_image(...)".
            with self._state_lock:
                masks, scores, low_resolution_logits = self.sam_model.predict_inst(
                    embedding,
                    mask_input=mask_input,
                    multimask_output=multimask_output,
                    return_logits=True,
                    normalize_coords=True,
                    **args,
                )
            masks, scores, low_resolution_logits = choose_most_confident_sam_prediction(
                masks=masks,
                scores=scores,
                low_resolution_logits=low_resolution_logits,
            )

            if save_logits_to_cache:
                self.add_low_res_logits_to_cache(
                    low_resolution_logits, image_id, prompt_set
                )

            return masks, scores, low_resolution_logits

    def add_low_res_logits_to_cache(
        self, logits: np.ndarray, image_id: str, prompt_set: Sam2PromptSet
    ) -> None:
        logits = logits[:, None, :, :]
        prompt_id = hash_prompt_set(image_id, prompt_set)
        with self._state_lock:
            self.low_res_logits_cache[prompt_id] = {
                "logits": logits,
                "prompt_set": prompt_set,
            }
            safe_remove_from_list(
                values=self.low_res_logits_cache_keys, element=prompt_id
            )
            self.low_res_logits_cache_keys.append(prompt_id)
            if len(self.low_res_logits_cache_keys) > self.low_res_logits_cache_size:
                cache_key = safe_pop_from_list(values=self.low_res_logits_cache_keys)
                if cache_key is not None:
                    safe_remove_from_dict(
                        values=self.low_res_logits_cache, key=cache_key
                    )

    @property
    def model_artifact_bucket(self):
        # Use CORE bucket for base SAM3, standard INFER bucket for fine-tuned models
        return CORE_MODEL_BUCKET if self._is_core_sam3_endpoint() else INFER_BUCKET

    def _is_core_sam3_endpoint(self) -> bool:
        return isinstance(self.endpoint, str) and self.endpoint.startswith("sam3/")

    def download_weights(self) -> None:
        infer_bucket_files = self.get_infer_bucket_file_list()

        # Auth check aligned with chosen endpoint type
        if MODELS_CACHE_AUTH_ENABLED:
            endpoint_type = (
                ModelEndpointType.CORE_MODEL
                if self._is_core_sam3_endpoint()
                else ModelEndpointType.ORT
            )
            if not _check_if_api_key_has_access_to_model(
                api_key=self.api_key,
                model_id=self.endpoint,
                endpoint_type=endpoint_type,
                countinference=self.countinference,
                service_secret=self.service_secret,
            ):
                raise RoboflowAPINotAuthorizedError(
                    f"API key {self.api_key} does not have access to model {self.endpoint}"
                )
        # Already cached
        if are_all_files_cached(files=infer_bucket_files, model_id=self.endpoint):
            return None
        # S3 path works for both; keys are {endpoint}/<file>
        if is_model_artefacts_bucket_available():
            self.download_model_artefacts_from_s3()
            return None
            # API fallback
        if self._is_core_sam3_endpoint():
            # Base SAM3 from core_model endpoint; preserves filenames
            return super().download_model_from_roboflow_api()

        # Fine-tuned SAM3: use ORT endpoint to fetch weights map or model url
        api_data = get_roboflow_model_data(
            api_key=self.api_key,
            model_id=self.endpoint,
            endpoint_type=ModelEndpointType.ORT,
            device_id=self.device_id,
            countinference=self.countinference,
            service_secret=self.service_secret,
        )

        ort = api_data.get("ort") if isinstance(api_data, dict) else None
        if not isinstance(ort, dict):
            raise ModelArtefactError("ORT response malformed for fine-tuned SAM3")

        # Preferred: explicit weights map of filename -> URL
        weights_map = ort.get("weights")
        if isinstance(weights_map, dict) and len(weights_map) > 0:
            for filename, url in weights_map.items():
                resp = get_from_url(url, json_response=False)
                save_bytes_in_cache(
                    content=resp.content,
                    file=str(filename),
                    model_id=self.endpoint,
                )
            return None
        raise ModelArtefactError(
            "ORT response missing both 'weights' for fine-tuned SAM3"
        )

Methods:¶

init ¶

__init__(
    *args,
    model_id="sam3/sam3_final",
    low_res_logits_cache_size=SAM3_MAX_LOGITS_CACHE_SIZE,
    embedding_cache_size=SAM3_MAX_EMBEDDING_CACHE_SIZE,
    **kwargs
)

Initializes the SegmentAnything.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/sam3/visual_segmentation.py

def __init__(
    self,
    *args,
    model_id: str = "sam3/sam3_final",
    low_res_logits_cache_size: int = SAM3_MAX_LOGITS_CACHE_SIZE,
    embedding_cache_size: int = SAM3_MAX_EMBEDDING_CACHE_SIZE,
    **kwargs,
):
    """Initializes the SegmentAnything.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__(*args, model_id=model_id, **kwargs)
    checkpoint = self.cache_file("weights.pt")
    bpe_path = self.cache_file("bpe_simple_vocab_16e6.txt.gz")

    self.sam_model = build_sam3_image_model(
        bpe_path=bpe_path,
        checkpoint_path=checkpoint,
        device="cuda" if torch.cuda.is_available() else "cpu",
        load_from_HF=False,
        compile=False,
        enable_inst_interactivity=True,
    )
    self.low_res_logits_cache_size = low_res_logits_cache_size
    self.embedding_cache_size = embedding_cache_size
    self.embedding_cache = {}
    self.image_size_cache = {}
    self.embedding_cache_keys = []
    self.low_res_logits_cache: Dict[Tuple[str, str], LogitsCacheType] = {}
    self.low_res_logits_cache_keys = []
    self._state_lock = RLock()
    self.task_type = "unsupervised-segmentation"

embed_image ¶

embed_image(image, image_id=None, **kwargs)

Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached, the cached result will be returned.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be embedded. The format should be compatible with the preproc_image method.	required
`image_id`	`Optional[str]`	An identifier for the image. If provided, the embedding result will be cached with this ID. Defaults to None.	`None`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type	Description
	Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image and the second element is the shape (height, width) of the processed image.

Notes

Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM2_MAX_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Example

img_array = ... # some image array embed_image(img_array, image_id="sample123") (array([...]), (224, 224))

Source code in inference/models/sam3/visual_segmentation.py

@torch.inference_mode()
def embed_image(
    self,
    image: Optional[InferenceRequestImage],
    image_id: Optional[str] = None,
    **kwargs,
):
    """
    Embeds an image and caches the result if an image_id is provided. If the image has been embedded before and cached,
    the cached result will be returned.

    Args:
        image (Any): The image to be embedded. The format should be compatible with the preproc_image method.
        image_id (Optional[str]): An identifier for the image. If provided, the embedding result will be cached
                                  with this ID. Defaults to None.
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, Tuple[int, int]]: A tuple where the first element is the embedding of the image
                                           and the second element is the shape (height, width) of the processed image.

    Notes:
        - Embeddings and image sizes are cached to improve performance on repeated requests for the same image.
        - The cache has a maximum size defined by SAM2_MAX_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.

    Example:
        >>> img_array = ... # some image array
        >>> embed_image(img_array, image_id="sample123")
        (array([...]), (224, 224))
    """
    if image_id:
        embedding_cache_content = self.embedding_cache.get(image_id)
        image_size_content = self.image_size_cache.get(image_id)
        if embedding_cache_content is not None and image_size_content is not None:
            return embedding_cache_content, image_size_content, image_id

    img_in = self.preproc_image(image)
    if image_id is None:
        image_id = hashlib.md5(img_in.tobytes()).hexdigest()[:12]

    embedding_cache_content = self.embedding_cache.get(image_id)
    image_size_content = self.image_size_cache.get(image_id)
    if embedding_cache_content is not None and image_size_content is not None:
        return (
            embedding_cache_content,
            image_size_content,
            image_id,
        )

    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
        with _temporarily_disable_torch_jit_script():
            processor = Sam3Processor(self.sam_model)
        state = processor.set_image(torch.from_numpy(img_in).permute(2, 0, 1))
        embedding_dict = state

    with self._state_lock:
        self.embedding_cache[image_id] = embedding_dict
        self.image_size_cache[image_id] = img_in.shape[:2]
        safe_remove_from_list(values=self.embedding_cache_keys, element=image_id)
        self.embedding_cache_keys.append(image_id)
        if len(self.embedding_cache_keys) > self.embedding_cache_size:
            cache_key = safe_pop_from_list(values=self.embedding_cache_keys)
            if cache_key is not None:
                safe_remove_from_dict(values=self.embedding_cache, key=cache_key)
                safe_remove_from_dict(values=self.image_size_cache, key=cache_key)
        return embedding_dict, img_in.shape[:2], image_id

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Gets the list of files required for inference.

Returns:

Type	Description
`List[str]`	List[str]: List of file names.

Source code in inference/models/sam3/visual_segmentation.py

def get_infer_bucket_file_list(self) -> List[str]:
    """Gets the list of files required for inference.

    Returns:
        List[str]: List of file names.
    """
    return ["weights.pt"]

infer_from_request ¶

infer_from_request(request)

Performs inference based on the request type.

Parameters:

Name	Type	Description	Default
`request`	`SamInferenceRequest`	The inference request.	required

Returns:

Type	Description
	Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.

Source code in inference/models/sam3/visual_segmentation.py

@usage_collector("model")
def infer_from_request(self, request: Sam2InferenceRequest):
    """Performs inference based on the request type.

    Args:
        request (SamInferenceRequest): The inference request.

    Returns:
        Union[SamEmbeddingResponse, SamSegmentationResponse]: The inference response.
    """
    t1 = perf_counter()
    if isinstance(request, Sam2EmbeddingRequest):
        _, _, image_id = self.embed_image(**request.dict())
        inference_time = perf_counter() - t1
        return Sam2EmbeddingResponse(time=inference_time, image_id=image_id)
    elif isinstance(request, Sam2SegmentationRequest):
        masks, scores, low_resolution_logits = self.segment_image(**request.dict())
        predictions = _masks_to_predictions(masks, scores, request.format)
        return Sam2SegmentationResponse(
            time=perf_counter() - t1,
            predictions=predictions,
        )
    else:
        raise ValueError(f"Invalid request type {type(request)}")

preproc_image ¶

preproc_image(image)

Preprocesses an image.

Parameters:

Name	Type	Description	Default
`image`	`InferenceRequestImage`	The image to preprocess.	required

Returns:

Type	Description
	np.array: The preprocessed image.

Source code in inference/models/sam3/visual_segmentation.py

def preproc_image(self, image: InferenceRequestImage):
    """Preprocesses an image.

    Args:
        image (InferenceRequestImage): The image to preprocess.

    Returns:
        np.array: The preprocessed image.
    """
    np_image = load_image_rgb(image)
    return np_image

segment_image ¶

segment_image(
    image,
    image_id=None,
    prompts=None,
    multimask_output=True,
    mask_input=None,
    save_logits_to_cache=False,
    load_logits_from_cache=False,
    **kwargs
)

Segments an image based on provided embeddings, points, masks, or cached results. If embeddings are not directly provided, the function can derive them from the input image or cache.

Parameters:

Name	Type	Description	Default
`image`	`Any`	The image to be segmented.	required
`image_id`	`Optional[str]`	A cached identifier for the image. Useful for accessing cached embeddings or masks.	`None`
`prompts`	`Optional[List[Sam2Prompt]]`	List of prompts to use for segmentation. Defaults to None.	`None`
`mask_input`	`Optional[Union[ndarray, List[List[List[float]]]]]`	Input low_res_logits for the image.	`None`
`multimask_output`	`Optional[bool]`	(bool): Flag to decide if multiple masks proposal to be predicted (among which the most promising will be returned	`True`
`use_logits_cache`		(bool): Flag to decide to use cached logits from prior prompting	required
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Type

Description

Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple of np.array, where: - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence for each prompt element - second element is of size (prompt_set_size, ) and represents ths score for most confident mask of each prompt element - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits for most confident mask of each prompt element

Raises:

Type	Description
`ValueError`	If necessary inputs are missing or inconsistent.

Notes

Embeddings, segmentations, and low-resolution logits can be cached to improve performance on repeated requests for the same image.
The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size, the oldest entries are removed.

Source code in inference/models/sam3/visual_segmentation.py

def segment_image(
    self,
    image: Optional[InferenceRequestImage],
    image_id: Optional[str] = None,
    prompts: Optional[Union[Sam2PromptSet, dict]] = None,
    multimask_output: Optional[bool] = True,
    mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
    save_logits_to_cache: bool = False,
    load_logits_from_cache: bool = False,
    **kwargs,
):
    """
    Segments an image based on provided embeddings, points, masks, or cached results.
    If embeddings are not directly provided, the function can derive them from the input image or cache.

    Args:
        image (Any): The image to be segmented.
        image_id (Optional[str]): A cached identifier for the image. Useful for accessing cached embeddings or masks.
        prompts (Optional[List[Sam2Prompt]]): List of prompts to use for segmentation. Defaults to None.
        mask_input (Optional[Union[np.ndarray, List[List[List[float]]]]]): Input low_res_logits for the image.
        multimask_output: (bool): Flag to decide if multiple masks proposal to be predicted (among which the most
            promising will be returned
        )
        use_logits_cache: (bool): Flag to decide to use cached logits from prior prompting
        **kwargs: Additional keyword arguments.

    Returns:
        Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple of np.array, where:
            - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence
                for each prompt element
            - second element is of size (prompt_set_size, ) and represents ths score for most confident mask
                of each prompt element
            - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits
                for most confident mask of each prompt element

    Raises:
        ValueError: If necessary inputs are missing or inconsistent.

    Notes:
        - Embeddings, segmentations, and low-resolution logits can be cached to improve performance
          on repeated requests for the same image.
        - The cache has a maximum size defined by SAM_MAX_EMBEDDING_CACHE_SIZE. When the cache exceeds this size,
          the oldest entries are removed.
    """
    load_logits_from_cache = (
        load_logits_from_cache and not DISABLE_SAM3_LOGITS_CACHE
    )
    save_logits_to_cache = save_logits_to_cache and not DISABLE_SAM3_LOGITS_CACHE
    with torch.inference_mode():
        if image is None and not image_id:
            raise ValueError("Must provide either image or  cached image_id")
        elif image_id and image is None and image_id not in self.embedding_cache:
            raise ValueError(
                f"Image ID {image_id} not in embedding cache, must provide the image or embeddings"
            )
        embedding, original_image_size, image_id = self.embed_image(
            image=image, image_id=image_id
        )

        args = dict()
        prompt_set: Sam2PromptSet
        if prompts:
            if type(prompts) is dict:
                prompt_set = Sam2PromptSet(**prompts)
                args = prompt_set.to_sam2_inputs()
            else:
                prompt_set = prompts
                args = prompts.to_sam2_inputs()
        else:
            prompt_set = Sam2PromptSet()

        if mask_input is None and load_logits_from_cache:
            mask_input = maybe_load_low_res_logits_from_cache(
                image_id, prompt_set, self.low_res_logits_cache
            )

        args = pad_points(args)
        if not any(args.values()):
            args = {"point_coords": [[0, 0]], "point_labels": [-1], "box": None}

        # predict_inst internally sets/clears _is_image_set and _features
        # on the shared inst_interactive_predictor (see sam3_image.py:627-635).
        # Without a lock, concurrent requests race: one thread's cleanup
        # (setting _is_image_set=False) can hit between another thread's
        # set (_is_image_set=True) and its _predict() check, causing
        # "An image must be set with .set_image(...)".
        with self._state_lock:
            masks, scores, low_resolution_logits = self.sam_model.predict_inst(
                embedding,
                mask_input=mask_input,
                multimask_output=multimask_output,
                return_logits=True,
                normalize_coords=True,
                **args,
            )
        masks, scores, low_resolution_logits = choose_most_confident_sam_prediction(
            masks=masks,
            scores=scores,
            low_resolution_logits=low_resolution_logits,
        )

        if save_logits_to_cache:
            self.add_low_res_logits_to_cache(
                low_resolution_logits, image_id, prompt_set
            )

        return masks, scores, low_resolution_logits

Functions:¶

choose_most_confident_sam_prediction ¶

choose_most_confident_sam_prediction(
    masks, scores, low_resolution_logits
)

This function is supposed to post-process SAM2 inference and choose most confident mask regardless of multimask_output parameter value Args: masks: np array with values 0.0 and 1.0 representing predicted mask of size (prompt_set_size, proposed_maks, h, w) or (proposed_maks, h, w) - depending on prompt set size - unfortunately, prompt_set_size=1 causes squeeze operation in SAM2 library, so to handle inference uniformly, we need to compensate with this function. scores: array of size (prompt_set_size, proposed_maks) or (proposed_maks, ) depending on prompt set size - this array gives confidence score for mask proposal low_resolution_logits: array of size (prompt_set_size, proposed_maks, 256, 256) or (proposed_maks, 256, 256) - depending on prompt set size. These low resolution logits can be passed to a subsequent iteration as mask input. Returns: Tuple of np.array, where: - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence for each prompt element - second element is of size (prompt_set_size, ) and represents ths score for most confident mask of each prompt element - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits for most confident mask of each prompt element

Source code in inference/models/sam3/visual_segmentation.py

def choose_most_confident_sam_prediction(
    masks: np.ndarray,
    scores: np.ndarray,
    low_resolution_logits: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    This function is supposed to post-process SAM2 inference and choose most confident
    mask regardless of `multimask_output` parameter value
    Args:
        masks: np array with values 0.0 and 1.0 representing predicted mask of size
            (prompt_set_size, proposed_maks, h, w) or (proposed_maks, h, w) - depending on
            prompt set size - unfortunately, prompt_set_size=1 causes squeeze operation
            in SAM2 library, so to handle inference uniformly, we need to compensate with
            this function.
        scores: array of size (prompt_set_size, proposed_maks) or (proposed_maks, ) depending
            on prompt set size - this array gives confidence score for mask proposal
        low_resolution_logits: array of size (prompt_set_size, proposed_maks, 256, 256) or
            (proposed_maks, 256, 256) - depending on prompt set size. These low resolution logits
             can be passed to a subsequent iteration as mask input.
    Returns:
        Tuple of np.array, where:
            - first element is of size (prompt_set_size, h, w) and represent mask with the highest confidence
                for each prompt element
            - second element is of size (prompt_set_size, ) and represents ths score for most confident mask
                of each prompt element
            - third element is of size (prompt_set_size, 256, 256) and represents the low resolution logits
                for most confident mask of each prompt element
    """
    if len(masks.shape) == 3:
        masks = np.expand_dims(masks, axis=0)
        scores = np.expand_dims(scores, axis=0)
        low_resolution_logits = np.expand_dims(low_resolution_logits, axis=0)
    selected_masks, selected_scores, selected_low_resolution_logits = [], [], []
    for mask, score, low_resolution_logit in zip(masks, scores, low_resolution_logits):
        selected_mask, selected_score, selected_low_resolution_logit = (
            choose_most_confident_prompt_set_element_prediction(
                mask=mask,
                score=score,
                low_resolution_logit=low_resolution_logit,
            )
        )
        selected_masks.append(selected_mask)
        selected_scores.append(selected_score)
        selected_low_resolution_logits.append(selected_low_resolution_logit)
    return (
        np.asarray(selected_masks),
        np.asarray(selected_scores),
        np.asarray(selected_low_resolution_logits),
    )

find_prior_prompt_in_cache ¶

find_prior_prompt_in_cache(
    initial_prompt_set, image_id, cache
)

Performs search over the cache to see if prior used prompts are subset of this one.

Source code in inference/models/sam3/visual_segmentation.py

def find_prior_prompt_in_cache(
    initial_prompt_set: Sam2PromptSet,
    image_id: str,
    cache: Dict[Tuple[str, str], LogitsCacheType],
) -> Optional[np.ndarray]:
    """
    Performs search over the cache to see if prior used prompts are subset of this one.
    """
    num_points = initial_prompt_set.num_points()
    if num_points <= 1:
        return None  # there is only 1 point, hence no prior prompt can be found

    logits_for_image = [cache[k] for k in cache if k[0] == image_id]
    maxed_size = 0
    best_match: Optional[np.ndarray] = None
    desired_size = num_points - 1
    for cached_dict in logits_for_image[::-1]:
        logits = cached_dict["logits"]
        prompt_set: Sam2PromptSet = cached_dict["prompt_set"]
        is_viable = is_prompt_strict_subset(prompt_set, initial_prompt_set)
        if not is_viable:
            continue

        size = prompt_set.num_points()
        # short circuit search if we find prompt with one less point (most recent possible mask)
        if size == desired_size:
            return logits
        if size >= maxed_size:
            maxed_size = size
            best_match = logits

    return best_match

hash_prompt_set ¶

hash_prompt_set(image_id, prompt_set)

Computes unique hash from a prompt set.

Source code in inference/models/sam3/visual_segmentation.py

def hash_prompt_set(image_id: str, prompt_set: Sam2PromptSet) -> Tuple[str, str]:
    """Computes unique hash from a prompt set."""
    md5_hash = hashlib.md5()
    md5_hash.update(str(prompt_set).encode("utf-8"))
    return image_id, md5_hash.hexdigest()[:12]

maybe_load_low_res_logits_from_cache ¶

maybe_load_low_res_logits_from_cache(
    image_id, prompt_set, cache
)

Loads prior masks from the cache by searching over possibel prior prompts.

Source code in inference/models/sam3/visual_segmentation.py

def maybe_load_low_res_logits_from_cache(
    image_id: str,
    prompt_set: Sam2PromptSet,
    cache: Dict[Tuple[str, str], LogitsCacheType],
) -> Optional[np.ndarray]:
    "Loads prior masks from the cache by searching over possibel prior prompts."
    prompts = prompt_set.prompts
    if not prompts:
        return None
    return find_prior_prompt_in_cache(prompt_set, image_id, cache)

pad_points ¶

pad_points(args)

Pad arguments to be passed to sam2 model with not_a_point label (-1). This is necessary when there are multiple prompts per image so that a tensor can be created.

Also pads empty point lists with a dummy non-point entry.

Source code in inference/models/sam3/visual_segmentation.py

def pad_points(args: Dict[str, Any]) -> Dict[str, Any]:
    """
    Pad arguments to be passed to sam2 model with not_a_point label (-1).
    This is necessary when there are multiple prompts per image so that a tensor can be created.


    Also pads empty point lists with a dummy non-point entry.
    """
    args = copy.deepcopy(args)
    if args["point_coords"] is not None:
        max_len = max(max(len(prompt) for prompt in args["point_coords"]), 1)
        for prompt in args["point_coords"]:
            for _ in range(max_len - len(prompt)):
                prompt.append([0, 0])
        for label in args["point_labels"]:
            for _ in range(max_len - len(label)):
                label.append(-1)
    else:
        if args["point_labels"] is not None:
            raise ValueError(
                "Can't have point labels without corresponding point coordinates"
            )
    return args

inference.models.sam3.visual_segmentation_inference_models ¶

Classes¶

InferenceModelsSAM3InteractiveAdapter ¶

Bases: Model

Adapter wrapping inference_models SAM3Torch for SAM-style interactive segmentation.

Replaces inference.models.sam3.visual_segmentation.Sam3ForInteractiveImageSegmentation. Handles Sam2EmbeddingRequest / Sam2SegmentationRequest with point/box prompts via SAM3Torch.embed_images and SAM3Torch.segment_with_visual_prompts (sharing Sam2 request/response schemas, as the legacy class did).

Source code in inference/models/sam3/visual_segmentation_inference_models.py

class InferenceModelsSAM3InteractiveAdapter(Model):
    """Adapter wrapping inference_models SAM3Torch for SAM-style interactive segmentation.

    Replaces inference.models.sam3.visual_segmentation.Sam3ForInteractiveImageSegmentation.
    Handles Sam2EmbeddingRequest / Sam2SegmentationRequest with point/box prompts via
    SAM3Torch.embed_images and SAM3Torch.segment_with_visual_prompts (sharing Sam2 request/response
    schemas, as the legacy class did).
    """

    def __init__(
        self,
        *args,
        model_id: str = "sam3/sam3_final",
        api_key: Optional[str] = None,
        low_res_logits_cache_size: int = SAM3_MAX_LOGITS_CACHE_SIZE,
        embedding_cache_size: int = SAM3_MAX_EMBEDDING_CACHE_SIZE,
        **kwargs,
    ):
        super().__init__()
        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}
        self.api_key = api_key if api_key else API_KEY
        self.task_type = "unsupervised-segmentation"

        sam3_image_embeddings_cache = Sam3ImageEmbeddingsInMemoryCache.init(
            size_limit=embedding_cache_size,
            send_to_cpu=SAM3_INTERACTIVE_CACHE_SEND_TO_CPU,
        )
        sam3_low_resolution_masks_cache = Sam3LowResolutionMasksInMemoryCache.init(
            size_limit=low_res_logits_cache_size,
            send_to_cpu=SAM3_INTERACTIVE_CACHE_SEND_TO_CPU,
        )
        extra_weights_provider_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        backend = list(
            VALID_INFERENCE_MODELS_BACKENDS.difference(
                DISABLED_INFERENCE_MODELS_BACKENDS
            )
        )
        self._model: SAM3Torch = AutoModel.from_pretrained(
            model_id_or_path=model_id,
            api_key=self.api_key,
            allow_untrusted_packages=ALLOW_INFERENCE_MODELS_UNTRUSTED_PACKAGES,
            allow_direct_local_storage_loading=ALLOW_INFERENCE_MODELS_DIRECTLY_ACCESS_LOCAL_PACKAGES,
            sam3_image_embeddings_cache=sam3_image_embeddings_cache,
            sam3_low_resolution_masks_cache=sam3_low_resolution_masks_cache,
            sam3_allow_client_generated_hash_ids=True,
            weights_provider_extra_headers=extra_weights_provider_headers,
            backend=backend,
            **kwargs,
        )

    @usage_collector("model")
    def infer_from_request(self, request: Sam2InferenceRequest):
        t1 = perf_counter()
        if isinstance(request, Sam2EmbeddingRequest):
            _, _, image_id = self.embed_image(**request.dict())
            return Sam2EmbeddingResponse(time=perf_counter() - t1, image_id=image_id)
        if isinstance(request, Sam2SegmentationRequest):
            masks, scores, low_res_logits = self.segment_image(**request.dict())
            if request.format == "json" or request.format == "polygon":
                return _build_polygon_response(
                    masks=masks,
                    scores=scores,
                    inference_start_timestamp=t1,
                )
            if request.format == "rle":
                return _build_rle_response(
                    masks=masks,
                    scores=scores,
                    inference_start_timestamp=t1,
                )
            if request.format == "binary":
                buf = BytesIO()
                np.savez_compressed(buf, masks=masks, low_res_masks=low_res_logits)
                buf.seek(0)
                return buf.getvalue()
            raise ValueError(f"Invalid format {request.format}")
        raise ValueError(f"Invalid request type {type(request)}")

    def preproc_image(self, image: InferenceRequestImage):
        if image is not None:
            return load_image_rgb(image)
        return None

    def embed_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        **kwargs,
    ):
        loaded_image = self.preproc_image(image)
        if loaded_image is None:
            raise ValueError("Image must be provided to handle this request.")
        embeddings = self._model.embed_images(
            images=loaded_image, image_hashes=image_id, **kwargs
        )[0]
        # The interactive backend stores opaque processor state, not array embeddings.
        # Preserve the legacy public shape: dict of "image_embed" / "high_res_feats".
        embedding_dict = {
            "image_embed": None,
            "high_res_feats": None,
            "state": embeddings.embeddings,
        }
        return embedding_dict, embeddings.image_size_hw, embeddings.image_hash

    def segment_image(
        self,
        image: Optional[InferenceRequestImage],
        image_id: Optional[str] = None,
        prompts: Optional[Union[Sam2PromptSet, dict]] = None,
        multimask_output: Optional[bool] = True,
        mask_input: Optional[Union[np.ndarray, List[List[List[float]]]]] = None,
        save_logits_to_cache: bool = False,
        load_logits_from_cache: bool = False,
        **kwargs,
    ):
        load_logits_from_cache = (
            load_logits_from_cache and not DISABLE_SAM3_LOGITS_CACHE
        )
        save_logits_to_cache = save_logits_to_cache and not DISABLE_SAM3_LOGITS_CACHE

        if prompts is not None:
            if isinstance(prompts, dict):
                prompts = Sam2PromptSet(**prompts)
        else:
            prompts = Sam2PromptSet()
        args = prompts.to_sam2_inputs()
        args = _pad_points(args)
        if args["point_coords"] is not None:
            args["point_coords"] = np.array(args["point_coords"])
        if args["point_labels"] is not None:
            args["point_labels"] = np.array(args["point_labels"])
        if args["box"] is not None:
            args["box"] = np.array(args["box"])
        if mask_input is not None and isinstance(mask_input, list):
            mask_input = np.array(mask_input)

        segment_kwargs = dict(
            point_coordinates=args["point_coords"],
            point_labels=args["point_labels"],
            boxes=args["box"],
            mask_input=mask_input,
            multi_mask_output=multimask_output,
            return_logits=True,
            load_from_mask_input_cache=load_logits_from_cache,
            save_to_mask_input_cache=save_logits_to_cache,
            use_embeddings_cache=True,
        )

        prediction = None
        if image_id is not None:
            # Fast path: skip image decode/preproc when embeddings are already cached.
            # NOTE: match the cache-miss message so other ModelInputErrors (bad prompt
            # shape, invalid hash usage) propagate instead of silently re-decoding.
            try:
                prediction = self._model.segment_with_visual_prompts(
                    images=None, image_hashes=image_id, **segment_kwargs
                )[0]
            except ModelInputError as error:
                if "no embeddings were found in the cache" not in str(error):
                    raise
                prediction = None
        if prediction is None:
            loaded_image = self.preproc_image(image)
            prediction = self._model.segment_with_visual_prompts(
                images=loaded_image, image_hashes=image_id, **segment_kwargs
            )[0]
        # SAM3Torch already selects the most confident of the multimask proposals
        # for each prompt, so masks/scores/logits arrive with exactly one entry
        # per prompt. Reducing again here would collapse a multi-prompt request
        # into a single prediction.
        return (
            prediction.masks.cpu().numpy(),
            prediction.scores.cpu().numpy(),
            prediction.logits.cpu().numpy(),
        )

Functions:¶

`models/sam3_3d`¶

inference.models.sam3_3d.segment_anything_3d ¶

Classes¶

Sam3_3D_ObjectsPipelineSingleton ¶

Singleton to cache the heavy 3D pipeline initialization.

Source code in inference/models/sam3_3d/segment_anything_3d.py

class Sam3_3D_ObjectsPipelineSingleton:
    """Singleton to cache the heavy 3D pipeline initialization."""

    _instances = weakref.WeakValueDictionary()
    _lock = Lock()

    def __new__(cls, config_key: str):
        with cls._lock:
            if config_key not in cls._instances:
                instance = super().__new__(cls)
                instance.config_key = config_key
                cls._instances[config_key] = instance
            return cls._instances[config_key]

SegmentAnything3_3D_Objects ¶

Bases: RoboflowCoreModel

Source code in inference/models/sam3_3d/segment_anything_3d.py

class SegmentAnything3_3D_Objects(RoboflowCoreModel):

    task_type = "3d-reconstruction"

    def __init__(
        self,
        *args,
        model_id: str = "sam3-3d-objects",
        torch_compile: bool = False,
        compile_res: int = 518,
        **kwargs,
    ):
        super().__init__(model_id=model_id, **kwargs)

        self.cache_dir = Path(get_cache_dir(model_id=self.endpoint))

        tdfy_dir = files(tdfy.sam3d_v1)
        pipeline_config_path = tdfy_dir / "checkpoints_configs" / "pipeline.yaml"
        moge_checkpoint_path = self.cache_dir / "moge-vitl.pth"
        ss_generator_checkpoint_path = self.cache_dir / "ss_generator.ckpt"
        slat_generator_checkpoint_path = self.cache_dir / "slat_generator.ckpt"
        ss_decoder_checkpoint_path = self.cache_dir / "ss_decoder.ckpt"
        slat_decoder_checkpoint_path = self.cache_dir / "slat_decoder_gs.ckpt"
        slat_decodergs4_checkpoint_path = self.cache_dir / "slat_decoder_gs_4.ckpt"
        slat_decoder_mesh_checkpoint_path = self.cache_dir / "slat_decoder_mesh.pt"
        dinov2_ckpt_path = self.cache_dir / "dinov2_vitl14_reg4_pretrain.pth"

        config_key = f"{DEVICE}_{pipeline_config_path}"
        singleton = Sam3_3D_ObjectsPipelineSingleton(config_key)

        if not hasattr(singleton, "pipeline"):
            self.pipeline_config = OmegaConf.load(str(pipeline_config_path))
            self.pipeline_config["device"] = DEVICE
            self.pipeline_config["workspace_dir"] = str(tdfy_dir)
            self.pipeline_config["compile_model"] = torch_compile
            self.pipeline_config["compile_res"] = compile_res
            self.pipeline_config["depth_model"]["model"][
                "pretrained_model_name_or_path"
            ] = str(moge_checkpoint_path)
            self.pipeline_config["ss_generator_ckpt_path"] = str(
                ss_generator_checkpoint_path
            )
            self.pipeline_config["slat_generator_ckpt_path"] = str(
                slat_generator_checkpoint_path
            )
            self.pipeline_config["ss_decoder_ckpt_path"] = str(
                ss_decoder_checkpoint_path
            )
            self.pipeline_config["slat_decoder_gs_ckpt_path"] = str(
                slat_decoder_checkpoint_path
            )
            self.pipeline_config["slat_decoder_gs_4_ckpt_path"] = str(
                slat_decodergs4_checkpoint_path
            )
            self.pipeline_config["slat_decoder_mesh_ckpt_path"] = str(
                slat_decoder_mesh_checkpoint_path
            )
            self.pipeline_config["dinov2_ckpt_path"] = str(dinov2_ckpt_path)
            singleton.pipeline = instantiate(self.pipeline_config)

        # Reference the singleton's pipeline
        self.pipeline = singleton.pipeline
        self._state_lock = Lock()

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["environment.json"].
        """
        return [
            "moge-vitl.pth",
            "ss_generator.ckpt",
            "slat_generator.ckpt",
            "ss_decoder.ckpt",
            "slat_decoder_gs.ckpt",
            "slat_decoder_gs_4.ckpt",
            "slat_decoder_mesh.pt",
        ]

    def download_model_from_roboflow_api(self) -> None:
        """Override parent method to use streaming downloads for large SAM3_3D model files."""
        lock_dir = MODEL_CACHE_DIR + "/_file_locks"
        os.makedirs(lock_dir, exist_ok=True)
        lock_file = os.path.join(lock_dir, f"{os.path.basename(self.cache_dir)}.lock")
        lock = FileLock(lock_file, timeout=120)
        with lock:
            api_data = get_roboflow_model_data(
                api_key=self.api_key,
                model_id="sam3-3d-weights-vc6vz/1",
                endpoint_type=ModelEndpointType.ORT,
                device_id=self.device_id,
                countinference=self.countinference,
                service_secret=self.service_secret,
            )["ort"]
            if "weights" not in api_data:
                raise ModelArtefactError(
                    f"`weights` key not available in Roboflow API response while downloading model weights."
                )
            for weights_url_key in api_data["weights"]:
                weights_url = api_data["weights"][weights_url_key]
                filename = weights_url.split("?")[0].split("/")[-1]
                stream_url_to_cache(
                    url=weights_url,
                    filename=filename,
                    model_id=self.endpoint,
                )

    def infer_from_request(
        self, request: Sam3_3D_Objects_InferenceRequest
    ) -> Sam3_3D_Objects_Response:
        with self._state_lock:
            t1 = perf_counter()
            raw_result = self.create_3d(**request.model_dump())
            inference_time = perf_counter() - t1
            return convert_3d_objects_result_to_api_response(
                raw_result=raw_result,
                inference_time=inference_time,
            )

    def create_3d(
        self,
        image: Optional[InferenceRequestImage],
        mask_input: Optional[Any] = None,
        *,
        output_meshes: bool = True,
        output_scene: bool = True,
        with_mesh_postprocess: bool = True,
        with_texture_baking: bool = True,
        use_distillations: bool = False,
        **kwargs,
    ):
        """
        Generate 3D from image and mask(s).

        Args:
            image: Input image
            mask_input: Mask in any supported format:
                - np.ndarray (H,W) or (N,H,W): Binary mask(s)
                - List[float]: COCO polygon [x1,y1,x2,y2,...]
                - List[List[float]]: Multiple polygons
                - Dict with 'counts'/'size': RLE mask
                - List[Dict]: Multiple RLE masks
        """
        with torch.inference_mode():
            if image is None or mask_input is None:
                raise ValueError("Must provide image and mask!")

            image_np = load_image_rgb(image)
            if image_np.dtype != np.uint8:
                if image_np.max() <= 1:
                    image_np = (image_np * 255).astype(np.uint8)
                else:
                    image_np = image_np.astype(np.uint8)
            image_shape = (image_np.shape[0], image_np.shape[1])

            if _is_single_mask_input(mask_input):
                masks = [convert_mask_to_binary(mask_input, image_shape)]
            elif isinstance(mask_input, np.ndarray) and mask_input.ndim == 3:
                masks = [convert_mask_to_binary(m, image_shape) for m in mask_input]
            else:
                masks = [convert_mask_to_binary(m, image_shape) for m in mask_input]

            # NOTE: mesh depends on gaussian, so we always decode gaussian
            decode_formats = ["gaussian"]
            if output_meshes:
                decode_formats.append("mesh")

            outputs = []
            for mask in masks:
                result = self.pipeline.run(
                    image=image_np,
                    mask=mask,
                    decode_formats=decode_formats,
                    with_mesh_postprocess=with_mesh_postprocess,
                    with_texture_baking=with_texture_baking,
                    use_stage1_distillation=use_distillations,
                    use_stage2_distillation=use_distillations,
                )
                outputs.append(result)

            if len(outputs) == 1:
                result = outputs[0]
                scene_gs = (
                    ready_gaussian_for_video_rendering(result["gs"])
                    if output_scene
                    else None
                )
                glb = result["glb"] if output_meshes else None
                return {
                    "gs": scene_gs,
                    "glb": glb,
                    "objects": outputs,
                }
            else:
                if output_scene:
                    scene_gs = make_scene(*outputs)
                    scene_gs = ready_gaussian_for_video_rendering(scene_gs)
                    scene_gs = apply_gaussian_view_correction(scene_gs)
                    scene_glb = make_scene_glb(*outputs) if output_meshes else None
                else:
                    scene_gs = None
                    scene_glb = None
                return {
                    "gs": scene_gs,
                    "glb": scene_glb,
                    "objects": outputs,
                }

Methods:¶

create_3d ¶

create_3d(
    image,
    mask_input=None,
    *,
    output_meshes=True,
    output_scene=True,
    with_mesh_postprocess=True,
    with_texture_baking=True,
    use_distillations=False,
    **kwargs
)

Generate 3D from image and mask(s).

Parameters:

Name	Type	Description	Default
`image`	`Optional[InferenceRequestImage]`	Input image	required
`mask_input`	`Optional[Any]`	Mask in any supported format: - np.ndarray (H,W) or (N,H,W): Binary mask(s) - List[float]: COCO polygon [x1,y1,x2,y2,...] - List[List[float]]: Multiple polygons - Dict with 'counts'/'size': RLE mask - List[Dict]: Multiple RLE masks	`None`

Source code in inference/models/sam3_3d/segment_anything_3d.py

def create_3d(
    self,
    image: Optional[InferenceRequestImage],
    mask_input: Optional[Any] = None,
    *,
    output_meshes: bool = True,
    output_scene: bool = True,
    with_mesh_postprocess: bool = True,
    with_texture_baking: bool = True,
    use_distillations: bool = False,
    **kwargs,
):
    """
    Generate 3D from image and mask(s).

    Args:
        image: Input image
        mask_input: Mask in any supported format:
            - np.ndarray (H,W) or (N,H,W): Binary mask(s)
            - List[float]: COCO polygon [x1,y1,x2,y2,...]
            - List[List[float]]: Multiple polygons
            - Dict with 'counts'/'size': RLE mask
            - List[Dict]: Multiple RLE masks
    """
    with torch.inference_mode():
        if image is None or mask_input is None:
            raise ValueError("Must provide image and mask!")

        image_np = load_image_rgb(image)
        if image_np.dtype != np.uint8:
            if image_np.max() <= 1:
                image_np = (image_np * 255).astype(np.uint8)
            else:
                image_np = image_np.astype(np.uint8)
        image_shape = (image_np.shape[0], image_np.shape[1])

        if _is_single_mask_input(mask_input):
            masks = [convert_mask_to_binary(mask_input, image_shape)]
        elif isinstance(mask_input, np.ndarray) and mask_input.ndim == 3:
            masks = [convert_mask_to_binary(m, image_shape) for m in mask_input]
        else:
            masks = [convert_mask_to_binary(m, image_shape) for m in mask_input]

        # NOTE: mesh depends on gaussian, so we always decode gaussian
        decode_formats = ["gaussian"]
        if output_meshes:
            decode_formats.append("mesh")

        outputs = []
        for mask in masks:
            result = self.pipeline.run(
                image=image_np,
                mask=mask,
                decode_formats=decode_formats,
                with_mesh_postprocess=with_mesh_postprocess,
                with_texture_baking=with_texture_baking,
                use_stage1_distillation=use_distillations,
                use_stage2_distillation=use_distillations,
            )
            outputs.append(result)

        if len(outputs) == 1:
            result = outputs[0]
            scene_gs = (
                ready_gaussian_for_video_rendering(result["gs"])
                if output_scene
                else None
            )
            glb = result["glb"] if output_meshes else None
            return {
                "gs": scene_gs,
                "glb": glb,
                "objects": outputs,
            }
        else:
            if output_scene:
                scene_gs = make_scene(*outputs)
                scene_gs = ready_gaussian_for_video_rendering(scene_gs)
                scene_gs = apply_gaussian_view_correction(scene_gs)
                scene_glb = make_scene_glb(*outputs) if output_meshes else None
            else:
                scene_gs = None
                scene_glb = None
            return {
                "gs": scene_gs,
                "glb": scene_glb,
                "objects": outputs,
            }

download_model_from_roboflow_api ¶

download_model_from_roboflow_api()

Override parent method to use streaming downloads for large SAM3_3D model files.

Source code in inference/models/sam3_3d/segment_anything_3d.py

def download_model_from_roboflow_api(self) -> None:
    """Override parent method to use streaming downloads for large SAM3_3D model files."""
    lock_dir = MODEL_CACHE_DIR + "/_file_locks"
    os.makedirs(lock_dir, exist_ok=True)
    lock_file = os.path.join(lock_dir, f"{os.path.basename(self.cache_dir)}.lock")
    lock = FileLock(lock_file, timeout=120)
    with lock:
        api_data = get_roboflow_model_data(
            api_key=self.api_key,
            model_id="sam3-3d-weights-vc6vz/1",
            endpoint_type=ModelEndpointType.ORT,
            device_id=self.device_id,
            countinference=self.countinference,
            service_secret=self.service_secret,
        )["ort"]
        if "weights" not in api_data:
            raise ModelArtefactError(
                f"`weights` key not available in Roboflow API response while downloading model weights."
            )
        for weights_url_key in api_data["weights"]:
            weights_url = api_data["weights"][weights_url_key]
            filename = weights_url.split("?")[0].split("/")[-1]
            stream_url_to_cache(
                url=weights_url,
                filename=filename,
                model_id=self.endpoint,
            )

get_infer_bucket_file_list ¶

get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name	Type	Description
`list`	`list`	A list of required files for inference, e.g., ["environment.json"].

Source code in inference/models/sam3_3d/segment_anything_3d.py

def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns:
        list: A list of required files for inference, e.g., ["environment.json"].
    """
    return [
        "moge-vitl.pth",
        "ss_generator.ckpt",
        "slat_generator.ckpt",
        "ss_decoder.ckpt",
        "slat_decoder_gs.ckpt",
        "slat_decoder_gs_4.ckpt",
        "slat_decoder_mesh.pt",
    ]

Functions:¶

apply_gaussian_view_correction ¶

apply_gaussian_view_correction(scene_gs)

Apply view correction to Gaussian scene to match GLB orientation. Used for combined scene PLY.

Source code in inference/models/sam3_3d/segment_anything_3d.py

def apply_gaussian_view_correction(scene_gs):
    """
    Apply view correction to Gaussian scene to match GLB orientation.
    Used for combined scene PLY.
    """
    xyz = scene_gs.get_xyz
    device = xyz.device
    dtype = xyz.dtype

    R_view_zup = torch.tensor(
        [[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -1.0]],
        device=device,
        dtype=dtype,
    )

    new_xyz = xyz @ R_view_zup
    scene_gs.from_xyz(new_xyz)

    q_correction = matrix_to_quaternion(R_view_zup.unsqueeze(0)).squeeze(0)
    old_rotations = scene_gs.get_rotation
    new_rotations = quaternion_multiply(
        q_correction.unsqueeze(0).expand(old_rotations.shape[0], -1), old_rotations
    )
    scene_gs.from_rotation(new_rotations)

    return scene_gs

convert_mask_to_binary ¶

convert_mask_to_binary(mask_input, image_shape)

Convert polygon, RLE, or binary mask to binary mask (H, W) with values 0/255.

Source code in inference/models/sam3_3d/segment_anything_3d.py

def convert_mask_to_binary(mask_input: Any, image_shape: Tuple[int, int]) -> np.ndarray:
    """Convert polygon, RLE, or binary mask to binary mask (H, W) with values 0/255."""
    height, width = image_shape

    if isinstance(mask_input, np.ndarray):
        return _normalize_binary_mask(mask_input, image_shape)

    if isinstance(mask_input, Image.Image):
        return _normalize_binary_mask(np.array(mask_input.convert("L")), image_shape)

    if isinstance(mask_input, dict) and "counts" in mask_input:
        if not PYCOCOTOOLS_AVAILABLE:
            raise ImportError(
                "pycocotools required for RLE. Install: pip install pycocotools"
            )
        rle = dict(mask_input)
        if isinstance(rle.get("counts"), str):
            rle["counts"] = rle["counts"].encode("utf-8")
        return _normalize_binary_mask(mask_utils.decode(rle), image_shape)

    if isinstance(mask_input, list):
        points = _parse_polygon_to_points(mask_input)
        if not points or len(points) < 3:
            return np.zeros((height, width), dtype=np.uint8)
        mask = Image.new("L", (width, height), 0)
        ImageDraw.Draw(mask).polygon(points, outline=255, fill=255)
        return np.array(mask, dtype=np.uint8)

    raise TypeError(f"Unsupported mask type: {type(mask_input)}")

make_scene_glb ¶

make_scene_glb(*outputs)

Combine multiple GLB meshes into a single scene. Applies layout transforms and a final view correction rotation.

Source code in inference/models/sam3_3d/segment_anything_3d.py

def make_scene_glb(*outputs):
    """
    Combine multiple GLB meshes into a single scene.
    Applies layout transforms and a final view correction rotation.
    """
    scene = trimesh.Scene()

    for i, output in enumerate(outputs):
        glb = output["glb"]
        glb = glb.copy()

        glb = transform_glb_to_world(
            glb,
            output["rotation"],
            output["translation"],
            output["scale"],
        )
        scene.add_geometry(glb, node_name=f"object_{i}")

    R_view = np.array([[-1, 0, 0], [0, 0, -1], [0, -1, 0]], dtype=np.float32)
    for geom_name in scene.geometry:
        mesh = scene.geometry[geom_name]
        mesh.vertices = (mesh.vertices.astype(np.float32)) @ R_view
        if (
            hasattr(mesh, "vertex_normals")
            and mesh.vertex_normals is not None
            and len(mesh.vertex_normals) > 0
        ):
            mesh.vertex_normals = (mesh.vertex_normals.astype(np.float32)) @ R_view

    return scene

prepare_individual_object_for_export ¶

prepare_individual_object_for_export(gs)

Prepare an individual object Gaussian for PLY export.

Source code in inference/models/sam3_3d/segment_anything_3d.py

def prepare_individual_object_for_export(gs):
    """
    Prepare an individual object Gaussian for PLY export.
    """
    from copy import deepcopy

    gs_copy = deepcopy(gs)
    gs_copy = ready_gaussian_for_video_rendering(gs_copy)

    xyz = gs_copy.get_xyz
    device = xyz.device
    dtype = xyz.dtype

    R_view = torch.tensor(
        [[1.0, 0.0, 0.0], [0.0, 0.0, -1.0], [0.0, 1.0, 0.0]], device=device, dtype=dtype
    )

    new_xyz = xyz @ R_view
    gs_copy.from_xyz(new_xyz)

    q_correction = matrix_to_quaternion(R_view.unsqueeze(0)).squeeze(0)
    old_rotations = gs_copy.get_rotation
    new_rotations = quaternion_multiply(
        q_correction.unsqueeze(0).expand(old_rotations.shape[0], -1), old_rotations
    )
    gs_copy.from_rotation(new_rotations)

    return gs_copy

transform_glb_to_world ¶

transform_glb_to_world(
    glb_mesh, rotation, translation, scale
)

Transform a GLB mesh from local to world coordinates.

Source code in inference/models/sam3_3d/segment_anything_3d.py

def transform_glb_to_world(glb_mesh, rotation, translation, scale):
    """
    Transform a GLB mesh from local to world coordinates.
    """
    quat = rotation.squeeze()
    quat_normalized = quat / quat.norm()
    R_layout = quaternion_to_matrix(quat_normalized).cpu().numpy()
    t = translation.squeeze().cpu().numpy()
    s = scale.squeeze().cpu().numpy()[0]

    z_to_y_up = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]], dtype=np.float32)
    y_to_z_up = np.array([[1, 0, 0], [0, 0, 1], [0, -1, 0]], dtype=np.float32)

    verts = glb_mesh.vertices.copy().astype(np.float32)

    verts = verts @ y_to_z_up

    verts = verts * s
    verts = verts @ R_layout
    verts = verts + t

    verts = verts @ z_to_y_up

    glb_mesh.vertices = verts

    if (
        hasattr(glb_mesh, "vertex_normals")
        and glb_mesh.vertex_normals is not None
        and len(glb_mesh.vertex_normals) > 0
    ):
        normals = glb_mesh.vertex_normals.copy().astype(np.float32)
        normals = normals @ y_to_z_up
        normals = normals @ R_layout
        normals = normals @ z_to_y_up
        glb_mesh.vertex_normals = normals

    return glb_mesh

`models/vit`¶

inference.models.vit.vit_classification ¶

Classes¶

VitClassification ¶

Bases: ClassificationBaseOnnxRoboflowInferenceModel

VitClassification handles classification inference for Vision Transformer (ViT) models using ONNX.

Inherits

Attributes:

Name	Type	Description
`multiclass`	`bool`	A flag that specifies if the model should handle multiclass classification.

Source code in inference/models/vit/vit_classification.py

class VitClassification(ClassificationBaseOnnxRoboflowInferenceModel):
    """VitClassification handles classification inference
    for Vision Transformer (ViT) models using ONNX.

    Inherits:
        ClassificationBaseOnnxRoboflowInferenceModel: Base class for ONNX Roboflow Inference.
        ClassificationMixin: Mixin class providing classification-specific methods.

    Attributes:
        multiclass (bool): A flag that specifies if the model should handle multiclass classification.
    """

    preprocess_means = [0.5, 0.5, 0.5]
    preprocess_stds = [0.5, 0.5, 0.5]

    def __init__(self, *args, **kwargs):
        """Initializes the VitClassification instance.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        super().__init__(*args, **kwargs)
        self.multiclass = self.environment.get("MULTICLASS", False)

    @property
    def weights_file(self) -> str:
        """Determines the weights file to be used based on the availability of AWS keys.

        If AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set, it returns the path to 'weights.onnx'.
        Otherwise, it returns the path to 'best.onnx'.

        Returns:
            str: Path to the weights file.
        """
        if AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and LAMBDA:
            return "weights.onnx"
        else:
            return "best.onnx"

Attributes¶

weights_file `property` ¶

weights_file

Determines the weights file to be used based on the availability of AWS keys.

If AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set, it returns the path to 'weights.onnx'. Otherwise, it returns the path to 'best.onnx'.

Returns:

Name	Type	Description
`str`	`str`	Path to the weights file.

Methods:¶

init ¶

__init__(*args, **kwargs)

Initializes the VitClassification instance.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list.	`()`
`**kwargs`		Arbitrary keyword arguments.	`{}`

Source code in inference/models/vit/vit_classification.py

def __init__(self, *args, **kwargs):
    """Initializes the VitClassification instance.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    super().__init__(*args, **kwargs)
    self.multiclass = self.environment.get("MULTICLASS", False)

`models/vllm_proxy`¶

inference.models.vllm_proxy.adapter_manager ¶

Resolution + registration of Roboflow models against the vLLM sidecar.

Maps a Roboflow model_id to the name it is served under in vLLM:

Base model ids (matching VLLM_SERVED_BASE_VARIANT or VLLM_SERVED_BASE_NAME) require no registration - vLLM already serves the base model.
Registry metadata is ADVISORY for fine-tunes: modelArchitecture gates pre-download (clearly-unsupported families like florence never download), but modelVariant is sometimes misregistered (e.g. image-text/223 was recorded as 0.8b-peft while its adapter_config.json declared qwen/qwen3_5-2b), so a variant mismatch against VLLM_SERVED_BASE_VARIANT only logs a warning. The AUTHORITATIVE accept/reject is the adapter's own adapter_config.json base_model_name_or_path, cross-checked against the served base inside patch_adapter (cross_check_base_model).
Fine-tunes resolve their model package via the Roboflow weights provider (the api_key is passed through so registry-side access control applies), download ONLY the adapter artifacts, run patch_adapter, and register the patched adapter with vLLM's dynamic LoRA endpoint.

Cache keys combine model_id + package_id + a content digest because package ids are NOT unique per model version.

Registration semantics (multi-process): several gunicorn workers each hold their own AdapterManager but instruct ONE shared vLLM engine, so the per-process map can go stale (worker recycles, vLLM restarts). The map is therefore only trusted to skip the expensive download/patch work; the cheap, idempotent load_lora_adapter call is ALWAYS re-issued so vLLM remains the source of truth. The manager never unloads adapters: vLLM's own --max-cpu-loras LRU bounds host memory and refills from disk, and disk growth is handled outside (pod recycle / future GC). VLLM_MAX_REGISTERED_ADAPTERS is a warn-only threshold.

Classes¶

AdapterManager ¶

Thread-safe, idempotent adapter registration (registration-only).

Never unloads adapters from vLLM - the engine's own --max-cpu-loras LRU bounds memory. The local map only short-circuits download/patch work; the idempotent vLLM registration call is always re-issued.

Source code in inference/models/vllm_proxy/adapter_manager.py

class AdapterManager:
    """Thread-safe, idempotent adapter registration (registration-only).

    Never unloads adapters from vLLM - the engine's own `--max-cpu-loras`
    LRU bounds memory. The local map only short-circuits download/patch
    work; the idempotent vLLM registration call is always re-issued.
    """

    def __init__(self, client: Optional[VLLMClient] = None):
        self._client = client or VLLMClient()
        self._lock = threading.Lock()
        self._registered: Dict[str, AdapterRegistration] = {}

    @property
    def client(self) -> VLLMClient:
        return self._client

    def resolve_and_register(
        self,
        model_id: str,
        api_key: Optional[str] = None,
        weights_provider_extra_headers: Optional[Dict[str, str]] = None,
    ) -> str:
        """Resolves `model_id` to the name it is served under in vLLM.

        Base model ids return the served base name without registration.
        Fine-tunes are downloaded, patched and registered. Idempotent, with
        a multi-process twist: when the slug is already in the local map and
        its patched dir is still on disk, the expensive download/patch work
        is skipped but the cheap `load_lora_adapter` call is ALWAYS
        re-issued - the per-process map may be stale relative to the shared
        vLLM engine (another worker's actions, vLLM restarts), and vLLM
        treats re-registration as success.

        Registry `modelArchitecture` gates pre-download (unsupported families
        are rejected before any artifact download); registry `modelVariant`
        is advisory only - a mismatch against the served base logs a warning
        and defers to the adapter's own `adapter_config.json`
        `base_model_name_or_path`, cross-checked inside `patch_adapter`.

        Raises:
            NotServableOnVLLMError: If the model's architecture is not
                supported by the vLLM proxy, or no HF package is exposed.
            AdapterNotServableError: If the adapter's declared base does not
                match the served base, or it cannot be patched into a
                vLLM-servable form.
        """
        served_base_variant = get_vllm_served_base_variant()
        served_base_name = get_vllm_served_base_name()
        # Base-model ids short-circuit: ids equal to the configured variant
        # (qwen3_5-0.8b) or to the served base name (qwen3vl-2b-instruct)
        # require no registration - vLLM already serves the base model.
        if model_id.lower() in {
            served_base_variant.lower(),
            served_base_name.lower(),
        }:
            return served_base_name
        metadata = get_model_from_provider(
            model_id=model_id,
            provider="roboflow",
            api_key=api_key,
            weights_provider_extra_headers=weights_provider_extra_headers,
        )
        if (
            not (metadata.model_architecture or "")
            .lower()
            .startswith(SUPPORTED_MODEL_ARCHITECTURES)
        ):
            raise NotServableOnVLLMError(
                f"Model {model_id} has architecture "
                f"{metadata.model_architecture!r} which is not servable by the "
                f"vLLM proxy (expected one of {SUPPORTED_MODEL_ARCHITECTURES!r})."
            )
        normalized_variant = normalize_base_variant(
            model_architecture=metadata.model_architecture,
            model_variant=metadata.model_variant,
        )
        registry_variant_matches = normalized_variant == served_base_variant.lower()
        if not registry_variant_matches:
            # ADVISORY ONLY: registry modelVariant is sometimes misregistered
            # (e.g. image-text/223 recorded as 0.8b-peft while its
            # adapter_config.json declared qwen/qwen3_5-2b). The adapter's
            # own adapter_config.json is authoritative - the cross-check in
            # `patch_adapter` accepts/rejects after download.
            logger.warning(
                "Registry variant %r for model %s (architecture %r, variant "
                "%r) does not match served base %r - deferring to "
                "adapter_config.json.",
                normalized_variant,
                model_id,
                metadata.model_architecture,
                metadata.model_variant,
                served_base_variant,
            )
        package = self._select_model_package(model_id=model_id, metadata=metadata)
        adapter_files = [
            artefact
            for artefact in package.package_artefacts
            if not artefact.file_handle.startswith(BASE_PACKAGE_DIR_PREFIX)
        ]
        if not any(
            artefact.file_handle == ADAPTER_CONFIG_FILE for artefact in adapter_files
        ):
            # No adapter artifacts - this is a base-model package of the
            # served variant.
            return get_vllm_served_base_name()
        content_digest = self._compute_content_digest(adapter_files=adapter_files)
        slug = self._build_slug(
            model_id=metadata.model_id,
            package_id=package.package_id,
            content_digest=content_digest,
        )
        with self._lock:
            existing = self._registered.get(slug)
            if existing is not None and self._try_load_existing_registration(existing):
                # Skip ONLY the expensive download/patch work. The vLLM
                # registration call must still happen: this process's map
                # may be stale (shared engine, NUM_WORKERS>1) and the call
                # is idempotent and ~ms with files already on disk.
                return slug
            registration = self._download_patch_and_load(
                slug=slug,
                model_id=metadata.model_id,
                package_id=package.package_id,
                content_digest=content_digest,
                adapter_files=adapter_files,
                registry_variant=metadata.model_variant,
                registry_variant_matches=registry_variant_matches,
            )
            self._registered[slug] = registration
            self._warn_if_over_max_registered()
        return slug

    def invalidate(self, served_name: str) -> None:
        """Drops `served_name` from the local registration map.

        Used by the request-path self-heal when vLLM reports the adapter
        unknown despite local bookkeeping (vLLM restart, desync across
        gunicorn workers): the next `resolve_and_register` re-runs the full
        path (files already on disk make it near-instant).
        """
        with self._lock:
            self._registered.pop(served_name, None)

    def get_registration(self, served_name: str) -> Optional[AdapterRegistration]:
        with self._lock:
            return self._registered.get(served_name)

    def _select_model_package(self, model_id: str, metadata):
        hf_packages = [
            package
            for package in metadata.model_packages
            if package.backend is BackendType.HF
        ]
        if not hf_packages:
            raise NotServableOnVLLMError(
                f"Model {model_id} exposes no HF model package - the vLLM "
                "proxy requires HF (PEFT adapter) packages."
            )
        return hf_packages[0]

    @staticmethod
    def _compute_content_digest(adapter_files) -> str:
        """Digest over the adapter artifact hashes (no download required).

        Package ids are not unique per model version, so the digest
        disambiguates cache entries when package content changes.
        """
        digest = hashlib.sha256()
        for artefact in sorted(adapter_files, key=lambda a: a.file_handle):
            digest.update(artefact.file_handle.encode("utf-8"))
            digest.update((artefact.md5_hash or artefact.download_url).encode("utf-8"))
        return digest.hexdigest()[:8]

    @staticmethod
    def _build_slug(model_id: str, package_id: str, content_digest: str) -> str:
        return (
            f"{sanitize_for_slug(model_id)}-{sanitize_for_slug(package_id)}"
            f"-{content_digest}"
        )

    def _download_patch_and_load(
        self,
        slug: str,
        model_id: str,
        package_id: str,
        content_digest: str,
        adapter_files,
        registry_variant: Optional[str] = None,
        registry_variant_matches: bool = True,
    ) -> AdapterRegistration:
        adapter_cache_dir = os.path.join(
            get_model_cache_dir(), ADAPTERS_CACHE_SUBDIR, slug
        )
        source_dir = os.path.join(adapter_cache_dir, "src")
        patched_dir = os.path.join(adapter_cache_dir, "patched")
        os.makedirs(adapter_cache_dir, exist_ok=True)
        with self._adapter_cache_lock(adapter_cache_dir):
            if not self._patched_adapter_ready(patched_dir):
                dora_policy = get_vllm_dora_policy()
                if dora_policy == "svd":
                    raise NotServableOnVLLMError(
                        "VLLM_DORA_POLICY=svd is not supported in the runtime "
                        "adapter manager: the manager downloads adapter-only "
                        "artifacts and intentionally prunes base/ weights. Use "
                        "VLLM_DORA_POLICY=strip or reject at runtime, or run "
                        "offline SVD conversion with an explicit base_dir."
                    )
                os.makedirs(source_dir, exist_ok=True)
                download_files_to_directory(
                    target_dir=source_dir,
                    files_specs=[
                        (artefact.file_handle, artefact.download_url, artefact.md5_hash)
                        for artefact in adapter_files
                    ],
                    verbose=False,
                )
                report = patch_adapter(
                    src_dir=source_dir,
                    dst_dir=patched_dir,
                    policy=dora_policy,
                    model_id=model_id,
                    registry_variant=registry_variant,
                )
                if (
                    not registry_variant_matches
                    and report.base_model_check == BASE_MODEL_CHECK_MATCH
                ):
                    # Drift audit: the adapter is servable here (its own config
                    # matches the served base) but the registry disagrees - flag the
                    # registry record for correction.
                    logger.warning(
                        "Registry modelVariant misregistered for %s: registry says "
                        "%r, adapter declares %r.",
                        model_id,
                        registry_variant,
                        report.base_model_name_or_path,
                    )
            self._load_adapter_into_vllm(
                slug=slug, model_id=model_id, patched_dir=patched_dir
            )
        logger.info(
            "Registered LoRA adapter %s (model_id=%s, package_id=%s) with vLLM",
            slug,
            model_id,
            package_id,
        )
        return AdapterRegistration(
            served_name=slug,
            model_id=model_id,
            package_id=package_id,
            content_digest=content_digest,
            source_dir=source_dir,
            patched_dir=patched_dir,
        )

    def _try_load_existing_registration(
        self, registration: AdapterRegistration
    ) -> bool:
        adapter_cache_dir = os.path.dirname(registration.patched_dir)
        os.makedirs(adapter_cache_dir, exist_ok=True)
        with self._adapter_cache_lock(adapter_cache_dir):
            if not self._patched_adapter_ready(registration.patched_dir):
                return False
            self._load_adapter_into_vllm(
                slug=registration.served_name,
                model_id=registration.model_id,
                patched_dir=registration.patched_dir,
            )
            return True

    @staticmethod
    def _adapter_cache_lock(adapter_cache_dir: str) -> FileLock:
        return FileLock(
            os.path.join(adapter_cache_dir, ADAPTER_CACHE_LOCK_FILE),
            timeout=ADAPTER_CACHE_LOCK_TIMEOUT_SECONDS,
        )

    @staticmethod
    def _patched_adapter_ready(patched_dir: str) -> bool:
        return os.path.isdir(patched_dir) and all(
            os.path.isfile(os.path.join(patched_dir, file_name))
            for file_name in PATCHED_ADAPTER_REQUIRED_FILES
        )

    def _load_adapter_into_vllm(
        self, slug: str, model_id: str, patched_dir: str
    ) -> None:
        """Loads the patched adapter, surfacing 5xx load failures as typed errors.

        vLLM returns HTTP 500 when an adapter passes local validation but is
        rejected at load time (e.g. tensor-shape mismatch against the served
        base) - re-raised as `AdapterNotServableError` naming the adapter and
        excerpting vLLM's response so on-call sees the real cause instead of
        an opaque proxy 500. Connection errors (`VLLMConnectionError`) and
        non-5xx HTTP errors propagate unchanged: they indicate sidecar /
        request problems, not a broken adapter, and stay retryable.
        """
        try:
            self._client.load_lora_adapter(name=slug, path=patched_dir)
        except VLLMHTTPError as error:
            if error.status_code < 500:
                raise
            body_excerpt = (error.response_body or "").strip()[:500]
            raise AdapterNotServableError(
                f"vLLM rejected LoRA adapter {slug} (model_id={model_id}) at "
                f"load time with HTTP {error.status_code}. The adapter passed "
                f"local validation but could not be loaded into the served "
                f"base model - vLLM said: {body_excerpt!r}"
            ) from error

    def _warn_if_over_max_registered(self) -> None:
        """Warn-only threshold - the manager NEVER unloads adapters.

        With NUM_WORKERS>1 every gunicorn worker instructs the same shared
        vLLM engine; unloading from one worker's bookkeeping would yank
        adapters other workers still serve. vLLM's own `--max-cpu-loras`
        LRU bounds host memory and refills from disk; disk growth is
        handled outside (pod recycle / future GC).
        """
        max_registered = get_vllm_max_registered_adapters()
        if len(self._registered) > max_registered:
            logger.warning(
                "Registered LoRA adapter count %d exceeds "
                "VLLM_MAX_REGISTERED_ADAPTERS=%d. No adapter is unloaded "
                "(vLLM's --max-cpu-loras LRU bounds memory); consider "
                "recycling the pod if disk growth becomes a concern.",
                len(self._registered),
                max_registered,
            )

Methods:¶

invalidate ¶

invalidate(served_name)

Drops served_name from the local registration map.

Used by the request-path self-heal when vLLM reports the adapter unknown despite local bookkeeping (vLLM restart, desync across gunicorn workers): the next resolve_and_register re-runs the full path (files already on disk make it near-instant).

Source code in inference/models/vllm_proxy/adapter_manager.py

def invalidate(self, served_name: str) -> None:
    """Drops `served_name` from the local registration map.

    Used by the request-path self-heal when vLLM reports the adapter
    unknown despite local bookkeeping (vLLM restart, desync across
    gunicorn workers): the next `resolve_and_register` re-runs the full
    path (files already on disk make it near-instant).
    """
    with self._lock:
        self._registered.pop(served_name, None)

resolve_and_register ¶

resolve_and_register(
    model_id,
    api_key=None,
    weights_provider_extra_headers=None,
)

Resolves model_id to the name it is served under in vLLM.

Base model ids return the served base name without registration. Fine-tunes are downloaded, patched and registered. Idempotent, with a multi-process twist: when the slug is already in the local map and its patched dir is still on disk, the expensive download/patch work is skipped but the cheap load_lora_adapter call is ALWAYS re-issued - the per-process map may be stale relative to the shared vLLM engine (another worker's actions, vLLM restarts), and vLLM treats re-registration as success.

Registry modelArchitecture gates pre-download (unsupported families are rejected before any artifact download); registry modelVariant is advisory only - a mismatch against the served base logs a warning and defers to the adapter's own adapter_config.json base_model_name_or_path, cross-checked inside patch_adapter.

Raises:

Type	Description
`NotServableOnVLLMError`	If the model's architecture is not supported by the vLLM proxy, or no HF package is exposed.
`AdapterNotServableError`	If the adapter's declared base does not match the served base, or it cannot be patched into a vLLM-servable form.

Source code in inference/models/vllm_proxy/adapter_manager.py

def resolve_and_register(
    self,
    model_id: str,
    api_key: Optional[str] = None,
    weights_provider_extra_headers: Optional[Dict[str, str]] = None,
) -> str:
    """Resolves `model_id` to the name it is served under in vLLM.

    Base model ids return the served base name without registration.
    Fine-tunes are downloaded, patched and registered. Idempotent, with
    a multi-process twist: when the slug is already in the local map and
    its patched dir is still on disk, the expensive download/patch work
    is skipped but the cheap `load_lora_adapter` call is ALWAYS
    re-issued - the per-process map may be stale relative to the shared
    vLLM engine (another worker's actions, vLLM restarts), and vLLM
    treats re-registration as success.

    Registry `modelArchitecture` gates pre-download (unsupported families
    are rejected before any artifact download); registry `modelVariant`
    is advisory only - a mismatch against the served base logs a warning
    and defers to the adapter's own `adapter_config.json`
    `base_model_name_or_path`, cross-checked inside `patch_adapter`.

    Raises:
        NotServableOnVLLMError: If the model's architecture is not
            supported by the vLLM proxy, or no HF package is exposed.
        AdapterNotServableError: If the adapter's declared base does not
            match the served base, or it cannot be patched into a
            vLLM-servable form.
    """
    served_base_variant = get_vllm_served_base_variant()
    served_base_name = get_vllm_served_base_name()
    # Base-model ids short-circuit: ids equal to the configured variant
    # (qwen3_5-0.8b) or to the served base name (qwen3vl-2b-instruct)
    # require no registration - vLLM already serves the base model.
    if model_id.lower() in {
        served_base_variant.lower(),
        served_base_name.lower(),
    }:
        return served_base_name
    metadata = get_model_from_provider(
        model_id=model_id,
        provider="roboflow",
        api_key=api_key,
        weights_provider_extra_headers=weights_provider_extra_headers,
    )
    if (
        not (metadata.model_architecture or "")
        .lower()
        .startswith(SUPPORTED_MODEL_ARCHITECTURES)
    ):
        raise NotServableOnVLLMError(
            f"Model {model_id} has architecture "
            f"{metadata.model_architecture!r} which is not servable by the "
            f"vLLM proxy (expected one of {SUPPORTED_MODEL_ARCHITECTURES!r})."
        )
    normalized_variant = normalize_base_variant(
        model_architecture=metadata.model_architecture,
        model_variant=metadata.model_variant,
    )
    registry_variant_matches = normalized_variant == served_base_variant.lower()
    if not registry_variant_matches:
        # ADVISORY ONLY: registry modelVariant is sometimes misregistered
        # (e.g. image-text/223 recorded as 0.8b-peft while its
        # adapter_config.json declared qwen/qwen3_5-2b). The adapter's
        # own adapter_config.json is authoritative - the cross-check in
        # `patch_adapter` accepts/rejects after download.
        logger.warning(
            "Registry variant %r for model %s (architecture %r, variant "
            "%r) does not match served base %r - deferring to "
            "adapter_config.json.",
            normalized_variant,
            model_id,
            metadata.model_architecture,
            metadata.model_variant,
            served_base_variant,
        )
    package = self._select_model_package(model_id=model_id, metadata=metadata)
    adapter_files = [
        artefact
        for artefact in package.package_artefacts
        if not artefact.file_handle.startswith(BASE_PACKAGE_DIR_PREFIX)
    ]
    if not any(
        artefact.file_handle == ADAPTER_CONFIG_FILE for artefact in adapter_files
    ):
        # No adapter artifacts - this is a base-model package of the
        # served variant.
        return get_vllm_served_base_name()
    content_digest = self._compute_content_digest(adapter_files=adapter_files)
    slug = self._build_slug(
        model_id=metadata.model_id,
        package_id=package.package_id,
        content_digest=content_digest,
    )
    with self._lock:
        existing = self._registered.get(slug)
        if existing is not None and self._try_load_existing_registration(existing):
            # Skip ONLY the expensive download/patch work. The vLLM
            # registration call must still happen: this process's map
            # may be stale (shared engine, NUM_WORKERS>1) and the call
            # is idempotent and ~ms with files already on disk.
            return slug
        registration = self._download_patch_and_load(
            slug=slug,
            model_id=metadata.model_id,
            package_id=package.package_id,
            content_digest=content_digest,
            adapter_files=adapter_files,
            registry_variant=metadata.model_variant,
            registry_variant_matches=registry_variant_matches,
        )
        self._registered[slug] = registration
        self._warn_if_over_max_registered()
    return slug

AdapterRegistration `dataclass` ¶

Bookkeeping record for an adapter registered with vLLM.

Source code in inference/models/vllm_proxy/adapter_manager.py

@dataclass
class AdapterRegistration:
    """Bookkeeping record for an adapter registered with vLLM."""

    served_name: str
    model_id: str
    package_id: str
    content_digest: str
    source_dir: str
    patched_dir: str

Functions:¶

get_adapter_manager ¶

get_adapter_manager()

Returns the process-wide AdapterManager singleton.

Source code in inference/models/vllm_proxy/adapter_manager.py

def get_adapter_manager() -> AdapterManager:
    """Returns the process-wide AdapterManager singleton."""
    global _ADAPTER_MANAGER
    with _ADAPTER_MANAGER_LOCK:
        if _ADAPTER_MANAGER is None:
            _ADAPTER_MANAGER = AdapterManager()
        return _ADAPTER_MANAGER

normalize_base_variant ¶

normalize_base_variant(model_architecture, model_variant)

Maps registry metadata to the canonical <architecture>-<variant> form.

VLLM_SERVED_BASE_VARIANT is configured as <architecture>-<variant-with-peft-suffix-stripped> (e.g. qwen3_5-0.8b, qwen3vl-2b). Registry metadata is inconsistent across families: qwen3_5 models report variants that already carry the architecture prefix (qwen3_5-0.8b), while qwen3vl fine-tunes report bare variants (2b-peft). Normalisation: lowercase, strip a trailing -peft (a fine-tune of base X is servable on the pool serving X), then prefix with <architecture>- unless the variant already starts with it.

Returns None when either field is missing.

Source code in inference/models/vllm_proxy/adapter_manager.py

def normalize_base_variant(
    model_architecture: Optional[str], model_variant: Optional[str]
) -> Optional[str]:
    """Maps registry metadata to the canonical `<architecture>-<variant>` form.

    `VLLM_SERVED_BASE_VARIANT` is configured as
    `<architecture>-<variant-with-peft-suffix-stripped>` (e.g.
    `qwen3_5-0.8b`, `qwen3vl-2b`). Registry metadata is inconsistent across
    families: qwen3_5 models report variants that already carry the
    architecture prefix (`qwen3_5-0.8b`), while qwen3vl fine-tunes report
    bare variants (`2b-peft`). Normalisation: lowercase, strip a trailing
    `-peft` (a fine-tune of base X is servable on the pool serving X), then
    prefix with `<architecture>-` unless the variant already starts with it.

    Returns None when either field is missing.
    """
    if not model_architecture or not model_variant:
        return None
    architecture = model_architecture.strip().lower()
    variant = model_variant.strip().lower()
    if variant.endswith(PEFT_VARIANT_SUFFIX):
        variant = variant[: -len(PEFT_VARIANT_SUFFIX)]
    if variant == architecture or variant.startswith(f"{architecture}-"):
        return variant
    return f"{architecture}-{variant}"

sanitize_for_slug ¶

sanitize_for_slug(value)

Lowercases and reduces a value to [a-z0-9-] for cache/served names.

Source code in inference/models/vllm_proxy/adapter_manager.py

def sanitize_for_slug(value: str) -> str:
    """Lowercases and reduces a value to `[a-z0-9-]` for cache/served names."""
    slug = re.sub(r"[^a-z0-9_-]+", "-", value.lower())
    slug = re.sub(r"[_-]{2,}", "-", slug)
    return slug.strip("-")

inference.models.vllm_proxy.adapter_patch ¶

Deterministic transform of Roboflow LoRA adapter packages into a form vLLM accepts.

Roboflow model packages for qwen3_5 fine-tunes contain adapter_config.json and adapter_model.safetensors produced by PEFT during training. Two issues prevent serving them directly with vLLM's dynamic LoRA loading:

Weight-key layout - Roboflow adapters store language-model keys like ...model.layers.N..., while the qwen3_5 VL architecture names them ...model.language_model.layers.N... (see refactor_adapter_weights_key in inference_models/models/qwen3_5/qwen3_5_hf.py, which performs the equivalent remap when loading via PEFT in-process). The exact layout vLLM v0.22.1 accepts is empirically unconfirmed, so the remap target is a configurable template (VLLM_ADAPTER_KEY_TEMPLATE).
DoRA - production adapters use use_dora: true; stock vLLM may reject DoRA adapters. The policy parameter controls handling: reject (default), strip (drop magnitude vectors), or svd (convert DoRA to a plain LoRA via the real merge math + truncated SVD - requires base weights).

This module is intentionally limited to pure file/tensor operations (safetensors + json + torch CPU). It performs no network access.

Classes¶

PatchReport `dataclass` ¶

Summary of a patch_adapter run, persisted as patch_report.json.

Source code in inference/models/vllm_proxy/adapter_patch.py

@dataclass
class PatchReport:
    """Summary of a `patch_adapter` run, persisted as `patch_report.json`."""

    source_dir: str
    dst_dir: str
    policy: str
    source_use_dora: bool
    lora_rank: int
    key_template: str
    target_modules: List[str] = field(default_factory=list)
    total_source_tensors: int = 0
    remapped_keys: int = 0
    dropped_vision_keys: List[str] = field(default_factory=list)
    dropped_magnitude_keys: List[str] = field(default_factory=list)
    svd_rank: Optional[int] = None
    source_weights_digest: Optional[str] = None
    patched_weights_digest: Optional[str] = None
    base_model_name_or_path: Optional[str] = None
    base_model_check: str = BASE_MODEL_CHECK_SKIPPED
    # Registry modelVariant as recorded by the weights provider - advisory
    # only (sometimes misregistered); kept alongside base_model_name_or_path
    # so registry/adapter drift can be audited from the patch report.
    registry_variant: Optional[str] = None
    notes: List[str] = field(default_factory=list)

Functions:¶

cross_check_base_model ¶

cross_check_base_model(config, model_id=None)

Cross-checks the adapter's declared base against the served base.

Registry variant metadata occasionally contradicts the adapter's own adapter_config.json (incident 2026-06-10: image-text/223 was recorded as a 0.8b-peft fine-tune but its adapter config declared qwen/qwen3_5-2b). Without this pre-flight check the mismatch only surfaces as an opaque tensor-shape RuntimeError inside vLLM's /v1/load_lora_adapter.

Returns (declared_base, check_result) where check_result is one of BASE_MODEL_CHECK_MATCH / BASE_MODEL_CHECK_SKIPPED. The check is skipped (with a warning) when base_model_name_or_path is missing/empty.

Raises:

Type	Description
`AdapterNotServableError`	When the declared base matches neither `VLLM_SERVED_BASE_VARIANT` nor `VLLM_SERVED_BASE_NAME`.

Source code in inference/models/vllm_proxy/adapter_patch.py

def cross_check_base_model(
    config: dict, model_id: Optional[str] = None
) -> Tuple[Optional[str], str]:
    """Cross-checks the adapter's declared base against the served base.

    Registry variant metadata occasionally contradicts the adapter's own
    `adapter_config.json` (incident 2026-06-10: `image-text/223` was recorded
    as a `0.8b-peft` fine-tune but its adapter config declared
    `qwen/qwen3_5-2b`). Without this pre-flight check the mismatch only
    surfaces as an opaque tensor-shape `RuntimeError` inside vLLM's
    `/v1/load_lora_adapter`.

    Returns `(declared_base, check_result)` where `check_result` is one of
    `BASE_MODEL_CHECK_MATCH` / `BASE_MODEL_CHECK_SKIPPED`. The check is
    skipped (with a warning) when `base_model_name_or_path` is missing/empty.

    Raises:
        AdapterNotServableError: When the declared base matches neither
            `VLLM_SERVED_BASE_VARIANT` nor `VLLM_SERVED_BASE_NAME`.
    """
    declared_base = (config.get("base_model_name_or_path") or "").strip()
    if not declared_base:
        logger.warning(
            "Adapter config for model %s declares no base_model_name_or_path "
            "- skipping the served-base cross-check.",
            model_id or "<unknown>",
        )
        return None, BASE_MODEL_CHECK_SKIPPED
    served_base_variant = get_vllm_served_base_variant()
    served_base_name = get_vllm_served_base_name()
    normalized_declared = normalize_base_model_reference(declared_base)
    if normalized_declared in {
        normalize_base_model_reference(served_base_variant),
        normalize_base_model_reference(served_base_name),
    }:
        return declared_base, BASE_MODEL_CHECK_MATCH
    raise AdapterNotServableError(
        f"Adapter for model {model_id or '<unknown>'} declares "
        f"base_model_name_or_path={declared_base!r} in its "
        f"adapter_config.json, but this vLLM deployment serves base "
        f"{served_base_variant!r} (VLLM_SERVED_BASE_VARIANT; served name "
        f"{served_base_name!r}). The model's registry variant metadata "
        f"contradicts the adapter's own config - this is a registry data "
        f"bug: fix the recorded modelVariant for "
        f"{model_id or 'this model'} so it matches the adapter's true base "
        f"{declared_base!r}."
    )

extract_module_path ¶

extract_module_path(key)

Returns the module path of a LoRA tensor key, or None for other keys.

E.g. base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight -> base_model.model.model.layers.0.self_attn.q_proj.

Source code in inference/models/vllm_proxy/adapter_patch.py

def extract_module_path(key: str) -> Optional[str]:
    """Returns the module path of a LoRA tensor key, or None for other keys.

    E.g. `base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight`
    -> `base_model.model.model.layers.0.self_attn.q_proj`.
    """
    for marker in _LORA_TENSOR_MARKERS:
        index = key.find(marker)
        if index != -1:
            return key[:index]
    index = key.find(f".{_MAGNITUDE_MARKER}")
    if index != -1:
        return key[:index]
    return None

is_vision_tensor_key ¶

is_vision_tensor_key(key)

True if the tensor's module path belongs to the vision encoder.

Source code in inference/models/vllm_proxy/adapter_patch.py

def is_vision_tensor_key(key: str) -> bool:
    """True if the tensor's module path belongs to the vision encoder."""
    module_path = extract_module_path(key) or key
    return any(marker in module_path for marker in VISION_MODULE_MARKERS)

normalize_base_model_reference ¶

normalize_base_model_reference(value)

Normalises a base-model reference for the served-base cross-check.

Lowercases, strips any org prefix (qwen/qwen3_5-2b -> qwen3_5-2b) and drops separator characters, so qwen/qwen3_5-0.8b, Qwen3.5-0.8B and qwen3_5-0.8b all compare equal, while genuinely different bases (qwen3_5-2b vs qwen3_5-0.8b) stay distinct.

Source code in inference/models/vllm_proxy/adapter_patch.py

def normalize_base_model_reference(value: str) -> str:
    """Normalises a base-model reference for the served-base cross-check.

    Lowercases, strips any org prefix (`qwen/qwen3_5-2b` -> `qwen3_5-2b`) and
    drops separator characters, so `qwen/qwen3_5-0.8b`, `Qwen3.5-0.8B` and
    `qwen3_5-0.8b` all compare equal, while genuinely different bases
    (`qwen3_5-2b` vs `qwen3_5-0.8b`) stay distinct.
    """
    value = value.strip().lower()
    if "/" in value:
        value = value.rsplit("/", 1)[-1]
    return re.sub(r"[^a-z0-9]+", "", value)

patch_adapter ¶

patch_adapter(
    src_dir,
    dst_dir,
    policy="reject",
    base_dir=None,
    max_lora_rank=None,
    vision_norm_threshold=None,
    key_template=None,
    model_id=None,
    registry_variant=None,
)

Transforms the adapter in src_dir into a vLLM-servable one in dst_dir.

Pipeline

Validate adapter_config.json (modules_to_save empty, rank within VLLM_MAX_LORA_RANK), and cross-check the adapter's own base_model_name_or_path against the served base (VLLM_SERVED_BASE_VARIANT / VLLM_SERVED_BASE_NAME) - registry variant metadata can be wrong, and loading an adapter trained on a different base fails deep inside vLLM with an opaque tensor-shape error.
Drop vision-tower tensors; raise if any dropped vision lora_B tensor has a norm above VLLM_VISION_LORA_NORM_THRESHOLD (a meaningfully trained vision adapter cannot be approximated by a language-only LoRA).
Apply the DoRA policy (reject / strip / svd).
Remap weight keys to the configured vLLM/PEFT layout.
Rewrite adapter_config.json (intersect target_modules with the supported language-module set, strip unsupported fields).
Write patched weights + config + patch_report.json into dst_dir.

Parameters:

Name	Type	Description	Default
`src_dir`	`str`	Directory with the downloaded Roboflow adapter artifacts.	required
`dst_dir`	`str`	Output directory for the patched adapter.	required
`policy`	`str`	DoRA handling policy, one of `reject` / `strip` / `svd`.	`'reject'`
`base_dir`	`Optional[str]`	Directory holding base model safetensors - required for the `svd` policy. Runtime AdapterManager calls intentionally download adapter-only artifacts, so `svd` is reserved for offline/lab conversion paths that pass base weights explicitly.	`None`
`max_lora_rank`	`Optional[int]`	Maximum accepted LoRA rank (defaults to `VLLM_MAX_LORA_RANK`).	`None`
`vision_norm_threshold`	`Optional[float]`	Norm threshold above which a vision `lora_B` tensor marks the adapter as not servable (defaults to `VLLM_VISION_LORA_NORM_THRESHOLD`).	`None`
`key_template`	`Optional[str]`	Remap target template with a `{suffix}` placeholder (defaults to `VLLM_ADAPTER_KEY_TEMPLATE`).	`None`
`model_id`	`Optional[str]`	Roboflow model id the adapter belongs to - only used to make error/log messages actionable.	`None`
`registry_variant`	`Optional[str]`	Registry `modelVariant` as recorded by the weights provider - advisory only, recorded in the patch report for registry/adapter drift auditing.	`None`

Raises:

Type	Description
`AdapterNotServableError`	If the adapter cannot be made servable.

Source code in inference/models/vllm_proxy/adapter_patch.py

def patch_adapter(
    src_dir: str,
    dst_dir: str,
    policy: str = "reject",
    base_dir: Optional[str] = None,
    max_lora_rank: Optional[int] = None,
    vision_norm_threshold: Optional[float] = None,
    key_template: Optional[str] = None,
    model_id: Optional[str] = None,
    registry_variant: Optional[str] = None,
) -> PatchReport:
    """Transforms the adapter in `src_dir` into a vLLM-servable one in `dst_dir`.

    Pipeline:
        1. Validate `adapter_config.json` (`modules_to_save` empty, rank within
           `VLLM_MAX_LORA_RANK`), and cross-check the adapter's own
           `base_model_name_or_path` against the served base
           (`VLLM_SERVED_BASE_VARIANT` / `VLLM_SERVED_BASE_NAME`) - registry
           variant metadata can be wrong, and loading an adapter trained on a
           different base fails deep inside vLLM with an opaque tensor-shape
           error.
        2. Drop vision-tower tensors; raise if any dropped vision `lora_B`
           tensor has a norm above `VLLM_VISION_LORA_NORM_THRESHOLD` (a
           meaningfully trained vision adapter cannot be approximated by a
           language-only LoRA).
        3. Apply the DoRA `policy` (`reject` / `strip` / `svd`).
        4. Remap weight keys to the configured vLLM/PEFT layout.
        5. Rewrite `adapter_config.json` (intersect `target_modules` with the
           supported language-module set, strip unsupported fields).
        6. Write patched weights + config + `patch_report.json` into `dst_dir`.

    Args:
        src_dir: Directory with the downloaded Roboflow adapter artifacts.
        dst_dir: Output directory for the patched adapter.
        policy: DoRA handling policy, one of `reject` / `strip` / `svd`.
        base_dir: Directory holding base model safetensors - required for the
            `svd` policy. Runtime AdapterManager calls intentionally download
            adapter-only artifacts, so `svd` is reserved for offline/lab
            conversion paths that pass base weights explicitly.
        max_lora_rank: Maximum accepted LoRA rank (defaults to
            `VLLM_MAX_LORA_RANK`).
        vision_norm_threshold: Norm threshold above which a vision `lora_B`
            tensor marks the adapter as not servable (defaults to
            `VLLM_VISION_LORA_NORM_THRESHOLD`).
        key_template: Remap target template with a `{suffix}` placeholder
            (defaults to `VLLM_ADAPTER_KEY_TEMPLATE`).
        model_id: Roboflow model id the adapter belongs to - only used to
            make error/log messages actionable.
        registry_variant: Registry `modelVariant` as recorded by the weights
            provider - advisory only, recorded in the patch report for
            registry/adapter drift auditing.

    Raises:
        AdapterNotServableError: If the adapter cannot be made servable.
    """
    if policy not in DORA_POLICIES:
        raise ValueError(
            f"Unknown DoRA policy: {policy!r} - expected one of {DORA_POLICIES}."
        )
    if max_lora_rank is None:
        max_lora_rank = get_vllm_max_lora_rank()
    if vision_norm_threshold is None:
        vision_norm_threshold = get_vllm_vision_lora_norm_threshold()
    if key_template is None:
        key_template = get_vllm_adapter_key_template()

    config = _load_adapter_config(adapter_dir=src_dir)
    _validate_adapter_config(config=config, max_lora_rank=max_lora_rank)
    declared_base, base_model_check = cross_check_base_model(
        config=config, model_id=model_id
    )
    source_use_dora = bool(config.get("use_dora", False))
    lora_rank = int(config["r"])

    weights_path = os.path.join(src_dir, ADAPTER_WEIGHTS_FILE)
    if not os.path.isfile(weights_path):
        raise AdapterNotServableError(
            f"Adapter package in {src_dir} is missing {ADAPTER_WEIGHTS_FILE}."
        )
    tensors = load_file(weights_path)

    report = PatchReport(
        source_dir=os.path.abspath(src_dir),
        dst_dir=os.path.abspath(dst_dir),
        policy=policy,
        source_use_dora=source_use_dora,
        lora_rank=lora_rank,
        key_template=key_template,
        total_source_tensors=len(tensors),
        source_weights_digest=_sha256_of_file(weights_path),
        base_model_name_or_path=declared_base,
        base_model_check=base_model_check,
        registry_variant=registry_variant,
    )

    tensors = _drop_vision_tensors(
        tensors=tensors,
        vision_norm_threshold=vision_norm_threshold,
        report=report,
    )

    if source_use_dora:
        if policy == "reject":
            raise AdapterNotServableError(
                "Adapter uses DoRA (`use_dora: true`), which is rejected under "
                "the configured `VLLM_DORA_POLICY=reject`. Set the policy to "
                "`strip` at runtime, or run offline `svd` conversion with "
                "base weights."
            )
        if policy == "strip":
            tensors = _strip_magnitude_vectors(tensors=tensors, report=report)
            report.notes.append(
                "DoRA magnitude vectors stripped - served adapter approximates "
                "the trained DoRA adapter."
            )
        elif policy == "svd":
            if base_dir is None:
                raise AdapterNotServableError(
                    "DoRA policy `svd` requires base model weights "
                    "(`base_dir` was not provided). Runtime AdapterManager "
                    "downloads adapter-only artifacts; reserve `svd` for "
                    "offline conversion paths that pass base weights explicitly."
                )
            svd_rank = min(lora_rank, max_lora_rank)
            base_weight_lookup = _build_base_weight_lookup(base_dir=base_dir)
            tensors = _convert_dora_tensors_to_plain_lora(
                tensors=tensors,
                config=config,
                base_weight_lookup=base_weight_lookup,
                rank=svd_rank,
            )
            report.svd_rank = svd_rank
            report.notes.append(
                "DoRA adapter converted to plain LoRA via merged-weight SVD "
                f"truncation (rank={svd_rank})."
            )
            config = _rewrite_config_for_svd(config=config, rank=svd_rank)
            lora_rank = svd_rank
    else:
        # Plain-LoRA adapters may still carry stray magnitude keys - drop them.
        tensors = _strip_magnitude_vectors(tensors=tensors, report=report)

    remapped_tensors = {}
    for key, tensor in tensors.items():
        new_key = remap_adapter_weight_key(key=key, key_template=key_template)
        if new_key != key:
            report.remapped_keys += 1
        remapped_tensors[new_key] = tensor

    config = _rewrite_adapter_config(config=config, policy=policy)
    report.target_modules = list(config["target_modules"])
    if not remapped_tensors:
        raise AdapterNotServableError(
            "Adapter contains no servable LoRA tensors after filtering."
        )

    dst_dir = os.path.abspath(dst_dir)
    dst_parent = os.path.dirname(dst_dir)
    os.makedirs(dst_parent, exist_ok=True)
    staging_dir = tempfile.mkdtemp(
        prefix=f".{os.path.basename(dst_dir)}-", dir=dst_parent
    )
    try:
        patched_weights_path = os.path.join(staging_dir, ADAPTER_WEIGHTS_FILE)
        save_file(remapped_tensors, patched_weights_path)
        report.patched_weights_digest = _sha256_of_file(patched_weights_path)
        with open(os.path.join(staging_dir, ADAPTER_CONFIG_FILE), "w") as f:
            json.dump(config, f, indent=2, sort_keys=True)
        report.notes.append(
            "The exact adapter key layout accepted by vLLM v0.22.1 is empirically "
            "unconfirmed - adjust VLLM_ADAPTER_KEY_TEMPLATE if vLLM rejects the "
            "patched adapter."
        )
        with open(os.path.join(staging_dir, PATCH_REPORT_FILE), "w") as f:
            json.dump(dataclasses.asdict(report), f, indent=2)
        _replace_directory(src_dir=staging_dir, dst_dir=dst_dir)
        staging_dir = None
    finally:
        if staging_dir is not None:
            _remove_path(staging_dir)
    return report

remap_adapter_weight_key ¶

remap_adapter_weight_key(key, key_template)

Remaps a Roboflow adapter weight key to the vLLM/PEFT expected layout.

Roboflow qwen3_5 adapters store language-model keys as base_model.model.model.layers.N.... while the qwen3_5 VL architecture names those modules model.language_model.layers.N..... This function rewrites any key under model.layers. / model.language_model.layers. (with or without the PEFT base_model.model. prefix) using key_template, which receives the part after the layers prefix as {suffix}. The default template preserves the standard PEFT base_model.model. prefix convention:

base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight
    -> base_model.model.model.language_model.layers.0.self_attn.q_proj.lora_A.weight

Keys that do not point into the language-model layers are returned unchanged.

Source code in inference/models/vllm_proxy/adapter_patch.py

def remap_adapter_weight_key(key: str, key_template: str) -> str:
    """Remaps a Roboflow adapter weight key to the vLLM/PEFT expected layout.

    Roboflow qwen3_5 adapters store language-model keys as
    `base_model.model.model.layers.N....` while the qwen3_5 VL architecture
    names those modules `model.language_model.layers.N....`. This function
    rewrites any key under `model.layers.` / `model.language_model.layers.`
    (with or without the PEFT `base_model.model.` prefix) using
    `key_template`, which receives the part after the layers prefix as
    `{suffix}`. The default template preserves the standard PEFT
    `base_model.model.` prefix convention:

        base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight
            -> base_model.model.model.language_model.layers.0.self_attn.q_proj.lora_A.weight

    Keys that do not point into the language-model layers are returned
    unchanged.
    """
    core = key
    if core.startswith(PEFT_KEY_PREFIX):
        core = core[len(PEFT_KEY_PREFIX) :]
    for layers_prefix in _LANGUAGE_LAYERS_PREFIXES:
        if core.startswith(layers_prefix):
            suffix = core[len(layers_prefix) :]
            return key_template.format(suffix=suffix)
    return key

svd_convert ¶

svd_convert(base_dir, adapter_dir, dst_dir, rank)

Converts a DoRA adapter into a plain LoRA via merged-weight SVD.

Math (mirrors PEFT's DoRA merge - peft/tuners/lora/dora.py / Linear.merge):

scale     = lora_alpha / r            (or lora_alpha / sqrt(r) with rsLoRA)
W         = W0 + scale * (B @ A)      # candidate direction matrix
col_norm  = ||W||_2 along dim=1       # one norm per output row, i.e.
                                      # "column-wise" norm of W^T as in
                                      # the DoRA paper
W_merged  = (m / col_norm).view(-1, 1) * W   # m = lora_magnitude_vector
dW        = W_merged - W0

dW is then SVD-truncated: U S V^T = svd(dW), B_new = U[:, :rank] sqrt(S[:rank]), A_new = sqrt(S[:rank]) V[:rank]^T, and the emitted config uses lora_alpha = rank so the effective scale is 1 and B_new @ A_new directly approximates dW. Note that dW is not low-rank in general (the per-row rescaling perturbs all of W0), so this is an approximation; it is exact when the magnitude vector equals the column norm (i.e. the DoRA rescaling is a no-op).

Runs on CPU; written for clarity rather than speed.

Parameters:

Name	Type	Description	Default
`base_dir`	`str`	Directory holding base model safetensors (the `base/` dir of a Roboflow model package).	required
`adapter_dir`	`str`	Directory holding the DoRA adapter (`adapter_config.json` + `adapter_model.safetensors`).	required
`dst_dir`	`str`	Output directory for the plain-LoRA adapter.	required
`rank`	`int`	Rank of the emitted LoRA.	required

Source code in inference/models/vllm_proxy/adapter_patch.py

def svd_convert(base_dir: str, adapter_dir: str, dst_dir: str, rank: int) -> None:
    """Converts a DoRA adapter into a plain LoRA via merged-weight SVD.

    Math (mirrors PEFT's DoRA merge - `peft/tuners/lora/dora.py` /
    `Linear.merge`):

        scale     = lora_alpha / r            (or lora_alpha / sqrt(r) with rsLoRA)
        W         = W0 + scale * (B @ A)      # candidate direction matrix
        col_norm  = ||W||_2 along dim=1       # one norm per output row, i.e.
                                              # "column-wise" norm of W^T as in
                                              # the DoRA paper
        W_merged  = (m / col_norm).view(-1, 1) * W   # m = lora_magnitude_vector
        dW        = W_merged - W0

    `dW` is then SVD-truncated: `U S V^T = svd(dW)`,
    `B_new = U[:, :rank] sqrt(S[:rank])`, `A_new = sqrt(S[:rank]) V[:rank]^T`,
    and the emitted config uses `lora_alpha = rank` so the effective scale is
    1 and `B_new @ A_new` directly approximates `dW`. Note that `dW` is not
    low-rank in general (the per-row rescaling perturbs all of `W0`), so this
    is an approximation; it is exact when the magnitude vector equals the
    column norm (i.e. the DoRA rescaling is a no-op).

    Runs on CPU; written for clarity rather than speed.

    Args:
        base_dir: Directory holding base model safetensors (the `base/` dir of
            a Roboflow model package).
        adapter_dir: Directory holding the DoRA adapter
            (`adapter_config.json` + `adapter_model.safetensors`).
        dst_dir: Output directory for the plain-LoRA adapter.
        rank: Rank of the emitted LoRA.
    """
    config = _load_adapter_config(adapter_dir=adapter_dir)
    tensors = load_file(os.path.join(adapter_dir, ADAPTER_WEIGHTS_FILE))
    base_weight_lookup = _build_base_weight_lookup(base_dir=base_dir)
    converted = _convert_dora_tensors_to_plain_lora(
        tensors=tensors,
        config=config,
        base_weight_lookup=base_weight_lookup,
        rank=rank,
    )
    config = _rewrite_config_for_svd(config=config, rank=rank)
    os.makedirs(dst_dir, exist_ok=True)
    save_file(converted, os.path.join(dst_dir, ADAPTER_WEIGHTS_FILE))
    with open(os.path.join(dst_dir, ADAPTER_CONFIG_FILE), "w") as f:
        json.dump(config, f, indent=2, sort_keys=True)

inference.models.vllm_proxy.config ¶

Environment configuration for the vLLM proxy backend.

All env vars specific to this package are read here with os.getenv (the package must not require changes to inference/core/env.py). Values that may be changed between requests/tests are exposed as functions reading the environment at call time; only the top-level enablement switch is resolved at import time (it controls model-class registration which also happens at import time).

Functions:¶

get_vllm_served_base_name ¶

get_vllm_served_base_name()

Name under which vLLM serves the base model (--served-model-name).

Defaults to the served base variant.

Source code in inference/models/vllm_proxy/config.py

def get_vllm_served_base_name() -> str:
    """Name under which vLLM serves the base model (`--served-model-name`).

    Defaults to the served base variant.
    """
    return os.getenv("VLLM_SERVED_BASE_NAME", get_vllm_served_base_variant())

inference.models.vllm_proxy.errors ¶

Typed exceptions for the vLLM proxy backend.

NotServableOnVLLMError / AdapterNotServableError subclass ModelDeploymentNotSupportedError, which the existing HTTP error handlers (inference/core/interfaces/http/error_handlers.py) surface as a 501 response with str(error) as the message - the same convention used when a model type is not supported by a deployment.

Classes¶

AdapterNotServableError ¶

Bases: NotServableOnVLLMError

The model's LoRA adapter cannot be transformed into a form vLLM accepts.

Raised e.g. for adapters with modules_to_save, excessive rank, DoRA adapters under the reject policy, or adapters that meaningfully trained the vision tower.

Source code in inference/models/vllm_proxy/errors.py

class AdapterNotServableError(NotServableOnVLLMError):
    """The model's LoRA adapter cannot be transformed into a form vLLM accepts.

    Raised e.g. for adapters with `modules_to_save`, excessive rank, DoRA
    adapters under the `reject` policy, or adapters that meaningfully trained
    the vision tower.
    """

NotServableOnVLLMError ¶

Bases: ModelDeploymentNotSupportedError

The requested model cannot be served by the configured vLLM sidecar.

Raised e.g. when the model's base variant does not match the base model loaded in vLLM.

Source code in inference/models/vllm_proxy/errors.py

class NotServableOnVLLMError(ModelDeploymentNotSupportedError):
    """The requested model cannot be served by the configured vLLM sidecar.

    Raised e.g. when the model's base variant does not match the base model
    loaded in vLLM.
    """

VLLMConnectionError ¶

Bases: VLLMProxyError

Could not connect to (or timed out talking to) the vLLM sidecar.

Source code in inference/models/vllm_proxy/errors.py

class VLLMConnectionError(VLLMProxyError):
    """Could not connect to (or timed out talking to) the vLLM sidecar."""

VLLMHTTPError ¶

Bases: VLLMProxyError

The vLLM sidecar returned an HTTP error response.

Source code in inference/models/vllm_proxy/errors.py

class VLLMHTTPError(VLLMProxyError):
    """The vLLM sidecar returned an HTTP error response."""

    def __init__(self, message: str, status_code: int, response_body: Optional[str]):
        super().__init__(message)
        self.status_code = status_code
        self.response_body = response_body

VLLMProxyError ¶

Bases: Exception

Base class for errors talking to the vLLM sidecar.

Source code in inference/models/vllm_proxy/errors.py

class VLLMProxyError(Exception):
    """Base class for errors talking to the vLLM sidecar."""

inference.models.vllm_proxy.qwen3_5_vllm ¶

Qwen3.5 VL model class proxying generation to a vLLM sidecar.

Mirrors inference/models/qwen3_5vl/qwen3_5vl_inference_models.py (the in-process HF adapter): the inference server keeps performing auth/billing/model-resolution/image-preprocessing per request, while generation runs in the vLLM container (continuous batching + dynamic LoRA).

Preprocessing mirrors Qwen35HF.pre_process_generation (inference_models/models/qwen3_5/qwen3_5_hf.py): the same <system_prompt> split semantics and the same pixel budget the HF processor applies (min 163232 / max 5123232 with patch factor 32). Postprocessing replicates Qwen35HF.post_process_generation think-tag parsing so responses are shape-identical to the HF path.

Shared proxy mechanics live in qwen_vllm_base.QwenVLVLLMProxy; this module holds only the qwen3_5-specific bits.

Classes¶

Qwen35VLLMProxy ¶

Bases: QwenVLLMProxyBase

Qwen3.5 VL served via a vLLM sidecar (base model + dynamic LoRA).

Source code in inference/models/vllm_proxy/qwen3_5_vllm.py

class Qwen35VLLMProxy(QwenVLLMProxyBase):
    """Qwen3.5 VL served via a vLLM sidecar (base model + dynamic LoRA)."""

    image_patch_factor = IMAGE_PATCH_FACTOR
    min_pixels = MIN_PIXELS
    max_pixels = MAX_PIXELS
    default_system_prompt = DEFAULT_SYSTEM_PROMPT
    default_max_new_tokens = INFERENCE_MODELS_QWEN3_5_DEFAULT_MAX_NEW_TOKENS
    supports_thinking = True

    def _get_adapter_manager(self) -> AdapterManager:
        # Module-level lookup keeps `get_adapter_manager` patchable on this
        # module in tests.
        return get_adapter_manager()

    def post_process_text(self, text: str, **kwargs) -> Union[str, Dict[str, str]]:
        return post_process_generated_text(
            text=text,
            enable_thinking=kwargs.get("enable_thinking", False),
        )

Functions:¶

post_process_generated_text ¶

post_process_generated_text(text, enable_thinking=False)

Replicates Qwen35HF.post_process_generation for a single decoded text.

Cleans common artifacts and parses <think>...</think> blocks. When enable_thinking is set, the opening <think> tag is prepended when missing (the HF path's generation prompt ends with <think>\n, so the tag is absent from generated tokens; vLLM applies the same chat template and is expected to behave identically - the guard keeps parsing correct either way).

Source code in inference/models/vllm_proxy/qwen3_5_vllm.py

def post_process_generated_text(
    text: str, enable_thinking: bool = False
) -> Union[str, Dict[str, str]]:
    """Replicates Qwen35HF.post_process_generation for a single decoded text.

    Cleans common artifacts and parses `<think>...</think>` blocks. When
    `enable_thinking` is set, the opening `<think>` tag is prepended when
    missing (the HF path's generation prompt ends with `<think>\\n`, so the
    tag is absent from generated tokens; vLLM applies the same chat template
    and is expected to behave identically - the guard keeps parsing correct
    either way).
    """
    text = text.replace("<|im_end|>", "")
    text = text.replace("<|endoftext|>", "")
    text = text.replace("assistant\n", "")
    text = text.replace(" addCriterion\n", "")
    if enable_thinking:
        if not text.lstrip().startswith("<think>"):
            text = "<think>" + text
        think_match = re.search(r"<think>(.*?)</think>", text, flags=re.DOTALL)
        if think_match:
            thinking = think_match.group(1).strip()
            answer = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
        else:
            # Model hit max tokens before producing </think>.
            thinking = text.replace("<think>", "").strip()
            answer = ""
        return {"thinking": thinking, "answer": answer}
    text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL)
    return text.strip()

smart_resize_dimensions ¶

smart_resize_dimensions(
    height,
    width,
    factor=IMAGE_PATCH_FACTOR,
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS,
)

Computes the (height, width) the Qwen3.5 image processor would resize to.

Source code in inference/models/vllm_proxy/qwen3_5_vllm.py

def smart_resize_dimensions(
    height: int,
    width: int,
    factor: int = IMAGE_PATCH_FACTOR,
    min_pixels: int = MIN_PIXELS,
    max_pixels: int = MAX_PIXELS,
) -> Tuple[int, int]:
    """Computes the (height, width) the Qwen3.5 image processor would resize to."""
    return _smart_resize_dimensions(
        height=height,
        width=width,
        factor=factor,
        min_pixels=min_pixels,
        max_pixels=max_pixels,
    )

split_prompt_and_system_prompt ¶

split_prompt_and_system_prompt(prompt)

Replicates the <system_prompt> split from Qwen35HF.pre_process_generation.

Source code in inference/models/vllm_proxy/qwen3_5_vllm.py

def split_prompt_and_system_prompt(prompt: Optional[str]) -> Tuple[str, str]:
    """Replicates the `<system_prompt>` split from Qwen35HF.pre_process_generation."""
    return _split_prompt_and_system_prompt(
        prompt=prompt,
        default_system_prompt=DEFAULT_SYSTEM_PROMPT,
        default_prompt=DEFAULT_PROMPT,
    )

inference.models.vllm_proxy.qwen3vl_vllm ¶

Qwen3-VL (instruct) model class proxying generation to a vLLM sidecar.

Mirrors inference/models/qwen3vl/qwen3vl_inference_models.py (the in-process HF adapter): the inference server keeps performing auth/billing/model-resolution/image-preprocessing per request, while generation runs in the vLLM container (continuous batching + dynamic LoRA).

Preprocessing mirrors Qwen3VLHF.pre_process_generation (inference_models/models/qwen3vl/qwen3vl_hf.py): the same <system_prompt> split semantics and the same pixel budget the HF processor is configured with (min 2562828 / max 12802828; the patch factor is 32 from the Qwen3-VL checkpoint's preprocessor config: patch_size 16 * merge_size 2). Postprocessing replicates Qwen3VLHF.post_process_generation: plain artifact cleanup only - qwen3vl-instruct has NO thinking mode, so there is no think-tag parsing and <think> is never prepended.

Shared proxy mechanics live in qwen_vllm_base.QwenVLLMProxyBase; this module holds only the qwen3vl-specific bits.

Classes¶

Qwen3VLVLLMProxy ¶

Bases: QwenVLLMProxyBase

Qwen3-VL served via a vLLM sidecar (base model + dynamic LoRA).

Source code in inference/models/vllm_proxy/qwen3vl_vllm.py

class Qwen3VLVLLMProxy(QwenVLLMProxyBase):
    """Qwen3-VL served via a vLLM sidecar (base model + dynamic LoRA)."""

    image_patch_factor = IMAGE_PATCH_FACTOR
    min_pixels = MIN_PIXELS
    max_pixels = MAX_PIXELS
    default_system_prompt = DEFAULT_SYSTEM_PROMPT
    default_max_new_tokens = INFERENCE_MODELS_QWEN3_VL_DEFAULT_MAX_NEW_TOKENS
    # qwen3vl-instruct has no thinking mode - `enable_thinking` is never
    # forwarded to the chat template.
    supports_thinking = False

    def _get_adapter_manager(self) -> AdapterManager:
        # Module-level lookup keeps `get_adapter_manager` patchable on this
        # module in tests.
        return get_adapter_manager()

    def post_process_text(self, text: str, **kwargs) -> str:
        return post_process_generated_text(text=text)

Functions:¶

post_process_generated_text ¶

post_process_generated_text(text)

Replicates Qwen3VLHF.post_process_generation for a single decoded text.

The HF path decodes with skip_special_tokens=True and only cleans the assistant\n / addCriterion\n artifacts; the special-token replacements below mirror that decode behaviour for the vLLM response (which normally carries no special tokens either). There is NO thinking mode for qwen3vl-instruct: <think> tags are neither prepended nor parsed - any such text is returned verbatim, exactly like the HF path.

Source code in inference/models/vllm_proxy/qwen3vl_vllm.py

def post_process_generated_text(text: str) -> str:
    """Replicates Qwen3VLHF.post_process_generation for a single decoded text.

    The HF path decodes with `skip_special_tokens=True` and only cleans the
    `assistant\\n` / ` addCriterion\\n` artifacts; the special-token
    replacements below mirror that decode behaviour for the vLLM response
    (which normally carries no special tokens either). There is NO thinking
    mode for qwen3vl-instruct: `<think>` tags are neither prepended nor
    parsed - any such text is returned verbatim, exactly like the HF path.
    """
    text = text.replace("<|im_end|>", "")
    text = text.replace("<|endoftext|>", "")
    text = text.replace("assistant\n", "")
    text = text.replace(" addCriterion\n", "")
    return text.strip()

smart_resize_dimensions ¶

smart_resize_dimensions(
    height,
    width,
    factor=IMAGE_PATCH_FACTOR,
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS,
)

Computes the (height, width) the Qwen3-VL image processor would resize to.

Source code in inference/models/vllm_proxy/qwen3vl_vllm.py

def smart_resize_dimensions(
    height: int,
    width: int,
    factor: int = IMAGE_PATCH_FACTOR,
    min_pixels: int = MIN_PIXELS,
    max_pixels: int = MAX_PIXELS,
) -> Tuple[int, int]:
    """Computes the (height, width) the Qwen3-VL image processor would resize to."""
    return _smart_resize_dimensions(
        height=height,
        width=width,
        factor=factor,
        min_pixels=min_pixels,
        max_pixels=max_pixels,
    )

split_prompt_and_system_prompt ¶

split_prompt_and_system_prompt(prompt)

Replicates the <system_prompt> split from Qwen3VLHF.pre_process_generation.

Source code in inference/models/vllm_proxy/qwen3vl_vllm.py

def split_prompt_and_system_prompt(prompt: Optional[str]) -> Tuple[str, str]:
    """Replicates the `<system_prompt>` split from Qwen3VLHF.pre_process_generation."""
    return _split_prompt_and_system_prompt(
        prompt=prompt,
        default_system_prompt=DEFAULT_SYSTEM_PROMPT,
        default_prompt=DEFAULT_PROMPT,
    )

inference.models.vllm_proxy.qwen_vllm_base ¶

Shared logic for Qwen VL family models proxied to a vLLM sidecar.

The proxy classes mirror the in-process HF adapters (see inference/models/qwen3_5vl/ and inference/models/qwen3vl/): the inference server keeps performing auth/billing/model-resolution/image-preprocessing per request, while generation runs in the vLLM container (continuous batching + dynamic LoRA).

QwenVLLMProxyBase implements everything that is identical across families: message construction (the <system_prompt> split semantics are shared by all Qwen HF adapters), smart-resize to the HF processor's pixel budget, the chat-completion call and the response shape. Family-specific bits are class attributes / hooks on the subclass:

image_patch_factor / min_pixels / max_pixels - the pixel budget the family's HF AutoProcessor is configured with.
default_system_prompt - differs between families.
default_max_new_tokens - each family reads its own env-configured default.
supports_thinking - whether enable_thinking is forwarded to the chat template (qwen3_5 only; qwen3vl-instruct has no thinking mode).
post_process_text - family-specific decoded-text cleanup (think-tag parsing for qwen3_5, plain artifact cleanup for qwen3vl).
_get_adapter_manager - defined in the family module so its module-level get_adapter_manager symbol stays patchable in tests.

Classes¶

QwenVLLMProxyBase ¶

Bases: Model

Base class for Qwen VL models served via a vLLM sidecar.

Source code in inference/models/vllm_proxy/qwen_vllm_base.py

class QwenVLLMProxyBase(Model):
    """Base class for Qwen VL models served via a vLLM sidecar."""

    # Family-specific knobs - subclasses must define these.
    image_patch_factor: int
    min_pixels: int
    max_pixels: int
    default_system_prompt: str
    default_max_new_tokens: int
    default_prompt: str = DEFAULT_PROMPT
    supports_thinking: bool = False

    def __init__(self, model_id: str, api_key: str = None, **kwargs):
        super().__init__()
        self.metrics = {"num_inferences": 0, "avg_inference_time": 0.0}
        self.api_key = api_key if api_key else API_KEY
        self.task_type = "lmm"
        self.model_id = model_id
        # Kept for the request-path self-heal, which re-resolves the adapter
        # when the shared vLLM engine no longer knows it (engine restart /
        # per-worker map desync under NUM_WORKERS>1).
        self._weights_provider_extra_headers = get_extra_weights_provider_headers(
            countinference=kwargs.get("countinference"),
            service_secret=kwargs.get("service_secret"),
        )
        self._adapter_manager = self._get_adapter_manager()
        # Cheap "load": resolves metadata, downloads/patches/registers the
        # adapter when needed - no model weights are loaded in this process.
        self._served_name = self._adapter_manager.resolve_and_register(
            model_id=model_id,
            api_key=self.api_key,
            weights_provider_extra_headers=self._weights_provider_extra_headers,
        )
        self._client = self._adapter_manager.client
        self._inference_config = self._load_inference_config()

    def _get_adapter_manager(self) -> AdapterManager:
        """Family modules override this so their module-level
        `get_adapter_manager` symbol stays patchable in tests."""
        return get_adapter_manager()

    def _load_inference_config(self) -> Optional[InferenceConfig]:
        """Parses inference_config.json from the adapter package if present.

        Mirrors the HF adapters' from_pretrained, which parses the config
        with the same allowed resize modes (the HF path holds the config
        without applying it during generation pre-processing - the
        processor's smart resize governs sizing; the same applies here).
        """
        registration = self._adapter_manager.get_registration(self._served_name)
        if registration is None:
            return None
        inference_config_path = os.path.join(
            registration.source_dir, "inference_config.json"
        )
        if not os.path.exists(inference_config_path):
            return None
        return parse_inference_config(
            config_path=inference_config_path,
            allowed_resize_modes=ALLOWED_RESIZE_MODES,
        )

    def preprocess(self, image: Any, prompt: str = "", **kwargs):
        is_batch = isinstance(image, list)
        if is_batch:
            raise ValueError("This model does not support batched-inference.")
        np_image = load_image_bgr(
            image,
            disable_preproc_auto_orient=kwargs.get(
                "disable_preproc_auto_orient", False
            ),
        )
        input_shape = PreprocessReturnMetadata({"image_dims": np_image.shape[:2][::-1]})
        user_prompt, system_prompt = split_prompt_and_system_prompt(
            prompt=prompt,
            default_system_prompt=self.default_system_prompt,
            default_prompt=self.default_prompt,
        )
        image_base64 = self._encode_image_to_png_base64(np_image=np_image)
        messages = [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_prompt}],
            },
            {
                "role": "user",
                "content": [
                    build_image_content_part(image_base64=image_base64),
                    {"type": "text", "text": user_prompt},
                ],
            },
        ]
        return messages, input_shape

    def _encode_image_to_png_base64(self, np_image: np.ndarray) -> str:
        height, width = np_image.shape[:2]
        target_height, target_width = smart_resize_dimensions(
            height=height,
            width=width,
            factor=self.image_patch_factor,
            min_pixels=self.min_pixels,
            max_pixels=self.max_pixels,
        )
        if (target_height, target_width) != (height, width):
            np_image = cv2.resize(
                np_image,
                (target_width, target_height),
                interpolation=cv2.INTER_CUBIC,
            )
        success, encoded_image = cv2.imencode(".png", np_image)
        if not success:
            raise ValueError("Could not encode input image to PNG.")
        return base64.b64encode(encoded_image.tobytes()).decode("ascii")

    def predict(self, inputs: List[dict], **kwargs) -> str:
        max_new_tokens = kwargs.get("max_new_tokens")
        if max_new_tokens is None:
            max_new_tokens = self.default_max_new_tokens
        chat_template_kwargs = self._build_chat_template_kwargs(kwargs=kwargs)
        try:
            response = self._chat_completion(
                inputs=inputs,
                max_new_tokens=max_new_tokens,
                chat_template_kwargs=chat_template_kwargs,
            )
        except VLLMHTTPError as error:
            if not self._is_unknown_served_model_error(error=error):
                raise
            # Self-heal: this process registered the adapter, but the SHARED
            # vLLM engine no longer knows it (engine restart, or per-worker
            # registration-map desync under NUM_WORKERS>1). Files are on the
            # shared volume, so re-registration is ~ms. Retry exactly once -
            # a second unknown-model failure propagates.
            logger.warning(
                "vLLM does not know served model %s (model_id=%s) despite "
                "local registration - re-registering and retrying once. "
                "vLLM said: %r",
                self._served_name,
                self.model_id,
                (error.response_body or "")[:200],
            )
            self._adapter_manager.invalidate(served_name=self._served_name)
            self._served_name = self._adapter_manager.resolve_and_register(
                model_id=self.model_id,
                api_key=self.api_key,
                weights_provider_extra_headers=self._weights_provider_extra_headers,
            )
            response = self._chat_completion(
                inputs=inputs,
                max_new_tokens=max_new_tokens,
                chat_template_kwargs=chat_template_kwargs,
            )
        return response["choices"][0]["message"]["content"] or ""

    def _chat_completion(
        self,
        inputs: List[dict],
        max_new_tokens: int,
        chat_template_kwargs: Optional[Dict[str, Any]],
    ) -> Dict[str, Any]:
        return self._client.chat_completion(
            model=self._served_name,
            messages=inputs,
            temperature=0,
            max_tokens=max_new_tokens,
            chat_template_kwargs=chat_template_kwargs,
        )

    def _is_unknown_served_model_error(self, error: VLLMHTTPError) -> bool:
        """True iff vLLM rejected the request because OUR served model is unknown.

        vLLM's OpenAI server answers 404 with `The model `<name>` does not
        exist.` for unknown served models. Matched defensively: HTTP 404 +
        the served name in the body + a not-found phrasing. Anything else
        (other adapters, genuine 4xx/5xx) must NOT trigger the self-heal
        retry.
        """
        if error.status_code != 404:
            return False
        body = (error.response_body or "").lower()
        if self._served_name.lower() not in body:
            return False
        return "does not exist" in body or "not found" in body

    def _build_chat_template_kwargs(
        self, kwargs: Dict[str, Any]
    ) -> Optional[Dict[str, Any]]:
        if not self.supports_thinking:
            # The family's chat template has no thinking switch - forwarding
            # `enable_thinking` would be a template error / silent no-op.
            return None
        if "enable_thinking" in kwargs and kwargs["enable_thinking"] is not None:
            return {"enable_thinking": bool(kwargs["enable_thinking"])}
        return None

    def post_process_text(self, text: str, **kwargs) -> Union[str, Dict[str, str]]:
        """Family-specific cleanup of the decoded generation."""
        raise NotImplementedError

    def postprocess(
        self,
        predictions: str,
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> List[LMMInferenceResponse]:
        result = self.post_process_text(text=predictions, **kwargs)
        return [
            LMMInferenceResponse(
                response=result,
                image=InferenceResponseImage(
                    width=preprocess_return_metadata["image_dims"][0],
                    height=preprocess_return_metadata["image_dims"][1],
                ),
            )
        ]

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        pass

Methods:¶

post_process_text ¶

post_process_text(text, **kwargs)

Family-specific cleanup of the decoded generation.

Source code in inference/models/vllm_proxy/qwen_vllm_base.py

def post_process_text(self, text: str, **kwargs) -> Union[str, Dict[str, str]]:
    """Family-specific cleanup of the decoded generation."""
    raise NotImplementedError

Functions:¶

smart_resize_dimensions ¶

smart_resize_dimensions(
    height, width, factor, min_pixels, max_pixels
)

Computes the (height, width) the Qwen image processor would resize to.

Mirrors the smart_resize math of the HF Qwen VL image processors so the image sent to vLLM carries the same pixel budget the in-process HF path used (min/max pixels, dimensions divisible by the patch factor).

Source code in inference/models/vllm_proxy/qwen_vllm_base.py

def smart_resize_dimensions(
    height: int,
    width: int,
    factor: int,
    min_pixels: int,
    max_pixels: int,
) -> Tuple[int, int]:
    """Computes the (height, width) the Qwen image processor would resize to.

    Mirrors the `smart_resize` math of the HF Qwen VL image processors so the
    image sent to vLLM carries the same pixel budget the in-process HF path
    used (min/max pixels, dimensions divisible by the patch factor).
    """
    if max(height, width) / min(height, width) > 200:
        raise ValueError(
            "Absolute aspect ratio must be smaller than 200, got "
            f"{max(height, width) / min(height, width)}"
        )
    h_bar = max(factor, round(height / factor) * factor)
    w_bar = max(factor, round(width / factor) * factor)
    if h_bar * w_bar > max_pixels:
        beta = math.sqrt((height * width) / max_pixels)
        h_bar = math.floor(height / beta / factor) * factor
        w_bar = math.floor(width / beta / factor) * factor
    elif h_bar * w_bar < min_pixels:
        beta = math.sqrt(min_pixels / (height * width))
        h_bar = math.ceil(height * beta / factor) * factor
        w_bar = math.ceil(width * beta / factor) * factor
    return h_bar, w_bar

split_prompt_and_system_prompt ¶

split_prompt_and_system_prompt(
    prompt,
    default_system_prompt,
    default_prompt=DEFAULT_PROMPT,
)

Replicates the <system_prompt> split shared by the Qwen HF adapters.

Source code in inference/models/vllm_proxy/qwen_vllm_base.py

def split_prompt_and_system_prompt(
    prompt: Optional[str],
    default_system_prompt: str,
    default_prompt: str = DEFAULT_PROMPT,
) -> Tuple[str, str]:
    """Replicates the `<system_prompt>` split shared by the Qwen HF adapters."""
    if prompt is None:
        return default_prompt, default_system_prompt
    split_prompt = prompt.split("<system_prompt>")
    if len(split_prompt) == 1:
        return split_prompt[0] or default_prompt, default_system_prompt
    return (
        split_prompt[0] or default_prompt,
        split_prompt[1] or default_system_prompt,
    )

inference.models.vllm_proxy.vllm_client ¶

HTTP client for the vLLM sidecar (OpenAI-compatible API).

The vLLM container owns the GPU and exposes: - POST /v1/chat/completions - generation (with image_url base64 data-URI content parts for multimodal input), - POST /v1/load_lora_adapter / POST /v1/unload_lora_adapter - dynamic LoRA registration (requires VLLM_ALLOW_RUNTIME_LORA_UPDATING=True on the vLLM side), - GET /v1/models, GET /health.

A single shared requests.Session with a large connection pool is used so many concurrent uvicorn workers / threads can proxy requests without exhausting sockets.

Classes¶

VLLMClient ¶

Thin, typed client for the vLLM sidecar HTTP API.

Source code in inference/models/vllm_proxy/vllm_client.py

class VLLMClient:
    """Thin, typed client for the vLLM sidecar HTTP API."""

    def __init__(
        self,
        base_url: Optional[str] = None,
        request_timeout_s: Optional[float] = None,
        pool_size: int = CONNECTION_POOL_SIZE,
    ):
        self._base_url = (base_url or get_vllm_base_url()).rstrip("/")
        self._request_timeout_s = (
            request_timeout_s
            if request_timeout_s is not None
            else get_vllm_request_timeout_s()
        )
        self._session = requests.Session()
        adapter = HTTPAdapter(pool_connections=pool_size, pool_maxsize=pool_size)
        self._session.mount("http://", adapter)
        self._session.mount("https://", adapter)

    @property
    def base_url(self) -> str:
        return self._base_url

    def chat_completion(
        self,
        model: str,
        messages: List[Dict[str, Any]],
        temperature: float = 0.0,
        max_tokens: Optional[int] = None,
        chat_template_kwargs: Optional[Dict[str, Any]] = None,
        **extra_body: Any,
    ) -> Dict[str, Any]:
        """Runs `POST /v1/chat/completions` and returns the decoded response.

        Args:
            model: Served model name - either the base model name or a
                registered LoRA adapter name.
            messages: OpenAI-style chat messages. Image inputs must be
                provided as `image_url` content parts with base64 data URIs
                (see `build_image_content_part`).
            temperature: Sampling temperature (0 = greedy, matching the HF
                path's `do_sample=False` default).
            max_tokens: Maximum number of tokens to generate.
            chat_template_kwargs: Extra kwargs forwarded to the chat template
                (e.g. `{"enable_thinking": True}` for Qwen3.5).
            **extra_body: Additional fields merged into the request payload.
        """
        payload: Dict[str, Any] = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
        }
        if max_tokens is not None:
            payload["max_tokens"] = max_tokens
        if chat_template_kwargs is not None:
            payload["chat_template_kwargs"] = chat_template_kwargs
        payload.update(extra_body)
        response = self._request("POST", "/v1/chat/completions", json=payload)
        return response.json()

    def load_lora_adapter(self, name: str, path: str) -> None:
        """Registers a LoRA adapter stored at `path` under `name`.

        Idempotent: vLLM responds with 400 and an 'already been loaded'
        message when the adapter is registered twice - that case is treated
        as success.
        """
        try:
            self._request(
                "POST",
                "/v1/load_lora_adapter",
                json={"lora_name": name, "lora_path": path},
            )
        except VLLMHTTPError as error:
            if error.status_code == 400 and "already been loaded" in (
                error.response_body or ""
            ):
                logger.debug("LoRA adapter %s already loaded in vLLM", name)
                return
            raise

    def unload_lora_adapter(self, name: str) -> None:
        """Unregisters the LoRA adapter served under `name`.

        Operational/debugging use only - the AdapterManager never calls
        this: with NUM_WORKERS>1 all workers share one vLLM engine, so an
        automatic unload from one worker would break the others. vLLM's
        `--max-cpu-loras` LRU bounds memory by itself.
        """
        self._request("POST", "/v1/unload_lora_adapter", json={"lora_name": name})

    def list_models(self) -> List[Dict[str, Any]]:
        """Returns the list of served models (base + registered adapters)."""
        response = self._request("GET", "/v1/models")
        return response.json().get("data", [])

    def health(self) -> bool:
        """Returns True if the vLLM sidecar reports healthy."""
        try:
            response = self._session.get(
                f"{self._base_url}/health", timeout=self._request_timeout_s
            )
        except _CONNECTIVITY_ERRORS:
            return False
        return response.status_code == 200

    def _request(self, method: str, path: str, **kwargs: Any) -> requests.Response:
        url = f"{self._base_url}{path}"
        correlation_value = get_correlation_value()
        if correlation_value:
            headers = dict(kwargs.pop("headers", None) or {})
            headers.setdefault(CORRELATION_HEADER, correlation_value)
            kwargs["headers"] = headers
        try:
            response = self._session.request(
                method, url, timeout=self._request_timeout_s, **kwargs
            )
        except _CONNECTIVITY_ERRORS as error:
            raise VLLMConnectionError(
                f"Could not reach vLLM sidecar at {url}: {error}"
            ) from error
        if response.status_code >= 400:
            raise VLLMHTTPError(
                message=f"vLLM sidecar returned HTTP {response.status_code} "
                f"for {method} {path}",
                status_code=response.status_code,
                response_body=response.text,
            )
        return response

Methods:¶

chat_completion ¶

chat_completion(
    model,
    messages,
    temperature=0.0,
    max_tokens=None,
    chat_template_kwargs=None,
    **extra_body
)

Runs POST /v1/chat/completions and returns the decoded response.

Parameters:

Name	Type	Description	Default
`model`	`str`	Served model name - either the base model name or a registered LoRA adapter name.	required
`messages`	`List[Dict[str, Any]]`	OpenAI-style chat messages. Image inputs must be provided as `image_url` content parts with base64 data URIs (see `build_image_content_part`).	required
`temperature`	`float`	Sampling temperature (0 = greedy, matching the HF path's `do_sample=False` default).	`0.0`
`max_tokens`	`Optional[int]`	Maximum number of tokens to generate.	`None`
`chat_template_kwargs`	`Optional[Dict[str, Any]]`	Extra kwargs forwarded to the chat template (e.g. `{"enable_thinking": True}` for Qwen3.5).	`None`
`**extra_body`	`Any`	Additional fields merged into the request payload.	`{}`

Source code in inference/models/vllm_proxy/vllm_client.py

def chat_completion(
    self,
    model: str,
    messages: List[Dict[str, Any]],
    temperature: float = 0.0,
    max_tokens: Optional[int] = None,
    chat_template_kwargs: Optional[Dict[str, Any]] = None,
    **extra_body: Any,
) -> Dict[str, Any]:
    """Runs `POST /v1/chat/completions` and returns the decoded response.

    Args:
        model: Served model name - either the base model name or a
            registered LoRA adapter name.
        messages: OpenAI-style chat messages. Image inputs must be
            provided as `image_url` content parts with base64 data URIs
            (see `build_image_content_part`).
        temperature: Sampling temperature (0 = greedy, matching the HF
            path's `do_sample=False` default).
        max_tokens: Maximum number of tokens to generate.
        chat_template_kwargs: Extra kwargs forwarded to the chat template
            (e.g. `{"enable_thinking": True}` for Qwen3.5).
        **extra_body: Additional fields merged into the request payload.
    """
    payload: Dict[str, Any] = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
    }
    if max_tokens is not None:
        payload["max_tokens"] = max_tokens
    if chat_template_kwargs is not None:
        payload["chat_template_kwargs"] = chat_template_kwargs
    payload.update(extra_body)
    response = self._request("POST", "/v1/chat/completions", json=payload)
    return response.json()

health ¶

health()

Returns True if the vLLM sidecar reports healthy.

Source code in inference/models/vllm_proxy/vllm_client.py

def health(self) -> bool:
    """Returns True if the vLLM sidecar reports healthy."""
    try:
        response = self._session.get(
            f"{self._base_url}/health", timeout=self._request_timeout_s
        )
    except _CONNECTIVITY_ERRORS:
        return False
    return response.status_code == 200

list_models ¶

list_models()

Returns the list of served models (base + registered adapters).

Source code in inference/models/vllm_proxy/vllm_client.py

def list_models(self) -> List[Dict[str, Any]]:
    """Returns the list of served models (base + registered adapters)."""
    response = self._request("GET", "/v1/models")
    return response.json().get("data", [])

load_lora_adapter ¶

load_lora_adapter(name, path)

Registers a LoRA adapter stored at path under name.

Idempotent: vLLM responds with 400 and an 'already been loaded' message when the adapter is registered twice - that case is treated as success.

Source code in inference/models/vllm_proxy/vllm_client.py

def load_lora_adapter(self, name: str, path: str) -> None:
    """Registers a LoRA adapter stored at `path` under `name`.

    Idempotent: vLLM responds with 400 and an 'already been loaded'
    message when the adapter is registered twice - that case is treated
    as success.
    """
    try:
        self._request(
            "POST",
            "/v1/load_lora_adapter",
            json={"lora_name": name, "lora_path": path},
        )
    except VLLMHTTPError as error:
        if error.status_code == 400 and "already been loaded" in (
            error.response_body or ""
        ):
            logger.debug("LoRA adapter %s already loaded in vLLM", name)
            return
        raise

unload_lora_adapter ¶

unload_lora_adapter(name)

Unregisters the LoRA adapter served under name.

Operational/debugging use only - the AdapterManager never calls this: with NUM_WORKERS>1 all workers share one vLLM engine, so an automatic unload from one worker would break the others. vLLM's --max-cpu-loras LRU bounds memory by itself.

Source code in inference/models/vllm_proxy/vllm_client.py

def unload_lora_adapter(self, name: str) -> None:
    """Unregisters the LoRA adapter served under `name`.

    Operational/debugging use only - the AdapterManager never calls
    this: with NUM_WORKERS>1 all workers share one vLLM engine, so an
    automatic unload from one worker would break the others. vLLM's
    `--max-cpu-loras` LRU bounds memory by itself.
    """
    self._request("POST", "/v1/unload_lora_adapter", json={"lora_name": name})

Functions:¶

build_image_content_part ¶

build_image_content_part(
    image_base64, mime_type="image/png"
)

Builds an OpenAI-style image_url content part with a base64 data URI.

Source code in inference/models/vllm_proxy/vllm_client.py

def build_image_content_part(
    image_base64: str, mime_type: str = "image/png"
) -> Dict[str, Any]:
    """Builds an OpenAI-style `image_url` content part with a base64 data URI."""
    return {
        "type": "image_url",
        "image_url": {"url": f"data:{mime_type};base64,{image_base64}"},
    }

get_correlation_value ¶

get_correlation_value()

Best-effort read of the platform correlation id for the current context.

Prefers the workflow execution_id contextvar (inference_sdk.config, set from the EXECUTION_ID_HEADER request header / by the workflow executor) and falls back to the ASGI correlation_id (request id) contextvar - the same two sources inference.core.logger injects into structured log records. Deliberately defensive: any import or lookup failure yields None so the client also works outside a request context (and in environments without these packages).

Source code in inference/models/vllm_proxy/vllm_client.py

def get_correlation_value() -> Optional[str]:
    """Best-effort read of the platform correlation id for the current context.

    Prefers the workflow `execution_id` contextvar (`inference_sdk.config`,
    set from the `EXECUTION_ID_HEADER` request header / by the workflow
    executor) and falls back to the ASGI `correlation_id` (request id)
    contextvar - the same two sources `inference.core.logger` injects into
    structured log records. Deliberately defensive: any import or lookup
    failure yields None so the client also works outside a request context
    (and in environments without these packages).
    """
    try:
        from inference_sdk.config import execution_id

        value = execution_id.get() if execution_id is not None else None
        if value:
            return str(value)
    except Exception:
        pass
    try:
        from asgi_correlation_id import correlation_id

        value = correlation_id.get()
        if value:
            return str(value)
    except Exception:
        pass
    return None

`models/yolact`¶

inference.models.yolact.yolact_instance_segmentation ¶

Classes¶

YOLACT ¶