`inference_cli` API Reference¶

`lib`¶

Internal adapters for Docker container management, benchmarking, cloud deployment, and inference execution.

inference_cli.lib.container_adapter ¶

Functions:¶

terminate_running_containers ¶

terminate_running_containers(
    containers, interactive_mode=True
)

Parameters:

Name	Type	Description	Default
`containers`	`List[Container]`	List of containers to handle	required
`interactive_mode`	`bool`	Flag to determine if user prompt should decide on container termination	`True`

boolean value that informs if there are containers that have not received SIGKILL

Type	Description
`bool`	as a result of procedure.

Source code in inference_cli/lib/container_adapter.py

def terminate_running_containers(
    containers: List[Container], interactive_mode: bool = True
) -> bool:
    """
    Args:
        containers (List[Container]): List of containers to handle
        interactive_mode (bool): Flag to determine if user prompt should decide on container termination

    Returns: boolean value that informs if there are containers that have not received SIGKILL
        as a result of procedure.
    """
    running_inference_containers = [
        c for c in containers if is_container_running(container=c)
    ]
    containers_to_kill = running_inference_containers
    if interactive_mode:
        containers_to_kill = [
            c for c in running_inference_containers if ask_user_to_kill_container(c)
        ]
    kill_containers(containers=containers_to_kill)
    return len(containers_to_kill) < len(running_inference_containers)

`lib/enterprise/inference_compiler/core/compilation_handlers`¶

inference_cli.lib.enterprise.inference_compiler.core.compilation_handlers.engine_builder ¶

Classes¶

EngineBuilder ¶

Parses an ONNX graph and builds a TensorRT engine from it.

Source code in inference_cli/lib/enterprise/inference_compiler/core/compilation_handlers/engine_builder.py

class EngineBuilder:
    """
    Parses an ONNX graph and builds a TensorRT engine from it.
    """

    def __init__(
        self,
        workspace: int,
    ):
        self.trt_logger = InferenceTRTLogger()
        trt.init_libnvinfer_plugins(self.trt_logger, namespace="")
        self.builder = trt.Builder(self.trt_logger)
        self.config = self.builder.create_builder_config()
        self.config.set_memory_pool_limit(
            trt.MemoryPoolType.WORKSPACE, workspace * (2**30)
        )
        self.network: Optional[trt.tensorrt.INetworkDefinition] = None
        self.parser: Optional[trt.OnnxParser] = None
        self.cache_manager: Optional[TimingCacheManager] = None

    def set_timing_cache_manager(self, cache_manager: TimingCacheManager) -> None:
        self.cache_manager = cache_manager

    def create_network(self, onnx_path: str) -> None:
        """
        Parse the ONNX graph and create the corresponding TensorRT network definition.
        :param onnx_path: The path to the ONNX graph to load.
        """
        self.network = self.builder.create_network(0)
        self.parser = trt.OnnxParser(self.network, self.trt_logger)

        onnx_path = os.path.realpath(onnx_path)
        with open(onnx_path, "rb") as f:
            if not self.parser.parse(f.read()):
                LOGGER.error("Failed to load ONNX file: {}".format(onnx_path))
                for error in range(self.parser.num_errors):
                    LOGGER.error(self.parser.get_error(error))
                raise NetworkParsingError("Could not parse ONNX file")

        network_inputs = [
            self.network.get_input(i) for i in range(self.network.num_inputs)
        ]
        network_outputs = [
            self.network.get_output(i) for i in range(self.network.num_outputs)
        ]
        LOGGER.info("Network Description")
        for network_input in network_inputs:
            LOGGER.info(
                "Input '{}' with shape {} and dtype {}".format(
                    network_input.name, network_input.shape, network_input.dtype
                )
            )
        for network_output in network_outputs:
            LOGGER.info(
                "Output '{}' with shape {} and dtype {}".format(
                    network_output.name, network_output.shape, network_output.dtype
                )
            )

    def get_static_batch_size_of_input(self) -> int:
        network_input = self._get_image_input()
        try:
            return int(network_input.shape[0])
        except ValueError as error:
            raise InvalidNetworkInputsError(
                f"Expected the input to have static batch size, detected shape: {network_input.shape}"
            ) from error

    def create_engine(
        self,
        engine_path: str,
        precision: Literal["fp32", "fp16"],
        input_size: Tuple[int, int],
        dynamic_batch_sizes: Optional[Tuple[int, int, int]] = None,
        trt_version_compatible: bool = False,
        same_compute_compatibility: bool = False,
    ) -> None:
        if self.cache_manager:
            cache_bytes = self.cache_manager.get_cache_for_features()
            cache = self.config.create_timing_cache(cache_bytes)
            self.config.set_timing_cache(cache, ignore_mismatch=False)
        engine_path = os.path.abspath(engine_path)
        engine_dir = os.path.dirname(engine_path)
        os.makedirs(engine_dir, exist_ok=True)
        LOGGER.info("Building {} Engine in {}".format(precision, engine_path))
        network_input = self._get_image_input()
        input_name = network_input.name
        if precision == "fp16":
            if not self.builder.platform_has_fast_fp16:
                raise QuantizationNotSupportedError("FP16 quantization not supported")
            self.config.set_flag(trt.BuilderFlag.FP16)
        if trt_version_compatible:
            self.config.set_flag(trt.BuilderFlag.VERSION_COMPATIBLE)
        if same_compute_compatibility:
            self.config.hardware_compatibility_level = (
                trt.HardwareCompatibilityLevel.SAME_COMPUTE_CAPABILITY
            )
        profile = self.builder.create_optimization_profile()
        if dynamic_batch_sizes:
            bs_min, bs_opt, bs_max = dynamic_batch_sizes
            h, w = input_size
            profile.set_shape(
                input_name, (bs_min, 3, h, w), (bs_opt, 3, h, w), (bs_max, 3, h, w)
            )
        self.config.add_optimization_profile(profile)
        engine_bytes = self.builder.build_serialized_network(self.network, self.config)
        if engine_bytes is None:
            raise TRTModelCompilationError("Failed to create TRT engine")
        with open(engine_path, "wb") as f:
            LOGGER.info("Serializing engine to file: {:}".format(engine_path))
            f.write(engine_bytes)
        if self.cache_manager:
            cache = self.config.get_timing_cache()
            self.cache_manager.save_cache_for_features(cache=cache.serialize())

    def _get_image_input(self) -> trt.ITensor:
        if self.network is None:
            raise TRTModelCompilationError(
                "Attempted to get network input before parsing the model"
            )
        network_inputs = [
            self.network.get_input(i) for i in range(self.network.num_inputs)
        ]
        if len(network_inputs) != 1:
            raise InvalidNetworkInputsError("Detected network with multiple inputs")
        return network_inputs[0]

Methods:¶

create_network ¶

create_network(onnx_path)

Parse the ONNX graph and create the corresponding TensorRT network definition. :param onnx_path: The path to the ONNX graph to load.

Source code in inference_cli/lib/enterprise/inference_compiler/core/compilation_handlers/engine_builder.py

def create_network(self, onnx_path: str) -> None:
    """
    Parse the ONNX graph and create the corresponding TensorRT network definition.
    :param onnx_path: The path to the ONNX graph to load.
    """
    self.network = self.builder.create_network(0)
    self.parser = trt.OnnxParser(self.network, self.trt_logger)

    onnx_path = os.path.realpath(onnx_path)
    with open(onnx_path, "rb") as f:
        if not self.parser.parse(f.read()):
            LOGGER.error("Failed to load ONNX file: {}".format(onnx_path))
            for error in range(self.parser.num_errors):
                LOGGER.error(self.parser.get_error(error))
            raise NetworkParsingError("Could not parse ONNX file")

    network_inputs = [
        self.network.get_input(i) for i in range(self.network.num_inputs)
    ]
    network_outputs = [
        self.network.get_output(i) for i in range(self.network.num_outputs)
    ]
    LOGGER.info("Network Description")
    for network_input in network_inputs:
        LOGGER.info(
            "Input '{}' with shape {} and dtype {}".format(
                network_input.name, network_input.shape, network_input.dtype
            )
        )
    for network_output in network_outputs:
        LOGGER.info(
            "Output '{}' with shape {} and dtype {}".format(
                network_output.name, network_output.shape, network_output.dtype
            )
        )

`lib/enterprise/inference_compiler/core`¶

inference_cli.lib.enterprise.inference_compiler.core.entities ¶

Classes¶

CompilationPipelineResult `dataclass` ¶

Outcome of compile → install → register/upload.

Source code in inference_cli/lib/enterprise/inference_compiler/core/entities.py

@dataclass
class CompilationPipelineResult:
    """Outcome of compile → install → register/upload."""

    model_id: str
    model_architecture: str
    compiled: bool = False
    installed_local: bool = False
    local_package_id: Optional[str] = None
    local_install_path: Optional[str] = None
    registered_platform: bool = False
    uploaded_sealed: bool = False
    compile_error: Optional[str] = None
    register_error: Optional[str] = None
    backend: str = "onnx_cuda"
    reason: str = ""
    variant_outcomes: List[CompilationVariantOutcome] = field(default_factory=list)

    def as_log_metadata(self) -> dict:
        return {
            "model_id": self.model_id,
            "model_architecture": self.model_architecture,
            "compiled": self.compiled,
            "installed_local": self.installed_local,
            "local_package_id": self.local_package_id,
            "local_install_path": self.local_install_path,
            "registered_platform": self.registered_platform,
            "uploaded_sealed": self.uploaded_sealed,
            "compile_error": self.compile_error,
            "register_error": self.register_error,
            "backend": self.backend,
            "reason": self.reason,
            "variant_outcomes": [
                {
                    "precision": variant.precision,
                    "dynamic_batch": variant.dynamic_batch,
                    "compiled": variant.compiled,
                    "installed_local": variant.installed_local,
                    "local_package_id": variant.local_package_id,
                    "local_install_path": variant.local_install_path,
                    "registered_platform": variant.registered_platform,
                    "uploaded_sealed": variant.uploaded_sealed,
                    "compile_error": variant.compile_error,
                    "register_error": variant.register_error,
                    "backend": variant.backend,
                    "reason": variant.reason,
                }
                for variant in self.variant_outcomes
            ],
        }

PlatformRegistrationPolicy ¶

Bases: str, Enum

Whether platform register/upload must succeed for the pipeline to fail.

Source code in inference_cli/lib/enterprise/inference_compiler/core/entities.py

class PlatformRegistrationPolicy(str, Enum):
    """Whether platform register/upload must succeed for the pipeline to fail."""

    REQUIRED = "required"
    OPTIONAL = "optional"

inference_cli.lib.enterprise.inference_compiler.core.local_trt_install ¶

Install compiled TRT artefacts into inference-models models-cache for local lookup.

Functions:¶

install_compiled_trt_package ¶

install_compiled_trt_package(
    model_id,
    model_architecture,
    task_type,
    package_manifest,
    trt_config,
    engine_path,
    inference_config_path,
    class_names_path,
    compilation_directory,
    keypoints_metadata_path=None,
)

Install compiled TRT into models-cache. Returns (package_id, install_dir).

Source code in inference_cli/lib/enterprise/inference_compiler/core/local_trt_install.py

def install_compiled_trt_package(
    model_id: str,
    model_architecture: str,
    task_type: Optional[str],
    package_manifest: TRTModelPackageV1,
    trt_config: TRTConfig,
    engine_path: str,
    inference_config_path: str,
    class_names_path: str,
    compilation_directory: str,
    keypoints_metadata_path: Optional[str] = None,
) -> Tuple[str, str]:
    """Install compiled TRT into models-cache. Returns (package_id, install_dir)."""
    from inference_cli.lib.enterprise.inference_compiler.core.compilation_handlers.default import (
        prepare_adjusted_inference_config,
    )
    from inference_models.models.auto_loaders.core import (
        create_symlinks_to_shared_blobs,
        generate_model_package_cache_path,
        generate_shared_blobs_path,
    )

    package_id = local_package_id_for_manifest(package_manifest)
    install_dir = generate_model_package_cache_path(
        model_id=model_id, package_id=package_id
    )
    if os.path.isdir(install_dir):
        shutil.rmtree(install_dir, ignore_errors=True)
    os.makedirs(install_dir, exist_ok=True)

    adjusted_inference_config_path = os.path.join(
        compilation_directory, "adjusted_inference_config.json"
    )
    prepare_adjusted_inference_config(
        inference_config_path=inference_config_path,
        target_path=adjusted_inference_config_path,
    )
    trt_config_path = os.path.join(compilation_directory, TRT_CONFIG_FILE)
    dump_json(path=trt_config_path, content=trt_config.model_dump())

    source_files = {
        INFERENCE_CONFIG_FILE: adjusted_inference_config_path,
        CLASS_NAMES_FILE: class_names_path,
        TRT_CONFIG_FILE: trt_config_path,
        ENGINE_PLAN_FILE: engine_path,
    }
    if keypoints_metadata_path is not None:
        source_files[KEYPOINTS_METADATA_FILE] = keypoints_metadata_path

    file_md5: Dict[str, str] = {}
    shared_blobs_dir = generate_shared_blobs_path()
    os.makedirs(shared_blobs_dir, exist_ok=True)
    shared_files_mapping: Dict[str, str] = {}
    for handle, source_path in source_files.items():
        md5_hash = calculate_local_file_md5(file_path=source_path)
        file_md5[handle] = md5_hash
        shared_blob_path = os.path.join(shared_blobs_dir, md5_hash)
        if not os.path.isfile(shared_blob_path):
            shutil.copy2(source_path, shared_blob_path)
        shared_files_mapping[handle] = shared_blob_path

    create_symlinks_to_shared_blobs(
        model_dir=install_dir,
        shared_files_mapping=shared_files_mapping,
    )

    manifest_payload = {
        "packageManifest": package_manifest.model_dump(
            by_alias=True, mode="json", exclude_none=True
        ),
        "files": file_md5,
        "modelArchitecture": model_architecture,
        "taskType": task_type,
    }
    dump_json(
        path=os.path.join(install_dir, LOCAL_TRT_MANIFEST_FILE),
        content=manifest_payload,
    )

    logger.info(
        "Installed local TRT package model_id=%s package_id=%s path=%s backend=trt "
        "compiled=true installed_local=true files=%s",
        model_id,
        package_id,
        install_dir,
        list(source_files.keys()),
    )
    return package_id, install_dir

`lib/roboflow_cloud/data_staging`¶

Data staging operations for uploading and managing data in the Roboflow cloud.

inference_cli.lib.roboflow_cloud.data_staging.api_operations ¶

Functions:¶

create_images_batch_from_cloud_storage ¶

create_images_batch_from_cloud_storage(
    bucket_path,
    batch_id,
    api_key,
    batch_name=None,
    ingest_id=None,
    notifications_url=None,
    notification_categories=None,
    presign_expiration_seconds=86400,
)

Create image batch from cloud storage by generating presigned URLs.

Parameters:

Name	Type	Description	Default
`bucket_path`	`str`	Cloud path with optional glob pattern (e.g., 's3://bucket/*/.jpg')	required
`batch_id`	`str`	Batch identifier	required
`api_key`	`str`	Roboflow API key	required
`presign_expiration_seconds`	`int`	Presigned URL expiration time (default: 24 hours)	`86400`

Internally calls trigger_images_references_ingest with generated presigned URLs.

Source code in inference_cli/lib/roboflow_cloud/data_staging/api_operations.py

def create_images_batch_from_cloud_storage(
    bucket_path: str,
    batch_id: str,
    api_key: str,
    batch_name: Optional[str] = None,
    ingest_id: Optional[str] = None,
    notifications_url: Optional[str] = None,
    notification_categories: Optional[List[str]] = None,
    presign_expiration_seconds: int = 86400,
) -> None:
    """
    Create image batch from cloud storage by generating presigned URLs.

    Args:
        bucket_path: Cloud path with optional glob pattern (e.g., 's3://bucket/**/*.jpg')
        batch_id: Batch identifier
        api_key: Roboflow API key
        presign_expiration_seconds: Presigned URL expiration time (default: 24 hours)

    Internally calls trigger_images_references_ingest with generated presigned URLs.
    """
    try:
        import fsspec
    except ImportError:
        raise ImportError(
            "Cloud storage support requires additional dependencies. "
            "Install with: pip install 'inference-cli[cloud-storage]'"
        )

    base_path, glob_pattern = _parse_bucket_path(bucket_path)
    protocol = base_path.split("://")[0]
    fs = fsspec.filesystem(protocol, **_get_fs_kwargs(protocol))

    # Stream and filter image files with progress
    image_files_generator = _list_and_filter_files_streaming(
        fs, base_path, glob_pattern, IMAGES_EXTENSIONS
    )

    # Generate presigned URLs in parallel (consumes generator and shows progress)
    references = _generate_presigned_urls_parallel(
        fs, image_files_generator, base_path, presign_expiration_seconds
    )

    if len(references) == 0:
        pattern_desc = glob_pattern if glob_pattern else "all image files"
        raise ValueError(
            f"No image files found matching pattern: {pattern_desc} in {base_path}\n"
            f"Supported extensions: {', '.join(IMAGES_EXTENSIONS)}\n"
            f"Note: If you're getting connection errors, check your cloud credentials and network access."
        )

    if len(references) > MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST:
        num_chunks = (
            len(references) + MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST - 1
        ) // MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST
        print(
            f"Files will be split into {num_chunks} chunks of up to {MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST} files each"
        )

    workspace = get_workspace(api_key=api_key)

    # Split into batches if needed
    ingest_parts = list(
        create_batches(
            sequence=references, batch_size=MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST
        )
    )
    if len(ingest_parts) > 1:
        print(
            f"Your ingest exceeds {MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST} files - we split the ingest "
            f"into {len(ingest_parts)} chunks."
        )

    # Trigger ingest for each batch
    for batch_references in ingest_parts:
        response = trigger_images_references_ingest(
            workspace=workspace,
            batch_id=batch_id,
            references=batch_references,
            api_key=api_key,
            ingest_id=ingest_id,
            batch_name=batch_name,
            notifications_url=notifications_url,
            notification_categories=notification_categories,
        )
        print(f"Ingest triggered. Ingest ID: {response.ingest_id}")

    if notifications_url:
        print(f"Monitor updates that will be sent to: {notifications_url}")
        print(
            f"You can also use `inference rf-cloud data-staging list-ingest-details --batch-id {batch_id}` command "
            f"to check progress."
        )
    else:
        print(
            f"Use `inference rf-cloud data-staging list-ingest-details --batch-id {batch_id}` "
            "command to watch the ingest progress. If you want automated updates - use `--notifications-url` option "
            "of this command."
        )

create_videos_batch_from_cloud_storage ¶

create_videos_batch_from_cloud_storage(
    bucket_path,
    batch_id,
    api_key,
    batch_name=None,
    ingest_id=None,
    notifications_url=None,
    notification_categories=None,
    presign_expiration_seconds=86400,
)

Create video batch from cloud storage by generating presigned URLs.

Parameters:

Name	Type	Description	Default
`bucket_path`	`str`	Cloud path with optional glob pattern (e.g., 's3://bucket/*/.mp4')	required
`batch_id`	`str`	Batch identifier	required
`api_key`	`str`	Roboflow API key	required
`presign_expiration_seconds`	`int`	Presigned URL expiration time (default: 24 hours)	`86400`

Internally calls trigger_videos_references_ingest with generated presigned URLs.

Source code in inference_cli/lib/roboflow_cloud/data_staging/api_operations.py

def create_videos_batch_from_cloud_storage(
    bucket_path: str,
    batch_id: str,
    api_key: str,
    batch_name: Optional[str] = None,
    ingest_id: Optional[str] = None,
    notifications_url: Optional[str] = None,
    notification_categories: Optional[List[str]] = None,
    presign_expiration_seconds: int = 86400,
) -> None:
    """
    Create video batch from cloud storage by generating presigned URLs.

    Args:
        bucket_path: Cloud path with optional glob pattern (e.g., 's3://bucket/**/*.mp4')
        batch_id: Batch identifier
        api_key: Roboflow API key
        presign_expiration_seconds: Presigned URL expiration time (default: 24 hours)

    Internally calls trigger_videos_references_ingest with generated presigned URLs.
    """
    try:
        import fsspec
    except ImportError:
        raise ImportError(
            "Cloud storage support requires additional dependencies. "
            "Install with: pip install 'inference-cli[cloud-storage]'"
        )

    base_path, glob_pattern = _parse_bucket_path(bucket_path)
    protocol = base_path.split("://")[0]
    fs = fsspec.filesystem(protocol, **_get_fs_kwargs(protocol))

    # Stream and filter video files with progress
    video_files_generator = _list_and_filter_files_streaming(
        fs, base_path, glob_pattern, VIDEOS_EXTENSIONS
    )

    # Generate presigned URLs in parallel (consumes generator and shows progress)
    references = _generate_presigned_urls_parallel(
        fs, video_files_generator, base_path, presign_expiration_seconds
    )

    if len(references) == 0:
        pattern_desc = glob_pattern if glob_pattern else "all video files"
        raise ValueError(
            f"No video files found matching pattern: {pattern_desc} in {base_path}\n"
            f"Supported extensions: {', '.join(VIDEOS_EXTENSIONS)}\n"
            f"Note: If you're getting connection errors, check your cloud credentials and network access."
        )

    print(f"Found {len(references)} video files")
    if len(references) > SUGGESTED_MAX_VIDEOS_IN_BATCH:
        print(
            f"Warning: Found {len(references)} videos. Suggested max is {SUGGESTED_MAX_VIDEOS_IN_BATCH} videos per batch."
        )

    workspace = get_workspace(api_key=api_key)

    # Trigger ingest directly with the list of references
    response = trigger_videos_references_ingest(
        workspace=workspace,
        batch_id=batch_id,
        references=references,
        api_key=api_key,
        ingest_id=ingest_id,
        batch_name=batch_name,
        notifications_url=notifications_url,
        notification_categories=notification_categories,
    )
    print(f"Ingest triggered. Ingest ID: {response.ingest_id}")

    if notifications_url:
        print(f"Monitor updates that will be sent to: {notifications_url}")
        print(
            f"You can also use `inference rf-cloud data-staging list-ingest-details --batch-id {batch_id}` command "
            f"to check progress."
        )
    else:
        print(
            f"Use `inference rf-cloud data-staging list-ingest-details --batch-id {batch_id}` "
            "command to watch the ingest progress. If you want automated updates - use `--notifications-url` option "
            "of this command."
        )

`lib/workflows`¶

Workflow execution adapters for local images, remote images, and video sources.

inference_cli API Reference¶

lib¶

inference_cli.lib.container_adapter ¶

Functions:¶

terminate_running_containers ¶

lib/enterprise/inference_compiler/core/compilation_handlers¶

inference_cli.lib.enterprise.inference_compiler.core.compilation_handlers.engine_builder ¶

Classes¶

EngineBuilder ¶

Methods:¶

create_network ¶

lib/enterprise/inference_compiler/core¶

inference_cli.lib.enterprise.inference_compiler.core.entities ¶

Classes¶

CompilationPipelineResult dataclass ¶

PlatformRegistrationPolicy ¶

inference_cli.lib.enterprise.inference_compiler.core.local_trt_install ¶

Functions:¶

install_compiled_trt_package ¶

lib/roboflow_cloud/data_staging¶

inference_cli.lib.roboflow_cloud.data_staging.api_operations ¶

Functions:¶

create_images_batch_from_cloud_storage ¶

create_videos_batch_from_cloud_storage ¶

lib/workflows¶

inference_cli.lib.workflows.local_image_adapter ¶

Classes¶

Functions:¶

`inference_cli` API Reference¶

`lib`¶

`lib/enterprise/inference_compiler/core/compilation_handlers`¶

`lib/enterprise/inference_compiler/core`¶

CompilationPipelineResult `dataclass` ¶

`lib/roboflow_cloud/data_staging`¶

`lib/workflows`¶