Skip to content

inference API Reference

core/active_learning

Active learning loop: sampling strategies, data collection middleware, and configuration.

inference.core.active_learning.accounting

Functions:

get_images_in_labeling_jobs_of_specific_batch

get_images_in_labeling_jobs_of_specific_batch(
    all_labeling_jobs, batch_id
)

Get the number of images in labeling jobs of a specific batch.

Parameters:

Name Type Description Default
all_labeling_jobs List[dict]

All labeling jobs.

required
batch_id str

ID of the batch.

required

Returns:

Type Description
int

The number of images in labeling jobs of the batch.

Source code in inference/core/active_learning/accounting.py
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def get_images_in_labeling_jobs_of_specific_batch(
    all_labeling_jobs: List[dict],
    batch_id: str,
) -> int:
    """Get the number of images in labeling jobs of a specific batch.

    Args:
        all_labeling_jobs: All labeling jobs.
        batch_id: ID of the batch.

    Returns:
        The number of images in labeling jobs of the batch.

    """

    matching_jobs = []
    for labeling_job in all_labeling_jobs:
        if batch_id in labeling_job["sourceBatch"]:
            matching_jobs.append(labeling_job)
    return sum(job["numImages"] for job in matching_jobs)

get_matching_labeling_batch

get_matching_labeling_batch(
    all_labeling_batches, batch_name
)

Get the matching labeling batch.

Parameters:

Name Type Description Default
all_labeling_batches List[dict]

All labeling batches.

required
batch_name str

Name of the batch.

required

Returns:

Type Description
Optional[dict]

The matching labeling batch if found, None otherwise.

Source code in inference/core/active_learning/accounting.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def get_matching_labeling_batch(
    all_labeling_batches: List[dict],
    batch_name: str,
) -> Optional[dict]:
    """Get the matching labeling batch.

    Args:
        all_labeling_batches: All labeling batches.
        batch_name: Name of the batch.

    Returns:
        The matching labeling batch if found, None otherwise.

    """
    matching_batch = None
    for labeling_batch in all_labeling_batches:
        if labeling_batch["name"] == batch_name:
            matching_batch = labeling_batch
            break
    return matching_batch

image_can_be_submitted_to_batch

image_can_be_submitted_to_batch(
    batch_name,
    workspace_id,
    dataset_id,
    max_batch_images,
    api_key,
)

Check if an image can be submitted to a batch.

Parameters:

Name Type Description Default
batch_name str

Name of the batch.

required
workspace_id WorkspaceID

ID of the workspace.

required
dataset_id DatasetID

ID of the dataset.

required
max_batch_images Optional[int]

Maximum number of images allowed in the batch.

required
api_key str

API key to use for the request.

required

Returns:

Type Description
bool

True if the image can be submitted to the batch, False otherwise.

Source code in inference/core/active_learning/accounting.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def image_can_be_submitted_to_batch(
    batch_name: str,
    workspace_id: WorkspaceID,
    dataset_id: DatasetID,
    max_batch_images: Optional[int],
    api_key: str,
) -> bool:
    """Check if an image can be submitted to a batch.

    Args:
        batch_name: Name of the batch.
        workspace_id: ID of the workspace.
        dataset_id: ID of the dataset.
        max_batch_images: Maximum number of images allowed in the batch.
        api_key: API key to use for the request.

    Returns:
        True if the image can be submitted to the batch, False otherwise.
    """
    if max_batch_images is None:
        return True
    labeling_batches = get_roboflow_labeling_batches(
        api_key=api_key,
        workspace_id=workspace_id,
        dataset_id=dataset_id,
    )
    matching_labeling_batch = get_matching_labeling_batch(
        all_labeling_batches=labeling_batches["batches"],
        batch_name=batch_name,
    )
    if matching_labeling_batch is None:
        return max_batch_images > 0
    batch_images_under_labeling = 0
    if matching_labeling_batch["numJobs"] > 0:
        labeling_jobs = get_roboflow_labeling_jobs(
            api_key=api_key, workspace_id=workspace_id, dataset_id=dataset_id
        )
        batch_images_under_labeling = get_images_in_labeling_jobs_of_specific_batch(
            all_labeling_jobs=labeling_jobs["jobs"],
            batch_id=matching_labeling_batch["id"],
        )
    total_batch_images = matching_labeling_batch["images"] + batch_images_under_labeling
    return max_batch_images > total_batch_images

inference.core.active_learning.configuration

Classes

Functions:

predictions_incompatible_with_dataset

predictions_incompatible_with_dataset(
    model_type, dataset_type
)

The incompatibility occurs when we mix classification with detection - as detection-based predictions are partially compatible (for instance - for key-points detection we may register bboxes from object detection and manually provide key-points annotations)

Source code in inference/core/active_learning/configuration.py
206
207
208
209
210
211
212
213
214
215
216
217
def predictions_incompatible_with_dataset(
    model_type: str,
    dataset_type: str,
) -> bool:
    """
    The incompatibility occurs when we mix classification with detection - as detection-based
    predictions are partially compatible (for instance - for key-points detection we may register bboxes
    from object detection and manually provide key-points annotations)
    """
    model_is_classifier = CLASSIFICATION_TASK in model_type
    dataset_is_of_type_classification = CLASSIFICATION_TASK in dataset_type
    return model_is_classifier != dataset_is_of_type_classification

core/cache

Caching backends (in-memory, Redis) used for model artefacts and inference results.

inference.core.cache.air_gapped

Utilities for discovering models and foundation-model weights in the local cache.

Used by the air-gapped workflow builder to enumerate what is available for offline workflow construction.

Functions:

get_cached_foundation_models

get_cached_foundation_models(blocks=None)

Return metadata for workflow blocks whose required weights are cached.

Each block whose manifest class exposes get_supported_model_variants is inspected. If any variant it declares is present in the local cache the block is included in the result list.

Parameters:

Name Type Description Default
blocks Optional[list]

Optional pre-loaded list of block specifications. When None (the default) the blocks are loaded via the engine's block loader.

None
Source code in inference/core/cache/air_gapped.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def get_cached_foundation_models(
    blocks: Optional[list] = None,
) -> List[Dict[str, Any]]:
    """Return metadata for workflow blocks whose required weights are cached.

    Each block whose manifest class exposes ``get_supported_model_variants``
    is inspected.  If any variant it declares is present in the local cache
    the block is included in the result list.

    Args:
        blocks: Optional pre-loaded list of block specifications.  When
            *None* (the default) the blocks are loaded via the engine's
            block loader.
    """
    results: List[Dict[str, Any]] = []
    if blocks is None:
        try:
            blocks = _load_blocks()
        except Exception:
            logger.debug(
                "Could not load workflow blocks for foundation model scan",
                exc_info=True,
            )
            return results

    for block in blocks:
        manifest_cls = block.manifest_class
        model_variants = manifest_cls.get_supported_model_variants()
        if model_variants is None:
            continue

        if not has_cached_model_variant(model_variants):
            continue

        model_id = model_variants[0] if model_variants else ""

        block_name = model_id
        try:
            schema = manifest_cls.model_json_schema()
            block_name = schema.get("name", model_id)
        except Exception:
            pass

        block_type_id = _get_block_type_identifier(block)

        results.append(
            {
                "model_id": model_id,
                "name": block_name,
                "task_type": "",
                "model_architecture": "",
                "is_foundation": True,
                "block_type": block_type_id,
            }
        )

    return results

get_task_type_to_block_mapping

get_task_type_to_block_mapping(blocks=None)

Build a reverse mapping from task_type to compatible block type identifiers.

Uses get_compatible_task_types() on block manifests. Blocks whose method returns None (the base-class default) are skipped.

Parameters:

Name Type Description Default
blocks Optional[list]

Optional pre-loaded list of block specifications. When None (the default) the blocks are loaded via the engine's block loader.

None
Source code in inference/core/cache/air_gapped.py
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def get_task_type_to_block_mapping(
    blocks: Optional[list] = None,
) -> Dict[str, List[str]]:
    """Build a reverse mapping from task_type to compatible block type identifiers.

    Uses ``get_compatible_task_types()`` on block manifests.  Blocks whose
    method returns *None* (the base-class default) are skipped.

    Args:
        blocks: Optional pre-loaded list of block specifications.  When
            *None* (the default) the blocks are loaded via the engine's
            block loader.
    """
    mapping: Dict[str, List[str]] = {}
    if blocks is None:
        try:
            blocks = _load_blocks()
        except Exception:
            logger.debug(
                "Could not load workflow blocks for task-type mapping",
                exc_info=True,
            )
            return mapping

    for block in blocks:
        manifest_cls = block.manifest_class
        task_types = manifest_cls.get_compatible_task_types()
        if task_types is None:
            continue

        block_type_id = _get_block_type_identifier(block)
        for tt in task_types:
            mapping.setdefault(tt, []).append(block_type_id)

    return mapping

has_cached_model_variant

has_cached_model_variant(model_variants)

Return True if any of the given model variant IDs has cached artifacts.

Parameters:

Name Type Description Default
model_variants Optional[List[str]]

List of model IDs as returned by WorkflowBlockManifest.get_supported_model_variants(). Returns False when None or empty.

required
Source code in inference/core/cache/air_gapped.py
87
88
89
90
91
92
93
94
95
96
97
def has_cached_model_variant(model_variants: Optional[List[str]]) -> bool:
    """Return True if **any** of the given model variant IDs has cached artifacts.

    Args:
        model_variants: List of model IDs as returned by
            ``WorkflowBlockManifest.get_supported_model_variants()``.
            Returns ``False`` when *None* or empty.
    """
    if not model_variants:
        return False
    return any(is_model_cached(mid) for mid in model_variants)

is_model_cached

is_model_cached(model_id)

Best-effort check whether model_id has cached artifacts.

Checks both the traditional and inference-models cache layouts, respecting the USE_INFERENCE_MODELS flag to avoid false positives from one layout when the runtime uses the other.

.. note::

This is intentionally optimistic — a directory with non-hidden files is assumed to contain a usable model. Full integrity verification (hash checks, registry validation) happens at model-load time inside inference-models. Treat the result as "there is a chance the model is cached" rather than a guarantee.

Source code in inference/core/cache/air_gapped.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def is_model_cached(model_id: str) -> bool:
    """Best-effort check whether *model_id* has cached artifacts.

    Checks both the traditional and ``inference-models`` cache layouts,
    respecting the ``USE_INFERENCE_MODELS`` flag to avoid false positives
    from one layout when the runtime uses the other.

    .. note::

       This is intentionally optimistic — a directory with non-hidden files
       is assumed to contain a usable model.  Full integrity verification
       (hash checks, registry validation) happens at model-load time inside
       ``inference-models``.  Treat the result as *"there is a chance the
       model is cached"* rather than a guarantee.
    """
    if not USE_INFERENCE_MODELS:
        # Only check the traditional layout when inference-models is disabled.
        traditional_path = os.path.join(MODEL_CACHE_DIR, model_id)
        return os.path.isdir(traditional_path) and _has_non_hidden_children(
            traditional_path
        )

    # When inference-models is enabled, check both layouts — models cached
    # before the migration still sit in the traditional tree.
    traditional_path = os.path.join(MODEL_CACHE_DIR, model_id)
    if os.path.isdir(traditional_path) and _has_non_hidden_children(traditional_path):
        return True

    slug = _slugify_model_id(model_id)
    models_cache_path = os.path.join(MODEL_CACHE_DIR, "models-cache", slug)
    if os.path.isdir(models_cache_path) and _has_non_hidden_children(models_cache_path):
        return True

    return False

scan_cached_models

scan_cached_models(cache_dir)

Walk cache_dir looking for model_type.json marker files.

Each marker is written by the model registry when a model is first downloaded. The file contains at least project_task_type and model_type keys.

Returns a list of dicts with the following shape::

{
    "model_id": "workspace/project/3",
    "name": "workspace/project/3",
    "task_type": "object-detection",
    "model_architecture": "yolov8n",
    "is_foundation": False,
}
Source code in inference/core/cache/air_gapped.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def scan_cached_models(cache_dir: str) -> List[Dict[str, Any]]:
    """Walk *cache_dir* looking for ``model_type.json`` marker files.

    Each marker is written by the model registry when a model is first
    downloaded.  The file contains at least ``project_task_type`` and
    ``model_type`` keys.

    Returns a list of dicts with the following shape::

        {
            "model_id": "workspace/project/3",
            "name": "workspace/project/3",
            "task_type": "object-detection",
            "model_architecture": "yolov8n",
            "is_foundation": False,
        }
    """
    results: List[Dict[str, Any]] = []
    if not os.path.isdir(cache_dir):
        return results

    for root, dirs, files in os.walk(cache_dir):
        # Prune top-level directories we know are not model trees.
        rel = os.path.relpath(root, cache_dir)
        if rel == ".":
            dirs[:] = [d for d in dirs if d not in _SKIP_TOP_LEVEL]
            continue

        if "model_type.json" not in files:
            continue

        model_type_path = os.path.join(root, "model_type.json")
        try:
            with open(model_type_path, "r") as fh:
                metadata = json.load(fh)
        except (json.JSONDecodeError, OSError) as exc:
            logger.warning(
                "Skipping unreadable model_type.json at %s: %s",
                model_type_path,
                exc,
            )
            continue

        if not isinstance(metadata, dict):
            continue

        # Support both traditional keys and inference-models metadata keys.
        task_type = metadata.get(PROJECT_TASK_TYPE_KEY) or metadata.get("taskType", "")
        model_architecture = metadata.get(MODEL_TYPE_KEY) or metadata.get(
            "modelArchitecture", ""
        )

        if not task_type:
            continue

        model_id = os.path.relpath(root, cache_dir)
        # Normalise path separators on Windows.
        model_id = model_id.replace(os.sep, "/")

        results.append(
            {
                "model_id": model_id,
                "name": model_id,
                "task_type": task_type,
                "model_architecture": model_architecture,
                "is_foundation": False,
            }
        )

    return results

inference.core.cache.base

Classes

BaseCache

BaseCache is an abstract base class that defines the interface for a cache.

Source code in inference/core/cache/base.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class BaseCache:
    """
    BaseCache is an abstract base class that defines the interface for a cache.
    """

    def get(self, key: str):
        """
        Gets the value associated with the given key.

        Args:
            key (str): The key to retrieve the value.

        Raises:
            NotImplementedError: This method must be implemented by subclasses.
        """
        raise NotImplementedError()

    def set(self, key: str, value: str, expire: float = None):
        """
        Sets a value for a given key with an optional expire time.

        Args:
            key (str): The key to store the value.
            value (str): The value to store.
            expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.

        Raises:
            NotImplementedError: This method must be implemented by subclasses.
        """
        raise NotImplementedError()

    def zadd(self, key: str, value: str, score: float, expire: float = None):
        """
        Adds a member with the specified score to the sorted set stored at key.

        Args:
            key (str): The key of the sorted set.
            value (str): The value to add to the sorted set.
            score (float): The score associated with the value.
            expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.

        Raises:
            NotImplementedError: This method must be implemented by subclasses.
        """
        raise NotImplementedError()

    def zrangebyscore(
        self,
        key: str,
        min: Optional[float] = -1,
        max: Optional[float] = float("inf"),
        withscores: bool = False,
    ):
        """
        Retrieves a range of members from a sorted set.

        Args:
            key (str): The key of the sorted set.
            start (int, optional): The starting index of the range. Defaults to -1.
            stop (int, optional): The ending index of the range. Defaults to float("inf").
            withscores (bool, optional): Whether to return the scores along with the values. Defaults to False.

        Raises:
            NotImplementedError: This method must be implemented by subclasses.
        """
        raise NotImplementedError()

    def zremrangebyscore(
        self,
        key: str,
        start: Optional[int] = -1,
        stop: Optional[int] = float("inf"),
    ):
        """
        Removes all members in a sorted set within the given scores.

        Args:
            key (str): The key of the sorted set.
            start (int, optional): The minimum score of the range. Defaults to -1.
            stop (int, optional): The maximum score of the range. Defaults to float("inf").

        Raises:
            NotImplementedError: This method must be implemented by subclasses.
        """
        raise NotImplementedError()

    def acquire_lock(self, key: str, expire: float = None) -> Any:
        raise NotImplementedError()

    @contextmanager
    def lock(self, key: str, expire: float = None) -> Any:
        # codeql[py/clear-text-logging-sensitive-data]: Cache lock keys; non-secret.
        logger.debug(f"Acquiring lock at cache key: {key}")
        l = self.acquire_lock(key, expire=expire)
        try:
            yield l
        finally:
            # codeql[py/clear-text-logging-sensitive-data]: Cache lock keys; non-secret.
            logger.debug(f"Releasing lock at cache key: {key}")
            try:
                l.release()
            except LockNotOwnedError:
                # Lock TTL expired before release - this is expected in some cases
                # codeql[py/clear-text-logging-sensitive-data]: TTL expiry on cache key.
                logger.warning(f"Lock at cache key {key} expired before release")

    def set_numpy(self, key: str, value: Any, expire: float = None):
        """
        Caches a numpy array.

        Args:
            key (str): The key to store the value.
            value (Any): The value to store.
            expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.

        Raises:
            NotImplementedError: This method must be implemented by subclasses.
        """
        raise NotImplementedError()

    def get_numpy(self, key: str) -> Any:
        """
        Retrieves a numpy array from the cache.

        Args:
            key (str): The key of the value to retrieve.

        Raises:
            NotImplementedError: This method must be implemented by subclasses.
        """
        raise NotImplementedError()
Methods:
get
get(key)

Gets the value associated with the given key.

Parameters:

Name Type Description Default
key str

The key to retrieve the value.

required

Raises:

Type Description
NotImplementedError

This method must be implemented by subclasses.

Source code in inference/core/cache/base.py
14
15
16
17
18
19
20
21
22
23
24
def get(self, key: str):
    """
    Gets the value associated with the given key.

    Args:
        key (str): The key to retrieve the value.

    Raises:
        NotImplementedError: This method must be implemented by subclasses.
    """
    raise NotImplementedError()
get_numpy
get_numpy(key)

Retrieves a numpy array from the cache.

Parameters:

Name Type Description Default
key str

The key of the value to retrieve.

required

Raises:

Type Description
NotImplementedError

This method must be implemented by subclasses.

Source code in inference/core/cache/base.py
129
130
131
132
133
134
135
136
137
138
139
def get_numpy(self, key: str) -> Any:
    """
    Retrieves a numpy array from the cache.

    Args:
        key (str): The key of the value to retrieve.

    Raises:
        NotImplementedError: This method must be implemented by subclasses.
    """
    raise NotImplementedError()
set
set(key, value, expire=None)

Sets a value for a given key with an optional expire time.

Parameters:

Name Type Description Default
key str

The key to store the value.

required
value str

The value to store.

required
expire float

The time, in seconds, after which the key will expire. Defaults to None.

None

Raises:

Type Description
NotImplementedError

This method must be implemented by subclasses.

Source code in inference/core/cache/base.py
26
27
28
29
30
31
32
33
34
35
36
37
38
def set(self, key: str, value: str, expire: float = None):
    """
    Sets a value for a given key with an optional expire time.

    Args:
        key (str): The key to store the value.
        value (str): The value to store.
        expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.

    Raises:
        NotImplementedError: This method must be implemented by subclasses.
    """
    raise NotImplementedError()
set_numpy
set_numpy(key, value, expire=None)

Caches a numpy array.

Parameters:

Name Type Description Default
key str

The key to store the value.

required
value Any

The value to store.

required
expire float

The time, in seconds, after which the key will expire. Defaults to None.

None

Raises:

Type Description
NotImplementedError

This method must be implemented by subclasses.

Source code in inference/core/cache/base.py
115
116
117
118
119
120
121
122
123
124
125
126
127
def set_numpy(self, key: str, value: Any, expire: float = None):
    """
    Caches a numpy array.

    Args:
        key (str): The key to store the value.
        value (Any): The value to store.
        expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.

    Raises:
        NotImplementedError: This method must be implemented by subclasses.
    """
    raise NotImplementedError()
zadd
zadd(key, value, score, expire=None)

Adds a member with the specified score to the sorted set stored at key.

Parameters:

Name Type Description Default
key str

The key of the sorted set.

required
value str

The value to add to the sorted set.

required
score float

The score associated with the value.

required
expire float

The time, in seconds, after which the key will expire. Defaults to None.

None

Raises:

Type Description
NotImplementedError

This method must be implemented by subclasses.

Source code in inference/core/cache/base.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def zadd(self, key: str, value: str, score: float, expire: float = None):
    """
    Adds a member with the specified score to the sorted set stored at key.

    Args:
        key (str): The key of the sorted set.
        value (str): The value to add to the sorted set.
        score (float): The score associated with the value.
        expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.

    Raises:
        NotImplementedError: This method must be implemented by subclasses.
    """
    raise NotImplementedError()
zrangebyscore
zrangebyscore(
    key, min=-1, max=float("inf"), withscores=False
)

Retrieves a range of members from a sorted set.

Parameters:

Name Type Description Default
key str

The key of the sorted set.

required
start int

The starting index of the range. Defaults to -1.

required
stop int

The ending index of the range. Defaults to float("inf").

required
withscores bool

Whether to return the scores along with the values. Defaults to False.

False

Raises:

Type Description
NotImplementedError

This method must be implemented by subclasses.

Source code in inference/core/cache/base.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def zrangebyscore(
    self,
    key: str,
    min: Optional[float] = -1,
    max: Optional[float] = float("inf"),
    withscores: bool = False,
):
    """
    Retrieves a range of members from a sorted set.

    Args:
        key (str): The key of the sorted set.
        start (int, optional): The starting index of the range. Defaults to -1.
        stop (int, optional): The ending index of the range. Defaults to float("inf").
        withscores (bool, optional): Whether to return the scores along with the values. Defaults to False.

    Raises:
        NotImplementedError: This method must be implemented by subclasses.
    """
    raise NotImplementedError()
zremrangebyscore
zremrangebyscore(key, start=-1, stop=float('inf'))

Removes all members in a sorted set within the given scores.

Parameters:

Name Type Description Default
key str

The key of the sorted set.

required
start int

The minimum score of the range. Defaults to -1.

-1
stop int

The maximum score of the range. Defaults to float("inf").

float('inf')

Raises:

Type Description
NotImplementedError

This method must be implemented by subclasses.

Source code in inference/core/cache/base.py
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def zremrangebyscore(
    self,
    key: str,
    start: Optional[int] = -1,
    stop: Optional[int] = float("inf"),
):
    """
    Removes all members in a sorted set within the given scores.

    Args:
        key (str): The key of the sorted set.
        start (int, optional): The minimum score of the range. Defaults to -1.
        stop (int, optional): The maximum score of the range. Defaults to float("inf").

    Raises:
        NotImplementedError: This method must be implemented by subclasses.
    """
    raise NotImplementedError()

inference.core.cache.memory

Classes

MemoryCache

Bases: BaseCache

MemoryCache is an in-memory cache that implements the BaseCache interface.

Attributes:

Name Type Description
cache dict

A dictionary to store the cache values.

expires dict

A dictionary to store the expiration times of the cache values.

zexpires dict

A dictionary to store the expiration times of the sorted set values.

_expire_thread Thread

A thread that runs the _expire method.

Source code in inference/core/cache/memory.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
class MemoryCache(BaseCache):
    """
    MemoryCache is an in-memory cache that implements the BaseCache interface.

    Attributes:
        cache (dict): A dictionary to store the cache values.
        expires (dict): A dictionary to store the expiration times of the cache values.
        zexpires (dict): A dictionary to store the expiration times of the sorted set values.
        _expire_thread (threading.Thread): A thread that runs the _expire method.
    """

    def __init__(self) -> None:
        """
        Initializes a new instance of the MemoryCache class.
        """
        self.cache = dict()
        self.expires = dict()
        self.zexpires = dict()

        self._expire_thread = threading.Thread(target=self._expire)
        self._expire_thread.daemon = True
        self._expire_thread.start()

    def _expire(self):
        """
        Removes the expired keys from the cache and zexpires dictionaries.

        This method runs in an infinite loop and sleeps for MEMORY_CACHE_EXPIRE_INTERVAL seconds between each iteration.
        """
        while True:
            now = time.time()
            keys_to_delete = []
            for k, v in self.expires.copy().items():
                if v < now:
                    keys_to_delete.append(k)
            for k in keys_to_delete:
                del self.cache[k]
                del self.expires[k]
            keys_to_delete = []
            for k, v in self.zexpires.copy().items():
                if v < now:
                    keys_to_delete.append(k)
            for k in keys_to_delete:
                del self.cache[k[0]][k[1]]
                del self.zexpires[k]
            while time.time() - now < MEMORY_CACHE_EXPIRE_INTERVAL:
                time.sleep(0.1)

    def get(self, key: str):
        """
        Gets the value associated with the given key.

        Args:
            key (str): The key to retrieve the value.

        Returns:
            str: The value associated with the key, or None if the key does not exist or is expired.
        """
        if key in self.expires:
            if self.expires[key] < time.time():
                del self.cache[key]
                del self.expires[key]
                return None
        return self.cache.get(key)

    def set(self, key: str, value: str, expire: float = None):
        """
        Sets a value for a given key with an optional expire time.

        Args:
            key (str): The key to store the value.
            value (str): The value to store.
            expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.
        """
        self.cache[key] = value
        if expire:
            self.expires[key] = expire + time.time()

    def zadd(self, key: str, value: Any, score: float, expire: float = None):
        """
        Adds a member with the specified score to the sorted set stored at key.

        Args:
            key (str): The key of the sorted set.
            value (str): The value to add to the sorted set.
            score (float): The score associated with the value.
            expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.
        """
        if not key in self.cache:
            self.cache[key] = dict()
        self.cache[key][score] = value
        if expire:
            self.zexpires[(key, score)] = expire + time.time()

    def zrangebyscore(
        self,
        key: str,
        min: Optional[float] = -1,
        max: Optional[float] = float("inf"),
        withscores: bool = False,
    ):
        """
        Retrieves a range of members from a sorted set.

        Args:
            key (str): The key of the sorted set.
            start (int, optional): The starting score of the range. Defaults to -1.
            stop (int, optional): The ending score of the range. Defaults to float("inf").
            withscores (bool, optional): Whether to return the scores along with the values. Defaults to False.

        Returns:
            list: A list of values (or value-score pairs if withscores is True) in the specified score range.
        """
        if not key in self.cache:
            return []
        keys = sorted([k for k in self.cache[key].keys() if min <= k <= max])
        if withscores:
            return [(self.cache[key][k], k) for k in keys]
        else:
            return [self.cache[key][k] for k in keys]

    def zremrangebyscore(
        self,
        key: str,
        min: Optional[float] = -1,
        max: Optional[float] = float("inf"),
    ):
        """
        Removes all members in a sorted set within the given scores.

        Args:
            key (str): The key of the sorted set.
            start (int, optional): The minimum score of the range. Defaults to -1.
            stop (int, optional): The maximum score of the range. Defaults to float("inf").

        Returns:
            int: The number of members removed from the sorted set.
        """
        res = self.zrangebyscore(key, min=min, max=max, withscores=True)
        keys_to_delete = [k[1] for k in res]
        for k in keys_to_delete:
            del self.cache[key][k]
        return len(keys_to_delete)

    def acquire_lock(self, key: str, expire=None) -> Any:
        lock: Optional[Lock] = self.get(key)
        if lock is None:
            lock = Lock()
            self.set(key, lock, expire=expire)
        if expire is None:
            expire = -1
        acquired = lock.acquire(timeout=expire)
        if not acquired:
            raise TimeoutError()
        # refresh the lock
        self.set(key, lock, expire=expire)
        return lock

    def set_numpy(self, key: str, value: Any, expire: float = None):
        return self.set(key, value, expire=expire)

    def get_numpy(self, key: str):
        return self.get(key)
Methods:
__init__
__init__()

Initializes a new instance of the MemoryCache class.

Source code in inference/core/cache/memory.py
21
22
23
24
25
26
27
28
29
30
31
def __init__(self) -> None:
    """
    Initializes a new instance of the MemoryCache class.
    """
    self.cache = dict()
    self.expires = dict()
    self.zexpires = dict()

    self._expire_thread = threading.Thread(target=self._expire)
    self._expire_thread.daemon = True
    self._expire_thread.start()
get
get(key)

Gets the value associated with the given key.

Parameters:

Name Type Description Default
key str

The key to retrieve the value.

required

Returns:

Name Type Description
str

The value associated with the key, or None if the key does not exist or is expired.

Source code in inference/core/cache/memory.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def get(self, key: str):
    """
    Gets the value associated with the given key.

    Args:
        key (str): The key to retrieve the value.

    Returns:
        str: The value associated with the key, or None if the key does not exist or is expired.
    """
    if key in self.expires:
        if self.expires[key] < time.time():
            del self.cache[key]
            del self.expires[key]
            return None
    return self.cache.get(key)
set
set(key, value, expire=None)

Sets a value for a given key with an optional expire time.

Parameters:

Name Type Description Default
key str

The key to store the value.

required
value str

The value to store.

required
expire float

The time, in seconds, after which the key will expire. Defaults to None.

None
Source code in inference/core/cache/memory.py
75
76
77
78
79
80
81
82
83
84
85
86
def set(self, key: str, value: str, expire: float = None):
    """
    Sets a value for a given key with an optional expire time.

    Args:
        key (str): The key to store the value.
        value (str): The value to store.
        expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.
    """
    self.cache[key] = value
    if expire:
        self.expires[key] = expire + time.time()
zadd
zadd(key, value, score, expire=None)

Adds a member with the specified score to the sorted set stored at key.

Parameters:

Name Type Description Default
key str

The key of the sorted set.

required
value str

The value to add to the sorted set.

required
score float

The score associated with the value.

required
expire float

The time, in seconds, after which the key will expire. Defaults to None.

None
Source code in inference/core/cache/memory.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def zadd(self, key: str, value: Any, score: float, expire: float = None):
    """
    Adds a member with the specified score to the sorted set stored at key.

    Args:
        key (str): The key of the sorted set.
        value (str): The value to add to the sorted set.
        score (float): The score associated with the value.
        expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.
    """
    if not key in self.cache:
        self.cache[key] = dict()
    self.cache[key][score] = value
    if expire:
        self.zexpires[(key, score)] = expire + time.time()
zrangebyscore
zrangebyscore(
    key, min=-1, max=float("inf"), withscores=False
)

Retrieves a range of members from a sorted set.

Parameters:

Name Type Description Default
key str

The key of the sorted set.

required
start int

The starting score of the range. Defaults to -1.

required
stop int

The ending score of the range. Defaults to float("inf").

required
withscores bool

Whether to return the scores along with the values. Defaults to False.

False

Returns:

Name Type Description
list

A list of values (or value-score pairs if withscores is True) in the specified score range.

Source code in inference/core/cache/memory.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def zrangebyscore(
    self,
    key: str,
    min: Optional[float] = -1,
    max: Optional[float] = float("inf"),
    withscores: bool = False,
):
    """
    Retrieves a range of members from a sorted set.

    Args:
        key (str): The key of the sorted set.
        start (int, optional): The starting score of the range. Defaults to -1.
        stop (int, optional): The ending score of the range. Defaults to float("inf").
        withscores (bool, optional): Whether to return the scores along with the values. Defaults to False.

    Returns:
        list: A list of values (or value-score pairs if withscores is True) in the specified score range.
    """
    if not key in self.cache:
        return []
    keys = sorted([k for k in self.cache[key].keys() if min <= k <= max])
    if withscores:
        return [(self.cache[key][k], k) for k in keys]
    else:
        return [self.cache[key][k] for k in keys]
zremrangebyscore
zremrangebyscore(key, min=-1, max=float('inf'))

Removes all members in a sorted set within the given scores.

Parameters:

Name Type Description Default
key str

The key of the sorted set.

required
start int

The minimum score of the range. Defaults to -1.

required
stop int

The maximum score of the range. Defaults to float("inf").

required

Returns:

Name Type Description
int

The number of members removed from the sorted set.

Source code in inference/core/cache/memory.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def zremrangebyscore(
    self,
    key: str,
    min: Optional[float] = -1,
    max: Optional[float] = float("inf"),
):
    """
    Removes all members in a sorted set within the given scores.

    Args:
        key (str): The key of the sorted set.
        start (int, optional): The minimum score of the range. Defaults to -1.
        stop (int, optional): The maximum score of the range. Defaults to float("inf").

    Returns:
        int: The number of members removed from the sorted set.
    """
    res = self.zrangebyscore(key, min=min, max=max, withscores=True)
    keys_to_delete = [k[1] for k in res]
    for k in keys_to_delete:
        del self.cache[key][k]
    return len(keys_to_delete)

inference.core.cache.model_artifacts

Functions:

clear_cache

clear_cache(model_id=None, delete_from_disk=True)

Clear the cache for a specific model or the entire cache directory.

Parameters:

Name Type Description Default
model_id Optional[str]

The model ID to clear cache for. If None, clears entire cache. Defaults to None.

None
delete_from_disk bool

Whether to delete cached files from disk. Defaults to False.

True
Source code in inference/core/cache/model_artifacts.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def clear_cache(model_id: Optional[str] = None, delete_from_disk: bool = True) -> None:
    """Clear the cache for a specific model or the entire cache directory.

    Args:
        model_id (Optional[str], optional): The model ID to clear cache for. If None, clears entire cache. Defaults to None.
        delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to False.
    """
    if not delete_from_disk:
        return
    cache_dir = get_cache_dir(model_id=model_id)
    if not os.path.exists(cache_dir):
        return
    lock_dir = MODEL_CACHE_DIR + "/_file_locks"  # Dedicated lock directory
    os.makedirs(lock_dir, exist_ok=True)  # ensure lock directory exists.

    # Use the last 2 levels of the cache directory path as the lock file name suffix
    parts = os.path.normpath(cache_dir).split(os.sep)
    suffix = (
        os.path.join(*parts[-2:]) if len(parts) >= 2 else os.path.basename(cache_dir)
    )
    lock_file = os.path.join(lock_dir, f"{suffix}.lock")

    try:
        lock = FileLock(lock_file, timeout=10)  # 10 second timeout
        with lock:
            if not os.path.exists(cache_dir):  # Check again after acquiring lock
                return  # Already deleted by another process

            max_retries = 3
            retry_delay = 1  # Initial delay in seconds

            for attempt in range(max_retries):
                try:
                    shutil.rmtree(cache_dir, onerror=_rmtree_onerror)
                    return  # Success
                except FileNotFoundError:
                    return  # Already deleted by another process
                except Exception as e:
                    if attempt < max_retries - 1:
                        logger.warning(
                            f"Error deleting cache %s: %s, retrying in %s seconds...",
                            cache_dir,
                            e,
                            retry_delay,
                        )
                        time.sleep(retry_delay)
                        retry_delay *= 2  # Exponential backoff
                    else:
                        logger.warning(
                            f"Error deleting cache %s: %s, max retries exceeded.",
                            cache_dir,
                            e,
                        )
                        return
    except Exception as e:
        logger.warning(
            f"Error acquiring lock for cache %s, skipping cache cleanup. %s",
            cache_dir,
            e,
        )

inference.core.cache.redis

Classes

RedisCache

Bases: BaseCache

MemoryCache is an in-memory cache that implements the BaseCache interface.

Attributes:

Name Type Description
cache dict

A dictionary to store the cache values.

expires dict

A dictionary to store the expiration times of the cache values.

zexpires dict

A dictionary to store the expiration times of the sorted set values.

_expire_thread Thread

A thread that runs the _expire method.

Source code in inference/core/cache/redis.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
class RedisCache(BaseCache):
    """
    MemoryCache is an in-memory cache that implements the BaseCache interface.

    Attributes:
        cache (dict): A dictionary to store the cache values.
        expires (dict): A dictionary to store the expiration times of the cache values.
        zexpires (dict): A dictionary to store the expiration times of the sorted set values.
        _expire_thread (threading.Thread): A thread that runs the _expire method.
    """

    def __init__(
        self,
        host: str = "localhost",
        port: int = 6379,
        db: int = 0,
        ssl: bool = False,
        timeout: float = 2.0,
    ) -> None:
        """
        Initializes a new instance of the MemoryCache class.
        """
        self.client = redis.Redis(
            host=host,
            port=port,
            db=db,
            decode_responses=False,
            ssl=ssl,
            socket_timeout=timeout,
            socket_connect_timeout=timeout,
        )
        logger.debug("Attempting to diagnose Redis connection...")
        self.client.ping()
        logger.debug("Redis connection established.")
        self.zexpires = dict()

        self._expire_thread = threading.Thread(target=self._expire, daemon=True)
        self._expire_thread.start()

    def _expire(self):
        """
        Removes the expired keys from the cache and zexpires dictionaries.

        This method runs in an infinite loop and sleeps for MEMORY_CACHE_EXPIRE_INTERVAL seconds between each iteration.
        """
        while True:
            now = time.time()
            for k, v in copy(list(self.zexpires.items())):
                if v < now:
                    tolerance_factor = 1e-14  # floating point accuracy
                    self.zremrangebyscore(
                        k[0], k[1] - tolerance_factor, k[1] + tolerance_factor
                    )
                    del self.zexpires[k]
            sleep_time = MEMORY_CACHE_EXPIRE_INTERVAL - (time.time() - now)
            time.sleep(max(sleep_time, 0))

    def get(self, key: str):
        """
        Gets the value associated with the given key.

        Args:
            key (str): The key to retrieve the value.

        Returns:
            str: The value associated with the key, or None if the key does not exist or is expired.
        """
        item = self.client.get(key)
        if item is not None:
            try:
                return json.loads(item)
            except (TypeError, ValueError):
                return item

    def set(self, key: str, value: str, expire: float = None):
        """
        Sets a value for a given key with an optional expire time.

        Args:
            key (str): The key to store the value.
            value (str): The value to store.
            expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.
        """
        if not isinstance(value, bytes):
            value = json.dumps(value)
        self.client.set(key, value, ex=expire)

    def zadd(self, key: str, value: Any, score: float, expire: float = None):
        """
        Adds a member with the specified score to the sorted set stored at key.

        Args:
            key (str): The key of the sorted set.
            value (str): The value to add to the sorted set.
            score (float): The score associated with the value.
            expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.
        """
        # serializable_value = self.ensure_serializable(value)
        value = json.dumps(value)
        if expire:
            # Set a server-side (sliding) TTL on the whole sorted set in the same
            # round-trip as the ZADD. Without this the key only ever expired via the
            # in-process ``self.zexpires`` bookkeeping reaped by ``_expire()`` below,
            # which lives solely in this process' memory. If the process dies before
            # those members are trimmed (e.g. a serverless/autoscaled pod scaling
            # down), the key is orphaned in Redis forever with TTL -1 — an unbounded
            # memory leak. A real EXPIRE lets Redis reclaim the key ``expire`` seconds
            # after the last write regardless of process lifecycle. ``max(1, ...)``
            # guards against EXPIRE 0 (immediate delete) for sub-second values.
            with self.client.pipeline() as pipe:
                pipe.zadd(key, {value: score})
                pipe.expire(key, max(1, int(expire)))
                pipe.execute()
            # Keep per-member bookkeeping so ``_expire()`` can still trim individual
            # expired members from an otherwise-live key (now a best-effort
            # optimisation rather than the only safety net).
            self.zexpires[(key, score)] = expire + time.time()
        else:
            self.client.zadd(key, {value: score})

    def zrangebyscore(
        self,
        key: str,
        min: Optional[float] = -1,
        max: Optional[float] = float("inf"),
        withscores: bool = False,
    ):
        """
        Retrieves a range of members from a sorted set.

        Args:
            key (str): The key of the sorted set.
            start (int, optional): The starting score of the range. Defaults to -1.
            stop (int, optional): The ending score of the range. Defaults to float("inf").
            withscores (bool, optional): Whether to return the scores along with the values. Defaults to False.

        Returns:
            list: A list of values (or value-score pairs if withscores is True) in the specified score range.
        """
        res = self.client.zrangebyscore(key, min, max, withscores=withscores)
        if withscores:
            return [(json.loads(x), y) for x, y in res]
        else:
            return [json.loads(x) for x in res]

    def zremrangebyscore(
        self,
        key: str,
        min: Optional[float] = -1,
        max: Optional[float] = float("inf"),
    ):
        """
        Removes all members in a sorted set within the given scores.

        Args:
            key (str): The key of the sorted set.
            start (int, optional): The minimum score of the range. Defaults to -1.
            stop (int, optional): The maximum score of the range. Defaults to float("inf").

        Returns:
            int: The number of members removed from the sorted set.
        """
        return self.client.zremrangebyscore(key, min, max)

    def ensure_serializable(self, value: Any):
        if isinstance(value, dict):
            for k, v in value.items():
                if isinstance(v, Exception):
                    value[k] = str(v)
                elif inspect.isclass(v) and isinstance(v, InferenceResponseImage):
                    value[k] = v.dict()
        return value

    def acquire_lock(self, key: str, expire=None) -> Any:
        l = self.client.lock(key, blocking=True, timeout=expire)
        acquired = l.acquire(blocking_timeout=expire)
        if not acquired:
            raise TimeoutError("Couldn't get lock")
        # refresh the lock
        if expire is not None:
            l.extend(expire)
        return l

    def set_numpy(self, key: str, value: Any, expire: float = None):
        serialized_value = pickle.dumps(value)
        self.set(key, serialized_value, expire=expire)

    def get_numpy(self, key: str) -> Any:
        serialized_value = self.get(key)
        if serialized_value is not None:
            return pickle.loads(serialized_value)
        else:
            return None
Methods:
__init__
__init__(
    host="localhost",
    port=6379,
    db=0,
    ssl=False,
    timeout=2.0,
)

Initializes a new instance of the MemoryCache class.

Source code in inference/core/cache/redis.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def __init__(
    self,
    host: str = "localhost",
    port: int = 6379,
    db: int = 0,
    ssl: bool = False,
    timeout: float = 2.0,
) -> None:
    """
    Initializes a new instance of the MemoryCache class.
    """
    self.client = redis.Redis(
        host=host,
        port=port,
        db=db,
        decode_responses=False,
        ssl=ssl,
        socket_timeout=timeout,
        socket_connect_timeout=timeout,
    )
    logger.debug("Attempting to diagnose Redis connection...")
    self.client.ping()
    logger.debug("Redis connection established.")
    self.zexpires = dict()

    self._expire_thread = threading.Thread(target=self._expire, daemon=True)
    self._expire_thread.start()
get
get(key)

Gets the value associated with the given key.

Parameters:

Name Type Description Default
key str

The key to retrieve the value.

required

Returns:

Name Type Description
str

The value associated with the key, or None if the key does not exist or is expired.

Source code in inference/core/cache/redis.py
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def get(self, key: str):
    """
    Gets the value associated with the given key.

    Args:
        key (str): The key to retrieve the value.

    Returns:
        str: The value associated with the key, or None if the key does not exist or is expired.
    """
    item = self.client.get(key)
    if item is not None:
        try:
            return json.loads(item)
        except (TypeError, ValueError):
            return item
set
set(key, value, expire=None)

Sets a value for a given key with an optional expire time.

Parameters:

Name Type Description Default
key str

The key to store the value.

required
value str

The value to store.

required
expire float

The time, in seconds, after which the key will expire. Defaults to None.

None
Source code in inference/core/cache/redis.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def set(self, key: str, value: str, expire: float = None):
    """
    Sets a value for a given key with an optional expire time.

    Args:
        key (str): The key to store the value.
        value (str): The value to store.
        expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.
    """
    if not isinstance(value, bytes):
        value = json.dumps(value)
    self.client.set(key, value, ex=expire)
zadd
zadd(key, value, score, expire=None)

Adds a member with the specified score to the sorted set stored at key.

Parameters:

Name Type Description Default
key str

The key of the sorted set.

required
value str

The value to add to the sorted set.

required
score float

The score associated with the value.

required
expire float

The time, in seconds, after which the key will expire. Defaults to None.

None
Source code in inference/core/cache/redis.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def zadd(self, key: str, value: Any, score: float, expire: float = None):
    """
    Adds a member with the specified score to the sorted set stored at key.

    Args:
        key (str): The key of the sorted set.
        value (str): The value to add to the sorted set.
        score (float): The score associated with the value.
        expire (float, optional): The time, in seconds, after which the key will expire. Defaults to None.
    """
    # serializable_value = self.ensure_serializable(value)
    value = json.dumps(value)
    if expire:
        # Set a server-side (sliding) TTL on the whole sorted set in the same
        # round-trip as the ZADD. Without this the key only ever expired via the
        # in-process ``self.zexpires`` bookkeeping reaped by ``_expire()`` below,
        # which lives solely in this process' memory. If the process dies before
        # those members are trimmed (e.g. a serverless/autoscaled pod scaling
        # down), the key is orphaned in Redis forever with TTL -1 — an unbounded
        # memory leak. A real EXPIRE lets Redis reclaim the key ``expire`` seconds
        # after the last write regardless of process lifecycle. ``max(1, ...)``
        # guards against EXPIRE 0 (immediate delete) for sub-second values.
        with self.client.pipeline() as pipe:
            pipe.zadd(key, {value: score})
            pipe.expire(key, max(1, int(expire)))
            pipe.execute()
        # Keep per-member bookkeeping so ``_expire()`` can still trim individual
        # expired members from an otherwise-live key (now a best-effort
        # optimisation rather than the only safety net).
        self.zexpires[(key, score)] = expire + time.time()
    else:
        self.client.zadd(key, {value: score})
zrangebyscore
zrangebyscore(
    key, min=-1, max=float("inf"), withscores=False
)

Retrieves a range of members from a sorted set.

Parameters:

Name Type Description Default
key str

The key of the sorted set.

required
start int

The starting score of the range. Defaults to -1.

required
stop int

The ending score of the range. Defaults to float("inf").

required
withscores bool

Whether to return the scores along with the values. Defaults to False.

False

Returns:

Name Type Description
list

A list of values (or value-score pairs if withscores is True) in the specified score range.

Source code in inference/core/cache/redis.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def zrangebyscore(
    self,
    key: str,
    min: Optional[float] = -1,
    max: Optional[float] = float("inf"),
    withscores: bool = False,
):
    """
    Retrieves a range of members from a sorted set.

    Args:
        key (str): The key of the sorted set.
        start (int, optional): The starting score of the range. Defaults to -1.
        stop (int, optional): The ending score of the range. Defaults to float("inf").
        withscores (bool, optional): Whether to return the scores along with the values. Defaults to False.

    Returns:
        list: A list of values (or value-score pairs if withscores is True) in the specified score range.
    """
    res = self.client.zrangebyscore(key, min, max, withscores=withscores)
    if withscores:
        return [(json.loads(x), y) for x, y in res]
    else:
        return [json.loads(x) for x in res]
zremrangebyscore
zremrangebyscore(key, min=-1, max=float('inf'))

Removes all members in a sorted set within the given scores.

Parameters:

Name Type Description Default
key str

The key of the sorted set.

required
start int

The minimum score of the range. Defaults to -1.

required
stop int

The maximum score of the range. Defaults to float("inf").

required

Returns:

Name Type Description
int

The number of members removed from the sorted set.

Source code in inference/core/cache/redis.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def zremrangebyscore(
    self,
    key: str,
    min: Optional[float] = -1,
    max: Optional[float] = float("inf"),
):
    """
    Removes all members in a sorted set within the given scores.

    Args:
        key (str): The key of the sorted set.
        start (int, optional): The minimum score of the range. Defaults to -1.
        stop (int, optional): The maximum score of the range. Defaults to float("inf").

    Returns:
        int: The number of members removed from the sorted set.
    """
    return self.client.zremrangebyscore(key, min, max)

core/devices

Hardware device detection and selection helpers.

inference.core.devices.utils

Functions:

get_cpu_id

get_cpu_id()

Fetches the CPU ID based on the operating system.

Attempts to get the CPU ID for Windows, Linux, and MacOS. In case of any error or an unsupported OS, returns None.

Returns:

Type Description

Optional[str]: CPU ID string if available, None otherwise.

Source code in inference/core/devices/utils.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def get_cpu_id():
    """Fetches the CPU ID based on the operating system.

    Attempts to get the CPU ID for Windows, Linux, and MacOS.
    In case of any error or an unsupported OS, returns None.

    Returns:
        Optional[str]: CPU ID string if available, None otherwise.
    """
    try:
        if platform.system() == "Windows":
            return os.popen("wmic cpu get ProcessorId").read().strip()
        elif platform.system() == "Linux":
            return (
                open("/proc/cpuinfo").read().split("processor")[0].split(":")[1].strip()
            )
        elif platform.system() == "Darwin":
            import subprocess

            return (
                subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
                .strip()
                .decode()
            )
    except Exception as e:
        return None

get_device_hostname

get_device_hostname()

Fetches the device's hostname.

Returns:

Name Type Description
str

The device's hostname.

Source code in inference/core/devices/utils.py
107
108
109
110
111
112
113
def get_device_hostname():
    """Fetches the device's hostname.

    Returns:
        str: The device's hostname.
    """
    return platform.node()

get_gpu_id

get_gpu_id()

Fetches the GPU ID if a GPU is present.

Tries to import and use the pynvml (delivered by nvidia-ml-py) module to retrieve the GPU information.

Returns:

Type Description

Optional[int]: GPU ID if available, None otherwise.

Source code in inference/core/devices/utils.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def get_gpu_id():
    """Fetches the GPU ID if a GPU is present.

    Tries to import and use the `pynvml` (delivered by nvidia-ml-py) module to retrieve the GPU information.

    Returns:
        Optional[int]: GPU ID if available, None otherwise.
    """
    try:
        from pynvml import nvmlDeviceGetCount, nvmlInit

        nvmlInit()
        gpus_count = nvmlDeviceGetCount()
        if gpus_count:
            return 0
    except ImportError:
        return None
    except Exception:
        return None

get_inference_server_id

get_inference_server_id()

Fetches a unique device ID.

Tries to get the GPU ID first, then falls back to CPU ID. If the application is running inside Docker, the Docker container ID is appended to the hostname.

Returns:

Name Type Description
str

A unique string representing the device. If unable to determine, returns "UNKNOWN".

Source code in inference/core/devices/utils.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def get_inference_server_id():
    """Fetches a unique device ID.

    Tries to get the GPU ID first, then falls back to CPU ID.
    If the application is running inside Docker, the Docker container ID is appended to the hostname.

    Returns:
        str: A unique string representing the device. If unable to determine, returns "UNKNOWN".
    """
    try:
        if INFERENCE_SERVER_ID is not None:
            return INFERENCE_SERVER_ID
        id = random_string(6)
        gpu_id = get_gpu_id()
        if gpu_id is not None:
            return f"{id}-GPU-{gpu_id}"
        jetson_id = get_jetson_id()
        if jetson_id is not None:
            return f"{id}-JETSON-{jetson_id}"
        return id
    except Exception as e:
        return "UNKNOWN"

get_jetson_id

get_jetson_id()

Fetches the Jetson device's serial number.

Attempts to read the serial number from the device tree. In case of any error, returns None.

Returns:

Type Description

Optional[str]: Jetson device serial number if available, None otherwise.

Source code in inference/core/devices/utils.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def get_jetson_id():
    """Fetches the Jetson device's serial number.

    Attempts to read the serial number from the device tree.
    In case of any error, returns None.

    Returns:
        Optional[str]: Jetson device serial number if available, None otherwise.
    """
    try:
        # Fetch the device's serial number
        if not os.path.exists("/proc/device-tree/serial-number"):
            return None
        serial_number = os.popen("cat /proc/device-tree/serial-number").read().strip()
        if serial_number == "":
            return None
        return serial_number
    except Exception as e:
        return None

is_running_in_docker

is_running_in_docker()

Checks if the current process is running inside a Docker container.

Returns:

Name Type Description
bool

True if running inside a Docker container, False otherwise.

Source code in inference/core/devices/utils.py
10
11
12
13
14
15
16
def is_running_in_docker():
    """Checks if the current process is running inside a Docker container.

    Returns:
        bool: True if running inside a Docker container, False otherwise.
    """
    return os.path.exists("/.dockerenv")

core/entities/requests

inference.core.entities.requests.clip

Classes

ClipCompareRequest

Bases: ClipInferenceRequest

Request for CLIP comparison.

Attributes:

Name Type Description
subject Union[InferenceRequestImage, str]

The type of image data provided, one of 'url' or 'base64'.

subject_type str

The type of subject, one of 'image' or 'text'.

prompt Union[List[InferenceRequestImage], InferenceRequestImage, str, List[str], Dict[str, Union[InferenceRequestImage, str]]]

The prompt for comparison.

prompt_type str

The type of prompt, one of 'image' or 'text'.

Source code in inference/core/entities/requests/clip.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class ClipCompareRequest(ClipInferenceRequest):
    """Request for CLIP comparison.

    Attributes:
        subject (Union[InferenceRequestImage, str]): The type of image data provided, one of 'url' or 'base64'.
        subject_type (str): The type of subject, one of 'image' or 'text'.
        prompt (Union[List[InferenceRequestImage], InferenceRequestImage, str, List[str], Dict[str, Union[InferenceRequestImage, str]]]): The prompt for comparison.
        prompt_type (str): The type of prompt, one of 'image' or 'text'.
    """

    subject: Union[InferenceRequestImage, str] = Field(
        examples=["url"],
        description="The type of image data provided, one of 'url' or 'base64'",
    )
    subject_type: str = Field(
        default="image",
        examples=["image"],
        description="The type of subject, one of 'image' or 'text'",
    )
    prompt: Union[
        List[InferenceRequestImage],
        InferenceRequestImage,
        str,
        List[str],
        Dict[str, Union[InferenceRequestImage, str]],
    ]
    prompt_type: str = Field(
        default="text",
        examples=["text"],
        description="The type of prompt, one of 'image' or 'text'",
    )

ClipImageEmbeddingRequest

Bases: ClipInferenceRequest

Request for CLIP image embedding.

Attributes:

Name Type Description
image Union[List[InferenceRequestImage], InferenceRequestImage]

Image(s) to be embedded.

Source code in inference/core/entities/requests/clip.py
38
39
40
41
42
43
44
45
class ClipImageEmbeddingRequest(ClipInferenceRequest):
    """Request for CLIP image embedding.

    Attributes:
        image (Union[List[InferenceRequestImage], InferenceRequestImage]): Image(s) to be embedded.
    """

    image: Union[List[InferenceRequestImage], InferenceRequestImage]

ClipInferenceRequest

Bases: BaseRequest

Request for CLIP inference.

Attributes:

Name Type Description
api_key Optional[str]

Roboflow API Key.

clip_version_id Optional[str]

The version ID of CLIP to be used for this request.

Source code in inference/core/entities/requests/clip.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class ClipInferenceRequest(BaseRequest):
    """Request for CLIP inference.

    Attributes:
        api_key (Optional[str]): Roboflow API Key.
        clip_version_id (Optional[str]): The version ID of CLIP to be used for this request.
    """

    clip_version_id: Optional[str] = Field(
        default=CLIP_VERSION_ID,
        examples=["ViT-B-16"],
        description="The version ID of CLIP to be used for this request. Must be one of RN101, RN50, RN50x16, RN50x4, RN50x64, ViT-B-16, ViT-B-32, ViT-L-14-336px, and ViT-L-14.",
    )
    model_id: Optional[str] = Field(None)

    # TODO[pydantic]: We couldn't refactor the `validator`, please replace it by `field_validator` manually.
    # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
    @validator("model_id", always=True)
    def validate_model_id(cls, value, values):
        if value is not None:
            return value
        if values.get("clip_version_id") is None:
            return None
        return f"clip/{values['clip_version_id']}"

ClipTextEmbeddingRequest

Bases: ClipInferenceRequest

Request for CLIP text embedding.

Attributes:

Name Type Description
text Union[List[str], str]

A string or list of strings.

Source code in inference/core/entities/requests/clip.py
48
49
50
51
52
53
54
55
56
57
58
class ClipTextEmbeddingRequest(ClipInferenceRequest):
    """Request for CLIP text embedding.

    Attributes:
        text (Union[List[str], str]): A string or list of strings.
    """

    text: Union[List[str], str] = Field(
        examples=["The quick brown fox jumps over the lazy dog"],
        description="A string or list of strings",
    )

inference.core.entities.requests.doctr

Classes

DoctrOCRInferenceRequest

Bases: BaseRequest

DocTR inference request.

Attributes:

Name Type Description
api_key Optional[str]

Roboflow API Key.

Source code in inference/core/entities/requests/doctr.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class DoctrOCRInferenceRequest(BaseRequest):
    """
    DocTR inference request.

    Attributes:
        api_key (Optional[str]): Roboflow API Key.
    """

    image: Union[List[InferenceRequestImage], InferenceRequestImage]
    doctr_version_id: Optional[str] = "default"
    model_id: Optional[str] = Field(None)
    # flag to generate bounding box data rather than just a string, set to False for backwards compatibility
    generate_bounding_boxes: Optional[bool] = False

    # TODO[pydantic]: We couldn't refactor the `validator`, please replace it by `field_validator` manually.
    # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
    @validator("model_id", always=True, allow_reuse=True)
    def validate_model_id(cls, value, values):
        if value is not None:
            return value
        if values.get("doctr_version_id") is None:
            return None
        return f"doctr/{values['doctr_version_id']}"

inference.core.entities.requests.dynamic_class_base

Classes

DynamicClassBaseInferenceRequest

Bases: CVInferenceRequest

Request for zero-shot object detection models (with dynamic class lists).

Attributes:

Name Type Description
text List[str]

A list of strings.

Source code in inference/core/entities/requests/dynamic_class_base.py
 8
 9
10
11
12
13
14
15
16
17
18
19
class DynamicClassBaseInferenceRequest(CVInferenceRequest):
    """Request for zero-shot object detection models (with dynamic class lists).

    Attributes:
        text (List[str]): A list of strings.
    """

    model_id: Optional[str] = Field(None)
    text: List[str] = Field(
        examples=[["person", "dog", "cat"]],
        description="A list of strings",
    )

inference.core.entities.requests.easy_ocr

Classes

EasyOCRInferenceRequest

Bases: BaseRequest

EasyOCR inference request.

Attributes:

Name Type Description
api_key Optional[str]

Roboflow API Key.

Source code in inference/core/entities/requests/easy_ocr.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
class EasyOCRInferenceRequest(BaseRequest):
    """
    EasyOCR inference request.

    Attributes:
        api_key (Optional[str]): Roboflow API Key.
    """

    image: Union[List[InferenceRequestImage], InferenceRequestImage]
    easy_ocr_version_id: Optional[str] = EASYOCR_VERSION_ID
    model_id: Optional[str] = Field(None)
    language_codes: Optional[List[str]] = Field(default=["en"])
    quantize: Optional[bool] = Field(
        default=False,
        description="Quantized models are smaller and faster, but may be less accurate and won't work correctly on all hardware.",
    )

    # TODO[pydantic]: We couldn't refactor the `validator`, please replace it by `field_validator` manually.
    # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
    @validator("model_id", always=True, allow_reuse=True)
    def validate_model_id(cls, value, values):
        if value is not None:
            return value
        if values.get("easy_ocr_version_id") is None:
            return None
        return f"easy_ocr/{values['easy_ocr_version_id']}"

inference.core.entities.requests.groundingdino

Classes

GroundingDINOInferenceRequest

Bases: DynamicClassBaseInferenceRequest

Request for Grounding DINO zero-shot predictions.

Attributes:

Name Type Description
text List[str]

A list of strings.

Source code in inference/core/entities/requests/groundingdino.py
 9
10
11
12
13
14
15
16
17
18
19
class GroundingDINOInferenceRequest(DynamicClassBaseInferenceRequest):
    """Request for Grounding DINO zero-shot predictions.

    Attributes:
        text (List[str]): A list of strings.
    """

    box_threshold: Optional[float] = 0.5
    grounding_dino_version_id: Optional[str] = "default"
    text_threshold: Optional[float] = 0.5
    class_agnostic_nms: Optional[bool] = CLASS_AGNOSTIC_NMS

inference.core.entities.requests.inference

Classes

BaseRequest

Bases: BaseModel

Base request for inference.

Attributes:

Name Type Description
id str_

A unique request identifier.

api_key Optional[str]

Roboflow API Key that will be passed to the model during initialization for artifact retrieval.

start Optional[float]

start time of request

disable_model_monitoring Optional[bool]

If true, disables model monitoring for this request.

Source code in inference/core/entities/requests/inference.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class BaseRequest(BaseModel):
    """Base request for inference.

    Attributes:
        id (str_): A unique request identifier.
        api_key (Optional[str]): Roboflow API Key that will be passed to the model during initialization for artifact retrieval.
        start (Optional[float]): start time of request
        disable_model_monitoring (Optional[bool]): If true, disables model monitoring for this request.
    """

    def __init__(self, **kwargs):
        kwargs["id"] = kwargs.get("id", str(uuid4()))
        super().__init__(**kwargs)

    model_config = ConfigDict(protected_namespaces=())
    id: str
    api_key: Optional[str] = ApiKey
    usage_billable: bool = True
    start: Optional[float] = None
    source: Optional[str] = None
    source_info: Optional[str] = None
    disable_model_monitoring: Optional[bool] = Field(
        default=False, description="If true, disables model monitoring for this request"
    )

CVInferenceRequest

Bases: InferenceRequest

Computer Vision inference request.

Attributes:

Name Type Description
image Union[List[InferenceRequestImage], InferenceRequestImage]

Image(s) for inference.

disable_preproc_auto_orient Optional[bool]

If true, the auto orient preprocessing step is disabled for this call. Default is False.

disable_preproc_contrast Optional[bool]

If true, the auto contrast preprocessing step is disabled for this call. Default is False.

disable_preproc_grayscale Optional[bool]

If true, the grayscale preprocessing step is disabled for this call. Default is False.

disable_preproc_static_crop Optional[bool]

If true, the static crop preprocessing step is disabled for this call. Default is False.

Source code in inference/core/entities/requests/inference.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class CVInferenceRequest(InferenceRequest):
    """Computer Vision inference request.

    Attributes:
        image (Union[List[InferenceRequestImage], InferenceRequestImage]): Image(s) for inference.
        disable_preproc_auto_orient (Optional[bool]): If true, the auto orient preprocessing step is disabled for this call. Default is False.
        disable_preproc_contrast (Optional[bool]): If true, the auto contrast preprocessing step is disabled for this call. Default is False.
        disable_preproc_grayscale (Optional[bool]): If true, the grayscale preprocessing step is disabled for this call. Default is False.
        disable_preproc_static_crop (Optional[bool]): If true, the static crop preprocessing step is disabled for this call. Default is False.
    """

    image: Union[List[InferenceRequestImage], InferenceRequestImage]
    disable_preproc_auto_orient: Optional[bool] = Field(
        default=False,
        description="If true, the auto orient preprocessing step is disabled for this call.",
    )
    disable_preproc_contrast: Optional[bool] = Field(
        default=False,
        description="If true, the auto contrast preprocessing step is disabled for this call.",
    )
    disable_preproc_grayscale: Optional[bool] = Field(
        default=False,
        description="If true, the grayscale preprocessing step is disabled for this call.",
    )
    disable_preproc_static_crop: Optional[bool] = Field(
        default=False,
        description="If true, the static crop preprocessing step is disabled for this call.",
    )

ClassificationInferenceRequest

Bases: CVInferenceRequest

Classification inference request.

Attributes:

Name Type Description
confidence Optional[float]

The confidence threshold used to filter out predictions.

visualization_stroke_width Optional[int]

The stroke width used when visualizing predictions.

visualize_predictions Optional[bool]

If true, the predictions will be drawn on the original image and returned as a base64 string.

Source code in inference/core/entities/requests/inference.py
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
class ClassificationInferenceRequest(CVInferenceRequest):
    """Classification inference request.

    Attributes:
        confidence (Optional[float]): The confidence threshold used to filter out predictions.
        visualization_stroke_width (Optional[int]): The stroke width used when visualizing predictions.
        visualize_predictions (Optional[bool]): If true, the predictions will be drawn on the original image and returned as a base64 string.
    """

    def __init__(self, **kwargs):
        kwargs["model_type"] = "classification"
        super().__init__(**kwargs)

    confidence: Confidence = Field(
        default=0.4,
        examples=[0.5, "best", "default"],
        description=(
            'Confidence threshold. "best" uses model-eval thresholds, '
            '"default" uses the model built-in, or pass a float.'
        ),
    )
    visualization_stroke_width: Optional[int] = Field(
        default=1,
        examples=[1],
        description="The stroke width used when visualizing predictions",
    )
    visualize_predictions: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, the predictions will be drawn on the original image and returned as a base64 string",
    )
    disable_active_learning: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, the predictions will be prevented from registration by Active Learning (if the functionality is enabled)",
    )
    active_learning_target_dataset: Optional[str] = Field(
        default=None,
        examples=["my_dataset"],
        description="Parameter to be used when Active Learning data registration should happen against different dataset than the one pointed by model_id",
    )

DepthEstimationRequest

Bases: InferenceRequest

Request for depth estimation.

Attributes:

Name Type Description
image Union[List[InferenceRequestImage], InferenceRequestImage]

Image(s) to be estimated.

model_id str

The model ID to use for depth estimation.

depth_version_id Optional[str]

The version ID of the depth estimation model.

Source code in inference/core/entities/requests/inference.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
class DepthEstimationRequest(InferenceRequest):
    """Request for depth estimation.

    Attributes:
        image (Union[List[InferenceRequestImage], InferenceRequestImage]): Image(s) to be estimated.
        model_id (str): The model ID to use for depth estimation.
        depth_version_id (Optional[str]): The version ID of the depth estimation model.
    """

    image: Union[List[InferenceRequestImage], InferenceRequestImage]
    model_id: Optional[str] = Field(None)
    depth_version_id: Optional[str] = Field(
        default="small",
        examples=["small"],
        description="The version ID of the depth estimation model",
    )

    @validator("model_id", always=True)
    def validate_model_id(cls, value, values):
        if value is not None:
            return value
        if values.get("depth_version_id") is None:
            return None
        return f"depth-anything-v2/{values['depth_version_id']}"

InferenceRequest

Bases: BaseRequest

Base request for inference.

Attributes:

Name Type Description
model_id str

A unique model identifier.

model_type Optional[str]

The type of the model, usually referring to what task the model performs.

Source code in inference/core/entities/requests/inference.py
36
37
38
39
40
41
42
43
44
45
class InferenceRequest(BaseRequest):
    """Base request for inference.

    Attributes:
        model_id (str): A unique model identifier.
        model_type (Optional[str]): The type of the model, usually referring to what task the model performs.
    """

    model_id: Optional[str] = ModelID
    model_type: Optional[str] = ModelType

InferenceRequestImage

Bases: BaseModel

Image data for inference request.

Attributes:

Name Type Description
type str

The type of image data provided, one of 'url', 'base64', or 'numpy'.

value Optional[Any]

Image data corresponding to the image type.

Source code in inference/core/entities/requests/inference.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class InferenceRequestImage(BaseModel):
    """Image data for inference request.

    Attributes:
        type (str): The type of image data provided, one of 'url', 'base64', or 'numpy'.
        value (Optional[Any]): Image data corresponding to the image type.
    """

    type: str = Field(
        examples=["url"],
        description="The type of image data provided, one of 'url', 'base64', or 'numpy'",
    )
    value: Optional[Any] = Field(
        None,
        examples=["http://www.example-image-url.com"],
        description="Image data corresponding to the image type, if type = 'url' then value is a string containing the url of an image, else if type = 'base64' then value is a string containing base64 encoded image data, else if type = 'numpy' then value is binary numpy data serialized using pickle.dumps(); array should 3 dimensions, channels last, with values in the range [0,255].",
    )

InstanceSegmentationInferenceRequest

Bases: ObjectDetectionInferenceRequest

Instance Segmentation inference request.

Attributes:

Name Type Description
mask_decode_mode Optional[str]

The mode used to decode instance segmentation masks, one of 'accurate', 'fast', 'tradeoff'.

tradeoff_factor Optional[float]

The amount to tradeoff between 0='fast' and 1='accurate'.

Source code in inference/core/entities/requests/inference.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
class InstanceSegmentationInferenceRequest(ObjectDetectionInferenceRequest):
    """Instance Segmentation inference request.

    Attributes:
        mask_decode_mode (Optional[str]): The mode used to decode instance segmentation masks, one of 'accurate', 'fast', 'tradeoff'.
        tradeoff_factor (Optional[float]): The amount to tradeoff between 0='fast' and 1='accurate'.
    """

    mask_decode_mode: Optional[str] = Field(
        default="accurate",
        examples=["accurate"],
        description="The mode used to decode instance segmentation masks, one of 'accurate', 'fast', 'tradeoff'",
    )
    tradeoff_factor: Optional[float] = Field(
        default=0.0,
        examples=[0.5],
        description="The amount to tradeoff between 0='fast' and 1='accurate'",
    )
    response_mask_format: Literal["polygon", "rle"] = Field(
        default="polygon",
        examples=["rle"],
        description="Requested output mask format - `polygon` is the default Roboflow format, which however is "
        "not capable representing certain shapes - RLE is compact and more standard representation, yet "
        "require special decoding on the caller side - currently supported in `opt-in` mode when server is "
        "running with `USE_INFERENCE_MODELS=True` - otherwise it's ignored.",
    )
    enforce_dense_masks_in_inference_models: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="Flag to enforce dense masks in inference models. Such masks are faster than "
        "RLE but consume more memory which may be unstable in some cases. This flag cannot be tweaked "
        "when used on Roboflow serverless platform.",
    )

ObjectDetectionInferenceRequest

Bases: CVInferenceRequest

Object Detection inference request.

Attributes:

Name Type Description
class_agnostic_nms Optional[bool]

If true, NMS is applied to all detections at once, if false, NMS is applied per class.

class_filter Optional[List[str]]

If provided, only predictions for the listed classes will be returned.

confidence Optional[float]

The confidence threshold used to filter out predictions.

fix_batch_size Optional[bool]

If true, the batch size will be fixed to the maximum batch size configured for this server.

iou_threshold Optional[float]

The IoU threshold that must be met for a box pair to be considered duplicate during NMS.

max_detections Optional[int]

The maximum number of detections that will be returned.

max_candidates Optional[int]

The maximum number of candidate detections passed to NMS.

visualization_labels Optional[bool]

If true, labels will be rendered on prediction visualizations.

visualization_stroke_width Optional[int]

The stroke width used when visualizing predictions.

visualize_predictions Optional[bool]

If true, the predictions will be drawn on the original image and returned as a base64 string.

Source code in inference/core/entities/requests/inference.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
class ObjectDetectionInferenceRequest(CVInferenceRequest):
    """Object Detection inference request.

    Attributes:
        class_agnostic_nms (Optional[bool]): If true, NMS is applied to all detections at once, if false, NMS is applied per class.
        class_filter (Optional[List[str]]): If provided, only predictions for the listed classes will be returned.
        confidence (Optional[float]): The confidence threshold used to filter out predictions.
        fix_batch_size (Optional[bool]): If true, the batch size will be fixed to the maximum batch size configured for this server.
        iou_threshold (Optional[float]): The IoU threshold that must be met for a box pair to be considered duplicate during NMS.
        max_detections (Optional[int]): The maximum number of detections that will be returned.
        max_candidates (Optional[int]): The maximum number of candidate detections passed to NMS.
        visualization_labels (Optional[bool]): If true, labels will be rendered on prediction visualizations.
        visualization_stroke_width (Optional[int]): The stroke width used when visualizing predictions.
        visualize_predictions (Optional[bool]): If true, the predictions will be drawn on the original image and returned as a base64 string.
    """

    class_agnostic_nms: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, NMS is applied to all detections at once, if false, NMS is applied per class",
    )
    class_filter: Optional[List[str]] = Field(
        default=None,
        examples=[["class-1", "class-2", "class-n"]],
        description="If provided, only predictions for the listed classes will be returned",
    )
    confidence: Confidence = Field(
        default=0.4,
        examples=[0.5, "best", "default"],
        description=(
            'Confidence threshold. "best" uses model-eval thresholds, '
            '"default" uses the model built-in, or pass a float.'
        ),
    )
    fix_batch_size: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, the batch size will be fixed to the maximum batch size configured for this server",
    )
    iou_threshold: Optional[float] = Field(
        default=0.3,
        examples=[0.5],
        description="The IoU threhsold that must be met for a box pair to be considered duplicate during NMS",
    )
    max_detections: Optional[int] = Field(
        default=300,
        examples=[300],
        description="The maximum number of detections that will be returned",
    )
    max_candidates: Optional[int] = Field(
        default=3000,
        description="The maximum number of candidate detections passed to NMS",
    )
    visualization_labels: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, labels will be rendered on prediction visualizations",
    )
    visualization_stroke_width: Optional[int] = Field(
        default=1,
        examples=[1],
        description="The stroke width used when visualizing predictions",
    )
    visualize_predictions: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, the predictions will be drawn on the original image and returned as a base64 string",
    )
    disable_active_learning: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, the predictions will be prevented from registration by Active Learning (if the functionality is enabled)",
    )
    active_learning_target_dataset: Optional[str] = Field(
        default=None,
        examples=["my_dataset"],
        description="Parameter to be used when Active Learning data registration should happen against different dataset than the one pointed by model_id",
    )

SemanticSegmentationInferenceRequest

Bases: CVInferenceRequest

Semantic Segmentation inference request.

Source code in inference/core/entities/requests/inference.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
class SemanticSegmentationInferenceRequest(CVInferenceRequest):
    """Semantic Segmentation inference request."""

    def __init__(self, **kwargs):
        kwargs["model_type"] = "semantic-segmentation"
        super().__init__(**kwargs)

    confidence: Confidence = Field(
        default=0.4,
        examples=[0.5, "default"],
        description=(
            '"default" uses the model built-in threshold, or pass a float. '
            '"best" (model-eval threshold) is not supported for semantic '
            "segmentation yet."
        ),
    )

    # TODO: drop this validator once model eval supports semantic segmentation.
    @field_validator("confidence", mode="before")
    @classmethod
    def _reject_best_confidence(cls, value: Any) -> Any:
        if value == "best":
            raise ValueError(
                'confidence="best" is not supported for semantic segmentation '
                "— model eval does not yet produce per-class thresholds for "
                'this task. Use a float or "default".'
            )
        return value

Functions:

request_from_type

request_from_type(model_type, request_dict)

Uses original request id

Source code in inference/core/entities/requests/inference.py
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def request_from_type(model_type, request_dict):
    """Uses original request id"""
    if model_type == "classification":
        request = ClassificationInferenceRequest(**request_dict)
    elif model_type == "instance-segmentation":
        request = InstanceSegmentationInferenceRequest(**request_dict)
    elif model_type == "object-detection":
        request = ObjectDetectionInferenceRequest(**request_dict)
    elif model_type == "semantic-segmentation":
        request = SemanticSegmentationInferenceRequest(**request_dict)
    else:
        raise ValueError(f"Unknown task type {model_type}")
    request.id = request_dict.get("id", request.id)
    return request

inference.core.entities.requests.moondream2

Classes

Moondream2InferenceRequest

Bases: DynamicClassBaseInferenceRequest

Request for Moondream 2 zero-shot predictions.

Attributes:

Name Type Description
text List[str]

A list of strings.

Source code in inference/core/entities/requests/moondream2.py
 6
 7
 8
 9
10
11
12
13
class Moondream2InferenceRequest(DynamicClassBaseInferenceRequest):
    """Request for Moondream 2 zero-shot predictions.

    Attributes:
        text (List[str]): A list of strings.
    """

    prompt: str

inference.core.entities.requests.owlv2

Classes

OwlV2InferenceRequest

Bases: BaseRequest

Request for OwlV2 inference.

Attributes:

Name Type Description
api_key Optional[str]

Roboflow API Key.

owlv2_version_id Optional[str]

The version ID of OwlV2 to be used for this request.

image Union[List[InferenceRequestImage], InferenceRequestImage]

Image(s) for inference.

training_data List[TrainingImage]

Training data to ground the model on

confidence float

Confidence threshold to filter predictions by

Source code in inference/core/entities/requests/owlv2.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class OwlV2InferenceRequest(BaseRequest):
    """Request for OwlV2 inference.

    Attributes:
        api_key (Optional[str]): Roboflow API Key.
        owlv2_version_id (Optional[str]): The version ID of OwlV2 to be used for this request.
        image (Union[List[InferenceRequestImage], InferenceRequestImage]): Image(s) for inference.
        training_data (List[TrainingImage]): Training data to ground the model on
        confidence (float): Confidence threshold to filter predictions by
    """

    owlv2_version_id: Optional[str] = Field(
        default=OWLV2_VERSION_ID,
        examples=["owlv2-base-patch16-ensemble"],
        description="The version ID of owlv2 to be used for this request.",
    )
    model_id: Optional[str] = Field(
        default=None, description="Model id to be used in the request."
    )

    image: Union[List[InferenceRequestImage], InferenceRequestImage] = Field(
        description="Images to run the model on"
    )
    training_data: List[TrainingImage] = Field(
        description="Training images for the owlvit model to learn form"
    )
    confidence: Optional[float] = Field(
        default=0.99,
        examples=[0.99],
        description="Default confidence threshold for owlvit predictions. "
        "Needs to be much higher than you're used to, probably 0.99 - 0.9999",
    )
    visualize_predictions: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, return visualized predictions as a base64 string",
    )
    visualization_labels: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, labels will be rendered on prediction visualizations",
    )
    visualization_stroke_width: Optional[int] = Field(
        default=1,
        examples=[1],
        description="The stroke width used when visualizing predictions",
    )
    visualize_predictions: Optional[bool] = Field(
        default=False,
        examples=[False],
        description="If true, the predictions will be drawn on the original image and returned as a base64 string",
    )

    # TODO[pydantic]: We couldn't refactor the `validator`, please replace it by `field_validator` manually.
    # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
    @validator("model_id", always=True, allow_reuse=True)
    def validate_model_id(cls, value, values):
        if value is not None:
            return value
        if values.get("owl2_version_id") is None:
            return None
        return f"google/{values['owl2_version_id']}"

inference.core.entities.requests.perception_encoder

Classes

PerceptionEncoderCompareRequest

Bases: PerceptionEncoderInferenceRequest

Request for PERCEPTION_ENCODER comparison.

Attributes:

Name Type Description
subject Union[InferenceRequestImage, str]

The type of image data provided, one of 'url' or 'base64'.

subject_type str

The type of subject, one of 'image' or 'text'.

prompt Union[List[InferenceRequestImage], InferenceRequestImage, str, List[str], Dict[str, Union[InferenceRequestImage, str]]]

The prompt for comparison.

prompt_type str

The type of prompt, one of 'image' or 'text'.

Source code in inference/core/entities/requests/perception_encoder.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class PerceptionEncoderCompareRequest(PerceptionEncoderInferenceRequest):
    """Request for PERCEPTION_ENCODER comparison.

    Attributes:
        subject (Union[InferenceRequestImage, str]): The type of image data provided, one of 'url' or 'base64'.
        subject_type (str): The type of subject, one of 'image' or 'text'.
        prompt (Union[List[InferenceRequestImage], InferenceRequestImage, str, List[str], Dict[str, Union[InferenceRequestImage, str]]]): The prompt for comparison.
        prompt_type (str): The type of prompt, one of 'image' or 'text'.
    """

    subject: Union[InferenceRequestImage, str] = Field(
        examples=["url"],
        description="The type of image data provided, one of 'url' or 'base64'",
    )
    subject_type: str = Field(
        default="image",
        examples=["image"],
        description="The type of subject, one of 'image' or 'text'",
    )
    prompt: Union[
        List[InferenceRequestImage],
        InferenceRequestImage,
        str,
        List[str],
        Dict[str, Union[InferenceRequestImage, str]],
    ]
    prompt_type: str = Field(
        default="text",
        examples=["text"],
        description="The type of prompt, one of 'image' or 'text'",
    )

PerceptionEncoderImageEmbeddingRequest

Bases: PerceptionEncoderInferenceRequest

Request for PERCEPTION_ENCODER image embedding.

Attributes:

Name Type Description
image Union[List[InferenceRequestImage], InferenceRequestImage]

Image(s) to be embedded.

Source code in inference/core/entities/requests/perception_encoder.py
38
39
40
41
42
43
44
45
class PerceptionEncoderImageEmbeddingRequest(PerceptionEncoderInferenceRequest):
    """Request for PERCEPTION_ENCODER image embedding.

    Attributes:
        image (Union[List[InferenceRequestImage], InferenceRequestImage]): Image(s) to be embedded.
    """

    image: Union[List[InferenceRequestImage], InferenceRequestImage]

PerceptionEncoderInferenceRequest

Bases: BaseRequest

Request for PERCEPTION_ENCODER inference.

Attributes:

Name Type Description
api_key Optional[str]

Roboflow API Key.

clip_version_id Optional[str]

The version ID of PERCEPTION_ENCODER to be used for this request.

Source code in inference/core/entities/requests/perception_encoder.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class PerceptionEncoderInferenceRequest(BaseRequest):
    """Request for PERCEPTION_ENCODER inference.

    Attributes:
        api_key (Optional[str]): Roboflow API Key.
        clip_version_id (Optional[str]): The version ID of PERCEPTION_ENCODER to be used for this request.
    """

    perception_encoder_version_id: Optional[str] = Field(
        default=PERCEPTION_ENCODER_VERSION_ID,
        examples=["PE-Core-L14-336"],
        description="The version ID of PERCEPTION_ENCODER to be used for this request. Must be one of RN101, RN50, RN50x16, RN50x4, RN50x64, ViT-B-16, ViT-B-32, ViT-L-14-336px, and ViT-L-14.",
    )
    model_id: Optional[str] = Field(None)

    # TODO[pydantic]: We couldn't refactor the `validator`, please replace it by `field_validator` manually.
    # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
    @validator("model_id", always=True)
    def validate_model_id(cls, value, values):
        if value is not None:
            return value
        if values.get("perception_encoder_version_id") is None:
            return None
        return f"perception_encoder/{values['perception_encoder_version_id']}"

PerceptionEncoderTextEmbeddingRequest

Bases: PerceptionEncoderInferenceRequest

Request for PERCEPTION_ENCODER text embedding.

Attributes:

Name Type Description
text Union[List[str], str]

A string or list of strings.

Source code in inference/core/entities/requests/perception_encoder.py
48
49
50
51
52
53
54
55
56
57
58
class PerceptionEncoderTextEmbeddingRequest(PerceptionEncoderInferenceRequest):
    """Request for PERCEPTION_ENCODER text embedding.

    Attributes:
        text (Union[List[str], str]): A string or list of strings.
    """

    text: Union[List[str], str] = Field(
        examples=["The quick brown fox jumps over the lazy dog"],
        description="A string or list of strings",
    )

inference.core.entities.requests.sam

Classes

SamEmbeddingRequest

Bases: SamInferenceRequest

SAM embedding request.

Attributes:

Name Type Description
image Optional[InferenceRequestImage]

The image to be embedded.

image_id Optional[str]

The ID of the image to be embedded used to cache the embedding.

format Optional[str]

The format of the response. Must be one of json or binary.

Source code in inference/core/entities/requests/sam.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class SamEmbeddingRequest(SamInferenceRequest):
    """SAM embedding request.

    Attributes:
        image (Optional[inference.core.entities.requests.inference.InferenceRequestImage]): The image to be embedded.
        image_id (Optional[str]): The ID of the image to be embedded used to cache the embedding.
        format (Optional[str]): The format of the response. Must be one of json or binary.
    """

    image: Optional[InferenceRequestImage] = Field(
        default=None,
        description="The image to be embedded",
    )
    image_id: Optional[str] = Field(
        default=None,
        examples=["image_id"],
        description="The ID of the image to be embedded used to cache the embedding.",
    )
    format: Optional[str] = Field(
        default="json",
        examples=["json"],
        description="The format of the response. Must be one of json or binary. If binary, embedding is returned as a binary numpy array.",
    )

SamInferenceRequest

Bases: BaseRequest

SAM inference request.

Attributes:

Name Type Description
api_key Optional[str]

Roboflow API Key.

sam_version_id Optional[str]

The version ID of SAM to be used for this request.

Source code in inference/core/entities/requests/sam.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class SamInferenceRequest(BaseRequest):
    """SAM inference request.

    Attributes:
        api_key (Optional[str]): Roboflow API Key.
        sam_version_id (Optional[str]): The version ID of SAM to be used for this request.
    """

    sam_version_id: Optional[str] = Field(
        default=SAM_VERSION_ID,
        examples=["vit_h"],
        description="The version ID of SAM to be used for this request. Must be one of vit_h, vit_l, or vit_b.",
    )

    model_id: Optional[str] = Field(None)

    # TODO[pydantic]: We couldn't refactor the `validator`, please replace it by `field_validator` manually.
    # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
    @validator("model_id", always=True)
    def validate_model_id(cls, value, values):
        if value is not None:
            return value
        if values.get("sam_version_id") is None:
            return None
        return f"sam/{values['sam_version_id']}"

SamSegmentationRequest

Bases: SamInferenceRequest

SAM segmentation request.

Attributes:

Name Type Description
embeddings Optional[Union[List[List[List[List[float]]]], Any]]

The embeddings to be decoded.

embeddings_format Optional[str]

The format of the embeddings.

format Optional[str]

The format of the response.

image Optional[InferenceRequestImage]

The image to be segmented.

image_id Optional[str]

The ID of the image to be segmented used to retrieve cached embeddings.

has_mask_input Optional[bool]

Whether or not the request includes a mask input.

mask_input Optional[Union[List[List[List[float]]], Any]]

The set of output masks.

mask_input_format Optional[str]

The format of the mask input.

orig_im_size Optional[List[int]]

The original size of the image used to generate the embeddings.

point_coords Optional[List[List[float]]]

The coordinates of the interactive points used during decoding.

point_labels Optional[List[float]]

The labels of the interactive points used during decoding.

use_mask_input_cache Optional[bool]

Whether or not to use the mask input cache.

Source code in inference/core/entities/requests/sam.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class SamSegmentationRequest(SamInferenceRequest):
    """SAM segmentation request.

    Attributes:
        embeddings (Optional[Union[List[List[List[List[float]]]], Any]]): The embeddings to be decoded.
        embeddings_format (Optional[str]): The format of the embeddings.
        format (Optional[str]): The format of the response.
        image (Optional[InferenceRequestImage]): The image to be segmented.
        image_id (Optional[str]): The ID of the image to be segmented used to retrieve cached embeddings.
        has_mask_input (Optional[bool]): Whether or not the request includes a mask input.
        mask_input (Optional[Union[List[List[List[float]]], Any]]): The set of output masks.
        mask_input_format (Optional[str]): The format of the mask input.
        orig_im_size (Optional[List[int]]): The original size of the image used to generate the embeddings.
        point_coords (Optional[List[List[float]]]): The coordinates of the interactive points used during decoding.
        point_labels (Optional[List[float]]): The labels of the interactive points used during decoding.
        use_mask_input_cache (Optional[bool]): Whether or not to use the mask input cache.
    """

    embeddings: Optional[Union[List[List[List[List[float]]]], Any]] = Field(
        None,
        examples=["[[[[0.1, 0.2, 0.3, ...] ...] ...]]"],
        description="The embeddings to be decoded. The dimensions of the embeddings are 1 x 256 x 64 x 64. If embeddings is not provided, image must be provided.",
    )
    embeddings_format: Optional[str] = Field(
        default="json",
        examples=["json"],
        description="The format of the embeddings. Must be one of json or binary. If binary, embeddings are expected to be a binary numpy array.",
    )
    format: Optional[str] = Field(
        default="json",
        examples=["json"],
        description="The format of the response. Must be one of json or binary. If binary, masks are returned as binary numpy arrays. If json, masks are converted to polygons, then returned as json.",
    )
    image: Optional[InferenceRequestImage] = Field(
        default=None,
        description="The image to be segmented. Only required if embeddings are not provided.",
    )
    image_id: Optional[str] = Field(
        default=None,
        examples=["image_id"],
        description="The ID of the image to be segmented used to retrieve cached embeddings. If an embedding is cached, it will be used instead of generating a new embedding. If no embedding is cached, a new embedding will be generated and cached.",
    )
    has_mask_input: Optional[bool] = Field(
        default=False,
        examples=[True],
        description="Whether or not the request includes a mask input. If true, the mask input must be provided.",
    )
    mask_input: Optional[Union[List[List[List[float]]], Any]] = Field(
        default=None,
        description="The set of output masks. If request format is json, masks is a list of polygons, where each polygon is a list of points, where each point is a tuple containing the x,y pixel coordinates of the point. If request format is binary, masks is a list of binary numpy arrays. The dimensions of each mask are 256 x 256. This is the same as the output, low resolution mask from the previous inference.",
    )
    mask_input_format: Optional[str] = Field(
        default="json",
        examples=["json"],
        description="The format of the mask input. Must be one of json or binary. If binary, mask input is expected to be a binary numpy array.",
    )
    orig_im_size: Optional[List[int]] = Field(
        default=None,
        examples=[[640, 320]],
        description="The original size of the image used to generate the embeddings. This is only required if the image is not provided.",
    )
    point_coords: Optional[List[List[float]]] = Field(
        default=[[0.0, 0.0]],
        examples=[[[10.0, 10.0]]],
        description="The coordinates of the interactive points used during decoding. Each point (x,y pair) corresponds to a label in point_labels.",
    )
    point_labels: Optional[List[float]] = Field(
        default=[-1],
        examples=[[1]],
        description="The labels of the interactive points used during decoding. A 1 represents a positive point (part of the object to be segmented). A -1 represents a negative point (not part of the object to be segmented). Each label corresponds to a point in point_coords.",
    )
    use_mask_input_cache: Optional[bool] = Field(
        default=True,
        examples=[True],
        description="Whether or not to use the mask input cache. If true, the mask input cache will be used if it exists. If false, the mask input cache will not be used.",
    )

inference.core.entities.requests.sam2

Classes

Sam2EmbeddingRequest

Bases: Sam2InferenceRequest

SAM embedding request.

Attributes:

Name Type Description
image Optional[InferenceRequestImage]

The image to be embedded.

image_id Optional[str]

The ID of the image to be embedded used to cache the embedding.

format Optional[str]

The format of the response. Must be one of json or binary.

Source code in inference/core/entities/requests/sam2.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class Sam2EmbeddingRequest(Sam2InferenceRequest):
    """SAM embedding request.

    Attributes:
        image (Optional[inference.core.entities.requests.inference.InferenceRequestImage]): The image to be embedded.
        image_id (Optional[str]): The ID of the image to be embedded used to cache the embedding.
        format (Optional[str]): The format of the response. Must be one of json or binary.
    """

    image: Optional[InferenceRequestImage] = Field(
        default=None,
        description="The image to be embedded",
    )
    image_id: Optional[str] = Field(
        default=None,
        examples=["image_id"],
        description="The ID of the image to be embedded used to cache the embedding.",
    )

Sam2InferenceRequest

Bases: BaseRequest

SAM2 inference request.

Attributes:

Name Type Description
api_key Optional[str]

Roboflow API Key.

sam2_version_id Optional[str]

The version ID of SAM2 to be used for this request.

Source code in inference/core/entities/requests/sam2.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class Sam2InferenceRequest(BaseRequest):
    """SAM2 inference request.

    Attributes:
        api_key (Optional[str]): Roboflow API Key.
        sam2_version_id (Optional[str]): The version ID of SAM2 to be used for this request.
    """

    sam2_version_id: Optional[str] = Field(
        default=SAM2_VERSION_ID,
        examples=["hiera_large"],
        description="The version ID of SAM to be used for this request. Must be one of hiera_tiny, hiera_small, hiera_large, hiera_b_plus",
    )

    model_id: Optional[str] = Field(None)

    # TODO[pydantic]: We couldn't refactor the `validator`, please replace it by `field_validator` manually.
    # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
    @validator("model_id", always=True)
    def validate_model_id(cls, value, values):
        if value is not None:
            return value
        if values.get("sam_version_id") is None:
            return None
        return f"sam2/{values['sam_version_id']}"

Sam2SegmentationRequest

Bases: Sam2InferenceRequest

SAM segmentation request.

Attributes:

Name Type Description
format Optional[str]

The format of the response.

image InferenceRequestImage

The image to be segmented.

image_id Optional[str]

The ID of the image to be segmented used to retrieve cached embeddings.

point_coords Optional[List[List[float]]]

The coordinates of the interactive points used during decoding.

point_labels Optional[List[float]]

The labels of the interactive points used during decoding.

Source code in inference/core/entities/requests/sam2.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
class Sam2SegmentationRequest(Sam2InferenceRequest):
    """SAM segmentation request.

    Attributes:
        format (Optional[str]): The format of the response.
        image (InferenceRequestImage): The image to be segmented.
        image_id (Optional[str]): The ID of the image to be segmented used to retrieve cached embeddings.
        point_coords (Optional[List[List[float]]]): The coordinates of the interactive points used during decoding.
        point_labels (Optional[List[float]]): The labels of the interactive points used during decoding.
    """

    format: Optional[str] = Field(
        default="json",
        examples=["json"],
        description="The format of the response. Must be one of 'json', 'rle', or 'binary'. If binary, masks are returned as binary numpy arrays. If json, masks are converted to polygons. If rle, masks are converted to RLE format.",
    )
    image: InferenceRequestImage = Field(
        description="The image to be segmented.",
    )
    image_id: Optional[str] = Field(
        default=None,
        examples=["image_id"],
        description="The ID of the image to be segmented used to retrieve cached embeddings. If an embedding is cached, it will be used instead of generating a new embedding. If no embedding is cached, a new embedding will be generated and cached.",
    )
    prompts: Sam2PromptSet = Field(
        default=Sam2PromptSet(prompts=None),
        example=[{"prompts": [{"points": [{"x": 100, "y": 100, "positive": True}]}]}],
        description="A list of prompts for masks to predict. Each prompt can include a bounding box and / or a set of postive or negative points. "
        "Also accepts a flat array of prompts (e.g. 'prompts': [{...}, {...}]) for convenience.",
    )
    multimask_output: bool = Field(
        default=True,
        examples=[True],
        description="If true, the model will return three masks. "
        "For ambiguous input prompts (such as a single click), this will often "
        "produce better masks than a single prediction. If only a single "
        "mask is needed, the model's predicted quality score can be used "
        "to select the best mask. For non-ambiguous prompts, such as multiple "
        "input prompts, multimask_output=False can give better results.",
    )

    # TODO[pydantic]: We couldn't refactor the `validator`, please replace it by `field_validator` manually.
    # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
    @validator("prompts", pre=True, always=True)
    def _coerce_prompts(cls, value):
        """
        Accepts any of the following and coerces to Sam2PromptSet:
        - None
        - Sam2PromptSet
        - {"prompts": [...]} (nested)
        - [...] (flat list of prompts)
        - single prompt dict (wrapped to list)
        """
        if value is None:
            return Sam2PromptSet(prompts=None)
        if isinstance(value, Sam2PromptSet):
            return value
        # Nested dict with key 'prompts'
        if isinstance(value, dict):
            if "prompts" in value:
                return Sam2PromptSet(**value)
            # Single prompt dict – wrap and parse
            try:
                return Sam2PromptSet(prompts=[Sam2Prompt(**value)])
            except Exception:
                # Fall-through to attempt generic construction
                return Sam2PromptSet(**value)
        # Flat list of prompts (dicts or Sam2Prompt instances)
        if isinstance(value, list):
            prompts: List[Sam2Prompt] = []
            for item in value:
                if isinstance(item, Sam2Prompt):
                    prompts.append(item)
                elif isinstance(item, dict):
                    prompts.append(Sam2Prompt(**item))
                else:
                    raise ValueError(
                        "Invalid prompt entry; expected dict or Sam2Prompt instance"
                    )
            return Sam2PromptSet(prompts=prompts)
        # Fallback: let Pydantic try
        return value

    save_logits_to_cache: bool = Field(
        default=False,
        description="If True, saves the low-resolution logits to the cache for potential future use. "
        "This can speed up subsequent requests with similar prompts on the same image. "
        "This feature is ignored if DISABLE_SAM2_LOGITS_CACHE env variable is set True",
    )
    load_logits_from_cache: bool = Field(
        default=False,
        description="If True, attempts to load previously cached low-resolution logits for the given image and prompt set. "
        "This can significantly speed up inference when making multiple similar requests on the same image. "
        "This feature is ignored if DISABLE_SAM2_LOGITS_CACHE env variable is set True",
    )

inference.core.entities.requests.sam3

Classes

Sam3InferenceRequest

Bases: BaseRequest

SAM3 inference request.

Attributes:

Name Type Description
model_id Optional[str]

The model ID to be used, typically sam3.

Source code in inference/core/entities/requests/sam3.py
89
90
91
92
93
94
95
96
97
98
99
class Sam3InferenceRequest(BaseRequest):
    """SAM3 inference request.

    Attributes:
        model_id (Optional[str]): The model ID to be used, typically `sam3`.
    """

    model_id: Optional[str] = Field(
        default="sam3/sam3_final",
        description="The model ID of SAM3. Use 'sam3/sam3_final' to target the generic base model.",
    )

Sam3Prompt

Bases: BaseModel

Unified prompt that can contain text and/or geometry.

Absolute pixel coordinates are used for boxes. Labels accept 0/1 or booleans.

Source code in inference/core/entities/requests/sam3.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class Sam3Prompt(BaseModel):
    """Unified prompt that can contain text and/or geometry.

    Absolute pixel coordinates are used for boxes. Labels accept 0/1 or booleans.
    """

    type: Optional[str] = Field(
        default=None,
        description="Optional hint: 'text' or 'visual'. 'visual' requires at least one box.",
    )
    text: Optional[str] = Field(
        default=None,
        description="Concept to segment as a short noun phrase (e.g. 'person'). "
        "All matching instances are returned. Can be combined with exemplar boxes in the same prompt.",
    )

    output_prob_thresh: Optional[float] = Field(
        default=None,
        description="Score threshold for this prompt's outputs. Overrides request-level threshold if set.",
    )

    # Absolute-coordinate boxes (preferred) in pixels.
    # XYWH absolute pixels
    class Box(BaseModel):
        x: float
        y: float
        width: float
        height: float

    # XYXY absolute pixels
    class BoxXYXY(BaseModel):
        x0: float
        y0: float
        x1: float
        y1: float

    # Single unified boxes field; each entry can be XYWH or XYXY
    boxes: Optional[List[Union[Box, BoxXYXY]]] = Field(
        default=None,
        description="Exemplar boxes in absolute pixels, as XYWH entries "
        "({x, y, width, height}, top-left anchored) or XYXY entries ({x0, y0, x1, y1}). "
        "Each box marks an example object; the model segments every instance matching "
        "the exemplars (and text, if provided), not just the boxed objects. "
        "Requires box_labels.",
    )
    box_labels: Optional[List[Union[int, bool]]] = Field(
        default=None,
        description="Per-box exemplar labels, one per entry in boxes: "
        "1/true marks a positive exemplar (segment objects like this), "
        "0/false marks a negative exemplar (exclude objects like this). "
        "Required when boxes is set.",
    )

    @validator("boxes", always=True)
    def _validate_visual_boxes(cls, boxes, values):
        prompt_type = values.get("type")
        if prompt_type == "visual":
            if not boxes or len(boxes) == 0:
                raise ValueError("Visual prompt requires at least one box")
        return boxes

    @validator("box_labels", always=True)
    def _validate_box_labels(cls, labels, values):
        boxes = values.get("boxes")
        if labels is None:
            return labels
        if boxes is None or len(labels) != len(boxes):
            raise ValueError("box_labels must match boxes length when provided")
        return labels

    @validator("output_prob_thresh")
    def _validate_output_prob_thresh(cls, v):
        if v is not None and (v < 0.0 or v > 1.0):
            raise ValueError("output_prob_thresh must be between 0.0 and 1.0")
        return v

inference.core.entities.requests.sam3_3d

Classes

Sam3_3D_Objects_InferenceRequest

Bases: BaseRequest

SAM3D inference request for 3D object generation.

Attributes:

Name Type Description
api_key Optional[str]

Roboflow API Key.

image InferenceRequestImage

The input image to be used for 3D generation.

mask_input Any

Mask(s) in any supported format - polygon, binary mask, or RLE.

Source code in inference/core/entities/requests/sam3_3d.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class Sam3_3D_Objects_InferenceRequest(BaseRequest):
    """SAM3D inference request for 3D object generation.

    Attributes:
        api_key (Optional[str]): Roboflow API Key.
        image (InferenceRequestImage): The input image to be used for 3D generation.
        mask_input: Mask(s) in any supported format - polygon, binary mask, or RLE.
    """

    image: InferenceRequestImage = Field(
        description="The input image to be used for 3D generation.",
    )

    mask_input: Any = Field(
        description="Mask input in any supported format: "
        "polygon [x1,y1,x2,y2,...], binary mask (base64), RLE dict, or list of these.",
    )

    model_id: Optional[str] = Field(
        default="sam3-3d-objects", description="The model ID for SAM3_3D."
    )

    output_meshes: Optional[bool] = Field(
        default=True,
        description="SAM3 3D always outputs object gaussians, and can optionally output object meshes if output_meshes is True.",
    )

    output_scene: Optional[bool] = Field(
        default=True,
        description="Output the combined scene reconstruction in addition to individual object reconstructions.",
    )

    with_mesh_postprocess: Optional[bool] = Field(
        default=True, description="Enable mesh postprocessing."
    )

    with_texture_baking: Optional[bool] = Field(
        default=True, description="Enable texture baking for meshes."
    )

    use_distillations: Optional[bool] = Field(
        default=False, description="Use the distilled versions of the model components."
    )

    @validator("model_id", always=True)
    def validate_model_id(cls, value):
        if value is not None:
            return value
        return "sam3-3d-objects"

inference.core.entities.requests.server_state

Classes

AddModelRequest

Bases: BaseModel

Request to add a model to the inference server.

Attributes:

Name Type Description
model_id str

A unique model identifier.

model_type Optional[str]

The type of the model, usually referring to what task the model performs.

api_key Optional[str]

Roboflow API Key that will be passed to the model during initialization for artifact retrieval.

Source code in inference/core/entities/requests/server_state.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
class AddModelRequest(BaseModel):
    """Request to add a model to the inference server.

    Attributes:
        model_id (str): A unique model identifier.
        model_type (Optional[str]): The type of the model, usually referring to what task the model performs.
        api_key (Optional[str]): Roboflow API Key that will be passed to the model during initialization for artifact retrieval.
    """

    model_config = ConfigDict(protected_namespaces=())
    model_id: str = ModelID
    model_type: Optional[str] = ModelType
    api_key: Optional[str] = ApiKey

ClearModelRequest

Bases: BaseModel

Request to clear a model from the inference server.

Attributes:

Name Type Description
model_id str

A unique model identifier.

Source code in inference/core/entities/requests/server_state.py
23
24
25
26
27
28
29
30
31
class ClearModelRequest(BaseModel):
    """Request to clear a model from the inference server.

    Attributes:
        model_id (str): A unique model identifier.
    """

    model_config = ConfigDict(protected_namespaces=())
    model_id: str = ModelID

inference.core.entities.requests.trocr

Classes

TrOCRInferenceRequest

Bases: BaseRequest

TrOCR inference request.

Attributes:

Name Type Description
api_key Optional[str]

Roboflow API Key.

Source code in inference/core/entities/requests/trocr.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
class TrOCRInferenceRequest(BaseRequest):
    """
    TrOCR inference request.

    Attributes:
        api_key (Optional[str]): Roboflow API Key.
    """

    image: Union[List[InferenceRequestImage], InferenceRequestImage]
    trocr_version_id: Optional[str] = "trocr-base-printed"
    model_id: Optional[str] = Field(None)

    # TODO[pydantic]: We couldn't refactor the `validator`, please replace it by `field_validator` manually.
    # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
    @validator("model_id", always=True, allow_reuse=True)
    def validate_model_id(cls, value, values):
        if value is not None:
            return value
        if values.get("trocr_version_id") is None:
            return None
        return f"trocr/{values['trocr_version_id']}"

inference.core.entities.requests.yolo_world

Classes

YOLOWorldInferenceRequest

Bases: DynamicClassBaseInferenceRequest

Request for Grounding DINO zero-shot predictions.

Attributes:

Name Type Description
text List[str]

A list of strings.

Source code in inference/core/entities/requests/yolo_world.py
 9
10
11
12
13
14
15
16
17
class YOLOWorldInferenceRequest(DynamicClassBaseInferenceRequest):
    """Request for Grounding DINO zero-shot predictions.

    Attributes:
        text (List[str]): A list of strings.
    """

    yolo_world_version_id: Optional[str] = "l"
    confidence: Optional[float] = DEFAULT_CONFIDENCE

core/entities/responses

inference.core.entities.responses.clip

Classes

ClipCompareResponse

Bases: InferenceResponse

Response for CLIP comparison.

Attributes:

Name Type Description
similarity Union[List[float], Dict[str, float]]

Similarity scores.

time float

The time in seconds it took to produce the similarity scores including preprocessing.

Source code in inference/core/entities/responses/clip.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class ClipCompareResponse(InferenceResponse):
    """Response for CLIP comparison.

    Attributes:
        similarity (Union[List[float], Dict[str, float]]): Similarity scores.
        time (float): The time in seconds it took to produce the similarity scores including preprocessing.
    """

    similarity: Union[List[float], Dict[str, float]]
    time: Optional[float] = Field(
        default=None,
        description="The time in seconds it took to produce the similarity scores including preprocessing",
    )
    parent_id: Optional[str] = Field(
        description="Identifier of parent image region. Useful when stack of detection-models is in use to refer the RoI being the input to inference",
        default=None,
    )

ClipEmbeddingResponse

Bases: InferenceResponse

Response for CLIP embedding.

Attributes:

Name Type Description
embeddings List[List[float]]

A list of embeddings, each embedding is a list of floats.

time float

The time in seconds it took to produce the embeddings including preprocessing.

Source code in inference/core/entities/responses/clip.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
class ClipEmbeddingResponse(InferenceResponse):
    """Response for CLIP embedding.

    Attributes:
        embeddings (List[List[float]]): A list of embeddings, each embedding is a list of floats.
        time (float): The time in seconds it took to produce the embeddings including preprocessing.
    """

    embeddings: List[List[float]] = Field(
        examples=["[[0.12, 0.23, 0.34, ..., 0.43]]"],
        description="A list of embeddings, each embedding is a list of floats",
    )
    time: Optional[float] = Field(
        default=None,
        description="The time in seconds it took to produce the embeddings including preprocessing",
    )

inference.core.entities.responses.inference

Classes

ClassificationInferenceResponse

Bases: CvInferenceResponse, WithVisualizationResponse

Classification inference response.

Attributes:

Name Type Description
predictions List[ClassificationPrediction]

List of classification predictions.

top str

The top predicted class label.

confidence float

The confidence of the top predicted class label.

Source code in inference/core/entities/responses/inference.py
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
class ClassificationInferenceResponse(CvInferenceResponse, WithVisualizationResponse):
    """Classification inference response.

    Attributes:
        predictions (List[inference.core.entities.responses.inference.ClassificationPrediction]): List of classification predictions.
        top (str): The top predicted class label.
        confidence (float): The confidence of the top predicted class label.
    """

    predictions: List[ClassificationPrediction]
    top: str = Field(
        description="The top predicted class label", default=""
    )  # Not making this field optional to avoid breaking change - in other parts of the codebase `model_dump` is called with `exclude_none=True`
    confidence: float = Field(
        description="The confidence of the top predicted class label",
        default=0.0,
    )
    parent_id: Optional[str] = Field(
        description="Identifier of parent image region. Useful when stack of detection-models is in use to refer the RoI being the input to inference",
        default=None,
    )

ClassificationPrediction

Bases: BaseModel

Classification prediction.

Attributes:

Name Type Description
class_name str

The predicted class label.

class_id int

Numeric ID associated with the class label.

confidence float

The class label confidence as a fraction between 0 and 1.

Source code in inference/core/entities/responses/inference.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
class ClassificationPrediction(BaseModel):
    """Classification prediction.

    Attributes:
        class_name (str): The predicted class label.
        class_id (int): Numeric ID associated with the class label.
        confidence (float): The class label confidence as a fraction between 0 and 1.
    """

    class_name: str = Field(alias="class", description="The predicted class label")
    class_id: int = Field(description="Numeric ID associated with the class label")
    confidence: float = Field(
        description="The class label confidence as a fraction between 0 and 1"
    )

CvInferenceResponse

Bases: InferenceResponse

Computer Vision inference response.

Attributes:

Name Type Description
image Union[List[InferenceResponseImage], InferenceResponseImage]

Image(s) used in inference.

Source code in inference/core/entities/responses/inference.py
202
203
204
205
206
207
208
209
class CvInferenceResponse(InferenceResponse):
    """Computer Vision inference response.

    Attributes:
        image (Union[List[inference.core.entities.responses.inference.InferenceResponseImage], inference.core.entities.responses.inference.InferenceResponseImage]): Image(s) used in inference.
    """

    image: Union[List[InferenceResponseImage], InferenceResponseImage]

DepthEstimationResponse

Bases: BaseModel

Response for depth estimation inference.

Attributes:

Name Type Description
normalized_depth List[List[float]]

The normalized depth map as a 2D array of floats between 0 and 1.

image Optional[str]

Base64 encoded visualization of the depth map if visualize_predictions is True.

time float

The processing time in seconds.

visualization Optional[str]

Base64 encoded visualization of the depth map if visualize_predictions is True.

Source code in inference/core/entities/responses/inference.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
class DepthEstimationResponse(BaseModel):
    """Response for depth estimation inference.

    Attributes:
        normalized_depth (List[List[float]]): The normalized depth map as a 2D array of floats between 0 and 1.
        image (Optional[str]): Base64 encoded visualization of the depth map if visualize_predictions is True.
        time (float): The processing time in seconds.
        visualization (Optional[str]): Base64 encoded visualization of the depth map if visualize_predictions is True.
    """

    normalized_depth: List[List[float]] = Field(
        description="The normalized depth map as a 2D array of floats between 0 and 1"
    )
    image: Optional[str] = Field(
        None,
        description="Base64 encoded visualization of the depth map if visualize_predictions is True",
    )

FaceDetectionPrediction

Bases: ObjectDetectionPrediction

Face Detection prediction.

Attributes:

Name Type Description
class_name str

fixed value "face".

landmarks Union[List[Point], List[Point3D]]

The detected face landmarks.

Source code in inference/core/entities/responses/inference.py
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
class FaceDetectionPrediction(ObjectDetectionPrediction):
    """Face Detection prediction.

    Attributes:
        class_name (str): fixed value "face".
        landmarks (Union[List[inference.core.entities.responses.inference.Point], List[inference.core.entities.responses.inference.Point3D]]): The detected face landmarks.
    """

    class_id: Optional[int] = Field(
        description="The class id of the prediction", default=0
    )
    class_name: str = Field(
        alias="class", default="face", description="The predicted class label"
    )
    landmarks: Union[List[Point], List[Point3D]]

InferenceResponse

Bases: BaseModel

Base inference response.

Attributes:

Name Type Description
inference_id Optional[str]

Unique identifier of inference

frame_id Optional[int]

The frame id of the image used in inference if the input was a video.

time Optional[float]

The time in seconds it took to produce the predictions including image preprocessing.

Source code in inference/core/entities/responses/inference.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
class InferenceResponse(BaseModel):
    """Base inference response.

    Attributes:
        inference_id (Optional[str]): Unique identifier of inference
        frame_id (Optional[int]): The frame id of the image used in inference if the input was a video.
        time (Optional[float]): The time in seconds it took to produce the predictions including image preprocessing.
    """

    model_config = ConfigDict(protected_namespaces=())
    inference_id: Optional[str] = Field(
        description="Unique identifier of inference", default=None
    )
    frame_id: Optional[int] = Field(
        default=None,
        description="The frame id of the image used in inference if the input was a video",
    )
    time: Optional[float] = Field(
        default=None,
        description="The time in seconds it took to produce the predictions including image preprocessing",
    )

InferenceResponseImage

Bases: BaseModel

Inference response image information.

Attributes:

Name Type Description
width int

The original width of the image used in inference.

height int

The original height of the image used in inference.

Source code in inference/core/entities/responses/inference.py
165
166
167
168
169
170
171
172
173
174
175
176
class InferenceResponseImage(BaseModel):
    """Inference response image information.

    Attributes:
        width (int): The original width of the image used in inference.
        height (int): The original height of the image used in inference.
    """

    width: int = Field(description="The original width of the image used in inference")
    height: int = Field(
        description="The original height of the image used in inference"
    )

InstanceSegmentationInferenceResponse

Bases: CvInferenceResponse, WithVisualizationResponse

Instance Segmentation inference response.

Attributes:

Name Type Description
]])

List of instance segmentation predictions.

Source code in inference/core/entities/responses/inference.py
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
class InstanceSegmentationInferenceResponse(
    CvInferenceResponse, WithVisualizationResponse
):
    """Instance Segmentation inference response.

    Attributes:
        predictions (List[Union[
            inference.core.entities.responses.inference.InstanceSegmentationPrediction,
            inference.core.entities.responses.inference.InstanceSegmentationRLEPrediction
        ]]): List of instance segmentation predictions.
    """

    predictions: List[
        Union[InstanceSegmentationPrediction, InstanceSegmentationRLEPrediction]
    ]

MultiLabelClassificationInferenceResponse

Bases: CvInferenceResponse, WithVisualizationResponse

Multi-label Classification inference response.

Attributes:

Name Type Description
predictions Dict[str, MultiLabelClassificationPrediction]

Dictionary of multi-label classification predictions.

predicted_classes List[str]

The list of predicted classes.

Source code in inference/core/entities/responses/inference.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
class MultiLabelClassificationInferenceResponse(
    CvInferenceResponse, WithVisualizationResponse
):
    """Multi-label Classification inference response.

    Attributes:
        predictions (Dict[str, inference.core.entities.responses.inference.MultiLabelClassificationPrediction]): Dictionary of multi-label classification predictions.
        predicted_classes (List[str]): The list of predicted classes.
    """

    predictions: Dict[str, MultiLabelClassificationPrediction]
    predicted_classes: List[str] = Field(description="The list of predicted classes")
    parent_id: Optional[str] = Field(
        description="Identifier of parent image region. Useful when stack of detection-models is in use to refer the RoI being the input to inference",
        default=None,
    )

MultiLabelClassificationPrediction

Bases: BaseModel

Multi-label Classification prediction.

Attributes:

Name Type Description
confidence float

The class label confidence as a fraction between 0 and 1.

Source code in inference/core/entities/responses/inference.py
152
153
154
155
156
157
158
159
160
161
162
class MultiLabelClassificationPrediction(BaseModel):
    """Multi-label Classification prediction.

    Attributes:
        confidence (float): The class label confidence as a fraction between 0 and 1.
    """

    confidence: float = Field(
        description="The class label confidence as a fraction between 0 and 1"
    )
    class_id: int = Field(description="Numeric ID associated with the class label")

ObjectDetectionInferenceResponse

Bases: CvInferenceResponse, WithVisualizationResponse

Object Detection inference response.

Attributes:

Name Type Description
predictions List[ObjectDetectionPrediction]

List of object detection predictions.

Source code in inference/core/entities/responses/inference.py
231
232
233
234
235
236
237
238
class ObjectDetectionInferenceResponse(CvInferenceResponse, WithVisualizationResponse):
    """Object Detection inference response.

    Attributes:
        predictions (List[inference.core.entities.responses.inference.ObjectDetectionPrediction]): List of object detection predictions.
    """

    predictions: List[ObjectDetectionPrediction]

ObjectDetectionPrediction

Bases: BaseModel

Object Detection prediction.

Attributes:

Name Type Description
x float

The center x-axis pixel coordinate of the prediction.

y float

The center y-axis pixel coordinate of the prediction.

width float

The width of the prediction bounding box in number of pixels.

height float

The height of the prediction bounding box in number of pixels.

confidence float

The detection confidence as a fraction between 0 and 1.

class_name str

The predicted class label.

class_confidence Union[float, None]

The class label confidence as a fraction between 0 and 1.

class_id int

The class id of the prediction

Source code in inference/core/entities/responses/inference.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
class ObjectDetectionPrediction(BaseModel):
    """Object Detection prediction.

    Attributes:
        x (float): The center x-axis pixel coordinate of the prediction.
        y (float): The center y-axis pixel coordinate of the prediction.
        width (float): The width of the prediction bounding box in number of pixels.
        height (float): The height of the prediction bounding box in number of pixels.
        confidence (float): The detection confidence as a fraction between 0 and 1.
        class_name (str): The predicted class label.
        class_confidence (Union[float, None]): The class label confidence as a fraction between 0 and 1.
        class_id (int): The class id of the prediction
    """

    x: float = Field(description="The center x-axis pixel coordinate of the prediction")
    y: float = Field(description="The center y-axis pixel coordinate of the prediction")
    width: float = Field(
        description="The width of the prediction bounding box in number of pixels"
    )
    height: float = Field(
        description="The height of the prediction bounding box in number of pixels"
    )
    confidence: float = Field(
        description="The detection confidence as a fraction between 0 and 1"
    )
    class_name: str = Field(alias="class", description="The predicted class label")

    class_confidence: Union[float, None] = Field(
        None, description="The class label confidence as a fraction between 0 and 1"
    )
    class_id: int = Field(description="The class id of the prediction")
    tracker_id: Optional[int] = Field(
        description="The tracker id of the prediction if tracking is enabled",
        default=None,
    )
    detection_id: str = Field(
        description="Unique identifier of detection",
        default_factory=lambda: str(uuid4()),
    )
    parent_id: Optional[str] = Field(
        description="Identifier of parent image region. Useful when stack of detection-models is in use to refer the RoI being the input to inference",
        default=None,
    )

Point

Bases: BaseModel

Point coordinates.

Attributes:

Name Type Description
x float

The x-axis pixel coordinate of the point.

y float

The y-axis pixel coordinate of the point.

Source code in inference/core/entities/responses/inference.py
53
54
55
56
57
58
59
60
61
62
class Point(BaseModel):
    """Point coordinates.

    Attributes:
        x (float): The x-axis pixel coordinate of the point.
        y (float): The y-axis pixel coordinate of the point.
    """

    x: float = Field(description="The x-axis pixel coordinate of the point")
    y: float = Field(description="The y-axis pixel coordinate of the point")

Point3D

Bases: Point

3D Point coordinates.

Attributes:

Name Type Description
z float

The z-axis pixel coordinate of the point.

Source code in inference/core/entities/responses/inference.py
65
66
67
68
69
70
71
72
class Point3D(Point):
    """3D Point coordinates.

    Attributes:
        z (float): The z-axis pixel coordinate of the point.
    """

    z: float = Field(description="The z-axis pixel coordinate of the point")

SemanticSegmentationInferenceResponse

Bases: CvInferenceResponse, WithVisualizationResponse

Semantic Segmentation inference response.

Attributes:

Name Type Description
predictions SemanticSegmentationPrediction

Semantic segmentation predictions.

Source code in inference/core/entities/responses/inference.py
276
277
278
279
280
281
282
283
284
285
class SemanticSegmentationInferenceResponse(
    CvInferenceResponse, WithVisualizationResponse
):
    """Semantic Segmentation inference response.

    Attributes:
        predictions (inference.core.entities.responses.inference.SemanticSegmentationPrediction): Semantic segmentation predictions.
    """

    predictions: SemanticSegmentationPrediction

WithVisualizationResponse

Bases: BaseModel

Response with visualization.

Attributes:

Name Type Description
visualization Optional[Any]

Base64 encoded string containing prediction visualization image data.

Source code in inference/core/entities/responses/inference.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
class WithVisualizationResponse(BaseModel):
    """Response with visualization.

    Attributes:
        visualization (Optional[Any]): Base64 encoded string containing prediction visualization image data.
    """

    visualization: Optional[Any] = Field(
        default=None,
        description="Base64 encoded string containing prediction visualization image data",
    )

    @field_serializer("visualization", when_used="json")
    def serialize_visualisation(self, visualization: Optional[Any]) -> Optional[str]:
        if visualization is None:
            return None
        return base64.b64encode(visualization).decode("utf-8")

inference.core.entities.responses.notebooks

Classes

NotebookStartResponse

Bases: BaseModel

Response model for notebook start request

Source code in inference/core/entities/responses/notebooks.py
4
5
6
7
8
class NotebookStartResponse(BaseModel):
    """Response model for notebook start request"""

    success: str = Field(..., description="Status of the request")
    message: str = Field(..., description="Message of the request", optional=True)

inference.core.entities.responses.ocr

Classes

OCRInferenceResponse

Bases: BaseModel

OCR Inference response.

Attributes:

Name Type Description
result str

The combined OCR recognition result.

predictions List[ObjectDetectionPrediction]

List of objects detected by OCR

time float

The time in seconds it took to produce the inference including preprocessing

Source code in inference/core/entities/responses/ocr.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class OCRInferenceResponse(BaseModel):
    """
    OCR Inference response.

    Attributes:
        result (str): The combined OCR recognition result.
        predictions (List[ObjectDetectionPrediction]): List of objects detected by OCR
        time (float): The time in seconds it took to produce the inference including preprocessing
    """

    result: str = Field(description="The combined OCR recognition result.")
    image: Optional[InferenceResponseImage] = Field(
        description="Metadata about input image dimensions", default=None
    )
    predictions: Optional[List[ObjectDetectionPrediction]] = Field(
        description="List of objects detected by OCR",
        default=None,
    )
    time: float = Field(
        description="The time in seconds it took to produce the inference including preprocessing."
    )
    parent_id: Optional[str] = Field(
        description="Identifier of parent image region. Useful when stack of detection-models is in use to refer the RoI being the input to inference",
        default=None,
    )

inference.core.entities.responses.perception_encoder

Classes

PerceptionEncoderCompareResponse

Bases: InferenceResponse

Response for PERCEPTION_ENCODER comparison.

Attributes:

Name Type Description
similarity Union[List[float], Dict[str, float]]

Similarity scores.

time float

The time in seconds it took to produce the similarity scores including preprocessing.

Source code in inference/core/entities/responses/perception_encoder.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class PerceptionEncoderCompareResponse(InferenceResponse):
    """Response for PERCEPTION_ENCODER comparison.

    Attributes:
        similarity (Union[List[float], Dict[str, float]]): Similarity scores.
        time (float): The time in seconds it took to produce the similarity scores including preprocessing.
    """

    similarity: Union[List[float], Dict[str, float]]
    time: Optional[float] = Field(
        default=None,
        description="The time in seconds it took to produce the similarity scores including preprocessing",
    )
    parent_id: Optional[str] = Field(
        description="Identifier of parent image region. Useful when stack of detection-models is in use to refer the RoI being the input to inference",
        default=None,
    )

PerceptionEncoderEmbeddingResponse

Bases: InferenceResponse

Response for PERCEPTION_ENCODER embedding.

Attributes:

Name Type Description
embeddings List[List[float]]

A list of embeddings, each embedding is a list of floats.

time float

The time in seconds it took to produce the embeddings including preprocessing.

Source code in inference/core/entities/responses/perception_encoder.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
class PerceptionEncoderEmbeddingResponse(InferenceResponse):
    """Response for PERCEPTION_ENCODER embedding.

    Attributes:
        embeddings (List[List[float]]): A list of embeddings, each embedding is a list of floats.
        time (float): The time in seconds it took to produce the embeddings including preprocessing.
    """

    embeddings: List[List[float]] = Field(
        examples=["[[0.12, 0.23, 0.34, ..., 0.43]]"],
        description="A list of embeddings, each embedding is a list of floats",
    )
    time: Optional[float] = Field(
        None,
        description="The time in seconds it took to produce the embeddings including preprocessing",
    )

inference.core.entities.responses.sam

Classes

SamEmbeddingResponse

Bases: BaseModel

SAM embedding response.

Attributes:

Name Type Description
embeddings Union[List[List[List[List[float]]]], Any]

The SAM embedding.

time float

The time in seconds it took to produce the embeddings including preprocessing.

Source code in inference/core/entities/responses/sam.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
class SamEmbeddingResponse(BaseModel):
    """SAM embedding response.

    Attributes:
        embeddings (Union[List[List[List[List[float]]]], Any]): The SAM embedding.
        time (float): The time in seconds it took to produce the embeddings including preprocessing.
    """

    embeddings: Union[List[List[List[List[float]]]], Any] = Field(
        examples=["[[[[0.1, 0.2, 0.3, ...] ...] ...]]"],
        description="If request format is json, embeddings is a series of nested lists representing the SAM embedding. If request format is binary, embeddings is a binary numpy array. The dimensions of the embedding are 1 x 256 x 64 x 64.",
    )
    time: float = Field(
        description="The time in seconds it took to produce the embeddings including preprocessing"
    )

SamSegmentationResponse

Bases: BaseModel

SAM segmentation response.

Attributes:

Name Type Description
masks Union[List[List[List[int]]], Any]

The set of output masks.

low_res_masks Union[List[List[List[int]]], Any]

The set of output low-resolution masks.

time float

The time in seconds it took to produce the segmentation including preprocessing.

Source code in inference/core/entities/responses/sam.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class SamSegmentationResponse(BaseModel):
    """SAM segmentation response.

    Attributes:
        masks (Union[List[List[List[int]]], Any]): The set of output masks.
        low_res_masks (Union[List[List[List[int]]], Any]): The set of output low-resolution masks.
        time (float): The time in seconds it took to produce the segmentation including preprocessing.
    """

    masks: Union[List[List[List[int]]], Any] = Field(
        description="The set of output masks. If request format is json, masks is a list of polygons, where each polygon is a list of points, where each point is a tuple containing the x,y pixel coordinates of the point. If request format is binary, masks is a list of binary numpy arrays. The dimensions of each mask are the same as the dimensions of the input image.",
    )
    low_res_masks: Union[List[List[List[int]]], Any] = Field(
        description="The set of output masks. If request format is json, masks is a list of polygons, where each polygon is a list of points, where each point is a tuple containing the x,y pixel coordinates of the point. If request format is binary, masks is a list of binary numpy arrays. The dimensions of each mask are 256 x 256",
    )
    time: float = Field(
        description="The time in seconds it took to produce the segmentation including preprocessing"
    )

inference.core.entities.responses.sam2

Classes

Sam2EmbeddingResponse

Bases: BaseModel

SAM embedding response.

Attributes:

Name Type Description
embeddings Union[List[List[List[List[float]]]], Any]

The SAM embedding.

time float

The time in seconds it took to produce the embeddings including preprocessing.

Source code in inference/core/entities/responses/sam2.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
class Sam2EmbeddingResponse(BaseModel):
    """SAM embedding response.

    Attributes:
        embeddings (Union[List[List[List[List[float]]]], Any]): The SAM embedding.
        time (float): The time in seconds it took to produce the embeddings including preprocessing.
    """

    image_id: str = Field(description="Image id embeddings are cached to")
    time: float = Field(
        description="The time in seconds it took to produce the embeddings including preprocessing"
    )

Sam2SegmentationPrediction

Bases: BaseModel

SAM segmentation prediction.

Attributes:

Name Type Description
masks Union[List[List[List[int]]], Dict[str, Any], Any]

Mask data - either polygon coordinates or RLE encoding.

confidence float

Masks confidences.

format Optional[str]

Format of the mask data: 'polygon' or 'rle'.

Source code in inference/core/entities/responses/sam2.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class Sam2SegmentationPrediction(BaseModel):
    """SAM segmentation prediction.

    Attributes:
        masks (Union[List[List[List[int]]], Dict[str, Any], Any]): Mask data - either polygon coordinates or RLE encoding.
        confidence (float): Masks confidences.
        format (Optional[str]): Format of the mask data: 'polygon' or 'rle'.
    """

    masks: Union[List[List[List[int]]], Dict[str, Any]] = Field(
        description="If polygon format, masks is a list of polygons, where each polygon is a list of points, where each point is a tuple containing the x,y pixel coordinates of the point. If rle format, masks is a dictionary with the keys 'size' and 'counts' containing the size and counts of the RLE encoding."
    )
    confidence: float = Field(description="Masks confidences")
    format: Optional[str] = Field(
        default="polygon", description="Format of the mask data: 'polygon' or 'rle'"
    )

inference.core.entities.responses.sam3_3d

Classes

Sam3_3D_Object_Item

Bases: BaseModel

Individual 3D object output with mesh, gaussian, and transformation metadata.

Source code in inference/core/entities/responses/sam3_3d.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
class Sam3_3D_Object_Item(BaseModel):
    """Individual 3D object output with mesh, gaussian, and transformation metadata."""

    mesh_glb: Optional[bytes] = Field(
        default=None, description="The 3D mesh in GLB format (binary)"
    )
    gaussian_ply: Optional[bytes] = Field(
        default=None, description="The Gaussian splatting in PLY format (binary)"
    )
    metadata: Sam3_3D_Objects_Metadata = Field(
        default_factory=Sam3_3D_Objects_Metadata,
        description="3D transformation metadata (rotation, translation, scale)",
    )

    class Config:
        arbitrary_types_allowed = True

inference.core.entities.responses.server_state

Classes

ServerVersionInfo

Bases: BaseModel

Server version information.

Attributes:

Name Type Description
name str

Server name.

version str

Server version.

uuid str

Server UUID.

Source code in inference/core/entities/responses/server_state.py
 8
 9
10
11
12
13
14
15
16
17
18
19
class ServerVersionInfo(BaseModel):
    """Server version information.

    Attributes:
        name (str): Server name.
        version (str): Server version.
        uuid (str): Server UUID.
    """

    name: str = Field(examples=["Roboflow Inference Server"])
    version: str = Field(examples=["0.0.1"])
    uuid: str = Field(examples=["9c18c6f4-2266-41fb-8a0f-c12ae28f6fbe"])

core

Core framework internals: environment config, data entities, and shared utilities.

inference.core.exceptions

Classes

CacheUnavailableError

Bases: Exception

Raised when the ephemeral cache (e.g. Redis/Dragonfly) cannot be reached.

Source code in inference/core/exceptions.py
194
195
class CacheUnavailableError(Exception):
    """Raised when the ephemeral cache (e.g. Redis/Dragonfly) cannot be reached."""

ContentTypeInvalid

Bases: Exception

Raised when the content type is invalid.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
4
5
6
7
8
9
class ContentTypeInvalid(Exception):
    """Raised when the content type is invalid.

    Attributes:
        message (str): Optional message describing the error.
    """

ContentTypeMissing

Bases: Exception

Raised when the content type is missing.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
12
13
14
15
16
17
class ContentTypeMissing(Exception):
    """Raised when the content type is missing.

    Attributes:
        message (str): Optional message describing the error.
    """

EngineIgnitionFailure

Bases: Exception

Raised when the engine fails to ignite.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
20
21
22
23
24
25
class EngineIgnitionFailure(Exception):
    """Raised when the engine fails to ignite.

    Attributes:
        message (str): Optional message describing the error.
    """

FeatureDeprecatedError

Bases: Exception

Raised when a removed/deprecated feature is invoked.

Maps to HTTP 410 Gone via the standard error handler; surfaces as error_type="FeatureDeprecatedError" in InferencePipeline StatusUpdate payloads and as the inner_error of ClientCausedStepExecutionError(status_code=410) when raised from a workflow block.

Source code in inference/core/exceptions.py
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
class FeatureDeprecatedError(Exception):
    """Raised when a removed/deprecated feature is invoked.

    Maps to HTTP 410 Gone via the standard error handler; surfaces as
    error_type="FeatureDeprecatedError" in InferencePipeline StatusUpdate
    payloads and as the inner_error of
    ClientCausedStepExecutionError(status_code=410) when raised from a
    workflow block.
    """

    def __init__(
        self,
        feature: str,
        *,
        removal_release: Optional[str] = None,
        replacement: Optional[str] = None,
        reason: Optional[str] = None,
    ):
        self.feature = feature
        self.removal_release = removal_release
        self.replacement = replacement
        self.reason = reason
        public = f"Feature '{feature}' has been removed from inference."
        if reason:
            public += f" Reason: {reason}."
        if removal_release:
            public += f" Removed in {removal_release}."
        public += (
            " No drop-in replacement is provided; contact Roboflow if you "
            "require this capability."
        )
        if replacement:
            public += f" Closest replacement: {replacement}."
        self._public_message = public
        super().__init__(public)

    def get_structured_public_error_details(self) -> dict:
        return {
            "feature": self.feature,
            "removal_release": self.removal_release,
            "replacement": self.replacement,
            "reason": self.reason,
        }

InferenceModelNotFound

Bases: Exception

Raised when the inference model is not found.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
28
29
30
31
32
33
class InferenceModelNotFound(Exception):
    """Raised when the inference model is not found.

    Attributes:
        message (str): Optional message describing the error.
    """

InvalidEnvironmentVariableError

Bases: Exception

Raised when an environment variable is invalid.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
36
37
38
39
40
41
class InvalidEnvironmentVariableError(Exception):
    """Raised when an environment variable is invalid.

    Attributes:
        message (str): Optional message describing the error.
    """

InvalidMaskDecodeArgument

Bases: Exception

Raised when an invalid argument is provided for mask decoding.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
44
45
46
47
48
49
class InvalidMaskDecodeArgument(Exception):
    """Raised when an invalid argument is provided for mask decoding.

    Attributes:
        message (str): Optional message describing the error.
    """

InvalidNumpyInput

Bases: InputImageLoadError

Raised when the input is an invalid NumPy array.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
94
95
96
97
98
99
class InvalidNumpyInput(InputImageLoadError):
    """Raised when the input is an invalid NumPy array.

    Attributes:
        message (str): Optional message describing the error.
    """

MissingApiKeyError

Bases: Exception

Raised when the API key is missing.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
52
53
54
55
56
57
class MissingApiKeyError(Exception):
    """Raised when the API key is missing.

    Attributes:
        message (str): Optional message describing the error.
    """

MissingServiceSecretError

Bases: Exception

Raised when the service secret is missing.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
60
61
62
63
64
65
class MissingServiceSecretError(Exception):
    """Raised when the service secret is missing.

    Attributes:
        message (str): Optional message describing the error.
    """

OnnxProviderNotAvailable

Bases: Exception

Raised when the ONNX provider is not available.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
68
69
70
71
72
73
class OnnxProviderNotAvailable(Exception):
    """Raised when the ONNX provider is not available.

    Attributes:
        message (str): Optional message describing the error.
    """

WorkspaceLoadError

Bases: Exception

Raised when there is an error loading the workspace.

Attributes:

Name Type Description
message str

Optional message describing the error.

Source code in inference/core/exceptions.py
76
77
78
79
80
81
class WorkspaceLoadError(Exception):
    """Raised when there is an error loading the workspace.

    Attributes:
        message (str): Optional message describing the error.
    """

WorkspaceStreamQuotaError

Bases: Exception

Raised when the workspace stream quota has been exceeded.

This error is returned when a workspace has reached its maximum number of concurrent WebRTC streams. This is to prevent that a single user uses all our modal resources.

Source code in inference/core/exceptions.py
249
250
251
252
253
254
255
256
257
class WorkspaceStreamQuotaError(Exception):
    """Raised when the workspace stream quota has been exceeded.

    This error is returned when a workspace has reached its maximum number
    of concurrent WebRTC streams. This is to prevent that a single user
    uses all our modal resources.
    """

    pass

inference.core.nms

Functions:

non_max_suppression_fast

non_max_suppression_fast(boxes, overlapThresh)

Applies non-maximum suppression to bounding boxes.

Parameters:

Name Type Description Default
boxes ndarray

Array of bounding boxes with confidence scores.

required
overlapThresh float

Overlap threshold for suppression.

required

Returns:

Name Type Description
list

List of bounding boxes after non-maximum suppression.

Source code in inference/core/nms.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def non_max_suppression_fast(boxes, overlapThresh):
    """Applies non-maximum suppression to bounding boxes.

    Args:
        boxes (np.ndarray): Array of bounding boxes with confidence scores.
        overlapThresh (float): Overlap threshold for suppression.

    Returns:
        list: List of bounding boxes after non-maximum suppression.
    """
    # if there are no boxes, return an empty list
    if len(boxes) == 0:
        return []
    # if the bounding boxes integers, convert them to floats --
    # this is important since we'll be doing a bunch of divisions
    if boxes.dtype.kind == "i":
        boxes = boxes.astype("float")
    # initialize the list of picked indexes
    pick = []
    # grab the coordinates of the bounding boxes
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    conf = boxes[:, 4]
    # compute the area of the bounding boxes and sort the bounding
    # boxes by the bottom-right y-coordinate of the bounding box
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = np.argsort(conf)
    # keep looping while some indexes still remain in the indexes
    # list
    while len(idxs) > 0:
        # grab the last index in the indexes list and add the
        # index value to the list of picked indexes
        last = len(idxs) - 1
        i = idxs[last]
        pick.append(i)
        # find the largest (x, y) coordinates for the start of
        # the bounding box and the smallest (x, y) coordinates
        # for the end of the bounding box
        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])
        # compute the width and height of the bounding box
        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)
        # compute the ratio of overlap
        overlap = (w * h) / area[idxs[:last]]
        # delete all indexes from the index list that have
        idxs = np.delete(
            idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0]))
        )
    # return only the bounding boxes that were picked using the
    # integer data type
    return boxes[pick].astype("float")

w_np_non_max_suppression

w_np_non_max_suppression(
    prediction,
    conf_thresh=0.25,
    iou_thresh=0.45,
    class_agnostic=False,
    max_detections=300,
    max_candidate_detections=3000,
    timeout_seconds=None,
    num_masks=0,
    box_format="xywh",
)

Applies non-maximum suppression to predictions.

Parameters:

Name Type Description Default
prediction ndarray

Array of predictions. Format for single prediction is [bbox x 4, max_class_confidence, (confidence) x num_of_classes, additional_element x num_masks]

required
conf_thresh float

Confidence threshold. Defaults to 0.25.

0.25
iou_thresh float

IOU threshold. Defaults to 0.45.

0.45
class_agnostic bool

Whether to ignore class labels. Defaults to False.

False
max_detections int

Maximum number of detections. Defaults to 300.

300
max_candidate_detections int

Maximum number of candidate detections. Defaults to 3000.

3000
timeout_seconds Optional[int]

Timeout in seconds. Defaults to None.

None
num_masks int

Number of masks. Defaults to 0.

0
box_format str

Format of bounding boxes. Either 'xywh' or 'xyxy'. Defaults to 'xywh'.

'xywh'

Returns:

Name Type Description
list

List of filtered predictions after non-maximum suppression. Format of a single result is: [bbox x 4, max_class_confidence, max_class_confidence, id_of_class_with_max_confidence, additional_element x num_masks]

Source code in inference/core/nms.py
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def w_np_non_max_suppression(
    prediction,
    conf_thresh: float = 0.25,
    iou_thresh: float = 0.45,
    class_agnostic: bool = False,
    max_detections: int = 300,
    max_candidate_detections: int = 3000,
    timeout_seconds: Optional[int] = None,
    num_masks: int = 0,
    box_format: str = "xywh",
):
    """Applies non-maximum suppression to predictions.

    Args:
        prediction (np.ndarray): Array of predictions. Format for single prediction is
            [bbox x 4, max_class_confidence, (confidence) x num_of_classes, additional_element x num_masks]
        conf_thresh (float, optional): Confidence threshold. Defaults to 0.25.
        iou_thresh (float, optional): IOU threshold. Defaults to 0.45.
        class_agnostic (bool, optional): Whether to ignore class labels. Defaults to False.
        max_detections (int, optional): Maximum number of detections. Defaults to 300.
        max_candidate_detections (int, optional): Maximum number of candidate detections. Defaults to 3000.
        timeout_seconds (Optional[int], optional): Timeout in seconds. Defaults to None.
        num_masks (int, optional): Number of masks. Defaults to 0.
        box_format (str, optional): Format of bounding boxes. Either 'xywh' or 'xyxy'. Defaults to 'xywh'.

    Returns:
        list: List of filtered predictions after non-maximum suppression. Format of a single result is:
            [bbox x 4, max_class_confidence, max_class_confidence, id_of_class_with_max_confidence,
            additional_element x num_masks]
    """
    num_classes = prediction.shape[2] - 5 - num_masks

    if box_format == "xywh":
        pred_view = prediction[:, :, :4]

        # Calculate all values without allocating a new array
        x1 = pred_view[:, :, 0] - pred_view[:, :, 2] / 2
        y1 = pred_view[:, :, 1] - pred_view[:, :, 3] / 2
        x2 = pred_view[:, :, 0] + pred_view[:, :, 2] / 2
        y2 = pred_view[:, :, 1] + pred_view[:, :, 3] / 2

        # Assign directly to the view
        pred_view[:, :, 0] = x1
        pred_view[:, :, 1] = y1
        pred_view[:, :, 2] = x2
        pred_view[:, :, 3] = y2
    elif box_format != "xyxy":
        raise ValueError(
            "box_format must be either 'xywh' or 'xyxy', got {}".format(box_format)
        )

    batch_predictions = []

    # Pre-allocate space for class confidence and class prediction arrays
    cls_confs_shape = (prediction.shape[1], 1)

    for np_image_i, np_image_pred in enumerate(prediction):
        np_conf_mask = np_image_pred[:, 4] >= conf_thresh
        if not np.any(np_conf_mask):  # Quick check if no boxes pass threshold
            batch_predictions.append([])
            continue

        np_image_pred = np_image_pred[np_conf_mask]

        # Handle empty case after filtering
        if np_image_pred.shape[0] == 0:
            batch_predictions.append([])
            continue

        cls_confs = np_image_pred[:, 5 : num_classes + 5]
        # Check for empty classes after slicing
        if cls_confs.shape[1] == 0:
            batch_predictions.append([])
            continue

        np_class_conf = np.max(cls_confs, axis=1, keepdims=True)
        np_class_pred = np.argmax(cls_confs, axis=1, keepdims=True)
        # Extract mask predictions if any
        if num_masks > 0:
            np_mask_pred = np_image_pred[:, 5 + num_classes :]
            # Construct final detections array directly
            np_detections = np.concatenate(
                [
                    np_image_pred[:, :5],
                    np_class_conf,
                    np_class_pred.astype(np.float32),
                    np_mask_pred,
                ],
                axis=1,
            )
        else:
            # Optimization: Avoid concatenation when no masks are present
            np_detections = np.concatenate(
                [np_image_pred[:, :5], np_class_conf, np_class_pred.astype(np.float32)],
                axis=1,
            )
        filtered_predictions = []
        if class_agnostic:
            # Sort by confidence directly
            sorted_indices = np.argsort(-np_detections[:, 4])
            np_detections_sorted = np_detections[sorted_indices]
            # Directly pass to optimized NMS
            filtered_predictions.extend(
                non_max_suppression_fast(np_detections_sorted, iou_thresh)
            )
        else:
            np_unique_labels = np.unique(np_class_pred)

            # Process each class
            for c in np_unique_labels:
                class_mask = np.atleast_1d(np_class_pred.squeeze() == c)
                np_detections_class = np_detections[class_mask]

                # Skip empty arrays
                if np_detections_class.shape[0] == 0:
                    continue

                # Sort by confidence (highest first)
                sorted_indices = np.argsort(-np_detections_class[:, 4])
                np_detections_sorted = np_detections_class[sorted_indices]

                # Apply optimized NMS and extend filtered predictions
                filtered_predictions.extend(
                    non_max_suppression_fast(np_detections_sorted, iou_thresh)
                )

        # Sort final predictions by confidence and limit to max_detections
        if filtered_predictions:
            # Use numpy sort for better performance
            filtered_np = np.array(filtered_predictions)
            idx = np.argsort(-filtered_np[:, 4])
            filtered_np = filtered_np[idx]

            # Limit to max_detections
            if len(filtered_np) > max_detections:
                filtered_np = filtered_np[:max_detections]

            batch_predictions.append(list(filtered_np))
        else:
            batch_predictions.append([])

    return batch_predictions

inference.core.roboflow_api

Classes

Functions:

get_workflow_specification

get_workflow_specification(
    api_key,
    workspace_id,
    workflow_id,
    use_cache=True,
    ephemeral_cache=None,
    workflow_version_id=None,
)

Fetch a workflow specification from cache or the Roboflow API.

When ephemeral cache (Redis/Dragonfly) is enabled but unreachable, falls back to the Roboflow API instead of failing the request.

Parameters:

Name Type Description Default
api_key Optional[str]

Roboflow API key, or None for unauthenticated fetches.

required
workspace_id WorkspaceID

Workspace slug, or local for filesystem-backed workflows.

required
workflow_id str

Workflow identifier within the workspace.

required
use_cache bool

If True, read and write the ephemeral workflow-definition cache.

True
ephemeral_cache Optional[BaseCache]

Cache backend; defaults to the process-global cache.

None
workflow_version_id Optional[str]

Optional pinned workflow version.

None

Returns:

Type Description
dict

Parsed workflow specification dict.

Raises:

Type Description
MalformedWorkflowResponseError

API response lacks a valid specification.

RoboflowAPIRequestError

API request failed and no file-cache fallback applies.

FileNotFoundError

Local workspace workflow file is missing.

ValueError

Invalid local workflow id.

Source code in inference/core/roboflow_api.py
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
@wrap_roboflow_api_errors()
def get_workflow_specification(
    api_key: Optional[str],
    workspace_id: WorkspaceID,
    workflow_id: str,
    use_cache: bool = True,
    ephemeral_cache: Optional[BaseCache] = None,
    workflow_version_id: Optional[str] = None,
) -> dict:
    """Fetch a workflow specification from cache or the Roboflow API.

    When ephemeral cache (Redis/Dragonfly) is enabled but unreachable, falls back
    to the Roboflow API instead of failing the request.

    Args:
        api_key: Roboflow API key, or None for unauthenticated fetches.
        workspace_id: Workspace slug, or ``local`` for filesystem-backed workflows.
        workflow_id: Workflow identifier within the workspace.
        use_cache: If True, read and write the ephemeral workflow-definition cache.
        ephemeral_cache: Cache backend; defaults to the process-global cache.
        workflow_version_id: Optional pinned workflow version.

    Returns:
        Parsed workflow specification dict.

    Raises:
        MalformedWorkflowResponseError: API response lacks a valid specification.
        RoboflowAPIRequestError: API request failed and no file-cache fallback applies.
        FileNotFoundError: Local workspace workflow file is missing.
        ValueError: Invalid local workflow id.
    """
    ephemeral_cache = ephemeral_cache or cache
    if use_cache:
        cached_entry = _try_retrieve_workflow_specification_from_ephemeral_cache(
            api_key=api_key,
            workspace_id=workspace_id,
            workflow_id=workflow_id,
            workflow_version_id=workflow_version_id,
            ephemeral_cache=ephemeral_cache,
        )
        if cached_entry:
            return cached_entry

    if workspace_id == "local":
        if not re.match(r"^[\w\-]+$", workflow_id):
            raise ValueError("Invalid workflow id")

        workflow_hash = sha256(workflow_id.encode()).hexdigest()
        local_file_path = (
            Path(MODEL_CACHE_DIR) / "workflow" / "local" / f"{workflow_hash}.json"
        )
        if not local_file_path.exists():
            raise FileNotFoundError(f"Local workflow file not found: {local_file_path}")

        with local_file_path.open("r", encoding="utf-8") as f:
            local_config = json.load(f)

        # Mimic the same shape as the cloud response:
        response = {"workflow": local_config}
    else:
        params = []
        if api_key is not None and api_key != LOCAL_API_KEY:
            params.append(("api_key", api_key))
        if workflow_version_id is not None:
            params.append(("workflow_version", workflow_version_id))
        api_url = _add_params_to_url(
            url=f"{API_BASE_URL}/{workspace_id}/workflows/{workflow_id}",
            params=params,
        )
        try:
            response = _get_from_url(url=api_url)
            if USE_FILE_CACHE_FOR_WORKFLOWS_DEFINITIONS:
                cache_workflow_response(
                    workspace_id=workspace_id,
                    workflow_id=workflow_id,
                    api_key=api_key,
                    response=response,
                    workflow_version_id=workflow_version_id,
                )
        except (
            requests.exceptions.ConnectionError,
            ConnectionError,
            requests.exceptions.Timeout,
        ) as error:
            if not USE_FILE_CACHE_FOR_WORKFLOWS_DEFINITIONS:
                raise error
            response = load_cached_workflow_response(
                workspace_id=workspace_id,
                workflow_id=workflow_id,
                api_key=api_key,
                workflow_version_id=workflow_version_id,
            )
            if response is None:
                raise error

    if "workflow" not in response or "config" not in response["workflow"]:
        raise MalformedWorkflowResponseError(
            "Could not find workflow specification in API response"
        )
    try:
        workflow_config = json.loads(response["workflow"]["config"])
        specification = workflow_config["specification"]
        if isinstance(specification, dict):
            specification["id"] = response["workflow"].get("id")
        if use_cache:
            _try_cache_workflow_specification_in_ephemeral_cache(
                api_key=api_key,
                workspace_id=workspace_id,
                workflow_id=workflow_id,
                workflow_version_id=workflow_version_id,
                specification=specification,
                ephemeral_cache=ephemeral_cache,
            )

        return specification
    except KeyError as error:
        raise MalformedWorkflowResponseError(
            "Workflow specification not found in Roboflow API response"
        ) from error
    except (ValueError, TypeError) as error:
        raise MalformedWorkflowResponseError(
            "Could not decode workflow specification in Roboflow API response"
        ) from error

post_to_roboflow_api

post_to_roboflow_api(
    endpoint,
    api_key,
    payload=None,
    params=None,
    http_errors_handlers=None,
)

Generic function to make a POST request to the Roboflow API.

Parameters:

Name Type Description Default
endpoint str

API endpoint path

required
api_key Optional[str]

Roboflow API key

required
payload Optional[dict]

JSON payload

None
params Optional[List[Tuple[str, str]]]

Additional URL parameters

None
http_errors_handlers Optional[Dict[int, Callable[[Union[HTTPError]], None]]]

Optional custom HTTP error handlers by status code

None
Source code in inference/core/roboflow_api.py
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
def post_to_roboflow_api(
    endpoint: str,
    api_key: Optional[str],
    payload: Optional[dict] = None,
    params: Optional[List[Tuple[str, str]]] = None,
    http_errors_handlers: Optional[
        Dict[int, Callable[[Union[requests.exceptions.HTTPError]], None]]
    ] = None,
) -> dict:
    """Generic function to make a POST request to the Roboflow API.

    Args:
        endpoint: API endpoint path
        api_key: Roboflow API key
        payload: JSON payload
        params: Additional URL parameters
        http_errors_handlers: Optional custom HTTP error handlers by status code
    """

    @wrap_roboflow_api_errors(http_errors_handlers=http_errors_handlers)
    def _make_request():
        url_params = []
        if api_key and api_key != LOCAL_API_KEY:
            url_params.append(("api_key", api_key))
        if params:
            url_params.extend(params)

        api_base_url = _api_base_url_for_endpoint(endpoint=endpoint).rstrip("/")
        full_url = _add_params_to_url(
            url=f"{api_base_url}/{endpoint.strip('/')}", params=url_params
        )
        wrapped_url = wrap_url(full_url)

        headers = build_roboflow_api_headers()

        response = requests.post(
            url=wrapped_url,
            json=payload,
            headers=headers,
            timeout=ROBOFLOW_API_REQUEST_TIMEOUT,
            verify=ROBOFLOW_API_VERIFY_SSL,
        )
        api_key_safe_raise_for_status(response=response)
        return response.json()

    return _make_request()

inference.core.telemetry

OpenTelemetry tracing setup and helpers for the inference server.

All public helpers are safe to import and call even when opentelemetry is not installed — they degrade to noops. Business-logic code should never need to check whether OTel is available.

Usage::

from inference.core.telemetry import (
    start_span, record_error, set_span_attribute,
)

with start_span("model.infer", {"model.id": model_id}):
    result = do_inference(...)
    set_span_attribute("model.load_time_seconds", elapsed)

record_error(error)

Functions:

attach_context

attach_context(ctx)

Attach a previously captured OTel context in the current thread.

Returns a token that MUST be passed to detach_context() when done.

Source code in inference/core/telemetry.py
127
128
129
130
131
132
133
134
135
136
def attach_context(ctx: Any) -> Any:
    """Attach a previously captured OTel context in the current thread.

    Returns a token that MUST be passed to detach_context() when done.
    """
    if ctx is None or not _OTEL_AVAILABLE:
        return None
    from opentelemetry import context

    return context.attach(ctx)

capture_context

capture_context()

Capture the current OTel context for propagation to another thread.

Returns an opaque token (or None). Pass to attach_context() in the target thread.

Source code in inference/core/telemetry.py
115
116
117
118
119
120
121
122
123
124
def capture_context() -> Any:
    """Capture the current OTel context for propagation to another thread.

    Returns an opaque token (or None). Pass to attach_context() in the target thread.
    """
    if not _OTEL_AVAILABLE:
        return None
    from opentelemetry import context

    return context.get_current()

detach_context

detach_context(token)

Detach a previously attached context. Must be called in a finally block.

Source code in inference/core/telemetry.py
139
140
141
142
143
144
145
def detach_context(token: Any) -> None:
    """Detach a previously attached context. Must be called in a finally block."""
    if token is None or not _OTEL_AVAILABLE:
        return
    from opentelemetry import context

    context.detach(token)

get_trace_id

get_trace_id()

Return the current trace ID as a hex string, or None.

Source code in inference/core/telemetry.py
91
92
93
94
95
96
97
98
99
def get_trace_id() -> Optional[str]:
    """Return the current trace ID as a hex string, or None."""
    if not _OTEL_AVAILABLE:
        return None
    span = trace.get_current_span()
    ctx = span.get_span_context()
    if ctx and ctx.trace_id:
        return format(ctx.trace_id, "032x")
    return None

inject_trace_context

inject_trace_context(headers)

Inject W3C traceparent/tracestate into headers dict and return it.

Safe to call when OTel is not installed (returns headers unchanged).

Source code in inference/core/telemetry.py
148
149
150
151
152
153
154
155
156
157
158
def inject_trace_context(headers: dict) -> dict:
    """Inject W3C traceparent/tracestate into *headers* dict and return it.

    Safe to call when OTel is not installed (returns headers unchanged).
    """
    if not _OTEL_AVAILABLE:
        return headers
    if headers is None:
        headers = {}
    _otel_inject(headers)
    return headers

record_api_call

record_api_call(function_name, duration)

Record a Roboflow API call duration.

Source code in inference/core/telemetry.py
205
206
207
208
209
210
211
def record_api_call(function_name: str, duration: float) -> None:
    """Record a Roboflow API call duration."""
    if _metrics is None:
        return
    _metrics["api_call_duration"].record(
        duration, {"roboflow_api.function": function_name}
    )

record_error

record_error(error)

Record an exception on the current active span and set ERROR status.

Safe to call when OTel is not installed or there is no active span.

Source code in inference/core/telemetry.py
78
79
80
81
82
83
84
85
86
87
88
def record_error(error: Exception) -> None:
    """Record an exception on the current active span and set ERROR status.

    Safe to call when OTel is not installed or there is no active span.
    """
    if not _OTEL_AVAILABLE:
        return
    span = trace.get_current_span()
    if span and span.is_recording():
        span.record_exception(error)
        span.set_status(StatusCode.ERROR, str(error))

record_error_metric

record_error_metric(error_type)

Increment the error counter by error type.

Source code in inference/core/telemetry.py
214
215
216
217
218
def record_error_metric(error_type: str) -> None:
    """Increment the error counter by error type."""
    if _metrics is None:
        return
    _metrics["errors"].add(1, {"error.type": error_type})

record_inference

record_inference(model_id, duration)

Record an inference execution.

Source code in inference/core/telemetry.py
197
198
199
200
201
202
def record_inference(model_id: str, duration: float) -> None:
    """Record an inference execution."""
    if _metrics is None:
        return
    _metrics["model_infer_count"].add(1, {"model.id": model_id})
    _metrics["model_infer_duration"].record(duration, {"model.id": model_id})

record_model_loaded

record_model_loaded(model_id, load_time)

Record a model load event: increment counters and record load duration.

Source code in inference/core/telemetry.py
180
181
182
183
184
185
186
def record_model_loaded(model_id: str, load_time: float) -> None:
    """Record a model load event: increment counters and record load duration."""
    if _metrics is None:
        return
    _metrics["models_loaded"].add(1)
    _metrics["model_loads"].add(1, {"model.id": model_id})
    _metrics["model_load_duration"].record(load_time, {"model.id": model_id})

record_model_unloaded

record_model_unloaded(model_id)

Record a model unload event.

Source code in inference/core/telemetry.py
189
190
191
192
193
194
def record_model_unloaded(model_id: str) -> None:
    """Record a model unload event."""
    if _metrics is None:
        return
    _metrics["models_loaded"].add(-1)
    _metrics["model_unloads"].add(1, {"model.id": model_id})

set_span_attribute

set_span_attribute(key, value)

Set an attribute on the current active span.

Noop when OTel is unavailable or there is no recording span. Callers never need to check for None spans.

Source code in inference/core/telemetry.py
102
103
104
105
106
107
108
109
110
111
112
def set_span_attribute(key: str, value: Any) -> None:
    """Set an attribute on the current active span.

    Noop when OTel is unavailable or there is no recording span.
    Callers never need to check for None spans.
    """
    if not _OTEL_AVAILABLE:
        return
    span = trace.get_current_span()
    if span and span.is_recording():
        span.set_attribute(key, value)

setup_telemetry

setup_telemetry(app)

Initialize OTel TracerProvider, MeterProvider, and instrument the FastAPI app.

Must be called before any middleware is added so the FastAPI instrumentor wraps at the outermost ASGI layer.

Source code in inference/core/telemetry.py
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
def setup_telemetry(app: Any) -> None:
    """Initialize OTel TracerProvider, MeterProvider, and instrument the FastAPI app.

    Must be called before any middleware is added so the FastAPI instrumentor
    wraps at the outermost ASGI layer.
    """
    global _provider, _tracer, _meter_provider, _metrics

    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
        OTLPSpanExporter as GRPCExporter,
    )
    from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
        OTLPSpanExporter as HTTPExporter,
    )
    from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
    from opentelemetry.sdk.resources import Resource
    from opentelemetry.sdk.trace import TracerProvider
    from opentelemetry.sdk.trace.export import BatchSpanProcessor
    from opentelemetry.sdk.trace.sampling import (
        ALWAYS_OFF,
        ALWAYS_ON,
        ParentBased,
        TraceIdRatioBased,
    )

    from inference.core.env import (
        OTEL_EXPORTER_ENDPOINT,
        OTEL_EXPORTER_PROTOCOL,
        OTEL_METRIC_EXPORT_INTERVAL_MS,
        OTEL_METRIC_EXPORTER_ENDPOINT,
        OTEL_METRICS_ENABLED,
        OTEL_SAMPLING_RATE,
        OTEL_SERVICE_NAME,
        OTEL_TRACE_EXPORT_INTERVAL_MS,
    )

    # W3C TraceContext propagator — always set so extract/inject are safe
    set_global_textmap(CompositePropagator([TraceContextTextMapPropagator()]))

    # Build root sampler with force-trace override
    if OTEL_SAMPLING_RATE <= 0:
        root_sampler = ALWAYS_OFF
    elif OTEL_SAMPLING_RATE >= 1.0:
        root_sampler = ALWAYS_ON
    else:
        root_sampler = TraceIdRatioBased(OTEL_SAMPLING_RATE)

    # ParentBased: child spans honour parent decision, root spans use our
    # custom sampler that checks for X-Force-Trace before falling back to
    # the ratio-based sampler.
    sampler = ParentBased(root=_ForceTraceRootSampler(root_sampler))

    from inference.core.devices.utils import GLOBAL_INFERENCE_SERVER_ID

    resource = Resource.create(
        {
            "service.name": OTEL_SERVICE_NAME,
            "service.instance.id": GLOBAL_INFERENCE_SERVER_ID,
        }
    )

    if OTEL_EXPORTER_PROTOCOL == "http":
        exporter = HTTPExporter(
            endpoint=f"http://{OTEL_EXPORTER_ENDPOINT}/v1/traces",
        )
    else:
        exporter = GRPCExporter(
            endpoint=OTEL_EXPORTER_ENDPOINT,
            insecure=True,
        )

    _provider = TracerProvider(resource=resource, sampler=sampler)
    _provider.add_span_processor(
        BatchSpanProcessor(
            exporter, schedule_delay_millis=OTEL_TRACE_EXPORT_INTERVAL_MS
        )
    )
    trace.set_tracer_provider(_provider)

    _tracer = trace.get_tracer("inference")

    # --- Metrics ---
    if OTEL_METRICS_ENABLED:
        from opentelemetry import metrics as otel_metrics
        from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
            OTLPMetricExporter as GRPCMetricExporter,
        )
        from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
            OTLPMetricExporter as HTTPMetricExporter,
        )
        from opentelemetry.sdk.metrics import MeterProvider
        from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader

        metric_endpoint = OTEL_METRIC_EXPORTER_ENDPOINT or OTEL_EXPORTER_ENDPOINT
        if OTEL_EXPORTER_PROTOCOL == "http":
            metric_exporter = HTTPMetricExporter(
                endpoint=f"http://{metric_endpoint}/v1/metrics",
            )
        else:
            metric_exporter = GRPCMetricExporter(
                endpoint=metric_endpoint,
                insecure=True,
            )

        metric_reader = PeriodicExportingMetricReader(
            metric_exporter, export_interval_millis=OTEL_METRIC_EXPORT_INTERVAL_MS
        )
        _meter_provider = MeterProvider(
            resource=resource, metric_readers=[metric_reader]
        )
        otel_metrics.set_meter_provider(_meter_provider)

        meter = _meter_provider.get_meter("inference")
        _metrics = {
            "models_loaded": meter.create_up_down_counter(
                "inference.models.loaded",
                description="Number of models currently loaded",
            ),
            "model_loads": meter.create_counter(
                "inference.model.loads",
                description="Total model loads (cold starts)",
            ),
            "model_unloads": meter.create_counter(
                "inference.model.unloads",
                description="Total model unloads",
            ),
            "model_load_duration": meter.create_histogram(
                "inference.model.load.duration",
                unit="s",
                description="Model load time in seconds",
            ),
            "model_infer_count": meter.create_counter(
                "inference.model.infer.count",
                description="Total inference requests",
            ),
            "model_infer_duration": meter.create_histogram(
                "inference.model.infer.duration",
                unit="s",
                description="Inference latency in seconds",
            ),
            "api_call_duration": meter.create_histogram(
                "inference.roboflow_api.duration",
                unit="s",
                description="Roboflow API call latency in seconds",
            ),
            "errors": meter.create_counter(
                "inference.errors",
                description="Total errors by type",
            ),
        }
        _install_export_error_filter("opentelemetry.sdk.metrics._internal.export")

    # Replace noisy connection-refused tracebacks with a single-line warning.
    _install_export_error_filter("opentelemetry.sdk.trace.export")

    # Add trace-ID response middleware BEFORE the instrumentor so it sits
    # INSIDE the OTel span context (earlier-added = innermost in Starlette).
    app.add_middleware(_TraceIdResponseMiddleware)

    # Auto-instrument FastAPI: creates server spans, extracts traceparent
    FastAPIInstrumentor.instrument_app(app)

    # Auto-instrument outgoing requests: creates http.client spans, injects traceparent
    from opentelemetry.instrumentation.requests import RequestsInstrumentor

    RequestsInstrumentor().instrument()

    # Add force-trace middleware AFTER the instrumentor so it wraps outermost.
    # Starlette builds middleware last-added = outermost, so this runs BEFORE
    # the instrumentor's ASGI middleware, ensuring the ContextVar is set
    # before should_sample() is called.
    app.add_middleware(_ForceTraceASGIMiddleware)

    logger.info(
        "OpenTelemetry tracing enabled (service=%s, endpoint=%s, protocol=%s, sampling_rate=%s)",
        OTEL_SERVICE_NAME,
        OTEL_EXPORTER_ENDPOINT,
        OTEL_EXPORTER_PROTOCOL,
        OTEL_SAMPLING_RATE,
    )

shutdown_telemetry

shutdown_telemetry()

Flush pending spans/metrics and shut down providers.

Source code in inference/core/telemetry.py
514
515
516
517
518
519
def shutdown_telemetry() -> None:
    """Flush pending spans/metrics and shut down providers."""
    if _provider is not None and hasattr(_provider, "shutdown"):
        _provider.shutdown()
    if _meter_provider is not None and hasattr(_meter_provider, "shutdown"):
        _meter_provider.shutdown()

start_span

start_span(name, attributes=None)

Start a new span as a child of the current context.

Yields the span (or None when OTel is not available).

Source code in inference/core/telemetry.py
64
65
66
67
68
69
70
71
72
73
74
75
@contextmanager
def start_span(name: str, attributes: Optional[Dict[str, Any]] = None):
    """Start a new span as a child of the current context.

    Yields the span (or None when OTel is not available).
    """
    if not _OTEL_AVAILABLE:
        yield None
        return
    tracer = _get_tracer()
    with tracer.start_as_current_span(name, attributes=attributes) as span:
        yield span

trace_context_log_processor

trace_context_log_processor(
    logger_instance, method_name, event_dict
)

Structlog processor that injects trace_id and span_id into log entries.

Source code in inference/core/telemetry.py
161
162
163
164
165
166
167
168
169
170
171
172
def trace_context_log_processor(
    logger_instance: Any, method_name: str, event_dict: Dict[str, Any]
) -> Dict[str, Any]:
    """Structlog processor that injects trace_id and span_id into log entries."""
    if not _OTEL_AVAILABLE:
        return event_dict
    span = trace.get_current_span()
    ctx = span.get_span_context()
    if ctx and ctx.trace_id:
        event_dict["trace_id"] = format(ctx.trace_id, "032x")
        event_dict["span_id"] = format(ctx.span_id, "016x")
    return event_dict

inference.core.usage

Functions:

trackUsage

trackUsage(endpoint, actor, n=1)

Tracks the usage of an endpoint by an actor.

This function increments the usage count for a given endpoint by an actor. It also handles initialization if the count does not exist.

Parameters:

Name Type Description Default
endpoint str

The endpoint being accessed.

required
actor str

The actor accessing the endpoint.

required
n int

The number of times the endpoint was accessed. Defaults to 1.

1

Returns:

Name Type Description
None

This function does not return anything but updates the memcache client.

Source code in inference/core/usage.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def trackUsage(endpoint, actor, n=1):
    """Tracks the usage of an endpoint by an actor.

    This function increments the usage count for a given endpoint by an actor.
    It also handles initialization if the count does not exist.

    Args:
        endpoint (str): The endpoint being accessed.
        actor (str): The actor accessing the endpoint.
        n (int, optional): The number of times the endpoint was accessed. Defaults to 1.

    Returns:
        None: This function does not return anything but updates the memcache client.
    """
    # count an inference
    try:
        job = endpoint + "endpoint:::actor" + actor
        current_infers = memcache_client.incr(job, n)
        if current_infers is None:  # not yet set; initialize at 1
            memcache_client.set(job, n)
            current_infers = n

            # store key
            job_keys = memcache_client.get("JOB_KEYS")
            if job_keys is None:
                memcache_client.add("JOB_KEYS", json.dumps([job]))
            else:
                decoded = json.loads(job_keys)
                decoded.append(job)
                decoded = list(set(decoded))
                memcache_client.set("JOB_KEYS", json.dumps(decoded))

            actor_keys = memcache_client.get("ACTOR_KEYS")
            if actor_keys is None:
                ak = {}
                ak[actor] = n
                memcache_client.add("ACTOR_KEYS", json.dumps(ak))
            else:
                decoded = json.loads(actor_keys)
                if actor in actor_keys:
                    actor_keys[actor] += n
                else:
                    actor_keys[actor] = n
                memcache_client.set("ACTOR_KEYS", json.dumps(actor_keys))

    except Exception as e:
        logger.debug("WARNING: there was an error in counting this inference")
        logger.debug(e)

core/interfaces

High-level inference interfaces: camera, HTTP, and stream processing.

inference.core.interfaces.base

Classes

BaseInterface

Base interface class which accepts a model manager on initialization

Source code in inference/core/interfaces/base.py
4
5
6
7
8
class BaseInterface:
    """Base interface class which accepts a model manager on initialization"""

    def __init__(self, model_manager: ModelManager) -> None:
        self.model_manager = model_manager

core/interfaces/camera

inference.core.interfaces.camera.camera

Classes

WebcamStream

Class to handle webcam streaming using a separate thread.

Attributes:

Name Type Description
stream_id int

The ID of the webcam stream.

frame_id int

A counter for the current frame.

vcap VideoCapture

OpenCV video capture object.

width int

The width of the video frame.

height int

The height of the video frame.

fps_input_stream int

Frames per second of the input stream.

grabbed bool

A flag indicating if a frame was successfully grabbed.

frame array

The current frame as a NumPy array.

pil_image Image

The current frame as a PIL image.

stopped bool

A flag indicating if the stream is stopped.

t Thread

The thread used to update the stream.

Source code in inference/core/interfaces/camera/camera.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
class WebcamStream:
    """Class to handle webcam streaming using a separate thread.

    Attributes:
        stream_id (int): The ID of the webcam stream.
        frame_id (int): A counter for the current frame.
        vcap (VideoCapture): OpenCV video capture object.
        width (int): The width of the video frame.
        height (int): The height of the video frame.
        fps_input_stream (int): Frames per second of the input stream.
        grabbed (bool): A flag indicating if a frame was successfully grabbed.
        frame (array): The current frame as a NumPy array.
        pil_image (Image): The current frame as a PIL image.
        stopped (bool): A flag indicating if the stream is stopped.
        t (Thread): The thread used to update the stream.
    """

    def __init__(self, stream_id=0, enforce_fps=False):
        """Initialize the webcam stream.

        Args:
            stream_id (int, optional): The ID of the webcam stream. Defaults to 0.
        """
        self.stream_id = stream_id
        self.enforce_fps = enforce_fps
        self.frame_id = 0
        self.vcap = cv2.VideoCapture(self.stream_id)

        for key in os.environ:
            if key.startswith("CV2_CAP_PROP"):
                opencv_prop = key[4:]
                opencv_constant = getattr(cv2, opencv_prop, None)
                if opencv_constant is not None:
                    value = int(os.getenv(key))
                    self.vcap.set(opencv_constant, value)
                    logger.info(f"set {opencv_prop} to {value}")
                else:
                    logger.warning(f"Property {opencv_prop} not found in cv2")

        self.width = int(self.vcap.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.height = int(self.vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        self.file_mode = self.vcap.get(cv2.CAP_PROP_FRAME_COUNT) > 0
        if self.enforce_fps and not self.file_mode:
            logger.warning(
                "Ignoring enforce_fps flag for this stream. It is not compatible with streams and will cause the process to crash"
            )
            self.enforce_fps = False
        self.max_fps = None
        if self.vcap.isOpened() is False:
            logger.debug("[Exiting]: Error accessing webcam stream.")
            exit(0)
        self.fps_input_stream = int(self.vcap.get(cv2.CAP_PROP_FPS))
        logger.debug(
            "FPS of webcam hardware/input stream: {}".format(self.fps_input_stream)
        )
        self.grabbed, self.frame = self.vcap.read()
        self.pil_image = Image.fromarray(cv2.cvtColor(self.frame, cv2.COLOR_BGR2RGB))
        if self.grabbed is False:
            logger.debug("[Exiting] No more frames to read")
            exit(0)
        self.stopped = True
        self.t = Thread(target=self.update, args=())
        self.t.daemon = True

    def start(self):
        """Start the thread for reading frames."""
        self.stopped = False
        self.t.start()

    def update(self):
        """Update the frame by reading from the webcam."""
        frame_id = 0
        next_frame_time = 0
        t0 = time.perf_counter()
        while True:
            t1 = time.perf_counter()
            if self.stopped is True:
                break

            self.grabbed = self.vcap.grab()
            if self.grabbed is False:
                logger.debug("[Exiting] No more frames to read")
                self.stopped = True
                break
            frame_id += 1
            # We can't retrieve each frame on nano and other lower powered devices quickly enough to keep up with the stream.
            # By default, we will only retrieve frames when we'll be ready process them (determined by self.max_fps).
            if t1 > next_frame_time:
                ret, frame = self.vcap.retrieve()
                if frame is None:
                    logger.debug("[Exiting] Frame not available for read")
                    self.stopped = True
                    break
                logger.debug(
                    f"retrieved frame {frame_id}, effective FPS: {frame_id / (t1 - t0):.2f}"
                )
                self.frame_id = frame_id
                self.frame = frame
                while self.file_mode and self.enforce_fps and self.max_fps is None:
                    # sleep until we have processed the first frame and we know what our FPS should be
                    time.sleep(0.01)
                if self.max_fps is None:
                    self.max_fps = 30
                next_frame_time = t1 + (1 / self.max_fps) + 0.02
            if self.file_mode:
                t2 = time.perf_counter()
                if self.enforce_fps:
                    # when enforce_fps is true, grab video frames 1:1 with inference speed
                    time_to_sleep = next_frame_time - t2
                else:
                    # otherwise, grab at native FPS of the video file
                    time_to_sleep = (1 / self.fps_input_stream) - (t2 - t1)
                if time_to_sleep > 0:
                    time.sleep(time_to_sleep)
        self.vcap.release()

    def read_opencv(self):
        """Read the current frame using OpenCV.

        Returns:
            array, int: The current frame as a NumPy array, and the frame ID.
        """
        return self.frame, self.frame_id

    def stop(self):
        """Stop the webcam stream."""
        self.stopped = True
Methods:
__init__
__init__(stream_id=0, enforce_fps=False)

Initialize the webcam stream.

Parameters:

Name Type Description Default
stream_id int

The ID of the webcam stream. Defaults to 0.

0
Source code in inference/core/interfaces/camera/camera.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def __init__(self, stream_id=0, enforce_fps=False):
    """Initialize the webcam stream.

    Args:
        stream_id (int, optional): The ID of the webcam stream. Defaults to 0.
    """
    self.stream_id = stream_id
    self.enforce_fps = enforce_fps
    self.frame_id = 0
    self.vcap = cv2.VideoCapture(self.stream_id)

    for key in os.environ:
        if key.startswith("CV2_CAP_PROP"):
            opencv_prop = key[4:]
            opencv_constant = getattr(cv2, opencv_prop, None)
            if opencv_constant is not None:
                value = int(os.getenv(key))
                self.vcap.set(opencv_constant, value)
                logger.info(f"set {opencv_prop} to {value}")
            else:
                logger.warning(f"Property {opencv_prop} not found in cv2")

    self.width = int(self.vcap.get(cv2.CAP_PROP_FRAME_WIDTH))
    self.height = int(self.vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    self.file_mode = self.vcap.get(cv2.CAP_PROP_FRAME_COUNT) > 0
    if self.enforce_fps and not self.file_mode:
        logger.warning(
            "Ignoring enforce_fps flag for this stream. It is not compatible with streams and will cause the process to crash"
        )
        self.enforce_fps = False
    self.max_fps = None
    if self.vcap.isOpened() is False:
        logger.debug("[Exiting]: Error accessing webcam stream.")
        exit(0)
    self.fps_input_stream = int(self.vcap.get(cv2.CAP_PROP_FPS))
    logger.debug(
        "FPS of webcam hardware/input stream: {}".format(self.fps_input_stream)
    )
    self.grabbed, self.frame = self.vcap.read()
    self.pil_image = Image.fromarray(cv2.cvtColor(self.frame, cv2.COLOR_BGR2RGB))
    if self.grabbed is False:
        logger.debug("[Exiting] No more frames to read")
        exit(0)
    self.stopped = True
    self.t = Thread(target=self.update, args=())
    self.t.daemon = True
read_opencv
read_opencv()

Read the current frame using OpenCV.

Returns:

Type Description

array, int: The current frame as a NumPy array, and the frame ID.

Source code in inference/core/interfaces/camera/camera.py
127
128
129
130
131
132
133
def read_opencv(self):
    """Read the current frame using OpenCV.

    Returns:
        array, int: The current frame as a NumPy array, and the frame ID.
    """
    return self.frame, self.frame_id
start
start()

Start the thread for reading frames.

Source code in inference/core/interfaces/camera/camera.py
75
76
77
78
def start(self):
    """Start the thread for reading frames."""
    self.stopped = False
    self.t.start()
stop
stop()

Stop the webcam stream.

Source code in inference/core/interfaces/camera/camera.py
135
136
137
def stop(self):
    """Stop the webcam stream."""
    self.stopped = True
update
update()

Update the frame by reading from the webcam.

Source code in inference/core/interfaces/camera/camera.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def update(self):
    """Update the frame by reading from the webcam."""
    frame_id = 0
    next_frame_time = 0
    t0 = time.perf_counter()
    while True:
        t1 = time.perf_counter()
        if self.stopped is True:
            break

        self.grabbed = self.vcap.grab()
        if self.grabbed is False:
            logger.debug("[Exiting] No more frames to read")
            self.stopped = True
            break
        frame_id += 1
        # We can't retrieve each frame on nano and other lower powered devices quickly enough to keep up with the stream.
        # By default, we will only retrieve frames when we'll be ready process them (determined by self.max_fps).
        if t1 > next_frame_time:
            ret, frame = self.vcap.retrieve()
            if frame is None:
                logger.debug("[Exiting] Frame not available for read")
                self.stopped = True
                break
            logger.debug(
                f"retrieved frame {frame_id}, effective FPS: {frame_id / (t1 - t0):.2f}"
            )
            self.frame_id = frame_id
            self.frame = frame
            while self.file_mode and self.enforce_fps and self.max_fps is None:
                # sleep until we have processed the first frame and we know what our FPS should be
                time.sleep(0.01)
            if self.max_fps is None:
                self.max_fps = 30
            next_frame_time = t1 + (1 / self.max_fps) + 0.02
        if self.file_mode:
            t2 = time.perf_counter()
            if self.enforce_fps:
                # when enforce_fps is true, grab video frames 1:1 with inference speed
                time_to_sleep = next_frame_time - t2
            else:
                # otherwise, grab at native FPS of the video file
                time_to_sleep = (1 / self.fps_input_stream) - (t2 - t1)
            if time_to_sleep > 0:
                time.sleep(time_to_sleep)
    self.vcap.release()

inference.core.interfaces.camera.entities

Classes

StatusUpdate dataclass

Represents a status update event in the system.

Attributes:

Name Type Description
timestamp datetime

The timestamp when the status update was created.

severity UpdateSeverity

The severity level of the update.

event_type str

A string representing the type of the event.

payload dict

A dictionary containing data relevant to the update.

context str

A string providing additional context about the update.

Source code in inference/core/interfaces/camera/entities.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@dataclass(frozen=True)
class StatusUpdate:
    """Represents a status update event in the system.

    Attributes:
        timestamp (datetime): The timestamp when the status update was created.
        severity (UpdateSeverity): The severity level of the update.
        event_type (str): A string representing the type of the event.
        payload (dict): A dictionary containing data relevant to the update.
        context (str): A string providing additional context about the update.
    """

    timestamp: datetime
    severity: UpdateSeverity
    event_type: str
    payload: dict
    context: str

UpdateSeverity

Bases: Enum

Enumeration for defining different levels of update severity.

Attributes:

Name Type Description
DEBUG int

A debugging severity level.

INFO int

An informational severity level.

WARNING int

A warning severity level.

ERROR int

An error severity level.

Source code in inference/core/interfaces/camera/entities.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class UpdateSeverity(Enum):
    """Enumeration for defining different levels of update severity.

    Attributes:
        DEBUG (int): A debugging severity level.
        INFO (int): An informational severity level.
        WARNING (int): A warning severity level.
        ERROR (int): An error severity level.
    """

    DEBUG = logging.DEBUG
    INFO = logging.INFO
    WARNING = logging.WARNING
    ERROR = logging.ERROR

VideoFrame dataclass

Represents a single frame of video data.

Attributes:

Name Type Description
image ndarray

The image data of the frame as a NumPy array.

frame_id FrameID

A unique identifier for the frame.

frame_timestamp FrameTimestamp

The timestamp when the frame was captured.

source_id int

The index of the video_reference element which was passed to InferencePipeline for this frame (useful when multiple streams are passed to InferencePipeline).

fps Optional[float]

declared FPS of source (if possible to be acquired)

measured_fps Optional[float]

measured FPS of live stream

comes_from_video_file Optional[bool]

flag to determine if frame comes from video file

Source code in inference/core/interfaces/camera/entities.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
@dataclass(frozen=True)
class VideoFrame:
    """Represents a single frame of video data.

    Attributes:
        image (np.ndarray): The image data of the frame as a NumPy array.
        frame_id (FrameID): A unique identifier for the frame.
        frame_timestamp (FrameTimestamp): The timestamp when the frame was captured.
        source_id (int): The index of the video_reference element which was passed to InferencePipeline for this frame
            (useful when multiple streams are passed to InferencePipeline).
        fps (Optional[float]): declared FPS of source (if possible to be acquired)
        measured_fps (Optional[float]): measured FPS of live stream
        comes_from_video_file (Optional[bool]): flag to determine if frame comes from video file
    """

    image: np.ndarray
    frame_id: FrameID
    frame_timestamp: FrameTimestamp
    # TODO: in next major version of inference replace `fps` with `declared_fps`
    fps: Optional[float] = None
    measured_fps: Optional[float] = None
    source_id: Optional[int] = None
    comes_from_video_file: Optional[bool] = None

inference.core.interfaces.camera.utils

Classes

RateLimiter

Implements rate upper-bound rate limiting by ensuring estimate_next_tick_delay() to be at min 1 / desired_fps, not letting the client obeying outcomes to exceed assumed rate.

Source code in inference/core/interfaces/camera/utils.py
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
class RateLimiter:
    """
    Implements rate upper-bound rate limiting by ensuring estimate_next_tick_delay()
    to be at min 1 / desired_fps, not letting the client obeying outcomes to exceed
    assumed rate.
    """

    def __init__(self, desired_fps: Union[float, int]):
        self._desired_fps = max(desired_fps, MINIMAL_FPS)
        self._last_tick: Optional[float] = None

    def tick(self) -> None:
        self._last_tick = time.monotonic()

    def estimate_next_action_delay(self) -> float:
        if self._last_tick is None:
            return 0.0
        desired_delay = 1 / self._desired_fps
        time_since_last_tick = time.monotonic() - self._last_tick
        return max(desired_delay - time_since_last_tick, 0.0)

VideoSourcesManager

This class should be treated as internal building block of stream multiplexer - not for external use.

Source code in inference/core/interfaces/camera/utils.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class VideoSourcesManager:
    """
    This class should be treated as internal building block of stream multiplexer - not for external use.
    """

    @classmethod
    def init(
        cls,
        video_sources: VideoSources,
        should_stop: Callable[[], bool],
        on_reconnection_error: Callable[[Optional[int], SourceConnectionError], None],
    ) -> "VideoSourcesManager":
        return cls(
            video_sources=video_sources,
            should_stop=should_stop,
            on_reconnection_error=on_reconnection_error,
        )

    def __init__(
        self,
        video_sources: VideoSources,
        should_stop: Callable[[], bool],
        on_reconnection_error: Callable[[Optional[int], SourceConnectionError], None],
    ):
        self._video_sources = video_sources
        self._reconnection_threads: Dict[int, Thread] = {}
        self._external_should_stop = should_stop
        self._on_reconnection_error = on_reconnection_error
        self._enforce_stop: Dict[int, bool] = {}
        self._ended_sources: Set[int] = set()
        self._threads_to_join: Set[int] = set()
        self._last_batch_yielded_time = datetime.now()

    def retrieve_frames_from_sources(
        self,
        batch_collection_timeout: Optional[float],
    ) -> Optional[List[VideoFrame]]:
        batch_frames = []
        if batch_collection_timeout is not None:
            batch_timeout_moment = self._last_batch_yielded_time + timedelta(
                seconds=batch_collection_timeout
            )
        else:
            batch_timeout_moment = None
        for source_ord, (source, source_should_reconnect) in enumerate(
            zip(self._video_sources.all_sources, self._video_sources.allow_reconnection)
        ):
            if self._external_should_stop():
                self.join_all_reconnection_threads(include_not_finished=True)
                return None
            if self._is_source_inactive(source_ord=source_ord):
                continue
            batch_time_left = (
                None
                if batch_timeout_moment is None
                else max((batch_timeout_moment - datetime.now()).total_seconds(), 0.0)
            )
            try:
                frame = source.read_frame(timeout=batch_time_left)
                if frame is not None:
                    batch_frames.append(frame)
            except EndOfStreamError:
                self._register_end_of_stream(source_ord=source_ord)
        self.join_all_reconnection_threads()
        self._last_batch_yielded_time = datetime.now()
        return batch_frames

    def all_sources_ended(self) -> bool:
        return len(self._ended_sources) >= len(self._video_sources.all_sources)

    def join_all_reconnection_threads(self, include_not_finished: bool = False) -> None:
        for source_ord in copy(self._threads_to_join):
            self._purge_reconnection_thread(source_ord=source_ord)
        if not include_not_finished:
            return None
        for source_ord in list(self._reconnection_threads.keys()):
            self._purge_reconnection_thread(source_ord=source_ord)

    def _is_source_inactive(self, source_ord: int) -> bool:
        return (
            source_ord in self._ended_sources
            or source_ord in self._reconnection_threads
        )

    def _register_end_of_stream(self, source_ord: int) -> None:
        source_should_reconnect = self._video_sources.allow_reconnection[source_ord]
        if source_should_reconnect:
            self._reconnect_source(source_ord=source_ord)
        else:
            self._ended_sources.add(source_ord)

    def _reconnect_source(self, source_ord: int) -> None:
        if source_ord in self._reconnection_threads:
            return None
        self._reconnection_threads[source_ord] = Thread(
            target=_attempt_reconnect,
            args=(
                self._video_sources.all_sources[source_ord],
                partial(self._should_stop, source_ord=source_ord),
                self._on_reconnection_error,
                partial(self._register_thread_to_join, source_ord=source_ord),
                partial(self._register_reconnection_fatal_error, source_ord=source_ord),
            ),
        )
        self._reconnection_threads[source_ord].start()

    def _register_reconnection_fatal_error(self, source_ord: int) -> None:
        self._register_thread_to_join(source_ord=source_ord)
        self._ended_sources.add(source_ord)

    def _register_thread_to_join(self, source_ord: int) -> None:
        self._threads_to_join.add(source_ord)

    def _purge_reconnection_thread(self, source_ord: int) -> None:
        if source_ord not in self._reconnection_threads:
            return None
        self._enforce_stop[source_ord] = True
        self._reconnection_threads[source_ord].join()
        del self._reconnection_threads[source_ord]
        self._enforce_stop[source_ord] = False
        if source_ord in self._threads_to_join:
            self._threads_to_join.remove(source_ord)

    def _should_stop(self, source_ord: int) -> bool:
        if self._external_should_stop():
            return True
        return self._enforce_stop.get(source_ord, False)

Functions:

get_video_frames_generator

get_video_frames_generator(
    video, max_fps=None, limiter_strategy=None
)

Util function to create a frames generator from VideoSource with possibility to limit FPS of consumed frames and dictate what to do if frames are produced to fast.

Parameters:

Name Type Description Default
video Union[VideoSource, str, int]

Either instance of VideoSource or video reference accepted by VideoSource.init(...)

required
max_fps Optional[Union[float, int]]

value of maximum FPS rate of generated frames - can be used to limit generation frequency

None
limiter_strategy Optional[FPSLimiterStrategy]

strategy used to deal with frames decoding exceeding limit of max_fps. By default - for files, in the interest of processing all frames - generation will be awaited, for streams - frames will be dropped on the floor.

None
Example
from inference.core.interfaces.camera.utils import get_video_frames_generator

for frame in get_video_frames_generator(
    video="./some.mp4",
    max_fps=50,
):
     pass
Source code in inference/core/interfaces/camera/utils.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def get_video_frames_generator(
    video: Union[VideoSource, str, int],
    max_fps: Optional[Union[float, int]] = None,
    limiter_strategy: Optional[FPSLimiterStrategy] = None,
) -> Generator[VideoFrame, None, None]:
    """
    Util function to create a frames generator from `VideoSource` with possibility to
    limit FPS of consumed frames and dictate what to do if frames are produced to fast.

    Args:
        video (Union[VideoSource, str, int]): Either instance of VideoSource or video reference accepted
            by VideoSource.init(...)
        max_fps (Optional[Union[float, int]]): value of maximum FPS rate of generated frames - can be used to limit
            generation frequency
        limiter_strategy (Optional[FPSLimiterStrategy]): strategy used to deal with frames decoding exceeding
            limit of `max_fps`. By default - for files, in the interest of processing all frames -
            generation will be awaited, for streams - frames will be dropped on the floor.
    Returns: generator of `VideoFrame`

    Example:
        ```python
        from inference.core.interfaces.camera.utils import get_video_frames_generator

        for frame in get_video_frames_generator(
            video="./some.mp4",
            max_fps=50,
        ):
             pass
        ```
    """
    is_managed_source = False
    if issubclass(type(video), str) or issubclass(type(video), int):
        video = VideoSource.init(
            video_reference=video,
        )
        video.start()
        is_managed_source = True
    if max_fps is None:
        yield from video
        if is_managed_source:
            video.terminate(purge_frames_buffer=True)
        return None
    limiter_strategy = resolve_limiter_strategy(
        explicitly_defined_strategy=limiter_strategy,
        source_properties=video.describe_source().source_properties,
    )
    yield from limit_frame_rate(
        frames_generator=video, max_fps=max_fps, strategy=limiter_strategy
    )
    if is_managed_source:
        video.terminate(purge_frames_buffer=True)
    return None

multiplex_videos

multiplex_videos(
    videos,
    max_fps=None,
    limiter_strategy=None,
    batch_collection_timeout=None,
    force_stream_reconnection=True,
    should_stop=never_stop,
    on_reconnection_error=log_error,
)

Function that is supposed to provide a generator over frames from multiple video sources. It is capable to initialise VideoSource from references to video files or streams and grab frames from all the sources - each running individual decoding on separate thread. In each cycle it attempts to grab frames from all sources (and wait at max batch_collection_timeout for whole batch to be collected). If frame from specific source cannot be collected in that time - it is simply not included in returned list. If after batch collection list of frames is empty - new collection start immediately. Collection does not account for sources that lost connectivity (example: streams that went offline). If that does not happen and stream has large latency - without reasonable batch_collection_timeout it will slow down processing - so please set it up in PROD solutions. In case of video streams (not video files) - given that force_stream_reconnection=True function will attempt to re-connect to disconnected source using background thread, not impairing batch frames collection and that source is not going to block frames retrieval even if infinite batch_collection_timeout=None is set. Similarly, when processing files - video file that is shorter than other passed into processing will not block the whole flow after End Of Stream (EOS).

All sources must be accessible on start - if that's not the case - logic function raises SourceConnectionError and closes all video sources it opened on it own. Disconnections at later stages are handled by re-connection mechanism.

Parameters:

Name Type Description Default
videos List[Union[VideoSource, str, int]]

List with references to video sources. Elements can be pre-initialised VideoSource instances, str with stream URI or file location or int representing camera device attached to the PC/server running the code.

required
max_fps Optional[Union[float, int]]

Upper-bound of processing speed - to be used when one wants at max max_fps video frames per second to be yielded from all sources by the generator.

None
limiter_strategy Optional[FPSLimiterStrategy]

strategy used to deal with frames decoding exceeding limit of max_fps. For video files, in the interest of processing all frames - we recommend WAIT mode, for streams - frames should be dropped on the floor with DROP strategy. Not setting the strategy equals using automatic mode - WAIT if all sources are files and DROP otherwise

None
batch_collection_timeout Optional[float]

maximum await time to get batch of predictions from all sources. None means infinite timeout.

None
force_stream_reconnection bool

Flag to decide on reconnection to streams (files are never re-connected)

True
should_stop Callable[[], bool]

external stop signal that is periodically checked - to denote that video consumption stopped - make the function to return True

never_stop
on_reconnection_error Callable[[Optional[int], SourceConnectionError], None]

Function that will be called whenever source cannot re-connect after disconnection. First parameter is source_id, second is connection error instance.

log_error

Returns Generator[List[VideoFrame], None, None]: allowing to iterate through frames from multiple video sources.

Raises:

Type Description
SourceConnectionError

when one or more source is not reachable at start of generation

Example
from inference.core.interfaces.camera.utils import multiplex_videos

for frames in multiplex_videos(videos=["./some.mp4", "./other.mp4"]):
     for frame in frames:
        pass  # do something with frame
Source code in inference/core/interfaces/camera/utils.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
def multiplex_videos(
    videos: List[Union[VideoSource, str, int]],
    max_fps: Optional[Union[float, int]] = None,
    limiter_strategy: Optional[FPSLimiterStrategy] = None,
    batch_collection_timeout: Optional[float] = None,
    force_stream_reconnection: bool = True,
    should_stop: Callable[[], bool] = never_stop,
    on_reconnection_error: Callable[
        [Optional[int], SourceConnectionError], None
    ] = log_error,
) -> Generator[List[VideoFrame], None, None]:
    """
    Function that is supposed to provide a generator over frames from multiple video sources. It is capable to
    initialise `VideoSource` from references to video files or streams and grab frames from all the sources -
    each running individual decoding on separate thread. In each cycle it attempts to grab frames from all sources
    (and wait at max `batch_collection_timeout` for whole batch to be collected). If frame from specific source
    cannot be collected in that time - it is simply not included in returned list. If after batch collection list of
    frames is empty - new collection start immediately. Collection does not account for
    sources that lost connectivity (example: streams that went offline). If that does not happen and stream has
    large latency - without reasonable `batch_collection_timeout` it will slow down processing - so please
    set it up in PROD solutions. In case of video streams (not video files) - given that
    `force_stream_reconnection=True` function will attempt to re-connect to disconnected source using background thread,
    not impairing batch frames collection and that source is not going to block frames retrieval even if infinite
    `batch_collection_timeout=None` is set. Similarly, when processing files - video file that is shorter than other
    passed into processing will not block the whole flow after End Of Stream (EOS).

    All sources must be accessible on start - if that's not the case - logic function raises `SourceConnectionError`
    and closes all video sources it opened on it own. Disconnections at later stages are handled by re-connection
    mechanism.

    Args:
        videos (List[Union[VideoSource, str, int]]): List with references to video sources. Elements can be
            pre-initialised `VideoSource` instances, str with stream URI or file location or int representing
            camera device attached to the PC/server running the code.
        max_fps (Optional[Union[float, int]]): Upper-bound of processing speed - to be used when one wants at max
            `max_fps` video frames per second to be yielded from all sources by the generator.
        limiter_strategy (Optional[FPSLimiterStrategy]): strategy used to deal with frames decoding exceeding
            limit of `max_fps`. For video files, in the interest of processing all frames - we recommend WAIT mode,
             for streams - frames should be dropped on the floor with DROP strategy. Not setting the strategy equals
             using automatic mode - WAIT if all sources are files and DROP otherwise
        batch_collection_timeout (Optional[float]): maximum await time to get batch of predictions from all sources.
            `None` means infinite timeout.
        force_stream_reconnection (bool): Flag to decide on reconnection to streams (files are never re-connected)
        should_stop (Callable[[], bool]): external stop signal that is periodically checked - to denote that
            video consumption stopped - make the function to return True
        on_reconnection_error (Callable[[Optional[int], SourceConnectionError], None]): Function that will be
            called whenever source cannot re-connect after disconnection. First parameter is source_id, second
            is connection error instance.

    Returns Generator[List[VideoFrame], None, None]: allowing to iterate through frames from multiple video sources.

    Raises:
        SourceConnectionError: when one or more source is not reachable at start of generation

    Example:
        ```python
        from inference.core.interfaces.camera.utils import multiplex_videos

        for frames in multiplex_videos(videos=["./some.mp4", "./other.mp4"]):
             for frame in frames:
                pass  # do something with frame
        ```
    """
    video_sources = _prepare_video_sources(
        videos=videos, force_stream_reconnection=force_stream_reconnection
    )
    if any(rule is None for rule in video_sources.allow_reconnection):
        logger.warning("Could not connect to all sources.")
        return None
    generator = _multiplex_videos(
        video_sources=video_sources,
        batch_collection_timeout=batch_collection_timeout,
        should_stop=should_stop,
        on_reconnection_error=on_reconnection_error,
    )
    if max_fps is None:
        yield from generator
        return None
    max_fps = max_fps / len(videos)
    if limiter_strategy is None:
        limiter_strategy = negotiate_rate_limiter_strategy_for_multiple_sources(
            video_sources=video_sources.all_sources,
        )
    yield from limit_frame_rate(
        frames_generator=generator, max_fps=max_fps, strategy=limiter_strategy
    )

inference.core.interfaces.camera.video_source

Classes

VideoConsumer

This class should be consumed as part of internal implementation. It provides abstraction around stream consumption strategies.

It must always be given the same video source for consecutive invocations, otherwise the internal state does not make sense.

Source code in inference/core/interfaces/camera/video_source.py
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
class VideoConsumer:
    """
    This class should be consumed as part of internal implementation.
    It provides abstraction around stream consumption strategies.

    It must always be given the same video source for consecutive invocations,
    otherwise the internal state does not make sense.
    """

    @classmethod
    def init(
        cls,
        buffer_filling_strategy: Optional[BufferFillingStrategy],
        adaptive_mode_stream_pace_tolerance: float,
        adaptive_mode_reader_pace_tolerance: float,
        minimum_adaptive_mode_samples: int,
        maximum_adaptive_frames_dropped_in_row: int,
        status_update_handlers: List[Callable[[StatusUpdate], None]],
        desired_fps: Optional[Union[float, int]] = None,
    ) -> "VideoConsumer":
        minimum_adaptive_mode_samples = max(minimum_adaptive_mode_samples, 2)
        reader_pace_monitor = sv.FPSMonitor(
            sample_size=10 * minimum_adaptive_mode_samples
        )
        stream_consumption_pace_monitor = sv.FPSMonitor(
            sample_size=10 * minimum_adaptive_mode_samples
        )
        decoding_pace_monitor = sv.FPSMonitor(
            sample_size=10 * minimum_adaptive_mode_samples
        )
        return cls(
            buffer_filling_strategy=buffer_filling_strategy,
            adaptive_mode_stream_pace_tolerance=adaptive_mode_stream_pace_tolerance,
            adaptive_mode_reader_pace_tolerance=adaptive_mode_reader_pace_tolerance,
            minimum_adaptive_mode_samples=minimum_adaptive_mode_samples,
            maximum_adaptive_frames_dropped_in_row=maximum_adaptive_frames_dropped_in_row,
            status_update_handlers=status_update_handlers,
            reader_pace_monitor=reader_pace_monitor,
            stream_consumption_pace_monitor=stream_consumption_pace_monitor,
            decoding_pace_monitor=decoding_pace_monitor,
            desired_fps=desired_fps,
        )

    def __init__(
        self,
        buffer_filling_strategy: Optional[BufferFillingStrategy],
        adaptive_mode_stream_pace_tolerance: float,
        adaptive_mode_reader_pace_tolerance: float,
        minimum_adaptive_mode_samples: int,
        maximum_adaptive_frames_dropped_in_row: int,
        status_update_handlers: List[Callable[[StatusUpdate], None]],
        reader_pace_monitor: sv.FPSMonitor,
        stream_consumption_pace_monitor: sv.FPSMonitor,
        decoding_pace_monitor: sv.FPSMonitor,
        desired_fps: Optional[Union[float, int]],
    ):
        self._buffer_filling_strategy = buffer_filling_strategy
        self._frame_counter = 0
        self._adaptive_mode_stream_pace_tolerance = adaptive_mode_stream_pace_tolerance
        self._adaptive_mode_reader_pace_tolerance = adaptive_mode_reader_pace_tolerance
        self._minimum_adaptive_mode_samples = minimum_adaptive_mode_samples
        self._maximum_adaptive_frames_dropped_in_row = (
            maximum_adaptive_frames_dropped_in_row
        )
        self._adaptive_frames_dropped_in_row = 0
        self._reader_pace_monitor = reader_pace_monitor
        self._stream_consumption_pace_monitor = stream_consumption_pace_monitor
        self._decoding_pace_monitor = decoding_pace_monitor
        self._desired_fps = desired_fps
        self._declared_source_fps = None
        self._is_source_video_file = None
        self._timestamp_created: Optional[datetime] = None
        self._status_update_handlers = status_update_handlers
        self._next_frame_from_video_to_accept = 1

    @property
    def buffer_filling_strategy(self) -> Optional[BufferFillingStrategy]:
        return self._buffer_filling_strategy

    def reset(self, source_properties: SourceProperties) -> None:
        if source_properties.is_file:
            self._set_file_mode_buffering_strategies()
        else:
            self._set_stream_mode_buffering_strategies()
        self._reader_pace_monitor.reset()
        self.reset_stream_consumption_pace()
        self._decoding_pace_monitor.reset()
        self._adaptive_frames_dropped_in_row = 0
        self._next_frame_from_video_to_accept = self._frame_counter + 1

    def reset_stream_consumption_pace(self) -> None:
        self._stream_consumption_pace_monitor.reset()

    def notify_frame_consumed(self) -> None:
        self._reader_pace_monitor.tick()

    def consume_frame(
        self,
        video: VideoFrameProducer,
        declared_source_fps: Optional[float],
        is_source_video_file: Optional[bool],
        buffer: Queue,
        frames_buffering_allowed: bool,
        source_id: Optional[int] = None,
    ) -> bool:
        if self._is_source_video_file is None:
            source_properties = video.discover_source_properties()
            self._is_source_video_file = source_properties.is_file
            self._declared_source_fps = source_properties.fps
            self._timestamp_created = source_properties.timestamp_created

        if self._timestamp_created:
            frame_timestamp = self._timestamp_created + timedelta(
                seconds=self._frame_counter / self._declared_source_fps
            )
        else:
            frame_timestamp = datetime.now()

        success = video.grab()
        self._stream_consumption_pace_monitor.tick()
        if not success:
            return False
        self._frame_counter += 1
        if self._status_update_handlers:
            send_video_source_status_update(
                severity=UpdateSeverity.DEBUG,
                event_type=FRAME_CAPTURED_EVENT,
                payload={
                    "frame_timestamp": frame_timestamp,
                    "frame_id": self._frame_counter,
                    "source_id": source_id,
                },
                status_update_handlers=self._status_update_handlers,
            )
        measured_source_fps = declared_source_fps
        if not is_source_video_file:
            if hasattr(self._stream_consumption_pace_monitor, "fps"):
                measured_source_fps = self._stream_consumption_pace_monitor.fps
            else:
                measured_source_fps = self._stream_consumption_pace_monitor()

        if self._video_fps_should_be_sub_sampled():
            return True
        return self._consume_stream_frame(
            video=video,
            declared_source_fps=declared_source_fps,
            measured_source_fps=measured_source_fps,
            is_source_video_file=is_source_video_file,
            frame_timestamp=frame_timestamp,
            buffer=buffer,
            frames_buffering_allowed=frames_buffering_allowed,
            source_id=source_id,
        )

    def _set_file_mode_buffering_strategies(self) -> None:
        if self._buffer_filling_strategy is None:
            self._buffer_filling_strategy = BufferFillingStrategy.WAIT

    def _set_stream_mode_buffering_strategies(self) -> None:
        if self._buffer_filling_strategy is None:
            self._buffer_filling_strategy = BufferFillingStrategy.ADAPTIVE_DROP_OLDEST

    def _video_fps_should_be_sub_sampled(self) -> bool:
        if self._desired_fps is None:
            return False
        if self._is_source_video_file:
            actual_fps = self._declared_source_fps
        else:
            fraction_of_pace_monitor_samples = (
                len(self._stream_consumption_pace_monitor.all_timestamps)
                / self._stream_consumption_pace_monitor.all_timestamps.maxlen
            )
            if fraction_of_pace_monitor_samples < 0.9:
                actual_fps = self._declared_source_fps
            elif hasattr(self._stream_consumption_pace_monitor, "fps"):
                actual_fps = self._stream_consumption_pace_monitor.fps
            else:
                actual_fps = self._stream_consumption_pace_monitor()
        if self._frame_counter == self._next_frame_from_video_to_accept:
            stride = calculate_video_file_stride(
                actual_fps=actual_fps,
                desired_fps=self._desired_fps,
            )
            self._next_frame_from_video_to_accept += stride
            return False
        # skipping frame
        return True

    def _consume_stream_frame(
        self,
        video: VideoFrameProducer,
        declared_source_fps: Optional[float],
        measured_source_fps: Optional[float],
        is_source_video_file: Optional[bool],
        frame_timestamp: datetime,
        buffer: Queue,
        frames_buffering_allowed: bool,
        source_id: Optional[int],
    ) -> bool:
        """
        Returns: boolean flag with success status
        """
        if not frames_buffering_allowed:
            send_frame_drop_update(
                frame_timestamp=frame_timestamp,
                frame_id=self._frame_counter,
                cause="Buffering not allowed at the moment",
                status_update_handlers=self._status_update_handlers,
                source_id=source_id,
            )
            return True
        if self._frame_should_be_adaptively_dropped(
            declared_source_fps=declared_source_fps
        ):
            self._adaptive_frames_dropped_in_row += 1
            send_frame_drop_update(
                frame_timestamp=frame_timestamp,
                frame_id=self._frame_counter,
                cause="ADAPTIVE strategy",
                status_update_handlers=self._status_update_handlers,
                source_id=source_id,
            )
            return True
        self._adaptive_frames_dropped_in_row = 0
        if (
            not buffer.full()
            or self._buffer_filling_strategy is BufferFillingStrategy.WAIT
        ):
            return decode_video_frame_to_buffer(
                frame_timestamp=frame_timestamp,
                frame_id=self._frame_counter,
                video=video,
                buffer=buffer,
                decoding_pace_monitor=self._decoding_pace_monitor,
                source_id=source_id,
                declared_source_fps=declared_source_fps,
                measured_source_fps=measured_source_fps,
                comes_from_video_file=is_source_video_file,
            )
        if self._buffer_filling_strategy in DROP_OLDEST_STRATEGIES:
            return self._process_stream_frame_dropping_oldest(
                frame_timestamp=frame_timestamp,
                video=video,
                buffer=buffer,
                source_id=source_id,
                is_video_file=is_source_video_file,
            )
        send_frame_drop_update(
            frame_timestamp=frame_timestamp,
            frame_id=self._frame_counter,
            cause="DROP_LATEST strategy",
            status_update_handlers=self._status_update_handlers,
            source_id=source_id,
        )
        return True

    def _frame_should_be_adaptively_dropped(
        self, declared_source_fps: Optional[float]
    ) -> bool:
        if self._buffer_filling_strategy not in ADAPTIVE_STRATEGIES:
            return False
        if (
            self._adaptive_frames_dropped_in_row
            >= self._maximum_adaptive_frames_dropped_in_row
        ):
            return False
        if (
            len(self._stream_consumption_pace_monitor.all_timestamps)
            <= self._minimum_adaptive_mode_samples
        ):
            # not enough observations
            return False
        if hasattr(self._stream_consumption_pace_monitor, "fps"):
            stream_consumption_pace = self._stream_consumption_pace_monitor.fps
        else:
            stream_consumption_pace = self._stream_consumption_pace_monitor()
        announced_stream_fps = stream_consumption_pace
        if declared_source_fps is not None and declared_source_fps > 0:
            announced_stream_fps = declared_source_fps
        if (
            announced_stream_fps - stream_consumption_pace
            > self._adaptive_mode_stream_pace_tolerance
        ):
            # cannot keep up with stream emission
            return True
        if (
            len(self._reader_pace_monitor.all_timestamps)
            <= self._minimum_adaptive_mode_samples
        ) or (
            len(self._decoding_pace_monitor.all_timestamps)
            <= self._minimum_adaptive_mode_samples
        ):
            # not enough observations
            return False
        actual_reader_pace = get_fps_if_tick_happens_now(
            fps_monitor=self._reader_pace_monitor
        )
        if hasattr(self._decoding_pace_monitor, "fps"):
            decoding_pace = self._decoding_pace_monitor.fps
        else:
            decoding_pace = self._decoding_pace_monitor()
        if (
            decoding_pace - actual_reader_pace
            > self._adaptive_mode_reader_pace_tolerance
        ):
            # we are too fast for the reader - time to save compute on decoding
            return True
        return False

    def _process_stream_frame_dropping_oldest(
        self,
        frame_timestamp: datetime,
        video: VideoFrameProducer,
        buffer: Queue,
        source_id: Optional[int],
        is_video_file: bool,
    ) -> bool:
        drop_single_frame_from_buffer(
            buffer=buffer,
            cause="DROP_OLDEST strategy",
            status_update_handlers=self._status_update_handlers,
        )
        return decode_video_frame_to_buffer(
            frame_timestamp=frame_timestamp,
            frame_id=self._frame_counter,
            video=video,
            buffer=buffer,
            decoding_pace_monitor=self._decoding_pace_monitor,
            source_id=source_id,
            comes_from_video_file=is_video_file,
        )

VideoSource

Source code in inference/core/interfaces/camera/video_source.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
class VideoSource:
    @classmethod
    def init(
        cls,
        video_reference: VideoSourceIdentifier,
        buffer_size: int = DEFAULT_BUFFER_SIZE,
        status_update_handlers: Optional[List[Callable[[StatusUpdate], None]]] = None,
        buffer_filling_strategy: Optional[BufferFillingStrategy] = None,
        buffer_consumption_strategy: Optional[BufferConsumptionStrategy] = None,
        adaptive_mode_stream_pace_tolerance: float = DEFAULT_ADAPTIVE_MODE_STREAM_PACE_TOLERANCE,
        adaptive_mode_reader_pace_tolerance: float = DEFAULT_ADAPTIVE_MODE_READER_PACE_TOLERANCE,
        minimum_adaptive_mode_samples: int = DEFAULT_MINIMUM_ADAPTIVE_MODE_SAMPLES,
        maximum_adaptive_frames_dropped_in_row: int = DEFAULT_MAXIMUM_ADAPTIVE_FRAMES_DROPPED_IN_ROW,
        video_source_properties: Optional[Dict[str, float]] = None,
        source_id: Optional[int] = None,
        desired_fps: Optional[Union[float, int]] = None,
    ):
        """
        This class is meant to represent abstraction over video sources - both video files and
        on-line streams that are possible to be consumed and used by other components of `inference`
        library.

        Before digging into details of the class behaviour, it is advised to familiarise with the following
        concepts and implementation assumptions:

        1. Video file can be accessed from local (or remote) storage by the consumer in a pace dictated by
            its processing capabilities. If processing is faster than the frame rate of video, operations
            may be executed in a time shorter than the time of video playback. In the opposite case - consumer
            may freely decode and process frames in its own pace, without risk for failures due to temporal
            dependencies of processing - this is classical offline processing example.
        2. Video streams, on the other hand, usually need to be consumed in a pace near to their frame-rate -
            in other words - this is on-line processing example. Consumer being faster than incoming stream
            frames cannot utilise its resources to the full extent as not-yet-delivered data would be needed.
            Slow consumer, however, may not be able to process everything on time and to keep up with the pace
            of stream - some frames would need to be dropped. Otherwise - over time, consumer could go out of
            sync with the stream causing decoding failures or unpredictable behavior.

        To fit those two types of video sources, `VideoSource` introduces the concept of buffered decoding of
        video stream (like at the YouTube - player buffers some frames that are soon to be displayed).
        The way on how buffer is filled and consumed dictates the behavior of `VideoSource`.

        Starting from `BufferFillingStrategy` - we have 3 basic options:
        * WAIT: in case of slow video consumption, when buffer is full - `VideoSource` will wait for
        the empty spot in buffer before next frame will be processed - this is suitable in cases when
        we want to ensure EACH FRAME of the video to be processed
        * DROP_OLDEST: when buffer is full, the frame that sits there for the longest time will be dropped -
        this is suitable for cases when we want to process the most recent frames possible
        * DROP_LATEST: when buffer is full, the newly decoded frame is dropped - useful in cases when
        it is expected to have processing performance drops, but we would like to consume portions of
        video that are locally smooth - but this is probably the least common use-case.

        On top of that - there are two ADAPTIVE strategies: ADAPTIVE_DROP_OLDEST and ADAPTIVE_DROP_LATEST,
        which are equivalent to DROP_OLDEST and DROP_LATEST with adaptive decoding feature enabled. The notion
        of that mode will be described later.

        Naturally, decoded frames must also be consumed. `VideoSource` provides a handy interface for reading
        a video source frames by a SINGLE consumer. Consumption strategy can also be dictated via
        `BufferConsumptionStrategy`:
        * LAZY - consume all the frames from decoding buffer one-by-one
        * EAGER - at each readout - take all frames already buffered, drop all of them apart from the most recent

        In consequence - there are various combinations of `BufferFillingStrategy` and `BufferConsumptionStrategy`.
        The most popular would be:
        * `BufferFillingStrategy.WAIT` and `BufferConsumptionStrategy.LAZY` - to always decode and process each and
            every frame of the source (useful while processing video files - and default behaviour enforced by
            `inference` if there is no explicit configuration)
        * `BufferFillingStrategy.DROP_OLDEST` and `BufferConsumptionStrategy.EAGER` - to always process the most
            recent frames of source (useful while processing video streams when low latency [real-time experience]
            is required - ADAPTIVE version of this is default for streams)

        ADAPTIVE strategies were introduced to handle corner-cases, when consumer hardware is not capable to consume
        video stream and process frames at the same time (for instance - Nvidia Jetson devices running processing
        against hi-res streams with high FPS ratio). It acts with buffer in nearly the same way as `DROP_OLDEST`
        and `DROP_LATEST` strategies, but there are two more conditions that may influence frame drop:
        * announced rate of source - which in fact dictate the pace of frames grabbing from incoming stream that
        MUST be met by consumer to avoid strange decoding issues causing decoder to fail - if the pace of frame grabbing
        deviates too much - decoding will be postponed, and frames dropped to grab next ones sooner
        * consumption rate - in resource constraints environment, not only decoding is problematic from the performance
        perspective - but also heavy processing. If consumer is not quick enough - allocating more useful resources
        for decoding frames that may never be processed is a waste. That's why - if decoding happens more frequently
        than consumption of frame - ADAPTIVE mode causes decoding to be done in a slower pace and more frames are just
        grabbed and dropped on the floor.
        ADAPTIVE mode increases latency slightly, but may be the only way to operate in some cases.
        Behaviour of adaptive mode, including the maximum acceptable deviations of frames grabbing pace from source,
        reader pace and maximum number of consecutive frames dropped in ADAPTIVE mode are configurable by clients,
        with reasonable defaults being set.

        `VideoSource` emits events regarding its activity - which can be intercepted by custom handlers. Take
        into account that they are always executed in context of thread invoking them (and should be fast to complete,
        otherwise may block the flow of stream consumption). All errors raised will be emitted as logger warnings only.

        `VideoSource` implementation is naturally multithreading, with different thread decoding video and different
        one consuming it and manipulating source state. Implementation of user interface is thread-safe, although
        stream it is meant to be consumed by a single thread only.

        ENV variables involved:
        * VIDEO_SOURCE_BUFFER_SIZE - default: 64
        * VIDEO_SOURCE_ADAPTIVE_MODE_STREAM_PACE_TOLERANCE - default: 0.1
        * VIDEO_SOURCE_ADAPTIVE_MODE_READER_PACE_TOLERANCE - default: 5.0
        * VIDEO_SOURCE_MINIMUM_ADAPTIVE_MODE_SAMPLES - default: 10
        * VIDEO_SOURCE_MAXIMUM_ADAPTIVE_FRAMES_DROPPED_IN_ROW - default: 16

        As an `inference` user, please use .init() method instead of constructor to instantiate objects.

        Args:
            video_reference (Union[str, int]): Either str with file or stream reference, or int representing device ID
            buffer_size (int): size of decoding buffer
            status_update_handlers (Optional[List[Callable[[StatusUpdate], None]]]): List of handlers for status updates
            buffer_filling_strategy (Optional[BufferFillingStrategy]): Settings for buffer filling strategy - if not
                given - automatic choice regarding source type will be applied
            buffer_consumption_strategy (Optional[BufferConsumptionStrategy]): Settings for buffer consumption strategy,
                if not given - automatic choice regarding source type will be applied
            adaptive_mode_stream_pace_tolerance (float): Maximum deviation between frames grabbing pace and stream pace
                that will not trigger adaptive mode frame drop
            adaptive_mode_reader_pace_tolerance (float): Maximum deviation between decoding pace and stream consumption
                pace that will not trigger adaptive mode frame drop
            minimum_adaptive_mode_samples (int): Minimal number of frames to be used to establish actual pace of
                processing, before adaptive mode can drop any frame
            maximum_adaptive_frames_dropped_in_row (int): Maximum number of frames dropped in row due to application of
                adaptive strategy
            video_source_properties (Optional[dict[str, float]]): Optional dictionary with video source properties
                corresponding to OpenCV VideoCapture properties cv2.CAP_PROP_* to set values for the video source.
            source_id (Optional[int]): Optional identifier of video source - mainly useful to recognise specific source
                when multiple ones are in use. Identifier will be added to emitted frames and updates. It is advised
                to keep it unique within all sources in use.

        Returns: Instance of `VideoSource` class
        """
        frames_buffer = Queue(maxsize=buffer_size)
        if status_update_handlers is None:
            status_update_handlers = []
        video_consumer = VideoConsumer.init(
            buffer_filling_strategy=buffer_filling_strategy,
            adaptive_mode_stream_pace_tolerance=adaptive_mode_stream_pace_tolerance,
            adaptive_mode_reader_pace_tolerance=adaptive_mode_reader_pace_tolerance,
            minimum_adaptive_mode_samples=minimum_adaptive_mode_samples,
            maximum_adaptive_frames_dropped_in_row=maximum_adaptive_frames_dropped_in_row,
            status_update_handlers=status_update_handlers,
            desired_fps=desired_fps,
        )
        return cls(
            stream_reference=video_reference,
            frames_buffer=frames_buffer,
            status_update_handlers=status_update_handlers,
            buffer_consumption_strategy=buffer_consumption_strategy,
            video_consumer=video_consumer,
            video_source_properties=video_source_properties,
            source_id=source_id,
        )

    def __init__(
        self,
        stream_reference: VideoSourceIdentifier,
        frames_buffer: Queue,
        status_update_handlers: List[Callable[[StatusUpdate], None]],
        buffer_consumption_strategy: Optional[BufferConsumptionStrategy],
        video_consumer: "VideoConsumer",
        video_source_properties: Optional[Dict[str, float]],
        source_id: Optional[int],
    ):
        self._stream_reference = stream_reference
        self._video: Optional[VideoFrameProducer] = None
        self._source_properties: Optional[SourceProperties] = None
        self._frames_buffer = frames_buffer
        self._status_update_handlers = status_update_handlers
        self._buffer_consumption_strategy = buffer_consumption_strategy
        self._video_consumer = video_consumer
        self._state = StreamState.NOT_STARTED
        self._playback_allowed = Event()
        self._frames_buffering_allowed = True
        self._stream_consumption_thread: Optional[Thread] = None
        self._state_change_lock = Lock()
        self._video_source_properties = video_source_properties or {}
        self._source_id = source_id
        self._last_frame_timestamp: int = time.time_ns()
        self._fps: Optional[float] = None
        self._is_file: Optional[bool] = None

    @property
    def source_id(self) -> Optional[int]:
        return self._source_id

    @lock_state_transition
    def restart(
        self, wait_on_frames_consumption: bool = True, purge_frames_buffer: bool = False
    ) -> None:
        """
        Method to restart source consumption. Eligible to be used in states:
        [MUTED, RUNNING, PAUSED, ENDED, ERROR].
        End state:
        * INITIALISING - that should change into RUNNING once first frame is ready to be grabbed
        * ERROR - if it was not possible to connect with source

        Thread safe - only one transition of states possible at the time.

        Args:
            wait_on_frames_consumption (bool): Flag telling if all frames from buffer must be consumed before
                completion of this operation.

        Returns: None
        Throws:
            * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
            * SourceConnectionError: if source cannot be connected
        """
        if self._state not in RESTART_ELIGIBLE_STATES:
            raise StreamOperationNotAllowedError(
                f"Could not RESTART stream in state: {self._state}"
            )
        self._restart(
            wait_on_frames_consumption=wait_on_frames_consumption,
            purge_frames_buffer=purge_frames_buffer,
        )

    @lock_state_transition
    def start(self) -> None:
        """
        Method to be used to start source consumption. Eligible to be used in states:
        [NOT_STARTED, ENDED, (RESTARTING - which is internal state only)]
        End state:
        * INITIALISING - that should change into RUNNING once first frame is ready to be grabbed
        * ERROR - if it was not possible to connect with source

        Thread safe - only one transition of states possible at the time.

        Returns: None
        Throws:
            * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
            * SourceConnectionError: if source cannot be connected
        """
        if self._state not in START_ELIGIBLE_STATES:
            raise StreamOperationNotAllowedError(
                f"Could not START stream in state: {self._state}"
            )
        self._start()

    @lock_state_transition
    def terminate(
        self, wait_on_frames_consumption: bool = True, purge_frames_buffer: bool = False
    ) -> None:
        """
        Method to be used to terminate source consumption. Eligible to be used in states:
        [MUTED, RUNNING, PAUSED, ENDED, ERROR, (RESTARTING - which is internal state only)]
        End state:
        * ENDED - indicating success of the process
        * ERROR - if error with processing occurred

        Must be used to properly dispose resources at the end.

        Thread safe - only one transition of states possible at the time.

        Args:
            wait_on_frames_consumption (bool): Flag telling if all frames from buffer must be consumed before
                completion of this operation.

        Returns: None
        Throws:
            * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
        """
        if self._state not in TERMINATE_ELIGIBLE_STATES:
            raise StreamOperationNotAllowedError(
                f"Could not TERMINATE stream in state: {self._state}"
            )
        self._terminate(
            wait_on_frames_consumption=wait_on_frames_consumption,
            purge_frames_buffer=purge_frames_buffer,
        )

    @lock_state_transition
    def pause(self) -> None:
        """
        Method to be used to pause source consumption. During pause - no new frames are consumed.
        Used on on-line streams for too long may cause stream disconnection.
        Eligible to be used in states:
        [RUNNING]
        End state:
        * PAUSED

        Thread safe - only one transition of states possible at the time.

        Returns: None
        Throws:
            * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
        """
        if self._state not in PAUSE_ELIGIBLE_STATES:
            raise StreamOperationNotAllowedError(
                f"Could not PAUSE stream in state: {self._state}"
            )
        self._pause()

    @lock_state_transition
    def mute(self) -> None:
        """
        Method to be used to mute source consumption. Muting is an equivalent of pause for stream - where
        frames grabbing is not put on hold, just new frames decoding and buffering is not allowed - causing
        intermediate frames to be dropped. May be also used against files, although arguably less useful.
        Eligible to be used in states:
        [RUNNING]
        End state:
        * MUTED

        Thread safe - only one transition of states possible at the time.

        Returns: None
        Throws:
            * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
        """
        if self._state not in MUTE_ELIGIBLE_STATES:
            raise StreamOperationNotAllowedError(
                f"Could not MUTE stream in state: {self._state}"
            )
        self._mute()

    @lock_state_transition
    def resume(self) -> None:
        """
        Method to recover from pause or mute into running state.
        [PAUSED, MUTED]
        End state:
        * RUNNING

        Thread safe - only one transition of states possible at the time.

        Returns: None
        Throws:
            * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
        """
        if self._state not in RESUME_ELIGIBLE_STATES:
            raise StreamOperationNotAllowedError(
                f"Could not RESUME stream in state: {self._state}"
            )
        self._resume()

    def get_state(self) -> StreamState:
        """
        Method to get current state of the `VideoSource`

        Returns: StreamState
        """
        return self._state

    def frame_ready(self) -> bool:
        """
        Method to check if decoded frame is ready for consumer

        Returns: boolean flag indicating frame readiness
        """
        return not self._frames_buffer.empty()

    def read_frame(self, timeout: Optional[float] = None) -> Optional[VideoFrame]:
        """
        Method to be used by the consumer to get decoded source frame.

        Returns: VideoFrame object with decoded frame and its metadata.
        Throws:
            * EndOfStreamError: when trying to get the frame from closed source.
        """
        if self._is_file is None:
            source_metadata: SourceMetadata = self.describe_source()
            self._is_file = source_metadata.source_properties.is_file
            self._fps = source_metadata.source_properties.fps
            if not self._fps or self._fps <= 0 or self._fps > 1000:
                self._fps = 30  # sane default
        video_frame: Optional[Union[VideoFrame, str]] = get_from_queue(
            queue=self._frames_buffer,
            on_successful_read=self._video_consumer.notify_frame_consumed,
            timeout=timeout,
            purge=self._buffer_consumption_strategy is BufferConsumptionStrategy.EAGER,
        )
        if video_frame == POISON_PILL:
            raise EndOfStreamError(
                "Attempted to retrieve frame from stream that already ended."
            )
        if video_frame is not None and self._status_update_handlers:
            send_video_source_status_update(
                severity=UpdateSeverity.DEBUG,
                event_type=FRAME_CONSUMED_EVENT,
                payload={
                    "frame_timestamp": video_frame.frame_timestamp,
                    "frame_id": video_frame.frame_id,
                    "source_id": video_frame.source_id,
                },
                status_update_handlers=self._status_update_handlers,
            )
        return video_frame

    def describe_source(self) -> SourceMetadata:
        serialized_source_reference = self._stream_reference
        if callable(serialized_source_reference):
            serialized_source_reference = str(self._stream_reference)
        return SourceMetadata(
            source_properties=self._source_properties,
            source_reference=serialized_source_reference,
            buffer_size=self._frames_buffer.maxsize,
            state=self._state,
            buffer_filling_strategy=self._video_consumer.buffer_filling_strategy,
            buffer_consumption_strategy=self._buffer_consumption_strategy,
            source_id=self._source_id,
        )

    def _restart(
        self, wait_on_frames_consumption: bool = True, purge_frames_buffer: bool = False
    ) -> None:
        self._terminate(
            wait_on_frames_consumption=wait_on_frames_consumption,
            purge_frames_buffer=purge_frames_buffer,
        )
        self._change_state(target_state=StreamState.RESTARTING)
        self._playback_allowed = Event()
        self._frames_buffering_allowed = True
        self._video: Optional[VideoFrameProducer] = None
        self._source_properties: Optional[SourceProperties] = None
        self._start()

    def _start(self) -> None:
        self._change_state(target_state=StreamState.INITIALISING)
        if callable(self._stream_reference):
            self._video = self._stream_reference()
        elif _is_test_pattern_reference(self._stream_reference):
            from inference.core.interfaces.camera.test_pattern_producer import (
                TestPatternStreamProducer,
            )

            self._video = TestPatternStreamProducer()
        else:
            self._video = CV2VideoFrameProducer(self._stream_reference)
        if not self._video.isOpened():
            self._change_state(target_state=StreamState.ERROR)
            raise SourceConnectionError(
                f"Cannot connect to video source under reference: {self._stream_reference}"
            )
        self._video.initialize_source_properties(self._video_source_properties)
        self._source_properties = self._video.discover_source_properties()
        self._video_consumer.reset(source_properties=self._source_properties)
        if self._source_properties.is_file:
            self._set_file_mode_consumption_strategies()
        else:
            self._set_stream_mode_consumption_strategies()
        self._playback_allowed.set()
        self._stream_consumption_thread = Thread(target=self._consume_video)
        self._stream_consumption_thread.start()

    def _terminate(
        self, wait_on_frames_consumption: bool, purge_frames_buffer: bool
    ) -> None:
        if self._state in RESUME_ELIGIBLE_STATES:
            self._resume()
        previous_state = self._state
        self._change_state(target_state=StreamState.TERMINATING)
        if purge_frames_buffer:
            _ = get_from_queue(queue=self._frames_buffer, timeout=0.0, purge=True)
        if self._stream_consumption_thread is not None:
            self._stream_consumption_thread.join()
        if wait_on_frames_consumption:
            self._frames_buffer.join()
        if previous_state is not StreamState.ERROR:
            self._change_state(target_state=StreamState.ENDED)

    def _pause(self) -> None:
        self._playback_allowed.clear()
        self._change_state(target_state=StreamState.PAUSED)

    def _mute(self) -> None:
        self._frames_buffering_allowed = False
        self._change_state(target_state=StreamState.MUTED)

    def _resume(self) -> None:
        previous_state = self._state
        self._change_state(target_state=StreamState.RUNNING)
        if previous_state is StreamState.PAUSED:
            self._video_consumer.reset_stream_consumption_pace()
            self._playback_allowed.set()
        if previous_state is StreamState.MUTED:
            self._frames_buffering_allowed = True

    def _set_file_mode_consumption_strategies(self) -> None:
        if self._buffer_consumption_strategy is None:
            self._buffer_consumption_strategy = BufferConsumptionStrategy.LAZY

    def _set_stream_mode_consumption_strategies(self) -> None:
        if self._buffer_consumption_strategy is None:
            self._buffer_consumption_strategy = BufferConsumptionStrategy.EAGER

    def _consume_video(self) -> None:
        send_video_source_status_update(
            severity=UpdateSeverity.INFO,
            event_type=VIDEO_CONSUMPTION_STARTED_EVENT,
            status_update_handlers=self._status_update_handlers,
            payload={"source_id": self._source_id},
        )
        logger.info(f"Video consumption started")
        try:
            if self._state is not StreamState.TERMINATING:
                self._change_state(target_state=StreamState.RUNNING)
            declared_source_fps, is_video_file = None, None
            if self._source_properties is not None:
                declared_source_fps = self._source_properties.fps
                is_video_file = self._source_properties.is_file
            while self._video.isOpened():
                if self._state is StreamState.TERMINATING:
                    break
                self._playback_allowed.wait()
                success = self._video_consumer.consume_frame(
                    video=self._video,
                    declared_source_fps=declared_source_fps,
                    is_source_video_file=is_video_file,
                    buffer=self._frames_buffer,
                    frames_buffering_allowed=self._frames_buffering_allowed,
                    source_id=self._source_id,
                )
                if not success:
                    break
            self._frames_buffer.put(POISON_PILL)
            self._video.release()
            self._change_state(target_state=StreamState.ENDED)
            send_video_source_status_update(
                severity=UpdateSeverity.INFO,
                event_type=VIDEO_CONSUMPTION_FINISHED_EVENT,
                status_update_handlers=self._status_update_handlers,
                payload={"source_id": self._source_id},
            )
            logger.info(f"Video consumption finished")
        except Exception as error:
            self._change_state(target_state=StreamState.ERROR)
            payload = {
                "source_id": self._source_id,
                "error_type": error.__class__.__name__,
                "error_message": str(error),
                "error_context": "stream_consumer_thread",
            }
            send_video_source_status_update(
                severity=UpdateSeverity.ERROR,
                event_type=SOURCE_ERROR_EVENT,
                payload=payload,
                status_update_handlers=self._status_update_handlers,
            )
            logger.exception("Encountered error in video consumption thread")

    def _change_state(self, target_state: StreamState) -> None:
        payload = {
            "previous_state": self._state,
            "new_state": target_state,
            "source_id": self._source_id,
        }
        self._state = target_state
        send_video_source_status_update(
            severity=UpdateSeverity.INFO,
            event_type=SOURCE_STATE_UPDATE_EVENT,
            payload=payload,
            status_update_handlers=self._status_update_handlers,
        )

    def __iter__(self) -> "VideoSource":
        return self

    def __next__(self) -> VideoFrame:
        """
        Method allowing to use `VideoSource` convenient to read frames

        Returns: VideoFrame

        Example:
            ```python
            source = VideoSource.init(video_reference="./some.mp4")
            source.start()

            for frame in source:
                 pass
            ```
        """
        try:
            return self.read_frame()
        except EndOfStreamError:
            raise StopIteration()
Methods:
__next__
__next__()

Method allowing to use VideoSource convenient to read frames

Returns: VideoFrame

Example
source = VideoSource.init(video_reference="./some.mp4")
source.start()

for frame in source:
     pass
Source code in inference/core/interfaces/camera/video_source.py
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
def __next__(self) -> VideoFrame:
    """
    Method allowing to use `VideoSource` convenient to read frames

    Returns: VideoFrame

    Example:
        ```python
        source = VideoSource.init(video_reference="./some.mp4")
        source.start()

        for frame in source:
             pass
        ```
    """
    try:
        return self.read_frame()
    except EndOfStreamError:
        raise StopIteration()
frame_ready
frame_ready()

Method to check if decoded frame is ready for consumer

Returns: boolean flag indicating frame readiness

Source code in inference/core/interfaces/camera/video_source.py
537
538
539
540
541
542
543
def frame_ready(self) -> bool:
    """
    Method to check if decoded frame is ready for consumer

    Returns: boolean flag indicating frame readiness
    """
    return not self._frames_buffer.empty()
get_state
get_state()

Method to get current state of the VideoSource

Returns: StreamState

Source code in inference/core/interfaces/camera/video_source.py
529
530
531
532
533
534
535
def get_state(self) -> StreamState:
    """
    Method to get current state of the `VideoSource`

    Returns: StreamState
    """
    return self._state
init classmethod
init(
    video_reference,
    buffer_size=DEFAULT_BUFFER_SIZE,
    status_update_handlers=None,
    buffer_filling_strategy=None,
    buffer_consumption_strategy=None,
    adaptive_mode_stream_pace_tolerance=DEFAULT_ADAPTIVE_MODE_STREAM_PACE_TOLERANCE,
    adaptive_mode_reader_pace_tolerance=DEFAULT_ADAPTIVE_MODE_READER_PACE_TOLERANCE,
    minimum_adaptive_mode_samples=DEFAULT_MINIMUM_ADAPTIVE_MODE_SAMPLES,
    maximum_adaptive_frames_dropped_in_row=DEFAULT_MAXIMUM_ADAPTIVE_FRAMES_DROPPED_IN_ROW,
    video_source_properties=None,
    source_id=None,
    desired_fps=None,
)

This class is meant to represent abstraction over video sources - both video files and on-line streams that are possible to be consumed and used by other components of inference library.

Before digging into details of the class behaviour, it is advised to familiarise with the following concepts and implementation assumptions:

  1. Video file can be accessed from local (or remote) storage by the consumer in a pace dictated by its processing capabilities. If processing is faster than the frame rate of video, operations may be executed in a time shorter than the time of video playback. In the opposite case - consumer may freely decode and process frames in its own pace, without risk for failures due to temporal dependencies of processing - this is classical offline processing example.
  2. Video streams, on the other hand, usually need to be consumed in a pace near to their frame-rate - in other words - this is on-line processing example. Consumer being faster than incoming stream frames cannot utilise its resources to the full extent as not-yet-delivered data would be needed. Slow consumer, however, may not be able to process everything on time and to keep up with the pace of stream - some frames would need to be dropped. Otherwise - over time, consumer could go out of sync with the stream causing decoding failures or unpredictable behavior.

To fit those two types of video sources, VideoSource introduces the concept of buffered decoding of video stream (like at the YouTube - player buffers some frames that are soon to be displayed). The way on how buffer is filled and consumed dictates the behavior of VideoSource.

Starting from BufferFillingStrategy - we have 3 basic options: * WAIT: in case of slow video consumption, when buffer is full - VideoSource will wait for the empty spot in buffer before next frame will be processed - this is suitable in cases when we want to ensure EACH FRAME of the video to be processed * DROP_OLDEST: when buffer is full, the frame that sits there for the longest time will be dropped - this is suitable for cases when we want to process the most recent frames possible * DROP_LATEST: when buffer is full, the newly decoded frame is dropped - useful in cases when it is expected to have processing performance drops, but we would like to consume portions of video that are locally smooth - but this is probably the least common use-case.

On top of that - there are two ADAPTIVE strategies: ADAPTIVE_DROP_OLDEST and ADAPTIVE_DROP_LATEST, which are equivalent to DROP_OLDEST and DROP_LATEST with adaptive decoding feature enabled. The notion of that mode will be described later.

Naturally, decoded frames must also be consumed. VideoSource provides a handy interface for reading a video source frames by a SINGLE consumer. Consumption strategy can also be dictated via BufferConsumptionStrategy: * LAZY - consume all the frames from decoding buffer one-by-one * EAGER - at each readout - take all frames already buffered, drop all of them apart from the most recent

In consequence - there are various combinations of BufferFillingStrategy and BufferConsumptionStrategy. The most popular would be: * BufferFillingStrategy.WAIT and BufferConsumptionStrategy.LAZY - to always decode and process each and every frame of the source (useful while processing video files - and default behaviour enforced by inference if there is no explicit configuration) * BufferFillingStrategy.DROP_OLDEST and BufferConsumptionStrategy.EAGER - to always process the most recent frames of source (useful while processing video streams when low latency [real-time experience] is required - ADAPTIVE version of this is default for streams)

ADAPTIVE strategies were introduced to handle corner-cases, when consumer hardware is not capable to consume video stream and process frames at the same time (for instance - Nvidia Jetson devices running processing against hi-res streams with high FPS ratio). It acts with buffer in nearly the same way as DROP_OLDEST and DROP_LATEST strategies, but there are two more conditions that may influence frame drop: * announced rate of source - which in fact dictate the pace of frames grabbing from incoming stream that MUST be met by consumer to avoid strange decoding issues causing decoder to fail - if the pace of frame grabbing deviates too much - decoding will be postponed, and frames dropped to grab next ones sooner * consumption rate - in resource constraints environment, not only decoding is problematic from the performance perspective - but also heavy processing. If consumer is not quick enough - allocating more useful resources for decoding frames that may never be processed is a waste. That's why - if decoding happens more frequently than consumption of frame - ADAPTIVE mode causes decoding to be done in a slower pace and more frames are just grabbed and dropped on the floor. ADAPTIVE mode increases latency slightly, but may be the only way to operate in some cases. Behaviour of adaptive mode, including the maximum acceptable deviations of frames grabbing pace from source, reader pace and maximum number of consecutive frames dropped in ADAPTIVE mode are configurable by clients, with reasonable defaults being set.

VideoSource emits events regarding its activity - which can be intercepted by custom handlers. Take into account that they are always executed in context of thread invoking them (and should be fast to complete, otherwise may block the flow of stream consumption). All errors raised will be emitted as logger warnings only.

VideoSource implementation is naturally multithreading, with different thread decoding video and different one consuming it and manipulating source state. Implementation of user interface is thread-safe, although stream it is meant to be consumed by a single thread only.

ENV variables involved: * VIDEO_SOURCE_BUFFER_SIZE - default: 64 * VIDEO_SOURCE_ADAPTIVE_MODE_STREAM_PACE_TOLERANCE - default: 0.1 * VIDEO_SOURCE_ADAPTIVE_MODE_READER_PACE_TOLERANCE - default: 5.0 * VIDEO_SOURCE_MINIMUM_ADAPTIVE_MODE_SAMPLES - default: 10 * VIDEO_SOURCE_MAXIMUM_ADAPTIVE_FRAMES_DROPPED_IN_ROW - default: 16

As an inference user, please use .init() method instead of constructor to instantiate objects.

Parameters:

Name Type Description Default
video_reference Union[str, int]

Either str with file or stream reference, or int representing device ID

required
buffer_size int

size of decoding buffer

DEFAULT_BUFFER_SIZE
status_update_handlers Optional[List[Callable[[StatusUpdate], None]]]

List of handlers for status updates

None
buffer_filling_strategy Optional[BufferFillingStrategy]

Settings for buffer filling strategy - if not given - automatic choice regarding source type will be applied

None
buffer_consumption_strategy Optional[BufferConsumptionStrategy]

Settings for buffer consumption strategy, if not given - automatic choice regarding source type will be applied

None
adaptive_mode_stream_pace_tolerance float

Maximum deviation between frames grabbing pace and stream pace that will not trigger adaptive mode frame drop

DEFAULT_ADAPTIVE_MODE_STREAM_PACE_TOLERANCE
adaptive_mode_reader_pace_tolerance float

Maximum deviation between decoding pace and stream consumption pace that will not trigger adaptive mode frame drop

DEFAULT_ADAPTIVE_MODE_READER_PACE_TOLERANCE
minimum_adaptive_mode_samples int

Minimal number of frames to be used to establish actual pace of processing, before adaptive mode can drop any frame

DEFAULT_MINIMUM_ADAPTIVE_MODE_SAMPLES
maximum_adaptive_frames_dropped_in_row int

Maximum number of frames dropped in row due to application of adaptive strategy

DEFAULT_MAXIMUM_ADAPTIVE_FRAMES_DROPPED_IN_ROW
video_source_properties Optional[dict[str, float]]

Optional dictionary with video source properties corresponding to OpenCV VideoCapture properties cv2.CAP_PROP_* to set values for the video source.

None
source_id Optional[int]

Optional identifier of video source - mainly useful to recognise specific source when multiple ones are in use. Identifier will be added to emitted frames and updates. It is advised to keep it unique within all sources in use.

None
Source code in inference/core/interfaces/camera/video_source.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
@classmethod
def init(
    cls,
    video_reference: VideoSourceIdentifier,
    buffer_size: int = DEFAULT_BUFFER_SIZE,
    status_update_handlers: Optional[List[Callable[[StatusUpdate], None]]] = None,
    buffer_filling_strategy: Optional[BufferFillingStrategy] = None,
    buffer_consumption_strategy: Optional[BufferConsumptionStrategy] = None,
    adaptive_mode_stream_pace_tolerance: float = DEFAULT_ADAPTIVE_MODE_STREAM_PACE_TOLERANCE,
    adaptive_mode_reader_pace_tolerance: float = DEFAULT_ADAPTIVE_MODE_READER_PACE_TOLERANCE,
    minimum_adaptive_mode_samples: int = DEFAULT_MINIMUM_ADAPTIVE_MODE_SAMPLES,
    maximum_adaptive_frames_dropped_in_row: int = DEFAULT_MAXIMUM_ADAPTIVE_FRAMES_DROPPED_IN_ROW,
    video_source_properties: Optional[Dict[str, float]] = None,
    source_id: Optional[int] = None,
    desired_fps: Optional[Union[float, int]] = None,
):
    """
    This class is meant to represent abstraction over video sources - both video files and
    on-line streams that are possible to be consumed and used by other components of `inference`
    library.

    Before digging into details of the class behaviour, it is advised to familiarise with the following
    concepts and implementation assumptions:

    1. Video file can be accessed from local (or remote) storage by the consumer in a pace dictated by
        its processing capabilities. If processing is faster than the frame rate of video, operations
        may be executed in a time shorter than the time of video playback. In the opposite case - consumer
        may freely decode and process frames in its own pace, without risk for failures due to temporal
        dependencies of processing - this is classical offline processing example.
    2. Video streams, on the other hand, usually need to be consumed in a pace near to their frame-rate -
        in other words - this is on-line processing example. Consumer being faster than incoming stream
        frames cannot utilise its resources to the full extent as not-yet-delivered data would be needed.
        Slow consumer, however, may not be able to process everything on time and to keep up with the pace
        of stream - some frames would need to be dropped. Otherwise - over time, consumer could go out of
        sync with the stream causing decoding failures or unpredictable behavior.

    To fit those two types of video sources, `VideoSource` introduces the concept of buffered decoding of
    video stream (like at the YouTube - player buffers some frames that are soon to be displayed).
    The way on how buffer is filled and consumed dictates the behavior of `VideoSource`.

    Starting from `BufferFillingStrategy` - we have 3 basic options:
    * WAIT: in case of slow video consumption, when buffer is full - `VideoSource` will wait for
    the empty spot in buffer before next frame will be processed - this is suitable in cases when
    we want to ensure EACH FRAME of the video to be processed
    * DROP_OLDEST: when buffer is full, the frame that sits there for the longest time will be dropped -
    this is suitable for cases when we want to process the most recent frames possible
    * DROP_LATEST: when buffer is full, the newly decoded frame is dropped - useful in cases when
    it is expected to have processing performance drops, but we would like to consume portions of
    video that are locally smooth - but this is probably the least common use-case.

    On top of that - there are two ADAPTIVE strategies: ADAPTIVE_DROP_OLDEST and ADAPTIVE_DROP_LATEST,
    which are equivalent to DROP_OLDEST and DROP_LATEST with adaptive decoding feature enabled. The notion
    of that mode will be described later.

    Naturally, decoded frames must also be consumed. `VideoSource` provides a handy interface for reading
    a video source frames by a SINGLE consumer. Consumption strategy can also be dictated via
    `BufferConsumptionStrategy`:
    * LAZY - consume all the frames from decoding buffer one-by-one
    * EAGER - at each readout - take all frames already buffered, drop all of them apart from the most recent

    In consequence - there are various combinations of `BufferFillingStrategy` and `BufferConsumptionStrategy`.
    The most popular would be:
    * `BufferFillingStrategy.WAIT` and `BufferConsumptionStrategy.LAZY` - to always decode and process each and
        every frame of the source (useful while processing video files - and default behaviour enforced by
        `inference` if there is no explicit configuration)
    * `BufferFillingStrategy.DROP_OLDEST` and `BufferConsumptionStrategy.EAGER` - to always process the most
        recent frames of source (useful while processing video streams when low latency [real-time experience]
        is required - ADAPTIVE version of this is default for streams)

    ADAPTIVE strategies were introduced to handle corner-cases, when consumer hardware is not capable to consume
    video stream and process frames at the same time (for instance - Nvidia Jetson devices running processing
    against hi-res streams with high FPS ratio). It acts with buffer in nearly the same way as `DROP_OLDEST`
    and `DROP_LATEST` strategies, but there are two more conditions that may influence frame drop:
    * announced rate of source - which in fact dictate the pace of frames grabbing from incoming stream that
    MUST be met by consumer to avoid strange decoding issues causing decoder to fail - if the pace of frame grabbing
    deviates too much - decoding will be postponed, and frames dropped to grab next ones sooner
    * consumption rate - in resource constraints environment, not only decoding is problematic from the performance
    perspective - but also heavy processing. If consumer is not quick enough - allocating more useful resources
    for decoding frames that may never be processed is a waste. That's why - if decoding happens more frequently
    than consumption of frame - ADAPTIVE mode causes decoding to be done in a slower pace and more frames are just
    grabbed and dropped on the floor.
    ADAPTIVE mode increases latency slightly, but may be the only way to operate in some cases.
    Behaviour of adaptive mode, including the maximum acceptable deviations of frames grabbing pace from source,
    reader pace and maximum number of consecutive frames dropped in ADAPTIVE mode are configurable by clients,
    with reasonable defaults being set.

    `VideoSource` emits events regarding its activity - which can be intercepted by custom handlers. Take
    into account that they are always executed in context of thread invoking them (and should be fast to complete,
    otherwise may block the flow of stream consumption). All errors raised will be emitted as logger warnings only.

    `VideoSource` implementation is naturally multithreading, with different thread decoding video and different
    one consuming it and manipulating source state. Implementation of user interface is thread-safe, although
    stream it is meant to be consumed by a single thread only.

    ENV variables involved:
    * VIDEO_SOURCE_BUFFER_SIZE - default: 64
    * VIDEO_SOURCE_ADAPTIVE_MODE_STREAM_PACE_TOLERANCE - default: 0.1
    * VIDEO_SOURCE_ADAPTIVE_MODE_READER_PACE_TOLERANCE - default: 5.0
    * VIDEO_SOURCE_MINIMUM_ADAPTIVE_MODE_SAMPLES - default: 10
    * VIDEO_SOURCE_MAXIMUM_ADAPTIVE_FRAMES_DROPPED_IN_ROW - default: 16

    As an `inference` user, please use .init() method instead of constructor to instantiate objects.

    Args:
        video_reference (Union[str, int]): Either str with file or stream reference, or int representing device ID
        buffer_size (int): size of decoding buffer
        status_update_handlers (Optional[List[Callable[[StatusUpdate], None]]]): List of handlers for status updates
        buffer_filling_strategy (Optional[BufferFillingStrategy]): Settings for buffer filling strategy - if not
            given - automatic choice regarding source type will be applied
        buffer_consumption_strategy (Optional[BufferConsumptionStrategy]): Settings for buffer consumption strategy,
            if not given - automatic choice regarding source type will be applied
        adaptive_mode_stream_pace_tolerance (float): Maximum deviation between frames grabbing pace and stream pace
            that will not trigger adaptive mode frame drop
        adaptive_mode_reader_pace_tolerance (float): Maximum deviation between decoding pace and stream consumption
            pace that will not trigger adaptive mode frame drop
        minimum_adaptive_mode_samples (int): Minimal number of frames to be used to establish actual pace of
            processing, before adaptive mode can drop any frame
        maximum_adaptive_frames_dropped_in_row (int): Maximum number of frames dropped in row due to application of
            adaptive strategy
        video_source_properties (Optional[dict[str, float]]): Optional dictionary with video source properties
            corresponding to OpenCV VideoCapture properties cv2.CAP_PROP_* to set values for the video source.
        source_id (Optional[int]): Optional identifier of video source - mainly useful to recognise specific source
            when multiple ones are in use. Identifier will be added to emitted frames and updates. It is advised
            to keep it unique within all sources in use.

    Returns: Instance of `VideoSource` class
    """
    frames_buffer = Queue(maxsize=buffer_size)
    if status_update_handlers is None:
        status_update_handlers = []
    video_consumer = VideoConsumer.init(
        buffer_filling_strategy=buffer_filling_strategy,
        adaptive_mode_stream_pace_tolerance=adaptive_mode_stream_pace_tolerance,
        adaptive_mode_reader_pace_tolerance=adaptive_mode_reader_pace_tolerance,
        minimum_adaptive_mode_samples=minimum_adaptive_mode_samples,
        maximum_adaptive_frames_dropped_in_row=maximum_adaptive_frames_dropped_in_row,
        status_update_handlers=status_update_handlers,
        desired_fps=desired_fps,
    )
    return cls(
        stream_reference=video_reference,
        frames_buffer=frames_buffer,
        status_update_handlers=status_update_handlers,
        buffer_consumption_strategy=buffer_consumption_strategy,
        video_consumer=video_consumer,
        video_source_properties=video_source_properties,
        source_id=source_id,
    )
mute
mute()

Method to be used to mute source consumption. Muting is an equivalent of pause for stream - where frames grabbing is not put on hold, just new frames decoding and buffering is not allowed - causing intermediate frames to be dropped. May be also used against files, although arguably less useful. Eligible to be used in states: [RUNNING] End state: * MUTED

Thread safe - only one transition of states possible at the time.

Source code in inference/core/interfaces/camera/video_source.py
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
@lock_state_transition
def mute(self) -> None:
    """
    Method to be used to mute source consumption. Muting is an equivalent of pause for stream - where
    frames grabbing is not put on hold, just new frames decoding and buffering is not allowed - causing
    intermediate frames to be dropped. May be also used against files, although arguably less useful.
    Eligible to be used in states:
    [RUNNING]
    End state:
    * MUTED

    Thread safe - only one transition of states possible at the time.

    Returns: None
    Throws:
        * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
    """
    if self._state not in MUTE_ELIGIBLE_STATES:
        raise StreamOperationNotAllowedError(
            f"Could not MUTE stream in state: {self._state}"
        )
    self._mute()
pause
pause()

Method to be used to pause source consumption. During pause - no new frames are consumed. Used on on-line streams for too long may cause stream disconnection. Eligible to be used in states: [RUNNING] End state: * PAUSED

Thread safe - only one transition of states possible at the time.

Source code in inference/core/interfaces/camera/video_source.py
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
@lock_state_transition
def pause(self) -> None:
    """
    Method to be used to pause source consumption. During pause - no new frames are consumed.
    Used on on-line streams for too long may cause stream disconnection.
    Eligible to be used in states:
    [RUNNING]
    End state:
    * PAUSED

    Thread safe - only one transition of states possible at the time.

    Returns: None
    Throws:
        * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
    """
    if self._state not in PAUSE_ELIGIBLE_STATES:
        raise StreamOperationNotAllowedError(
            f"Could not PAUSE stream in state: {self._state}"
        )
    self._pause()
read_frame
read_frame(timeout=None)

Method to be used by the consumer to get decoded source frame.

Source code in inference/core/interfaces/camera/video_source.py
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
def read_frame(self, timeout: Optional[float] = None) -> Optional[VideoFrame]:
    """
    Method to be used by the consumer to get decoded source frame.

    Returns: VideoFrame object with decoded frame and its metadata.
    Throws:
        * EndOfStreamError: when trying to get the frame from closed source.
    """
    if self._is_file is None:
        source_metadata: SourceMetadata = self.describe_source()
        self._is_file = source_metadata.source_properties.is_file
        self._fps = source_metadata.source_properties.fps
        if not self._fps or self._fps <= 0 or self._fps > 1000:
            self._fps = 30  # sane default
    video_frame: Optional[Union[VideoFrame, str]] = get_from_queue(
        queue=self._frames_buffer,
        on_successful_read=self._video_consumer.notify_frame_consumed,
        timeout=timeout,
        purge=self._buffer_consumption_strategy is BufferConsumptionStrategy.EAGER,
    )
    if video_frame == POISON_PILL:
        raise EndOfStreamError(
            "Attempted to retrieve frame from stream that already ended."
        )
    if video_frame is not None and self._status_update_handlers:
        send_video_source_status_update(
            severity=UpdateSeverity.DEBUG,
            event_type=FRAME_CONSUMED_EVENT,
            payload={
                "frame_timestamp": video_frame.frame_timestamp,
                "frame_id": video_frame.frame_id,
                "source_id": video_frame.source_id,
            },
            status_update_handlers=self._status_update_handlers,
        )
    return video_frame
restart
restart(
    wait_on_frames_consumption=True,
    purge_frames_buffer=False,
)

Method to restart source consumption. Eligible to be used in states: [MUTED, RUNNING, PAUSED, ENDED, ERROR]. End state: * INITIALISING - that should change into RUNNING once first frame is ready to be grabbed * ERROR - if it was not possible to connect with source

Thread safe - only one transition of states possible at the time.

Parameters:

Name Type Description Default
wait_on_frames_consumption bool

Flag telling if all frames from buffer must be consumed before completion of this operation.

True
Source code in inference/core/interfaces/camera/video_source.py
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
@lock_state_transition
def restart(
    self, wait_on_frames_consumption: bool = True, purge_frames_buffer: bool = False
) -> None:
    """
    Method to restart source consumption. Eligible to be used in states:
    [MUTED, RUNNING, PAUSED, ENDED, ERROR].
    End state:
    * INITIALISING - that should change into RUNNING once first frame is ready to be grabbed
    * ERROR - if it was not possible to connect with source

    Thread safe - only one transition of states possible at the time.

    Args:
        wait_on_frames_consumption (bool): Flag telling if all frames from buffer must be consumed before
            completion of this operation.

    Returns: None
    Throws:
        * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
        * SourceConnectionError: if source cannot be connected
    """
    if self._state not in RESTART_ELIGIBLE_STATES:
        raise StreamOperationNotAllowedError(
            f"Could not RESTART stream in state: {self._state}"
        )
    self._restart(
        wait_on_frames_consumption=wait_on_frames_consumption,
        purge_frames_buffer=purge_frames_buffer,
    )
resume
resume()

Method to recover from pause or mute into running state. [PAUSED, MUTED] End state: * RUNNING

Thread safe - only one transition of states possible at the time.

Source code in inference/core/interfaces/camera/video_source.py
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
@lock_state_transition
def resume(self) -> None:
    """
    Method to recover from pause or mute into running state.
    [PAUSED, MUTED]
    End state:
    * RUNNING

    Thread safe - only one transition of states possible at the time.

    Returns: None
    Throws:
        * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
    """
    if self._state not in RESUME_ELIGIBLE_STATES:
        raise StreamOperationNotAllowedError(
            f"Could not RESUME stream in state: {self._state}"
        )
    self._resume()
start
start()

Method to be used to start source consumption. Eligible to be used in states: [NOT_STARTED, ENDED, (RESTARTING - which is internal state only)] End state: * INITIALISING - that should change into RUNNING once first frame is ready to be grabbed * ERROR - if it was not possible to connect with source

Thread safe - only one transition of states possible at the time.

Source code in inference/core/interfaces/camera/video_source.py
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
@lock_state_transition
def start(self) -> None:
    """
    Method to be used to start source consumption. Eligible to be used in states:
    [NOT_STARTED, ENDED, (RESTARTING - which is internal state only)]
    End state:
    * INITIALISING - that should change into RUNNING once first frame is ready to be grabbed
    * ERROR - if it was not possible to connect with source

    Thread safe - only one transition of states possible at the time.

    Returns: None
    Throws:
        * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
        * SourceConnectionError: if source cannot be connected
    """
    if self._state not in START_ELIGIBLE_STATES:
        raise StreamOperationNotAllowedError(
            f"Could not START stream in state: {self._state}"
        )
    self._start()
terminate
terminate(
    wait_on_frames_consumption=True,
    purge_frames_buffer=False,
)

Method to be used to terminate source consumption. Eligible to be used in states: [MUTED, RUNNING, PAUSED, ENDED, ERROR, (RESTARTING - which is internal state only)] End state: * ENDED - indicating success of the process * ERROR - if error with processing occurred

Must be used to properly dispose resources at the end.

Thread safe - only one transition of states possible at the time.

Parameters:

Name Type Description Default
wait_on_frames_consumption bool

Flag telling if all frames from buffer must be consumed before completion of this operation.

True
Source code in inference/core/interfaces/camera/video_source.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
@lock_state_transition
def terminate(
    self, wait_on_frames_consumption: bool = True, purge_frames_buffer: bool = False
) -> None:
    """
    Method to be used to terminate source consumption. Eligible to be used in states:
    [MUTED, RUNNING, PAUSED, ENDED, ERROR, (RESTARTING - which is internal state only)]
    End state:
    * ENDED - indicating success of the process
    * ERROR - if error with processing occurred

    Must be used to properly dispose resources at the end.

    Thread safe - only one transition of states possible at the time.

    Args:
        wait_on_frames_consumption (bool): Flag telling if all frames from buffer must be consumed before
            completion of this operation.

    Returns: None
    Throws:
        * StreamOperationNotAllowedError: if executed in context of incorrect state of the source
    """
    if self._state not in TERMINATE_ELIGIBLE_STATES:
        raise StreamOperationNotAllowedError(
            f"Could not TERMINATE stream in state: {self._state}"
        )
    self._terminate(
        wait_on_frames_consumption=wait_on_frames_consumption,
        purge_frames_buffer=purge_frames_buffer,
    )

Functions:

get_from_queue

get_from_queue(
    queue,
    timeout=None,
    on_successful_read=lambda: None,
    purge=False,
)

Function is supposed to take element from the queue waiting on the first element to appear using timeout parameter. One may ask to go to the very last element of the queue and return it - then purge should be set to True. No additional wait on new elements to appear happen and the purge stops once queue is free returning last element consumed. queue.task_done() and on_successful_read(...) will be called on each received element.

Source code in inference/core/interfaces/camera/video_source.py
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
def get_from_queue(
    queue: Queue,
    timeout: Optional[float] = None,
    on_successful_read: Callable[[], None] = lambda: None,
    purge: bool = False,
) -> Optional[Any]:
    """
    Function is supposed to take element from the queue waiting on the first element to appear using `timeout`
    parameter. One may ask to go to the very last element of the queue and return it - then `purge` should be set
    to True. No additional wait on new elements to appear happen and the purge stops once queue is free returning last
    element consumed.
    queue.task_done() and on_successful_read(...) will be called on each received element.
    """
    result = None
    if queue.empty() or not purge:
        try:
            result = queue.get(timeout=timeout)
            queue.task_done()
            on_successful_read()
        except Empty:
            pass
    while not queue.empty() and purge:
        result = queue.get()
        queue.task_done()
        on_successful_read()
    return result

core/interfaces/http/builder

inference.core.interfaces.http.builder.routes

Functions:

builder_browse async

builder_browse()

Loads the main builder UI (editor.html). Injects the CSRF token and BUILDER_ORIGIN so the client can parse them on page load.

Source code in inference/core/interfaces/http/builder/routes.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
@router.get(
    "",
    summary="Workflow Builder List",
    description="Loads the list of Workflows available for editing",
)
@with_route_exceptions_async
async def builder_browse():
    """
    Loads the main builder UI (editor.html).
    Injects the CSRF token and BUILDER_ORIGIN
    so the client can parse them on page load.
    """
    base_path = Path(__file__).parent
    file_path = base_path / "editor.html"
    content = file_path.read_text(encoding="utf-8")
    content = content.replace("{{BUILDER_ORIGIN}}", BUILDER_ORIGIN)
    content = content.replace("{{CSRF}}", csrf)

    return HTMLResponse(content)

builder_edit async

builder_edit(workflow_id)

Loads a specific workflow for editing.

Parameters:

Name Type Description Default
workflow_id str

The ID of the workflow to be edited.

required
Source code in inference/core/interfaces/http/builder/routes.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@router.get(
    "/edit/{workflow_id}",
    summary="Workflow Builder",
    description="Loads a specific workflow for editing",
)
@with_route_exceptions_async
async def builder_edit(workflow_id: str):
    """
    Loads a specific workflow for editing.

    Args:
        workflow_id (str): The ID of the workflow to be edited.
    """
    base_path = Path(__file__).parent
    file_path = base_path / "editor.html"
    content = file_path.read_text(encoding="utf-8")
    content = content.replace("{{BUILDER_ORIGIN}}", BUILDER_ORIGIN)
    content = content.replace("{{CSRF}}", csrf)

    return HTMLResponse(content)

builder_maybe_redirect async

builder_maybe_redirect(workflow_id)

If the workflow_id.json file exists, redirect to /build/edit/{workflow_id}. Otherwise, redirect back to /build.

Source code in inference/core/interfaces/http/builder/routes.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
@router.get("/{workflow_id}", include_in_schema=False)
@with_route_exceptions_async
async def builder_maybe_redirect(workflow_id: str):
    """
    If the workflow_id.json file exists, redirect to /build/edit/{workflow_id}.
    Otherwise, redirect back to /build.
    """
    if not re.match(r"^[\w\-]+$", workflow_id):
        return RedirectResponse(url="/build", status_code=302)

    workflow_hash = sha256(workflow_id.encode()).hexdigest()
    file_path = workflow_local_dir / f"{workflow_hash}.json"
    if file_path.exists():
        return RedirectResponse(url=f"/build/edit/{workflow_id}", status_code=302)
    else:
        return RedirectResponse(url="/build", status_code=302)

builder_redirect async

builder_redirect()

If user hits /build/ with trailing slash, redirect to /build

Source code in inference/core/interfaces/http/builder/routes.py
80
81
82
83
84
85
@router.get("/", include_in_schema=False)
async def builder_redirect():
    """
    If user hits /build/ with trailing slash, redirect to /build
    """
    return RedirectResponse(url="/build", status_code=302)

create_or_overwrite_workflow async

create_or_overwrite_workflow(
    workflow_id, request_body=Body(...)
)

Create or overwrite a workflow's JSON file on disk. Protected by CSRF token check.

Source code in inference/core/interfaces/http/builder/routes.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
@router.post("/api/{workflow_id}", dependencies=[Depends(verify_csrf_token)])
@with_route_exceptions_async
async def create_or_overwrite_workflow(
    workflow_id: str, request_body: dict = Body(...)
):
    """
    Create or overwrite a workflow's JSON file on disk.
    Protected by CSRF token check.
    """
    if not re.match(r"^[\w\-]+$", workflow_id):
        return JSONResponse({"error": "invalid id"}, status_code=HTTP_400_BAD_REQUEST)
    if workflow_id in _RESERVED_WORKFLOW_IDS:
        return JSONResponse(
            {"error": f"'{workflow_id}' is a reserved identifier"},
            status_code=HTTP_400_BAD_REQUEST,
        )

    workflow_local_dir.mkdir(parents=True, exist_ok=True)

    # If the body claims a different ID, treat that as a "rename".
    if request_body.get("id") and request_body.get("id") != workflow_id:
        old_id: str = request_body["id"]
        if not re.match(r"^[\w\-]+$", old_id):
            return JSONResponse(
                {"error": "invalid id"}, status_code=HTTP_400_BAD_REQUEST
            )

        old_workflow_hash = sha256(old_id.encode()).hexdigest()
        old_file_path = workflow_local_dir / f"{old_workflow_hash}.json"
        if old_file_path.exists():
            try:
                old_file_path.unlink()
            except Exception as e:
                logger.error(f"Error deleting {old_id} from {old_file_path}: {e}")
                return JSONResponse({"error": "unable to delete file"}, status_code=500)

    request_body["id"] = workflow_id

    workflow_hash = sha256(workflow_id.encode()).hexdigest()
    file_path = workflow_local_dir / f"{workflow_hash}.json"
    try:
        with file_path.open("w", encoding="utf-8") as f:
            json.dump(request_body, f, indent=2)
    except Exception as e:
        logger.error(f"Error writing JSON for {workflow_id} to {file_path}: {e}")
        return JSONResponse({"error": "unable to write file"}, status_code=500)

    return JSONResponse(
        {"message": f"Workflow '{workflow_id}' created/updated successfully."},
        status_code=HTTP_201_CREATED,
    )

delete_workflow async

delete_workflow(workflow_id)

Delete a workflow's JSON file from disk. Protected by CSRF token check.

Source code in inference/core/interfaces/http/builder/routes.py
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
@router.delete("/api/{workflow_id}", dependencies=[Depends(verify_csrf_token)])
@with_route_exceptions_async
async def delete_workflow(workflow_id: str):
    """
    Delete a workflow's JSON file from disk.
    Protected by CSRF token check.
    """
    if not re.match(r"^[\w\-]+$", workflow_id):
        return JSONResponse({"error": "invalid id"}, status_code=HTTP_400_BAD_REQUEST)
    if workflow_id in _RESERVED_WORKFLOW_IDS:
        return JSONResponse(
            {"error": f"'{workflow_id}' is a reserved identifier"},
            status_code=HTTP_400_BAD_REQUEST,
        )

    workflow_hash = sha256(workflow_id.encode()).hexdigest()
    file_path = workflow_local_dir / f"{workflow_hash}.json"
    if not file_path.exists():
        return JSONResponse({"error": "not found"}, status_code=HTTP_404_NOT_FOUND)

    try:
        file_path.unlink()
    except Exception as e:
        logger.error(f"Error deleting {workflow_id} from {file_path}: {e}")
        return JSONResponse({"error": "unable to delete file"}, status_code=500)

    return JSONResponse(
        {"message": f"Workflow '{workflow_id}' deleted successfully."}, status_code=200
    )

get_all_workflows async

get_all_workflows()

Returns JSON info about all .json files in {MODEL_CACHE_DIR}/workflow/local. Protected by CSRF token check.

Source code in inference/core/interfaces/http/builder/routes.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
@router.get("/api", dependencies=[Depends(verify_csrf_token)])
@with_route_exceptions_async
async def get_all_workflows():
    """
    Returns JSON info about all .json files in {MODEL_CACHE_DIR}/workflow/local.
    Protected by CSRF token check.
    """
    data = {}
    for json_file in workflow_local_dir.glob("*.json"):
        stat_info = json_file.stat()
        try:
            with json_file.open("r", encoding="utf-8") as f:
                config_contents: Dict[str, Any] = json.load(f)
        except json.JSONDecodeError as e:
            logger.error(f"Error decoding JSON from {json_file}: {e}")
            continue

        data[config_contents.get("id", json_file.stem)] = {
            "createTime": {"_seconds": int(stat_info.st_ctime)},
            "updateTime": {"_seconds": int(stat_info.st_mtime)},
            "config": config_contents,
        }

    return Response(
        content=json.dumps({"data": data}, indent=4),
        media_type="application/json",
        status_code=200,
    )

get_cached_models async

get_cached_models()

Return all models available in the local cache.

Combines user-trained models discovered via model_type.json markers with foundation-model blocks whose weights are fully cached. Results are cached for 30 seconds to avoid repeated filesystem scans.

Source code in inference/core/interfaces/http/builder/routes.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
@router.get("/api/models", dependencies=[Depends(verify_csrf_token)])
@with_route_exceptions_async
async def get_cached_models():
    """Return all models available in the local cache.

    Combines user-trained models discovered via ``model_type.json`` markers
    with foundation-model blocks whose weights are fully cached.
    Results are cached for 30 seconds to avoid repeated filesystem scans.
    """
    global _models_cache  # noqa: PLW0603

    async with _models_lock:
        now = time.time()
        if _models_cache is not None:
            cached_at, cached_result = _models_cache
            if now - cached_at < _MODELS_CACHE_TTL:
                return JSONResponse(content={"models": cached_result})

        # Inline import: inference.models.aliases transitively imports the
        # inference_models package which may not be installed when
        # ENABLE_BUILDER=False.  Keeping the import lazy avoids breaking
        # the server for non-builder users.
        from inference.models.aliases import REGISTERED_ALIASES

        # Build reverse alias map: canonical_id → [alias1, alias2, ...]
        reverse_aliases: Dict[str, List[str]] = {}
        for alias, canonical in REGISTERED_ALIASES.items():
            reverse_aliases.setdefault(canonical, []).append(alias)

        # Load blocks once and pass to both helpers to avoid double-loading.
        try:
            blocks = load_workflow_blocks()
        except Exception:
            logger.warning(
                "Failed to load workflow blocks — foundation model data will "
                "be unavailable. This may indicate a broken build or missing "
                "dependencies.",
                exc_info=True,
            )
            blocks = []

        # Scan the filesystem for cached models.
        user_models = scan_cached_models(MODEL_CACHE_DIR)
        foundation_models = get_cached_foundation_models(blocks=blocks)

        # De-duplicate by model_id (foundation models take precedence).
        seen: Dict[str, Dict[str, Any]] = {}
        for m in user_models:
            seen[m["model_id"]] = m
        for m in foundation_models:
            seen[m["model_id"]] = m

        # Enrich each model with compatible block types and aliases.
        task_to_blocks = get_task_type_to_block_mapping(blocks=blocks)
        models = []
        for m in seen.values():
            entry = dict(m)
            # For foundation models, use block_type for compatible_block_types
            # since they have empty task_type.
            block_type = entry.get("block_type")
            if block_type:
                entry.setdefault("compatible_block_types", [block_type])
            else:
                entry.setdefault(
                    "compatible_block_types",
                    task_to_blocks.get(m.get("task_type", ""), []),
                )
            # Add known aliases for this model
            model_id = m.get("model_id", "")
            aliases = reverse_aliases.get(model_id, [])
            entry["aliases"] = aliases
            # Use the shortest alias as display name if available
            if aliases and (entry.get("name") == model_id or not entry.get("name")):
                entry["name"] = min(aliases, key=len)
            # Remove internal-only keys.
            entry.pop("block_type", None)
            models.append(entry)

        _models_cache = (now, models)

    return JSONResponse(content={"models": models})

get_workflow async

get_workflow(workflow_id)

Return JSON for workflow_id.json, or 404 if missing. IDs in _RESERVED_WORKFLOW_IDS are rejected to avoid shadowing explicit sub-routes like /api/models.

Source code in inference/core/interfaces/http/builder/routes.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
@router.get("/api/{workflow_id}", dependencies=[Depends(verify_csrf_token)])
@with_route_exceptions_async
async def get_workflow(workflow_id: str):
    """
    Return JSON for workflow_id.json, or 404 if missing.
    IDs in ``_RESERVED_WORKFLOW_IDS`` are rejected to avoid shadowing
    explicit sub-routes like ``/api/models``.
    """
    if not re.match(r"^[\w\-]+$", workflow_id):
        return JSONResponse({"error": "invalid id"}, status_code=HTTP_400_BAD_REQUEST)
    if workflow_id in _RESERVED_WORKFLOW_IDS:
        return JSONResponse(
            {"error": f"'{workflow_id}' is a reserved identifier"},
            status_code=HTTP_400_BAD_REQUEST,
        )

    workflow_hash = sha256(workflow_id.encode()).hexdigest()
    file_path = workflow_local_dir / f"{workflow_hash}.json"
    if not file_path.exists():
        return JSONResponse({"error": "not found"}, status_code=HTTP_404_NOT_FOUND)

    stat_info = file_path.stat()
    try:
        with file_path.open("r", encoding="utf-8") as f:
            config_contents = json.load(f)
    except json.JSONDecodeError as e:
        logger.error(f"Error reading JSON for {workflow_id} from '{file_path}': {e}")
        return JSONResponse({"error": "invalid JSON"}, status_code=500)

    return Response(
        content=json.dumps(
            {
                "data": {
                    "createTime": int(stat_info.st_ctime),
                    "updateTime": int(stat_info.st_mtime),
                    "config": config_contents,
                }
            },
            indent=4,
        ),
        media_type="application/json",
        status_code=200,
    )

core/interfaces/http

inference.core.interfaces.http.error_handlers

Classes

Functions:

with_route_exceptions

with_route_exceptions(route)

A decorator that wraps a FastAPI route to handle specific exceptions. If an exception is caught, it returns a JSON response with the error message.

Parameters:

Name Type Description Default
route Callable

The FastAPI route to be wrapped.

required

Returns:

Name Type Description
Callable

The wrapped route.

Source code in inference/core/interfaces/http/error_handlers.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
def with_route_exceptions(route):
    """
    A decorator that wraps a FastAPI route to handle specific exceptions. If an exception
    is caught, it returns a JSON response with the error message.

    Args:
        route (Callable): The FastAPI route to be wrapped.

    Returns:
        Callable: The wrapped route.
    """

    @wraps(route)
    def wrapped_route(*args, **kwargs):
        try:
            try:
                return route(*args, **kwargs)
            except Exception as error:
                record_error(error)
                record_error_metric(type(error).__name__)
                raise
        except ContentTypeInvalid as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": "Invalid Content-Type header provided with request."
                },
            )
        except ContentTypeMissing as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={"message": "Content-Type header not provided with request."},
            )
        except InputImageLoadError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": f"Could not load input image. Cause: {error.get_public_error_details()}"
                },
            )
        except ModelInputError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": f"Error with model input. Cause: {error}",
                    "help_url": error.help_url,
                },
            )
        except InvalidModelIDError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={"message": "Invalid Model ID sent in request."},
            )
        except InvalidMaskDecodeArgument as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": "Invalid mask decode argument sent. tradeoff_factor must be in [0.0, 1.0], "
                    "mask_decode_mode: must be one of ['accurate', 'fast', 'tradeoff']"
                },
            )
        except MissingApiKeyError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": "Required Roboflow API key is missing. Visit https://docs.roboflow.com/api-reference/authentication#retrieve-an-api-key "
                    "to learn how to retrieve one."
                },
            )
        except (
            WorkflowSyntaxError,
            InvalidReferenceTargetError,
            ExecutionGraphStructureError,
            StepInputDimensionalityError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            content = WorkflowErrorResponse(
                message=str(error.public_message),
                error_type=error.__class__.__name__,
                context=str(error.context),
                inner_error_type=str(error.inner_error_type),
                inner_error_message=str(error.inner_error),
                blocks_errors=error.blocks_errors,
            )
            resp = JSONResponse(status_code=400, content=content.model_dump())
        except DynamicBlockCodeError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            content = _build_execution_error_response(error)
            resp = JSONResponse(status_code=400, content=content.model_dump())
        except (
            WorkflowDefinitionError,
            ReferenceTypeError,
            RuntimeInputError,
            InvalidInputTypeError,
            OperationTypeNotRecognisedError,
            DynamicBlockError,
            WorkflowExecutionEngineVersionError,
            NotSupportedExecutionEngineError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "context": error.context,
                    "inner_error_type": error.inner_error_type,
                    "inner_error_message": str(error.inner_error),
                },
            )
        except (
            ProcessesManagerInvalidPayload,
            MalformedPayloadError,
            MessageToBigError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "inner_error_type": error.inner_error_type,
                },
            )
        except (
            RoboflowAPINotAuthorizedError,
            ProcessesManagerAuthorisationError,
            UnauthorizedModelAccessError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=401,
                content={
                    "message": "Unauthorized access to roboflow API - check API key and make sure the key is valid for "
                    "workspace you use. Visit https://docs.roboflow.com/api-reference/authentication#retrieve-an-api-key "
                    "to learn how to retrieve one."
                },
            )
        except PaymentRequiredError as error:
            logger.warning("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=402,
                content={
                    "message": "Not enough credits to perform this request. Verify your workspace billing page."
                },
            )
        except RoboflowAPIForbiddenError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=403,
                content={
                    "message": "Unauthorized access to roboflow API - check API key and make sure the key is valid and "
                    "have required scopes. Visit https://docs.roboflow.com/api-reference/authentication#retrieve-an-api-key "
                    "to learn how to retrieve one."
                },
            )
        except RoboflowAPIUsagePausedError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=423,
                content={
                    "message": "Roboflow API usage is paused. Please contact your workspace administrator to re-enable api keys."
                },
            )
        except (RoboflowAPINotNotFoundError, ModelNotFoundError) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=404,
                content={
                    "message": "Requested Roboflow resource not found. Make sure that workspace, project or model "
                    "you referred in request exists."
                },
            )
        except ProcessesManagerNotFoundError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=404,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "inner_error_type": error.inner_error_type,
                },
            )
        except ModelPackageNegotiationError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": f"Could not negotiate model package - {error}",
                    "help_url": error.help_url,
                },
            )
        except ModelDeploymentNotSupportedError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=501,
                content={"message": str(error)},
            )
        except (
            InvalidEnvironmentVariableError,
            MissingServiceSecretError,
            ServiceConfigurationError,
            EnvironmentConfigurationError,
            InvalidEnvVariable,
            JetsonTypeResolutionError,
            MissingDependencyError,
            InvalidParameterError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500, content={"message": "Service misconfiguration."}
            )
        except (
            PreProcessingError,
            PostProcessingError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": "Model configuration related to pre- or post-processing is invalid."
                },
            )
        except ModelArtefactError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500, content={"message": "Model package is broken."}
            )
        except (
            CannotInitialiseModelDueToInputSizeError,
            ModelPackageRestrictedError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=507,
                content={
                    "message": "Model loading failed due to restrictions of server configuration - "
                    "usually due to excessive runtime memory requirement of the model (for instance "
                    "caused by large input size).",
                },
            )
        except ModelPackageAlternativesExhaustedError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            inner_errors = error.alternatives_errors or []
            if any(isinstance(e, ModelPackageRestrictedError) for e in inner_errors):
                resp = JSONResponse(
                    status_code=507,
                    content={
                        "message": "Model loading failed due to restrictions of server configuration - "
                        "usually due to excessive runtime memory requirement of the model (for instance "
                        "caused by large input size).",
                        "help_url": error.help_url,
                    },
                )
            else:
                resp = JSONResponse(
                    status_code=500,
                    content={
                        "message": f"Model loading failed: {error}",
                        "help_url": error.help_url,
                    },
                )
        except ModelLoadingError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": f"Model loading failed: {error}",
                    "help_url": error.help_url,
                },
            )
        except (UntrustedFileError, FileHashSumMissmatch) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": f"Issue with model package file: {error}",
                    "help_url": error.help_url,
                },
            )
        except ModelRetrievalError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": f"Could not retrieve model {error}",
                    "help_url": error.help_url,
                },
            )
        except OnnxProviderNotAvailable as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=501,
                content={
                    "message": "Could not find requested ONNX Runtime Provider. Check that you are using "
                    "the correct docker image on a supported device."
                },
            )
        except (
            MalformedRoboflowAPIResponseError,
            RoboflowAPIUnsuccessfulRequestError,
            WorkspaceLoadError,
            MalformedWorkflowResponseError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=502,
                content={"message": "Internal error. Request to Roboflow API failed."},
            )
        except InferenceModelNotFound as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=503,
                content={"message": "Model is temporarily not ready - retry request."},
                headers={"Retry-After": "1"},
            )
        except RoboflowAPIConnectionError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=503,
                content={
                    "message": "Internal error. Could not connect to Roboflow API."
                },
            )
        except ModelManagerLockAcquisitionError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=503,
                content={
                    "message": "Could not acquire model manager lock due to other request performing "
                    "blocking operation. Try again...."
                },
                headers={"Retry-After": "1"},
            )
        except RoboflowAPITimeoutError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=504,
                content={
                    "message": "Timeout when attempting to connect to Roboflow API."
                },
            )
        except (
            ClientCausedStepExecutionError,
            RuntimeLimitsCausedStepExecutionError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            content = WorkflowErrorResponse(
                message=str(error.public_message),
                error_type=error.__class__.__name__,
                context=str(error.context),
                inner_error_type=str(error.inner_error_type),
                inner_error_message=str(error.inner_error),
                blocks_errors=[
                    WorkflowBlockError(
                        block_id=error.block_id,
                    ),
                ],
            )
            resp = JSONResponse(
                status_code=error.status_code,
                content=content.model_dump(),
            )
        except StepExecutionError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            content = _build_execution_error_response(error)
            resp = JSONResponse(
                status_code=500,
                content=content.model_dump(),
            )
        except WorkflowError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "context": error.context,
                    "inner_error_type": error.inner_error_type,
                    "inner_error_message": str(error.inner_error),
                },
            )
        except (
            ProcessesManagerClientError,
            CommunicationProtocolError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "inner_error_type": error.inner_error_type,
                },
            )
        except WebRTCConfigurationError as error:
            logger.error("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": str(error),
                    "error_type": "WebRTCConfigurationError",
                },
            )
        except CreditsExceededError as error:
            logger.error("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=402,
                content={
                    "message": "Not enough credits to perform this request.",
                    "error_type": "CreditsExceededError",
                },
            )
        except WorkspaceStreamQuotaError as error:
            logger.error("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=429,
                content={
                    "message": str(error),
                    "error_type": "WorkspaceStreamQuotaError",
                },
            )
        except FeatureDeprecatedError as error:
            logger.warning("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=410,
                content={
                    "message": str(error),
                    "error_type": "FeatureDeprecatedError",
                    **error.get_structured_public_error_details(),
                },
            )
        except Exception as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(status_code=500, content={"message": "Internal error."})
        return resp

    return wrapped_route

with_route_exceptions_async

with_route_exceptions_async(route)

A decorator that wraps a FastAPI route to handle specific exceptions. If an exception is caught, it returns a JSON response with the error message.

Parameters:

Name Type Description Default
route Callable

The FastAPI route to be wrapped.

required

Returns:

Name Type Description
Callable

The wrapped route.

Source code in inference/core/interfaces/http/error_handlers.py
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
def with_route_exceptions_async(route):
    """
    A decorator that wraps a FastAPI route to handle specific exceptions. If an exception
    is caught, it returns a JSON response with the error message.

    Args:
        route (Callable): The FastAPI route to be wrapped.

    Returns:
        Callable: The wrapped route.
    """

    @wraps(route)
    async def wrapped_route(*args, **kwargs):
        try:
            try:
                return await route(*args, **kwargs)
            except Exception as error:
                record_error(error)
                record_error_metric(type(error).__name__)
                raise
        except ContentTypeInvalid as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": "Invalid Content-Type header provided with request."
                },
            )
        except ContentTypeMissing as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={"message": "Content-Type header not provided with request."},
            )
        except InputImageLoadError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": f"Could not load input image. Cause: {error.get_public_error_details()}"
                },
            )
        except ModelInputError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": f"Error with model input. Cause: {error}",
                    "help_url": error.help_url,
                },
            )
        except InvalidModelIDError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={"message": "Invalid Model ID sent in request."},
            )
        except InvalidMaskDecodeArgument as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": "Invalid mask decode argument sent. tradeoff_factor must be in [0.0, 1.0], "
                    "mask_decode_mode: must be one of ['accurate', 'fast', 'tradeoff']"
                },
            )
        except MissingApiKeyError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": "Required Roboflow API key is missing. Visit https://docs.roboflow.com/api-reference/authentication#retrieve-an-api-key "
                    "to learn how to retrieve one."
                },
            )
        except (
            WorkflowSyntaxError,
            InvalidReferenceTargetError,
            ExecutionGraphStructureError,
            StepInputDimensionalityError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            content = WorkflowErrorResponse(
                message=str(error.public_message),
                error_type=error.__class__.__name__,
                context=str(error.context),
                inner_error_type=str(error.inner_error_type),
                inner_error_message=str(error.inner_error),
                blocks_errors=error.blocks_errors,
            )
            resp = JSONResponse(status_code=400, content=content.model_dump())
        except DynamicBlockCodeError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            content = _build_execution_error_response(error)
            resp = JSONResponse(status_code=400, content=content.model_dump())
        except (
            WorkflowDefinitionError,
            ReferenceTypeError,
            RuntimeInputError,
            InvalidInputTypeError,
            OperationTypeNotRecognisedError,
            DynamicBlockError,
            WorkflowExecutionEngineVersionError,
            NotSupportedExecutionEngineError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "context": error.context,
                    "inner_error_type": error.inner_error_type,
                    "inner_error_message": str(error.inner_error),
                },
            )
        except (
            ProcessesManagerInvalidPayload,
            MalformedPayloadError,
            MessageToBigError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "inner_error_type": error.inner_error_type,
                },
            )
        except (
            RoboflowAPINotAuthorizedError,
            ProcessesManagerAuthorisationError,
            UnauthorizedModelAccessError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=401,
                content={
                    "message": "Unauthorized access to roboflow API - check API key and make sure the key is valid for "
                    "workspace you use. Visit https://docs.roboflow.com/api-reference/authentication#retrieve-an-api-key "
                    "to learn how to retrieve one."
                },
            )
        except PaymentRequiredError as error:
            logger.warning("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=402,
                content={
                    "message": "Not enough credits to perform this request. Verify your workspace billing page."
                },
            )
        except RoboflowAPIForbiddenError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=403,
                content={
                    "message": "Unauthorized access to roboflow API - check API key and make sure the key is valid and "
                    "have required scopes. Visit https://docs.roboflow.com/api-reference/authentication#retrieve-an-api-key "
                    "to learn how to retrieve one."
                },
            )
        except RoboflowAPIUsagePausedError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=423,
                content={
                    "message": "Roboflow API usage is paused. Please contact your workspace administrator to re-enable api keys."
                },
            )
        except (RoboflowAPINotNotFoundError, ModelNotFoundError) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=404,
                content={
                    "message": "Requested Roboflow resource not found. Make sure that workspace, project or model "
                    "you referred in request exists."
                },
            )
        except ProcessesManagerNotFoundError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=404,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "inner_error_type": error.inner_error_type,
                },
            )
        except ModelPackageNegotiationError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": f"Could not negotiate model package - {error}",
                    "help_url": error.help_url,
                },
            )
        except ModelDeploymentNotSupportedError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=501,
                content={"message": str(error)},
            )
        except (
            InvalidEnvironmentVariableError,
            MissingServiceSecretError,
            ServiceConfigurationError,
            EnvironmentConfigurationError,
            InvalidEnvVariable,
            JetsonTypeResolutionError,
            MissingDependencyError,
            InvalidParameterError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500, content={"message": "Service misconfiguration."}
            )
        except (
            PreProcessingError,
            PostProcessingError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": "Model configuration related to pre- or post-processing is invalid."
                },
            )
        except ModelArtefactError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500, content={"message": "Model package is broken."}
            )
        except (
            CannotInitialiseModelDueToInputSizeError,
            ModelPackageRestrictedError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=507,
                content={
                    "message": "Model loading failed due to restrictions of server configuration - "
                    "usually due to excessive runtime memory requirement of the model (for instance "
                    "caused by large input size).",
                },
            )
        except ModelPackageAlternativesExhaustedError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            inner_errors = error.alternatives_errors or []
            if any(isinstance(e, ModelPackageRestrictedError) for e in inner_errors):
                resp = JSONResponse(
                    status_code=507,
                    content={
                        "message": "Model loading failed due to restrictions of server configuration - "
                        "usually due to excessive runtime memory requirement of the model (for instance "
                        "caused by large input size).",
                        "help_url": error.help_url,
                    },
                )
            else:
                resp = JSONResponse(
                    status_code=500,
                    content={
                        "message": f"Model loading failed: {error}",
                        "help_url": error.help_url,
                    },
                )
        except ModelLoadingError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": f"Model loading failed: {error}",
                    "help_url": error.help_url,
                },
            )
        except (UntrustedFileError, FileHashSumMissmatch) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": f"Issue with model package file: {error}",
                    "help_url": error.help_url,
                },
            )
        except ModelRetrievalError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": f"Could not retrieve model {error}",
                    "help_url": error.help_url,
                },
            )
        except OnnxProviderNotAvailable as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=501,
                content={
                    "message": "Could not find requested ONNX Runtime Provider. Check that you are using "
                    "the correct docker image on a supported device."
                },
            )
        except (
            MalformedRoboflowAPIResponseError,
            RoboflowAPIUnsuccessfulRequestError,
            WorkspaceLoadError,
            MalformedWorkflowResponseError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=502,
                content={"message": "Internal error. Request to Roboflow API failed."},
            )
        except InferenceModelNotFound as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=503,
                content={"message": "Model is temporarily not ready - retry request."},
                headers={"Retry-After": "1"},
            )
        except RoboflowAPIConnectionError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=503,
                content={
                    "message": "Internal error. Could not connect to Roboflow API."
                },
            )
        except ModelManagerLockAcquisitionError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=503,
                content={
                    "message": "Could not acquire model manager lock due to other request performing "
                    "blocking operation. Try again...."
                },
                headers={"Retry-After": "1"},
            )
        except RoboflowAPITimeoutError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=504,
                content={
                    "message": "Timeout when attempting to connect to Roboflow API."
                },
            )
        except (
            ClientCausedStepExecutionError,
            RuntimeLimitsCausedStepExecutionError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            content = WorkflowErrorResponse(
                message=str(error.public_message),
                error_type=error.__class__.__name__,
                context=str(error.context),
                inner_error_type=str(error.inner_error_type),
                inner_error_message=str(error.inner_error),
                blocks_errors=[
                    WorkflowBlockError(
                        block_id=error.block_id,
                    ),
                ],
            )
            resp = JSONResponse(
                status_code=error.status_code,
                content=content.model_dump(),
            )
        except StepExecutionError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            content = _build_execution_error_response(error)
            resp = JSONResponse(
                status_code=500,
                content=content.model_dump(),
            )
        except WorkflowError as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "context": error.context,
                    "inner_error_type": error.inner_error_type,
                    "inner_error_message": str(error.inner_error),
                },
            )
        except (
            ProcessesManagerClientError,
            CommunicationProtocolError,
        ) as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=500,
                content={
                    "message": error.public_message,
                    "error_type": error.__class__.__name__,
                    "inner_error_type": error.inner_error_type,
                },
            )
        except WebRTCConfigurationError as error:
            logger.error("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=400,
                content={
                    "message": str(error),
                    "error_type": "WebRTCConfigurationError",
                },
            )
        except CreditsExceededError as error:
            logger.error("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=402,
                content={
                    "message": "Not enough credits to perform this request.",
                    "error_type": "CreditsExceededError",
                },
            )
        except WorkspaceStreamQuotaError as error:
            logger.error("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=429,
                content={
                    "message": str(error),
                    "error_type": "WorkspaceStreamQuotaError",
                },
            )
        except FeatureDeprecatedError as error:
            logger.warning("%s: %s", type(error).__name__, error)
            resp = JSONResponse(
                status_code=410,
                content={
                    "message": str(error),
                    "error_type": "FeatureDeprecatedError",
                    **error.get_structured_public_error_details(),
                },
            )
        except Exception as error:
            logger.exception("%s: %s", type(error).__name__, error)
            resp = JSONResponse(status_code=500, content={"message": "Internal error."})
        return resp

    return wrapped_route

inference.core.interfaces.http.http_api

Classes

HttpInterface

Bases: BaseInterface

Roboflow defined HTTP interface for a general-purpose inference server.

This class sets up the FastAPI application and adds necessary middleware, as well as initializes the model manager and model registry for the inference server.

Attributes:

Name Type Description
app FastAPI

The FastAPI application instance.

model_manager ModelManager

The manager for handling different models.

Source code in inference/core/interfaces/http/http_api.py
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
class HttpInterface(BaseInterface):
    """Roboflow defined HTTP interface for a general-purpose inference server.

    This class sets up the FastAPI application and adds necessary middleware,
    as well as initializes the model manager and model registry for the inference server.

    Attributes:
        app (FastAPI): The FastAPI application instance.
        model_manager (ModelManager): The manager for handling different models.
    """

    def __init__(
        self,
        model_manager: ModelManager,
        root_path: Optional[str] = None,
    ):
        """
        Initializes the HttpInterface with given model manager and model registry.

        Args:
            model_manager (ModelManager): The manager for handling different models.
            root_path (Optional[str]): The root path for the FastAPI application.

        Description:
            Deploy Roboflow trained models to nearly any compute environment!
        """

        description = "Roboflow inference server"

        app = FastAPI(
            title="Roboflow Inference Server",
            description=description,
            version=__version__,
            terms_of_service="https://roboflow.com/terms",
            contact={
                "name": "Roboflow Inc.",
                "url": "https://roboflow.com/contact",
                "email": "help@roboflow.com",
            },
            license_info={
                "name": "Apache 2.0",
                "url": "https://www.apache.org/licenses/LICENSE-2.0.html",
            },
            root_path=root_path,
        )
        # Ensure in-memory logging is initialized as early as possible for all runtimes
        try:
            from inference.core.logging.memory_handler import setup_memory_logging

            setup_memory_logging()
        except Exception:
            pass

        app.mount(
            "/static",
            StaticFiles(directory="./inference/landing/out/static", html=True),
            name="static",
        )
        app.mount(
            "/_next/static",
            StaticFiles(directory="./inference/landing/out/_next/static", html=True),
            name="_next_static",
        )

        # OpenTelemetry: must be set up before any middleware is added
        # so the FastAPI instrumentor wraps at the outermost ASGI layer.
        if OTEL_TRACING_ENABLED:
            setup_telemetry(app)

        @app.middleware("http")
        async def set_request_path_context(request: Request, call_next):
            # CVE-2026-48710: prefer the raw ASGI scope path over
            # request.url.path. This ContextVar feeds downstream registry
            # metadata (_model_request_paths in ModelManagerBase), so a
            # Host-poisoned path would surface in model-info responses.
            token = current_request_path.set(request.scope["path"])
            try:
                return await call_next(request)
            finally:
                current_request_path.reset(token)

        @app.on_event("shutdown")
        async def on_shutdown():
            logger.info("Shutting down %s", description)
            await usage_collector.async_push_usage_payloads()
            if OTEL_TRACING_ENABLED:
                shutdown_telemetry()

        self._instrumentator = InferenceInstrumentator(
            app, model_manager=model_manager, endpoint="/metrics"
        )
        if LAMBDA:
            app.add_middleware(LambdaMiddleware)
        if GCP_SERVERLESS:
            app.add_middleware(GCPServerlessMiddleware)

        if len(ALLOW_ORIGINS) > 0:
            # Add CORS Middleware (but not for /build**, which is controlled separately)
            app.add_middleware(
                PathAwareCORSMiddleware,
                match_paths=r"^(?!/build).*",
                allow_origins=ALLOW_ORIGINS,
                allow_credentials=True,
                allow_methods=["*"],
                allow_headers=["*"],
                expose_headers=[
                    PROCESSING_TIME_HEADER,
                    REMOTE_PROCESSING_TIME_HEADER,
                    REMOTE_PROCESSING_TIMES_HEADER,
                    MODEL_COLD_START_HEADER,
                    MODEL_COLD_START_COUNT_HEADER,
                    MODEL_LOAD_TIME_HEADER,
                    MODEL_LOAD_DETAILS_HEADER,
                    MODEL_ID_HEADER,
                    WORKFLOW_ID_HEADER,
                    WORKSPACE_ID_HEADER,
                    TRACE_ID_HEADER,
                ]
                + ([EXECUTION_ID_HEADER] if EXECUTION_ID_HEADER is not None else [])
                + ["traceparent", "tracestate"],
            )

        # Optionally add middleware for profiling the FastAPI server and underlying inference API code
        if PROFILE:
            app.add_middleware(
                CProfileMiddleware,
                enable=True,
                server_app=app,
                filename="/profile/output.pstats",
                strip_dirs=False,
                sort_by="cumulative",
            )
        if API_LOGGING_ENABLED:
            app.add_middleware(
                asgi_correlation_id.CorrelationIdMiddleware,
                header_name=CORRELATION_ID_HEADER,
                update_request_header=True,
                generator=lambda: uuid4().hex,
                validator=lambda a: True,
                transformer=lambda a: a,
            )
            if STRUCTURED_API_LOGGING:
                # Suppress uvicorn's default access log to avoid duplicate
                # unstructured entries — we replace it with a structured
                # access log middleware (see structured_access_log below).
                logging.getLogger("uvicorn.access").handlers = []
                logging.getLogger("uvicorn.access").propagate = False
        else:
            app.add_middleware(asgi_correlation_id.CorrelationIdMiddleware)

        if METRICS_ENABLED:

            @app.middleware("http")
            async def count_errors(request: Request, call_next):
                """Middleware to count errors.

                Args:
                    request (Request): The incoming request.
                    call_next (Callable): The next middleware or endpoint to call.

                Returns:
                    Response: The response from the next middleware or endpoint.
                """
                response = await call_next(request)
                if self.model_manager.pingback and response.status_code >= 400:
                    self.model_manager.num_errors += 1
                return response

        if not (LAMBDA or GCP_SERVERLESS):

            @app.get("/device/stats")
            def device_stats():
                not_configured_error_message = {
                    "error": "Device statistics endpoint is not enabled.",
                    "hint": "Mount the Docker socket and point its location when running the docker "
                    "container to collect device stats "
                    "(i.e. `docker run ... -v /var/run/docker.sock:/var/run/docker.sock "
                    "-e DOCKER_SOCKET_PATH=/var/run/docker.sock ...`).",
                }
                if not DOCKER_SOCKET_PATH:
                    return JSONResponse(
                        status_code=404,
                        content=not_configured_error_message,
                    )
                if not is_docker_socket_mounted(docker_socket_path=DOCKER_SOCKET_PATH):
                    return JSONResponse(
                        status_code=500,
                        content=not_configured_error_message,
                    )
                container_stats = get_container_stats(
                    docker_socket_path=DOCKER_SOCKET_PATH
                )
                return JSONResponse(status_code=200, content=container_stats)

        cached_api_keys: Dict[AuthorizationCacheKey, AuthorizationCacheEntry] = {}

        if GCP_SERVERLESS:

            @app.middleware("http")
            async def check_authorization_serverless(request: Request, call_next):
                request_id, execution_id_value = (
                    _prepare_serverless_observability_context(request=request)
                )
                _log_serverless_request_received(
                    request=request,
                    request_id=request_id,
                    execution_id_value=execution_id_value,
                )
                t1 = time.time()

                # exclusions
                # CVE-2026-48710: use the raw ASGI scope path so a malicious
                # Host header (e.g. `Host: x?/docs`) cannot poison request.url.path
                # and slip an authenticated route into the allowlist.
                scope_path = request.scope["path"]
                skip_check = (
                    request.method not in ["GET", "POST"]
                    or scope_path
                    in [
                        "/",
                        "/docs",
                        "/info",
                        "/healthz",  # health check endpoint for liveness probe
                        "/readiness",
                        "/metrics",
                        "/openapi.json",  # needed for /docs and /redoc
                        "/model/registry",  # dont auth this route, usually not used on serverlerless, but queue based serverless uses it internally (not accessible from outside)
                    ]
                    or scope_path.startswith("/static/")
                    or scope_path.startswith("/_next/")
                )

                # for these routes we only want to auth if dynamic python modules are provided
                if scope_path in [
                    "/workflows/blocks/describe",
                    "/workflows/definition/schema",
                ]:
                    if request.method == "GET":
                        skip_check = True

                    elif (
                        get_content_type(request) == "application/json"
                        and int(request.headers.get("content-length", 0)) > 0
                    ):
                        json_params = await request.json()
                        dynamic_blocks_definitions = json_params.get(
                            "dynamic_blocks_definitions", None
                        )
                        if not dynamic_blocks_definitions:
                            skip_check = True

                if skip_check:
                    return await call_next(request)

                def _authorization_error_response(
                    status_code: int,
                    msg: str,
                    workspace_id: Optional[str] = None,
                    cache_hit: bool = False,
                ):
                    response = JSONResponse(
                        status_code=status_code,
                        content={
                            "status": status_code,
                            "message": msg,
                        },
                    )
                    _attach_observability_headers_to_early_response(
                        response=response,
                        request_id=request_id,
                        execution_id_value=execution_id_value,
                        processing_time=time.time() - t1,
                        workspace_id=workspace_id,
                    )
                    _log_serverless_authorization_denial(
                        request=request,
                        status_code=status_code,
                        message=msg,
                        request_id=request_id,
                        execution_id_value=execution_id_value,
                        workspace_id=workspace_id,
                        cache_hit=cache_hit,
                    )
                    return response

                try:
                    with start_span(
                        "serverless.authorization.check",
                        attributes={
                            "http.method": request.method,
                            # CVE-2026-48710: log the real ASGI path. The span
                            # records the auth decision, so it must not be
                            # forgeable via Host header.
                            "http.target": scope_path,
                        },
                    ) as auth_span:
                        req_params = request.query_params
                        json_params = dict()
                        api_key = req_params.get("api_key", None)
                        if (
                            api_key is None
                            and get_content_type(request) == "application/json"
                            and int(request.headers.get("content-length", 0)) > 0
                        ):
                            # have to try catch here, because some legacy endpoints that abuse Content-Type header but dont actually receive json
                            try:
                                json_params = await request.json()
                            except Exception:
                                pass
                        api_key = json_params.get("api_key", api_key)

                        if api_key is None:
                            if auth_span is not None:
                                auth_span.set_attribute("http.status_code", 401)
                                auth_span.set_attribute(
                                    "auth.result", "missing_api_key"
                                )
                            return _authorization_error_response(
                                401, "Unauthorized api_key"
                            )

                        enforce_credits_verification = (
                            not _is_non_billable_internal_request(
                                req_params=req_params,
                                json_params=json_params,
                            )
                        )
                        cache_key = (api_key, enforce_credits_verification)
                        cache_entry = cached_api_keys.get(cache_key)
                        workspace_id = None
                        if auth_span is not None:
                            auth_span.set_attribute(
                                "auth.enforce_credits_verification",
                                enforce_credits_verification,
                            )
                        if cache_entry and cache_entry.expires_at >= time.time():
                            if auth_span is not None:
                                auth_span.set_attribute("auth.cache_hit", True)
                            if cache_entry.status_code != 200:
                                if auth_span is not None:
                                    auth_span.set_attribute(
                                        "http.status_code", cache_entry.status_code
                                    )
                                    auth_span.set_attribute(
                                        "auth.result", "denied_from_cache"
                                    )
                                return _authorization_error_response(
                                    cache_entry.status_code,
                                    cache_entry.message or "Unauthorized api_key",
                                    workspace_id=cache_entry.workspace_id,
                                    cache_hit=True,
                                )
                            workspace_id = cache_entry.workspace_id
                        else:
                            if auth_span is not None:
                                auth_span.set_attribute("auth.cache_hit", False)
                            if not enforce_credits_verification:
                                try:
                                    workspace_id = await get_roboflow_workspace_async(
                                        api_key=api_key
                                    )
                                    cached_api_keys[cache_key] = (
                                        AuthorizationCacheEntry(
                                            expires_at=time.time()
                                            + AUTH_CACHE_TTL_SECONDS,
                                            workspace_id=workspace_id,
                                        )
                                    )
                                except (
                                    RoboflowAPINotAuthorizedError,
                                    WorkspaceLoadError,
                                ):
                                    cached_api_keys[cache_key] = (
                                        AuthorizationCacheEntry(
                                            expires_at=time.time()
                                            + SHORT_AUTH_CACHE_TTL_SECONDS,
                                            workspace_id=None,
                                            status_code=401,
                                            message="Unauthorized api_key",
                                        )
                                    )
                                    if auth_span is not None:
                                        auth_span.set_attribute("http.status_code", 401)
                                        auth_span.set_attribute(
                                            "auth.result", "unauthorized"
                                        )
                                    return _authorization_error_response(
                                        401,
                                        cached_api_keys[cache_key].message,
                                        cache_hit=False,
                                    )
                            else:
                                usage_check_result = (
                                    await get_serverless_usage_check_async(
                                        api_key=api_key
                                    )
                                )
                                if usage_check_result.status_code == 200:
                                    workspace_id = usage_check_result.workspace_id
                                    cached_api_keys[cache_key] = (
                                        AuthorizationCacheEntry(
                                            expires_at=time.time()
                                            + AUTH_CACHE_TTL_SECONDS,
                                            workspace_id=workspace_id,
                                        )
                                    )
                                elif usage_check_result.status_code == 401:
                                    cached_api_keys[cache_key] = (
                                        AuthorizationCacheEntry(
                                            expires_at=time.time()
                                            + SHORT_AUTH_CACHE_TTL_SECONDS,
                                            workspace_id=None,
                                            status_code=401,
                                            message=(
                                                "Unauthorized api_key. This key is not authorized "
                                                "for serverless inference."
                                            ),
                                        )
                                    )
                                    if auth_span is not None:
                                        auth_span.set_attribute("http.status_code", 401)
                                        auth_span.set_attribute(
                                            "auth.result",
                                            "serverless_inference_unauthorized",
                                        )
                                    return _authorization_error_response(
                                        401,
                                        cached_api_keys[cache_key].message,
                                        cache_hit=False,
                                    )
                                elif usage_check_result.status_code == 402:
                                    message = (
                                        "This workspace cannot currently spend credits for serverless inference. "
                                        "Verify billing or credit cap settings."
                                    )
                                    if usage_check_result.error:
                                        message = (
                                            f"{message} {usage_check_result.error}"
                                        )
                                    cached_api_keys[cache_key] = (
                                        AuthorizationCacheEntry(
                                            expires_at=time.time()
                                            + SHORT_AUTH_CACHE_TTL_SECONDS,
                                            workspace_id=usage_check_result.workspace_id,
                                            status_code=402,
                                            message=message,
                                        )
                                    )
                                    if auth_span is not None:
                                        auth_span.set_attribute("http.status_code", 402)
                                        auth_span.set_attribute(
                                            "auth.result",
                                            "credits_verification_failed",
                                        )
                                    return _authorization_error_response(
                                        402,
                                        cached_api_keys[cache_key].message,
                                        workspace_id=usage_check_result.workspace_id,
                                        cache_hit=False,
                                    )

                        if auth_span is not None:
                            auth_span.set_attribute("http.status_code", 200)
                            auth_span.set_attribute("auth.result", "authorized")
                            if workspace_id is not None:
                                auth_span.set_attribute("workspace.id", workspace_id)
                except Exception as error:
                    record_error(error)
                    raise

                response = await call_next(request)
                if workspace_id:
                    response.headers[WORKSPACE_ID_HEADER] = workspace_id
                return response

        if (
            DEDICATED_DEPLOYMENT_WORKSPACE_URL
            or WORKSPACES_WHITELISTED_FOR_LOCAL_DEPLOYMENT
        ):

            @app.middleware("http")
            async def check_authorization(request: Request, call_next):
                # exclusions
                # CVE-2026-48710: use the raw ASGI scope path so a malicious
                # Host header (e.g. `Host: x?/docs`) cannot poison request.url.path
                # and slip an authenticated route into the allowlist.
                scope_path = request.scope["path"]
                skip_check = (
                    request.method not in ["GET", "POST"]
                    or scope_path
                    in [
                        "/",
                        "/docs",
                        "/redoc",
                        "/info",
                        "/healthz",  # health check endpoint for liveness probe
                        "/readiness",
                        "/metrics",
                        "/openapi.json",  # needed for /docs and /redoc
                    ]
                    or scope_path.startswith("/static/")
                    or scope_path.startswith("/_next/")
                )
                if skip_check:
                    return await call_next(request)

                def _unauthorized_response(msg):
                    return JSONResponse(
                        status_code=401,
                        content={
                            "status": 401,
                            "message": msg,
                        },
                    )

                # check api_key
                req_params = request.query_params
                json_params = dict()
                api_key = req_params.get("api_key", None)
                if (
                    api_key is None
                    and get_content_type(request) == "application/json"
                    and int(request.headers.get("content-length", 0)) > 0
                ):
                    # have to try catch here, because some legacy endpoints that abuse Content-Type header but dont actually receive json
                    try:
                        json_params = await request.json()
                    except Exception:
                        pass
                api_key = json_params.get("api_key", api_key)

                if api_key is None:
                    return _unauthorized_response("Unauthorized api_key")

                cache_entry = cached_api_keys.get(api_key)
                workspace_id = None
                if cache_entry and cache_entry.expires_at >= time.time():
                    if cache_entry.status_code != 200:
                        return _unauthorized_response("Unauthorized api_key")
                    workspace_id = cache_entry.workspace_id
                else:
                    try:
                        if api_key is None:
                            workspace_id = None
                        else:
                            workspace_id = await get_roboflow_workspace_async(
                                api_key=api_key
                            )
                        allowed_workspaces = set()
                        if DEDICATED_DEPLOYMENT_WORKSPACE_URL:
                            allowed_workspaces.add(DEDICATED_DEPLOYMENT_WORKSPACE_URL)
                        if WORKSPACES_WHITELISTED_FOR_LOCAL_DEPLOYMENT:
                            allowed_workspaces.update(
                                WORKSPACES_WHITELISTED_FOR_LOCAL_DEPLOYMENT
                            )
                        if workspace_id not in allowed_workspaces:
                            return _unauthorized_response("Unauthorized api_key")

                        cached_api_keys[api_key] = AuthorizationCacheEntry(
                            expires_at=time.time() + AUTH_CACHE_TTL_SECONDS,
                            workspace_id=workspace_id,
                        )
                    except (RoboflowAPINotAuthorizedError, WorkspaceLoadError):
                        return _unauthorized_response("Unauthorized api_key")

                response = await call_next(request)
                if workspace_id:
                    response.headers[WORKSPACE_ID_HEADER] = workspace_id
                return response

        @app.middleware("http")
        async def add_inference_engine_headers(request: Request, call_next):
            response = await call_next(request)
            inference_engine = (
                "inference-models" if USE_INFERENCE_MODELS else "old-inference"
            )
            response.headers["x-inference-engine"] = inference_engine
            return response

        @app.middleware("http")
        async def track_model_load(request: Request, call_next):
            load_collector = ModelLoadCollector()
            model_load_info.set(load_collector)
            ids_collector = RequestModelIds()
            request_model_ids.set(ids_collector)
            response = await call_next(request)
            remote_processing_collector = getattr(
                request.state, "remote_processing_time_collector", None
            )
            if remote_processing_collector is not None:
                remote_model_ids = remote_processing_collector.snapshot_model_ids()
                remote_cold_start_entries = (
                    remote_processing_collector.snapshot_cold_start_entries()
                )
                remote_cold_start_count = (
                    remote_processing_collector.snapshot_cold_start_count()
                )
                remote_cold_start_total_load_time = (
                    remote_processing_collector.snapshot_cold_start_total_load_time()
                )
            else:
                remote_model_ids = set()
                remote_cold_start_entries = []
                remote_cold_start_count = 0
                remote_cold_start_total_load_time = 0.0
            response.headers.update(
                build_model_response_headers(
                    local_model_ids=ids_collector.get_ids(),
                    local_cold_start_entries=load_collector.snapshot_entries(),
                    remote_model_ids=remote_model_ids,
                    remote_cold_start_entries=remote_cold_start_entries,
                    remote_cold_start_count=remote_cold_start_count,
                    remote_cold_start_total_load_time=remote_cold_start_total_load_time,
                )
            )
            wf_id = request_workflow_id.get(None)
            if wf_id:
                response.headers[WORKFLOW_ID_HEADER] = wf_id
            return response

        if API_LOGGING_ENABLED and STRUCTURED_API_LOGGING:

            @app.middleware("http")
            async def structured_access_log(request: Request, call_next):
                response = await call_next(request)
                log_fields = {
                    "method": request.method,
                    "path": request.url.path,
                    "status_code": response.status_code,
                }

                # Read request_id and execution_id from response headers
                # instead of ContextVars — @app.middleware("http") uses
                # BaseHTTPMiddleware which runs the inner chain in a
                # separate asyncio task, so ContextVars set by inner
                # middlewares are not visible here.
                header_fields = {
                    "request_id": CORRELATION_ID_HEADER,
                    "processing_time": PROCESSING_TIME_HEADER,
                    "model_cold_start": MODEL_COLD_START_HEADER,
                    "model_cold_start_count": MODEL_COLD_START_COUNT_HEADER,
                    "model_load_time": MODEL_LOAD_TIME_HEADER,
                    "model_id": MODEL_ID_HEADER,
                    "workflow_id": WORKFLOW_ID_HEADER,
                    "workspace_id": WORKSPACE_ID_HEADER,
                }
                if EXECUTION_ID_HEADER is not None:
                    header_fields["execution_id"] = EXECUTION_ID_HEADER
                for field_name, header_name in header_fields.items():
                    value = response.headers.get(header_name)
                    if value is not None:
                        log_fields[field_name] = value

                # Extract trace_id from traceparent header if present
                # (reading from header due to ContextVar isolation in BaseHTTPMiddleware)
                traceparent = request.headers.get("traceparent")
                if traceparent:
                    parts = traceparent.split("-")
                    if len(parts) >= 3:
                        log_fields["trace_id"] = parts[1]

                logger.info(
                    f"{request.method} {request.url.path} {response.status_code}",
                    **log_fields,
                )
                return response

        self.app = app
        self.model_manager = model_manager
        self.stream_manager_client: Optional[StreamManagerClient] = None
        self.shared_thread_pool_executor: Optional[ThreadPoolExecutor] = None
        if HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_ENABLED:
            self.shared_thread_pool_executor = ThreadPoolExecutor(
                max_workers=HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_WORKERS
            )
        self.inference_models_cache_daemon: Optional[InferenceModelsCacheWatchdog] = (
            None
        )
        if USE_INFERENCE_MODELS and MAX_INFERENCE_MODELS_CACHE_SIZE_MB > 0:
            from inference_models.configuration import INFERENCE_HOME

            self.inference_models_cache_daemon = InferenceModelsCacheWatchdog(
                inference_home=INFERENCE_HOME,
                max_cache_size_mb=MAX_INFERENCE_MODELS_CACHE_SIZE_MB,
                interval_minutes=INFERENCE_MODELS_CACHE_WATCHDOG_INTERVAL_MINUTES,
            )
            self.inference_models_cache_daemon.start()

        if ENABLE_STREAM_API:
            operations_timeout = os.getenv("STREAM_MANAGER_OPERATIONS_TIMEOUT")
            if operations_timeout is not None:
                operations_timeout = float(operations_timeout)
            self.stream_manager_client = StreamManagerClient.init(
                host=os.getenv("STREAM_MANAGER_HOST", "127.0.0.1"),
                port=int(os.getenv("STREAM_MANAGER_PORT", "7070")),
                operations_timeout=operations_timeout,
            )
            self._instrumentator.set_stream_manager_client(self.stream_manager_client)

        def process_inference_request(
            inference_request: InferenceRequest,
            api_key: Optional[str] = None,
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
            **kwargs,
        ) -> InferenceResponse:
            """Processes an inference request by calling the appropriate model.

            Args:
                inference_request (InferenceRequest): The request containing model ID and other inference details.
                countinference (Optional[bool]): Whether to count inference for usage.
                service_secret (Optional[str]): The service secret.

            Returns:
                InferenceResponse: The response containing the inference results.
            """
            if api_key is not None:
                inference_request.api_key = api_key
            requested_model_id = inference_request.model_id
            de_aliased_model_id = resolve_roboflow_model_alias(
                model_id=requested_model_id
            )
            model_id_alias = (
                requested_model_id
                if de_aliased_model_id != requested_model_id
                else None
            )
            self.model_manager.add_model(
                de_aliased_model_id,
                inference_request.api_key,
                model_id_alias=model_id_alias,
                countinference=countinference,
                service_secret=service_secret,
            )
            inference_model_id = (
                requested_model_id
                if model_id_alias is not None
                else de_aliased_model_id
            )
            resp = self.model_manager.infer_from_request_sync(
                inference_model_id,
                inference_request,
                **kwargs,
            )
            return orjson_response(resp)

        def process_workflow_inference_request(
            workflow_request: WorkflowInferenceRequest,
            workflow_specification: dict,
            background_tasks: Optional[BackgroundTasks],
            profiler: WorkflowsProfiler,
        ) -> WorkflowInferenceResponse:
            if workflow_request.workflow_id:
                request_workflow_id.set(workflow_request.workflow_id)

            workflow_init_parameters = {
                "workflows_core.model_manager": model_manager,
                "workflows_core.api_key": workflow_request.api_key,
                "workflows_core.background_tasks": background_tasks,
            }
            with start_span(
                "workflow.init",
                {"workflow.id": workflow_request.workflow_id or ""},
            ):
                execution_engine = ExecutionEngine.init(
                    workflow_definition=workflow_specification,
                    init_parameters=workflow_init_parameters,
                    max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
                    prevent_local_images_loading=True,
                    profiler=profiler,
                    executor=self.shared_thread_pool_executor,
                    workflow_id=workflow_request.workflow_id,
                )
            is_preview = False
            if hasattr(workflow_request, "is_preview"):
                is_preview = workflow_request.is_preview
            workflow_results = execution_engine.run(
                runtime_parameters=workflow_request.inputs,
                serialize_results=True,
                _is_preview=is_preview,
            )
            with profiler.profile_execution_phase(
                name="workflow_results_filtering",
                categories=["inference_package_operation"],
            ):
                outputs = filter_out_unwanted_workflow_outputs(
                    workflow_results=workflow_results,
                    excluded_fields=workflow_request.excluded_fields,
                )
            profiler_trace = profiler.export_trace()
            response = WorkflowInferenceResponse(
                outputs=outputs,
                profiler_trace=profiler_trace,
            )
            return orjson_response(response=response)

        def load_core_model(
            inference_request: InferenceRequest,
            api_key: Optional[str] = None,
            core_model: str = None,
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ) -> None:
            """Loads a core model (e.g., "clip" or "sam") into the model manager.

            Args:
                inference_request (InferenceRequest): The request containing version and other details.
                api_key (Optional[str]): The API key for the request.
                core_model (str): The core model type, e.g., "clip" or "sam".
                countinference (Optional[bool]): Whether to count inference or not.
                service_secret (Optional[str]): The service secret for the request.

            Returns:
                str: The core model ID.
            """
            if api_key:
                inference_request.api_key = api_key
            version_id_field = f"{core_model}_version_id"
            core_model_id = (
                f"{core_model}/{inference_request.__getattribute__(version_id_field)}"
            )
            self.model_manager.add_model(
                core_model_id,
                inference_request.api_key,
                endpoint_type=ModelEndpointType.CORE_MODEL,
                countinference=countinference,
                service_secret=service_secret,
            )
            return core_model_id

        load_clip_model = partial(load_core_model, core_model="clip")
        """Loads the CLIP model into the model manager.

        Args:
        Same as `load_core_model`.

        Returns:
        The CLIP model ID.
        """

        load_pe_model = partial(load_core_model, core_model="perception_encoder")
        """Loads the Perception Encoder model into the model manager.

        Args:
        Same as `load_core_model`.

        Returns:
        The Perception Encoder model ID.
        """

        load_sam_model = partial(load_core_model, core_model="sam")
        """Loads the SAM model into the model manager.

        Args:
        Same as `load_core_model`.

        Returns:
        The SAM model ID.
        """
        load_sam2_model = partial(load_core_model, core_model="sam2")
        """Loads the SAM2 model into the model manager.

        Args:
        Same as `load_core_model`.

        Returns:
        The SAM2 model ID.
        """

        load_doctr_model = partial(load_core_model, core_model="doctr")
        """Loads the DocTR model into the model manager.

        Args:
        Same as `load_core_model`.

        Returns:
        The DocTR model ID.
        """

        load_easy_ocr_model = partial(load_core_model, core_model="easy_ocr")
        """Loads the EasyOCR model into the model manager.

        Args:
        Same as `load_core_model`.

        Returns:
        The EasyOCR model ID.
        """

        load_paligemma_model = partial(load_core_model, core_model="paligemma")

        load_grounding_dino_model = partial(
            load_core_model, core_model="grounding_dino"
        )
        """Loads the Grounding DINO model into the model manager.

        Args:
        Same as `load_core_model`.

        Returns:
        The Grounding DINO model ID.
        """

        load_yolo_world_model = partial(load_core_model, core_model="yolo_world")
        load_owlv2_model = partial(load_core_model, core_model="owlv2")
        """Loads the YOLO World model into the model manager.

        Args:
        Same as `load_core_model`.

        Returns:
        The YOLO World model ID.
        """

        load_trocr_model = partial(load_core_model, core_model="trocr")
        """Loads the TrOCR model into the model manager.

        Args:
        Same as `load_core_model`.

        Returns:
        The TrOCR model ID.
        """

        @app.get(
            "/info",
            response_model=ServerVersionInfo,
            summary="Info",
            description="Get the server name and version number",
        )
        def root():
            """Endpoint to get the server name and version number.

            Returns:
                ServerVersionInfo: The server version information.
            """
            return ServerVersionInfo(
                name="Roboflow Inference Server",
                version=__version__,
                uuid=GLOBAL_INFERENCE_SERVER_ID,
            )

        @app.get(
            "/logs",
            summary="Get Recent Logs",
            description="Get recent application logs for debugging",
        )
        def get_logs(
            limit: Optional[int] = Query(
                100, description="Maximum number of log entries to return"
            ),
            level: Optional[str] = Query(
                None,
                description="Filter by log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)",
            ),
            since: Optional[str] = Query(
                None, description="Return logs since this ISO timestamp"
            ),
        ):
            """Get recent application logs from memory.

            Only available when ENABLE_IN_MEMORY_LOGS environment variable is set to 'true'.

            Args:
                limit: Maximum number of log entries (default 100)
                level: Filter by log level
                since: ISO timestamp to filter logs since

            Returns:
                List of log entries with timestamp, level, logger, and message
            """
            # Check if in-memory logging is enabled
            from inference.core.logging.memory_handler import (
                get_recent_logs,
                is_memory_logging_enabled,
            )

            if not is_memory_logging_enabled():
                raise HTTPException(
                    status_code=404, detail="Logs endpoint not available"
                )

            try:
                logs = get_recent_logs(limit=limit or 100, level=level, since=since)
                return {"logs": logs, "total_count": len(logs)}
            except (ImportError, ModuleNotFoundError):
                raise HTTPException(
                    status_code=500, detail="Logging system not properly initialized"
                )

        if not LAMBDA and GET_MODEL_REGISTRY_ENABLED:

            @app.get(
                "/model/registry",
                response_model=ModelsDescriptions,
                summary="Get model keys",
                description="Get the ID of each loaded model",
            )
            def registry():
                """Get the ID of each loaded model in the registry.

                Returns:
                    ModelsDescriptions: The object containing models descriptions
                """
                logger.debug(f"Reached /model/registry")
                models_descriptions = self.model_manager.describe_models()
                return ModelsDescriptions.from_models_descriptions(
                    models_descriptions=models_descriptions
                )

        # The current AWS Lambda authorizer only supports path parameters, therefore we can only use the legacy infer route. This case statement excludes routes which won't work for the current Lambda authorizer.
        if not (LAMBDA or GCP_SERVERLESS):

            @app.post(
                "/model/add",
                response_model=ModelsDescriptions,
                summary="Load a model",
                description="Load the model with the given model ID",
            )
            @with_route_exceptions
            def model_add(
                request: AddModelRequest,
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """Load the model with the given model ID into the model manager.

                Args:
                    request (AddModelRequest): The request containing the model ID and optional API key.
                    countinference (Optional[bool]): Whether to count inference or not.
                    service_secret (Optional[str]): The service secret for the request.

                Returns:
                    ModelsDescriptions: The object containing models descriptions
                """
                logger.debug(f"Reached /model/add")
                de_aliased_model_id = resolve_roboflow_model_alias(
                    model_id=request.model_id
                )
                logger.info(f"Loading model: {de_aliased_model_id}")
                self.model_manager.add_model(
                    de_aliased_model_id,
                    request.api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                models_descriptions = self.model_manager.describe_models()
                return ModelsDescriptions.from_models_descriptions(
                    models_descriptions=models_descriptions
                )

            @app.post(
                "/model/remove",
                response_model=ModelsDescriptions,
                summary="Remove a model",
                description="Remove the model with the given model ID",
            )
            @with_route_exceptions
            def model_remove(request: ClearModelRequest):
                """Remove the model with the given model ID from the model manager.

                Args:
                    request (ClearModelRequest): The request containing the model ID to be removed.

                Returns:
                    ModelsDescriptions: The object containing models descriptions
                """
                logger.debug(f"Reached /model/remove")
                de_aliased_model_id = resolve_roboflow_model_alias(
                    model_id=request.model_id
                )
                self.model_manager.remove(de_aliased_model_id)
                models_descriptions = self.model_manager.describe_models()
                return ModelsDescriptions.from_models_descriptions(
                    models_descriptions=models_descriptions
                )

            @app.post(
                "/model/clear",
                response_model=ModelsDescriptions,
                summary="Remove all models",
                description="Remove all loaded models",
            )
            @with_route_exceptions
            def model_clear():
                """Remove all loaded models from the model manager.

                Returns:
                    ModelsDescriptions: The object containing models descriptions
                """
                logger.debug(f"Reached /model/clear")
                self.model_manager.clear()
                models_descriptions = self.model_manager.describe_models()
                return ModelsDescriptions.from_models_descriptions(
                    models_descriptions=models_descriptions
                )

        # these NEW endpoints need authentication protection
        if not LAMBDA and not GCP_SERVERLESS:

            @app.post(
                "/infer/object_detection",
                response_model=Union[
                    ObjectDetectionInferenceResponse,
                    List[ObjectDetectionInferenceResponse],
                    StubResponse,
                ],
                summary="Object detection infer",
                description="Run inference with the specified object detection model",
                response_model_exclude_none=True,
            )
            @with_route_exceptions
            @usage_collector("request")
            def infer_object_detection(
                inference_request: ObjectDetectionInferenceRequest,
                background_tasks: BackgroundTasks,
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """Run inference with the specified object detection model.

                Args:
                    inference_request (ObjectDetectionInferenceRequest): The request containing the necessary details for object detection.
                    background_tasks: (BackgroundTasks) pool of fastapi background tasks

                Returns:
                    Union[ObjectDetectionInferenceResponse, List[ObjectDetectionInferenceResponse]]: The response containing the inference results.
                """
                logger.debug(f"Reached /infer/object_detection")
                return process_inference_request(
                    inference_request,
                    active_learning_eligible=True,
                    background_tasks=background_tasks,
                    countinference=countinference,
                    service_secret=service_secret,
                )

            @app.post(
                "/infer/instance_segmentation",
                response_model=Union[
                    InstanceSegmentationInferenceResponse, StubResponse
                ],
                summary="Instance segmentation infer",
                description="Run inference with the specified instance segmentation model",
            )
            @with_route_exceptions
            @usage_collector("request")
            def infer_instance_segmentation(
                inference_request: InstanceSegmentationInferenceRequest,
                background_tasks: BackgroundTasks,
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """Run inference with the specified instance segmentation model.

                Args:
                    inference_request (InstanceSegmentationInferenceRequest): The request containing the necessary details for instance segmentation.
                    background_tasks: (BackgroundTasks) pool of fastapi background tasks

                Returns:
                    InstanceSegmentationInferenceResponse: The response containing the inference results.
                """
                logger.debug(f"Reached /infer/instance_segmentation")
                return process_inference_request(
                    inference_request,
                    active_learning_eligible=True,
                    background_tasks=background_tasks,
                    countinference=countinference,
                    service_secret=service_secret,
                )

            @app.post(
                "/infer/semantic_segmentation",
                response_model=Union[
                    SemanticSegmentationInferenceResponse, StubResponse
                ],
                summary="Semantic segmentation infer",
                description="Run inference with the specified semantic segmentation model",
            )
            @with_route_exceptions
            @usage_collector("request")
            def infer_semantic_segmentation(
                inference_request: SemanticSegmentationInferenceRequest,
                background_tasks: BackgroundTasks,
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """Run inference with the specified semantic segmentation model.

                Args:
                    inference_request (SemanticSegmentationInferenceRequest): The request containing the necessary details for semantic segmentation.
                    background_tasks: (BackgroundTasks) pool of fastapi background tasks

                Returns:
                    SemanticSegmentationInferenceResponse: The response containing the inference results.
                """
                logger.debug(f"Reached /infer/semantic_segmentation")
                return process_inference_request(
                    inference_request,
                    active_learning_eligible=True,
                    background_tasks=background_tasks,
                    countinference=countinference,
                    service_secret=service_secret,
                )

            @app.post(
                "/infer/classification",
                response_model=Union[
                    ClassificationInferenceResponse,
                    MultiLabelClassificationInferenceResponse,
                    StubResponse,
                ],
                summary="Classification infer",
                description="Run inference with the specified classification model",
            )
            @with_route_exceptions
            @usage_collector("request")
            def infer_classification(
                inference_request: ClassificationInferenceRequest,
                background_tasks: BackgroundTasks,
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """Run inference with the specified classification model.

                Args:
                    inference_request (ClassificationInferenceRequest): The request containing the necessary details for classification.
                    background_tasks: (BackgroundTasks) pool of fastapi background tasks

                Returns:
                    Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]: The response containing the inference results.
                """
                logger.debug(f"Reached /infer/classification")
                return process_inference_request(
                    inference_request,
                    active_learning_eligible=True,
                    background_tasks=background_tasks,
                    countinference=countinference,
                    service_secret=service_secret,
                )

            @app.post(
                "/infer/keypoints_detection",
                response_model=Union[KeypointsDetectionInferenceResponse, StubResponse],
                summary="Keypoints detection infer",
                description="Run inference with the specified keypoints detection model",
            )
            @with_route_exceptions
            @usage_collector("request")
            def infer_keypoints(
                inference_request: KeypointsDetectionInferenceRequest,
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """Run inference with the specified keypoints detection model.

                Args:
                    inference_request (KeypointsDetectionInferenceRequest): The request containing the necessary details for keypoints detection.

                Returns:
                    Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]: The response containing the inference results.
                """
                logger.debug(f"Reached /infer/keypoints_detection")
                return process_inference_request(
                    inference_request,
                    countinference=countinference,
                    service_secret=service_secret,
                )

        if not LAMBDA and (LMM_ENABLED or MOONDREAM2_ENABLED):

            @app.post(
                "/infer/lmm",
                response_model=Union[
                    LMMInferenceResponse,
                    List[LMMInferenceResponse],
                    StubResponse,
                ],
                summary="Large multi-modal model infer",
                description="Run inference with the specified large multi-modal model",
                response_model_exclude_none=True,
            )
            @with_route_exceptions
            @usage_collector("request")
            def infer_lmm(
                inference_request: LMMInferenceRequest,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """Run inference with the specified large multi-modal model.

                Args:
                    inference_request (LMMInferenceRequest): The request containing the necessary details for LMM inference.

                Returns:
                    Union[LMMInferenceResponse, List[LMMInferenceResponse]]: The response containing the inference results.
                """
                logger.debug(f"Reached /infer/lmm")
                return process_inference_request(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )

            @app.post(
                "/infer/lmm/{model_id:path}",
                response_model=Union[
                    LMMInferenceResponse,
                    List[LMMInferenceResponse],
                    StubResponse,
                ],
                summary="Large multi-modal model infer with model ID in path",
                description="Run inference with the specified large multi-modal model. Model ID is specified in the URL path (can contain slashes).",
                response_model_exclude_none=True,
            )
            @with_route_exceptions
            @usage_collector("request")
            def infer_lmm_with_model_id(
                model_id: str,
                inference_request: LMMInferenceRequest,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """Run inference with the specified large multi-modal model.

                The model_id can be specified in the URL path. If model_id is also provided
                in the request body, it must match the path parameter.

                Args:
                    model_id (str): The model identifier from the URL path.
                    inference_request (LMMInferenceRequest): The request containing the necessary details for LMM inference.

                Returns:
                    Union[LMMInferenceResponse, List[LMMInferenceResponse]]: The response containing the inference results.

                Raises:
                    HTTPException: If model_id in path and request body don't match.
                """
                logger.debug(f"Reached /infer/lmm/{model_id}")

                # Validate model_id consistency between path and request body
                if (
                    inference_request.model_id is not None
                    and inference_request.model_id != model_id
                ):
                    raise HTTPException(
                        status_code=400,
                        detail=f"Model ID mismatch: path specifies '{model_id}' but request body specifies '{inference_request.model_id}'",
                    )

                # Set the model_id from path if not in request body
                inference_request.model_id = model_id

                return process_inference_request(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )

        if not DISABLE_WORKFLOW_ENDPOINTS:

            @app.post(
                "/{workspace_name}/workflows/{workflow_id}/describe_interface",
                response_model=DescribeInterfaceResponse,
                summary="Endpoint to describe interface of predefined workflow",
                description="Checks Roboflow API for workflow definition, once acquired - describes workflow inputs and outputs",
            )
            @with_route_exceptions
            def describe_predefined_workflow_interface(
                workspace_name: str,
                workflow_id: str,
                workflow_request: PredefinedWorkflowDescribeInterfaceRequest,
            ) -> DescribeInterfaceResponse:
                workflow_specification = get_workflow_specification(
                    api_key=workflow_request.api_key,
                    workspace_id=workspace_name,
                    workflow_id=workflow_id,
                    use_cache=workflow_request.use_cache,
                    workflow_version_id=workflow_request.workflow_version_id,
                )
                return handle_describe_workflows_interface(
                    definition=workflow_specification,
                )

            @app.post(
                "/workflows/describe_interface",
                response_model=DescribeInterfaceResponse,
                summary="Endpoint to describe interface of workflow given in request",
                description="Parses workflow definition and retrieves describes inputs and outputs",
            )
            @with_route_exceptions
            def describe_workflow_interface(
                workflow_request: WorkflowSpecificationDescribeInterfaceRequest,
            ) -> DescribeInterfaceResponse:
                return handle_describe_workflows_interface(
                    definition=workflow_request.specification,
                )

            @app.post(
                "/{workspace_name}/workflows/{workflow_id}",
                response_model=WorkflowInferenceResponse,
                summary="Endpoint to run predefined workflow",
                description="Checks Roboflow API for workflow definition, once acquired - parses and executes injecting runtime parameters from request body",
            )
            @app.post(
                "/infer/workflows/{workspace_name}/{workflow_id}",
                response_model=WorkflowInferenceResponse,
                summary="[LEGACY] Endpoint to run predefined workflow",
                description="Checks Roboflow API for workflow definition, once acquired - parses and executes injecting runtime parameters from request body. This endpoint is deprecated and will be removed end of Q2 2024",
                deprecated=True,
            )
            @with_route_exceptions
            @usage_collector("request")
            def infer_from_predefined_workflow(
                workspace_name: str,
                workflow_id: str,
                workflow_request: PredefinedWorkflowInferenceRequest,
                background_tasks: BackgroundTasks,
            ) -> WorkflowInferenceResponse:
                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
                if ENABLE_WORKFLOWS_PROFILING and workflow_request.enable_profiling:
                    profiler = BaseWorkflowsProfiler.init(
                        max_runs_in_buffer=WORKFLOWS_PROFILER_BUFFER_SIZE,
                    )
                else:
                    profiler = NullWorkflowsProfiler.init()
                with profiler.profile_execution_phase(
                    name="workflow_definition_fetching",
                    categories=["inference_package_operation"],
                ):
                    workflow_specification = get_workflow_specification(
                        api_key=workflow_request.api_key,
                        workspace_id=workspace_name,
                        workflow_id=workflow_id,
                        use_cache=workflow_request.use_cache,
                        workflow_version_id=workflow_request.workflow_version_id,
                    )
                if not workflow_request.workflow_id:
                    workflow_request.workflow_id = workflow_id
                if not workflow_specification.get("id"):
                    logger.warning(
                        "Internal workflow ID missing in specification for '%s'",
                        workflow_id,
                    )
                return process_workflow_inference_request(
                    workflow_request=workflow_request,
                    workflow_specification=workflow_specification,
                    background_tasks=(
                        background_tasks if not (LAMBDA or GCP_SERVERLESS) else None
                    ),
                    profiler=profiler,
                )

            @app.post(
                "/workflows/run",
                response_model=WorkflowInferenceResponse,
                summary="Endpoint to run workflow specification provided in payload",
                description="Parses and executes workflow specification, injecting runtime parameters from request body.",
            )
            @app.post(
                "/infer/workflows",
                response_model=WorkflowInferenceResponse,
                summary="[LEGACY] Endpoint to run workflow specification provided in payload",
                description="Parses and executes workflow specification, injecting runtime parameters from request body. This endpoint is deprecated and will be removed end of Q2 2024.",
                deprecated=True,
            )
            @with_route_exceptions
            @usage_collector("request")
            def infer_from_workflow(
                workflow_request: WorkflowSpecificationInferenceRequest,
                background_tasks: BackgroundTasks,
            ) -> WorkflowInferenceResponse:
                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
                if ENABLE_WORKFLOWS_PROFILING and workflow_request.enable_profiling:
                    profiler = BaseWorkflowsProfiler.init(
                        max_runs_in_buffer=WORKFLOWS_PROFILER_BUFFER_SIZE,
                    )
                else:
                    profiler = NullWorkflowsProfiler.init()
                return process_workflow_inference_request(
                    workflow_request=workflow_request,
                    workflow_specification=workflow_request.specification,
                    background_tasks=(
                        background_tasks if not (LAMBDA or GCP_SERVERLESS) else None
                    ),
                    profiler=profiler,
                )

            @app.get(
                "/workflows/execution_engine/versions",
                response_model=ExecutionEngineVersions,
                summary="Returns available Execution Engine versions sorted from oldest to newest",
                description="Returns available Execution Engine versions sorted from oldest to newest",
            )
            @with_route_exceptions
            def get_execution_engine_versions() -> ExecutionEngineVersions:
                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
                versions = get_available_versions()
                return ExecutionEngineVersions(versions=versions)

            @app.get(
                "/workflows/blocks/describe",
                response_model=WorkflowsBlocksDescription,
                summary="[LEGACY] Endpoint to get definition of workflows blocks that are accessible",
                description="Endpoint provides detailed information about workflows building blocks that are "
                "accessible in the inference server. This information could be used to programmatically "
                "build / display workflows.",
                deprecated=True,
            )
            @with_route_exceptions
            def describe_workflows_blocks(
                request: Request,
                air_gapped: bool = Query(False),
            ) -> Union[WorkflowsBlocksDescription, Response]:
                result = handle_describe_workflows_blocks_request(
                    air_gapped=air_gapped,
                )
                return gzip_response_if_requested(request=request, response=result)

            @app.post(
                "/workflows/blocks/describe",
                response_model=WorkflowsBlocksDescription,
                summary="[EXPERIMENTAL] Endpoint to get definition of workflows blocks that are accessible",
                description="Endpoint provides detailed information about workflows building blocks that are "
                "accessible in the inference server. This information could be used to programmatically "
                "build / display workflows. Additionally - in request body one can specify list of "
                "dynamic blocks definitions which will be transformed into blocks and used to generate "
                "schemas and definitions of connections",
            )
            @with_route_exceptions
            def describe_workflows_blocks(
                request: Request,
                request_payload: Optional[DescribeBlocksRequest] = None,
                air_gapped: bool = Query(False),
            ) -> Union[WorkflowsBlocksDescription, Response]:
                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
                dynamic_blocks_definitions = None
                requested_execution_engine_version = None
                api_key = None
                if request_payload is not None:
                    dynamic_blocks_definitions = (
                        request_payload.dynamic_blocks_definitions
                    )
                    requested_execution_engine_version = (
                        request_payload.execution_engine_version
                    )
                    api_key = request_payload.api_key or request.query_params.get(
                        "api_key", None
                    )
                result = handle_describe_workflows_blocks_request(
                    dynamic_blocks_definitions=dynamic_blocks_definitions,
                    requested_execution_engine_version=requested_execution_engine_version,
                    api_key=api_key,
                    air_gapped=air_gapped,
                )
                return gzip_response_if_requested(request=request, response=result)

            @app.get(
                "/workflows/definition/schema",
                response_model=WorkflowsBlocksSchemaDescription,
                summary="Endpoint to fetch the workflows block schema",
                description="Endpoint to fetch the schema of all available blocks. This information can be "
                "used to validate workflow definitions and suggest syntax in the JSON editor.",
            )
            @with_route_exceptions
            def get_workflow_schema(
                request: Request,
            ) -> WorkflowsBlocksSchemaDescription:
                result = get_workflow_schema_description()
                return gzip_response_if_requested(request, response=result)

            @app.post(
                "/workflows/blocks/dynamic_outputs",
                response_model=List[OutputDefinition],
                summary="[EXPERIMENTAL] Endpoint to get definition of dynamic output for workflow step",
                description="Endpoint to be used when step outputs can be discovered only after "
                "filling manifest with data.",
            )
            @with_route_exceptions
            def get_dynamic_block_outputs(
                step_manifest: Dict[str, Any],
            ) -> List[OutputDefinition]:
                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
                # Potentially TODO: dynamic blocks do not support dynamic outputs, but if it changes
                # we need to provide dynamic blocks manifests here
                dummy_workflow_definition = {
                    "version": "1.0",
                    "inputs": [],
                    "steps": [step_manifest],
                    "outputs": [],
                }
                available_blocks = load_workflow_blocks()
                parsed_definition = parse_workflow_definition(
                    raw_workflow_definition=dummy_workflow_definition,
                    available_blocks=available_blocks,
                )
                parsed_manifest = parsed_definition.steps[0]
                return parsed_manifest.get_actual_outputs()

            @app.post(
                "/workflows/validate",
                response_model=WorkflowValidationStatus,
                summary="[EXPERIMENTAL] Endpoint to validate",
                description="Endpoint provides a way to check validity of JSON workflow definition.",
            )
            @with_route_exceptions
            def validate_workflow(
                specification: dict,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
            ) -> WorkflowValidationStatus:
                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
                step_execution_mode = StepExecutionMode(WORKFLOWS_STEP_EXECUTION_MODE)
                workflow_init_parameters = {
                    "workflows_core.model_manager": model_manager,
                    "workflows_core.api_key": api_key,
                    "workflows_core.background_tasks": None,
                    "workflows_core.step_execution_mode": step_execution_mode,
                }
                _ = ExecutionEngine.init(
                    workflow_definition=specification,
                    init_parameters=workflow_init_parameters,
                    max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
                    prevent_local_images_loading=True,
                )
                return WorkflowValidationStatus(status="ok")

        if WEBRTC_WORKER_ENABLED:

            @app.post(
                "/initialise_webrtc_worker",
                response_model=InitializeWebRTCResponse,
                summary="[EXPERIMENTAL] Establishes WebRTC peer connection and processes video stream in spawned process or modal function",
                description="[EXPERIMENTAL] Establishes WebRTC peer connection and processes video stream in spawned process or modal function",
            )
            @with_route_exceptions_async
            async def initialise_webrtc_worker(
                request: WebRTCWorkerRequest,
                r: Request,
            ) -> InitializeWebRTCResponse:
                if str(r.headers.get("origin")).lower() == BUILDER_ORIGIN.lower():
                    if re.search(
                        r"^https://[^.]+\.roboflow\.[^./]+/", str(r.url).lower()
                    ):
                        request.is_preview = True

                logger.debug("Received initialise_webrtc_worker request")
                worker_result: WebRTCWorkerResult = await start_worker(
                    webrtc_request=request,
                )
                if worker_result.exception_type is not None:
                    if worker_result.exception_type == "WorkflowSyntaxError":
                        # Reconstruct exception from serialized worker result.
                        # We dynamically create an exception class to preserve
                        # the original type name (e.g., "ValidationError") for
                        # the inner_error_type property, since exceptions can't
                        # be pickled across the worker process boundary.
                        inner_error = None
                        if worker_result.inner_error and worker_result.inner_error_type:
                            inner_error = type(
                                worker_result.inner_error_type,
                                (Exception,),
                                {},
                            )(worker_result.inner_error)

                        blocks_errors = None
                        if worker_result.blocks_errors:
                            blocks_errors = [
                                WorkflowBlockError(**be)
                                for be in worker_result.blocks_errors
                            ]
                        raise WorkflowSyntaxError(
                            public_message=worker_result.error_message,
                            context=worker_result.error_context,
                            inner_error=inner_error,
                            blocks_errors=blocks_errors,
                        )
                    if worker_result.exception_type == "WorkflowError":
                        raise WorkflowError(
                            public_message=worker_result.error_message,
                            context=worker_result.error_context,
                        )
                    expected_exceptions = {
                        "Exception": Exception,
                        "KeyError": KeyError,
                        "MissingApiKeyError": MissingApiKeyError,
                        "NotImplementedError": NotImplementedError,
                        "RoboflowAPINotAuthorizedError": RoboflowAPINotAuthorizedError,
                        "RoboflowAPINotNotFoundError": RoboflowAPINotNotFoundError,
                        "ValidationError": ValidationError,
                        "WebRTCConfigurationError": WebRTCConfigurationError,
                    }
                    exc = expected_exceptions.get(
                        worker_result.exception_type, Exception
                    )(worker_result.error_message)
                    logger.error(
                        f"Initialise webrtc worker failed with %s: %s",
                        worker_result.exception_type,
                        worker_result.error_message,
                    )
                    raise exc
                logger.debug("Returning initialise_webrtc_worker response")
                return InitializeWebRTCResponse(
                    context=CommandContext(),
                    status=OperationStatus.SUCCESS,
                    sdp=worker_result.answer.sdp,
                    type=worker_result.answer.type,
                )

            @app.post(
                "/webrtc/session/heartbeat",
                summary="WebRTC session heartbeat",
            )
            @with_route_exceptions_async
            async def webrtc_session_heartbeat(
                request: WebRTCSessionHeartbeatRequest,
            ) -> dict:
                """Receive heartbeat for an active WebRTC session.

                This endpoint is called periodically to indicate
                that their session is still active. The session will be removed from
                the quota count if no heartbeat is received within the TTL period.

                Requires api_key for authentication.
                """
                try:
                    workspace_id = await get_roboflow_workspace_async(
                        api_key=request.api_key
                    )
                except (RoboflowAPINotAuthorizedError, WorkspaceLoadError):
                    raise HTTPException(
                        status_code=401,
                        detail={"status": "error", "message": "unauthorized"},
                    )
                if not workspace_id:
                    raise HTTPException(
                        status_code=500,
                        detail={
                            "status": "error",
                            "message": "failed to retrieve workspace",
                        },
                    )

                session_refreshed = refresh_webrtc_session(
                    workspace_id=workspace_id,
                    session_id=request.session_id,
                )
                if not session_refreshed:
                    raise HTTPException(
                        status_code=404,
                        detail={"status": "error", "message": "session not found"},
                    )
                return {"status": "ok"}

            @app.post(
                "/webrtc/session/heartbeat/end",
                summary="End WebRTC session",
            )
            @with_route_exceptions_async
            async def webrtc_session_end(
                request: WebRTCSessionHeartbeatRequest,
            ) -> dict:
                """End a WebRTC session and immediately free the quota slot.

                Requires api_key for authentication.
                """
                try:
                    workspace_id = await get_roboflow_workspace_async(
                        api_key=request.api_key
                    )
                except (RoboflowAPINotAuthorizedError, WorkspaceLoadError):
                    raise HTTPException(
                        status_code=401,
                        detail={"status": "error", "message": "unauthorized"},
                    )
                if not workspace_id:
                    raise HTTPException(
                        status_code=500,
                        detail={
                            "status": "error",
                            "message": "failed to retrieve workspace",
                        },
                    )

                deregister_webrtc_session(
                    workspace_id=workspace_id,
                    session_id=request.session_id,
                )
                return {"status": "ok"}

        if ENABLE_STREAM_API:

            @app.get(
                "/inference_pipelines/list",
                response_model=ListPipelinesResponse,
                summary="[EXPERIMENTAL] List active InferencePipelines",
                description="[EXPERIMENTAL] Listing all active InferencePipelines processing videos",
            )
            @with_route_exceptions_async
            async def list_pipelines(_: Request) -> ListPipelinesResponse:
                return await self.stream_manager_client.list_pipelines()

            @app.get(
                "/inference_pipelines/{pipeline_id}/status",
                response_model=InferencePipelineStatusResponse,
                summary="[EXPERIMENTAL] Get status of InferencePipeline",
                description="[EXPERIMENTAL] Get status of InferencePipeline",
            )
            @with_route_exceptions_async
            async def get_status(pipeline_id: str) -> InferencePipelineStatusResponse:
                return await self.stream_manager_client.get_status(
                    pipeline_id=pipeline_id
                )

            @app.post(
                "/inference_pipelines/initialise",
                response_model=CommandResponse,
                summary="[EXPERIMENTAL] Starts new InferencePipeline",
                description="[EXPERIMENTAL] Starts new InferencePipeline",
            )
            @with_route_exceptions_async
            async def initialise(request: InitialisePipelinePayload) -> CommandResponse:
                return await self.stream_manager_client.initialise_pipeline(
                    initialisation_request=request
                )

            @app.post(
                "/inference_pipelines/initialise_webrtc",
                response_model=InitializeWebRTCPipelineResponse,
                summary="[EXPERIMENTAL] Establishes WebRTC peer connection and starts new InferencePipeline consuming video track",
                description="[EXPERIMENTAL] Establishes WebRTC peer connection and starts new InferencePipeline consuming video track",
            )
            @with_route_exceptions_async
            async def initialise_webrtc_inference_pipeline(
                request: InitialiseWebRTCPipelinePayload,
            ) -> CommandResponse:
                logger.debug("Received initialise webrtc inference pipeline request")
                resp = await self.stream_manager_client.initialise_webrtc_pipeline(
                    initialisation_request=request
                )
                logger.debug("Returning initialise webrtc inference pipeline response")
                return resp

            @app.post(
                "/inference_pipelines/{pipeline_id}/pause",
                response_model=CommandResponse,
                summary="[EXPERIMENTAL] Pauses the InferencePipeline",
                description="[EXPERIMENTAL] Pauses the InferencePipeline",
            )
            @with_route_exceptions_async
            async def pause(pipeline_id: str) -> CommandResponse:
                return await self.stream_manager_client.pause_pipeline(
                    pipeline_id=pipeline_id
                )

            @app.post(
                "/inference_pipelines/{pipeline_id}/resume",
                response_model=CommandResponse,
                summary="[EXPERIMENTAL] Resumes the InferencePipeline",
                description="[EXPERIMENTAL] Resumes the InferencePipeline",
            )
            @with_route_exceptions_async
            async def resume(pipeline_id: str) -> CommandResponse:
                return await self.stream_manager_client.resume_pipeline(
                    pipeline_id=pipeline_id
                )

            @app.post(
                "/inference_pipelines/{pipeline_id}/terminate",
                response_model=CommandResponse,
                summary="[EXPERIMENTAL] Terminates the InferencePipeline",
                description="[EXPERIMENTAL] Terminates the InferencePipeline",
            )
            @with_route_exceptions_async
            async def terminate(pipeline_id: str) -> CommandResponse:
                return await self.stream_manager_client.terminate_pipeline(
                    pipeline_id=pipeline_id
                )

            @app.get(
                "/inference_pipelines/{pipeline_id}/consume",
                response_model=ConsumePipelineResponse,
                summary="[EXPERIMENTAL] Consumes InferencePipeline result",
                description="[EXPERIMENTAL] Consumes InferencePipeline result",
            )
            @with_route_exceptions_async
            async def consume(
                pipeline_id: str,
                request: Optional[ConsumeResultsPayload] = None,
            ) -> ConsumePipelineResponse:
                if request is None:
                    request = ConsumeResultsPayload()
                return await self.stream_manager_client.consume_pipeline_result(
                    pipeline_id=pipeline_id,
                    excluded_fields=request.excluded_fields,
                )

        class ModelInitState:
            """Class to track model initialization state."""

            def __init__(self):
                self.is_ready = False
                self.lock = Lock()  # For thread-safe updates
                self.initialization_errors = []  # Track errors per model

        model_init_state = ModelInitState()

        should_preload = PRELOAD_MODELS or PINNED_MODELS
        if not should_preload:
            model_init_state.is_ready = True

        # Enable preloading models at startup
        if should_preload:

            def initialize_models(state: ModelInitState):
                """Perform asynchronous initialization tasks to load models."""

                def load_model(model_id):
                    t_start = time.perf_counter()
                    de_aliased = resolve_roboflow_model_alias(model_id=model_id)
                    model_id_alias = model_id if de_aliased != model_id else None
                    loaded_model_id = model_id_alias or de_aliased
                    logger.info(
                        f"Preload: starting model load for '{model_id}' (resolved: '{de_aliased}')"
                    )
                    try:
                        self.model_manager.add_model(
                            de_aliased,
                            PRELOAD_API_KEY,
                            model_id_alias=model_id_alias,
                        )
                        load_time = time.perf_counter() - t_start
                        logger.info(
                            f"Preload: model '{model_id}' loaded successfully in {load_time:.1f}s"
                        )
                    except Exception as e:
                        load_time = time.perf_counter() - t_start
                        error_msg = f"Preload: error loading model '{model_id}' after {load_time:.1f}s: {e}"
                        logger.error(error_msg)
                        with state.lock:
                            state.initialization_errors.append((model_id, str(e)))
                        return

                    # Pin if this model is in PINNED_MODELS
                    if (
                        PINNED_MODELS
                        and model_id in PINNED_MODELS
                        and hasattr(self.model_manager, "pin_model")
                    ):
                        self.model_manager.pin_model(loaded_model_id)

                all_models = list(
                    dict.fromkeys((PRELOAD_MODELS or []) + (PINNED_MODELS or []))
                )
                if all_models:
                    # Create tasks for each model to be loaded
                    model_loading_executor = ThreadPoolExecutor(max_workers=2)
                    loaded_futures: List[Tuple[str, Future]] = []
                    for model_id in all_models:
                        future = model_loading_executor.submit(
                            load_model, model_id=model_id
                        )
                        loaded_futures.append((model_id, future))

                    for model_id, future in loaded_futures:
                        try:
                            future.result(timeout=300)
                        except (
                            TimeoutError,
                            CancelledError,
                            concurrent.futures.TimeoutError,
                        ):
                            state.initialization_errors.append(
                                (
                                    model_id,
                                    "Could not finalise model loading before timeout",
                                )
                            )
                            future.cancel()
                        except Exception as e:
                            logger.error(
                                f"Preload: unexpected error for model '{model_id}': {e}"
                            )
                            with state.lock:
                                state.initialization_errors.append((model_id, str(e)))

                # Update the readiness state in a thread-safe manner
                with state.lock:
                    state.is_ready = True

            @app.on_event("startup")
            def startup_model_init():
                """Initialize the models on startup."""
                startup_thread = Thread(
                    target=initialize_models, args=(model_init_state,), daemon=True
                )
                startup_thread.start()
                logger.info("Model initialization started in the background.")

        # Attach health/readiness endpoints
        @app.get("/readiness", status_code=200)
        def readiness(
            state: ModelInitState = Depends(lambda: model_init_state),
        ):
            """Readiness endpoint for Kubernetes readiness probe."""
            with state.lock:
                if state.is_ready:
                    return {"status": "ready"}
                else:
                    return JSONResponse(
                        content={"status": "not ready"}, status_code=503
                    )

        @app.get("/healthz", status_code=200)
        def healthz():
            """Health endpoint for Kubernetes liveness probe.

            Verifies CUDA context health when running on GPU. Returns 503 if
            CUDA is corrupted (unrecoverable - requires process restart).
            """
            from inference.core.utils.cuda_health import check_cuda_health

            is_healthy, error = check_cuda_health()
            if is_healthy:
                return {"status": "healthy"}
            else:
                logger.error("CUDA health check failed: %s", error)
                return JSONResponse(
                    content={
                        "status": "unhealthy",
                        "reason": "cuda_error",
                    },
                    status_code=503,
                )

        if CORE_MODELS_ENABLED:
            if CORE_MODEL_CLIP_ENABLED:

                @app.post(
                    "/clip/embed_image",
                    response_model=ClipEmbeddingResponse,
                    summary="CLIP Image Embeddings",
                    description="Run the Open AI CLIP model to embed image data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def clip_embed_image(
                    inference_request: ClipImageEmbeddingRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds image data using the OpenAI CLIP model.

                    Args:
                        inference_request (ClipImageEmbeddingRequest): The request containing the image to be embedded.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        ClipEmbeddingResponse: The response containing the embedded image.
                    """
                    logger.debug(f"Reached /clip/embed_image")
                    clip_model_id = load_clip_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        clip_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(clip_model_id, actor)
                    return response

                @app.post(
                    "/clip/embed_text",
                    response_model=ClipEmbeddingResponse,
                    summary="CLIP Text Embeddings",
                    description="Run the Open AI CLIP model to embed text data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def clip_embed_text(
                    inference_request: ClipTextEmbeddingRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds text data using the OpenAI CLIP model.

                    Args:
                        inference_request (ClipTextEmbeddingRequest): The request containing the text to be embedded.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        ClipEmbeddingResponse: The response containing the embedded text.
                    """
                    logger.debug(f"Reached /clip/embed_text")
                    clip_model_id = load_clip_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        clip_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(clip_model_id, actor)
                    return response

                @app.post(
                    "/clip/compare",
                    response_model=ClipCompareResponse,
                    summary="CLIP Compare",
                    description="Run the Open AI CLIP model to compute similarity scores.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def clip_compare(
                    inference_request: ClipCompareRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Computes similarity scores using the OpenAI CLIP model.

                    Args:
                        inference_request (ClipCompareRequest): The request containing the data to be compared.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        ClipCompareResponse: The response containing the similarity scores.
                    """
                    logger.debug(f"Reached /clip/compare")
                    clip_model_id = load_clip_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        clip_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(clip_model_id, actor, n=2)
                    return response

            if CORE_MODEL_PE_ENABLED:

                @app.post(
                    "/perception_encoder/embed_image",
                    response_model=PerceptionEncoderEmbeddingResponse,
                    summary="PE Image Embeddings",
                    description="Run the Meta Perception Encoder model to embed image data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def pe_embed_image(
                    inference_request: PerceptionEncoderImageEmbeddingRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds image data using the Perception Encoder PE model.

                    Args:
                        inference_request (PerceptionEncoderImageEmbeddingRequest): The request containing the image to be embedded.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        PerceptionEncoderEmbeddingResponse: The response containing the embedded image.
                    """
                    logger.debug(f"Reached /perception_encoder/embed_image")
                    pe_model_id = load_pe_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        pe_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(pe_model_id, actor)
                    return response

                @app.post(
                    "/perception_encoder/embed_text",
                    response_model=PerceptionEncoderEmbeddingResponse,
                    summary="Perception Encoder Text Embeddings",
                    description="Run the Meta Perception Encoder model to embed text data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def pe_embed_text(
                    inference_request: PerceptionEncoderTextEmbeddingRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds text data using the Meta Perception Encoder model.

                    Args:
                        inference_request (PerceptionEncoderTextEmbeddingRequest): The request containing the text to be embedded.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        PerceptionEncoderEmbeddingResponse: The response containing the embedded text.
                    """
                    logger.debug(f"Reached /perception_encoder/embed_text")
                    pe_model_id = load_pe_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        pe_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(pe_model_id, actor)
                    return response

                @app.post(
                    "/perception_encoder/compare",
                    response_model=PerceptionEncoderCompareResponse,
                    summary="Perception Encoder Compare",
                    description="Run the Meta Perception Encoder model to compute similarity scores.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def pe_compare(
                    inference_request: PerceptionEncoderCompareRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Computes similarity scores using the Meta Perception Encoder model.

                    Args:
                        inference_request (PerceptionEncoderCompareRequest): The request containing the data to be compared.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        PerceptionEncoderCompareResponse: The response containing the similarity scores.
                    """
                    logger.debug(f"Reached /perception_encoder/compare")
                    pe_model_id = load_pe_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        pe_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(pe_model_id, actor, n=2)
                    return response

            if CORE_MODEL_GROUNDINGDINO_ENABLED:

                @app.post(
                    "/grounding_dino/infer",
                    response_model=ObjectDetectionInferenceResponse,
                    summary="Grounding DINO inference.",
                    description="Run the Grounding DINO zero-shot object detection model.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def grounding_dino_infer(
                    inference_request: GroundingDINOInferenceRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds image data using the Grounding DINO model.

                    Args:
                        inference_request GroundingDINOInferenceRequest): The request containing the image on which to run object detection.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        ObjectDetectionInferenceResponse: The object detection response.
                    """
                    logger.debug(f"Reached /grounding_dino/infer")
                    grounding_dino_model_id = load_grounding_dino_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        grounding_dino_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(grounding_dino_model_id, actor)
                    return response

            if CORE_MODEL_YOLO_WORLD_ENABLED:

                @app.post(
                    "/yolo_world/infer",
                    response_model=ObjectDetectionInferenceResponse,
                    summary="YOLO-World inference.",
                    description="Run the YOLO-World zero-shot object detection model.",
                    response_model_exclude_none=True,
                )
                @with_route_exceptions
                @usage_collector("request")
                def yolo_world_infer(
                    inference_request: YOLOWorldInferenceRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Runs the YOLO-World zero-shot object detection model.

                    Args:
                        inference_request (YOLOWorldInferenceRequest): The request containing the image on which to run object detection.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        ObjectDetectionInferenceResponse: The object detection response.
                    """
                    logger.debug(f"Reached /yolo_world/infer. Loading model")
                    yolo_world_model_id = load_yolo_world_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    logger.debug("YOLOWorld model loaded. Staring the inference.")
                    response = self.model_manager.infer_from_request_sync(
                        yolo_world_model_id, inference_request
                    )
                    logger.debug("YOLOWorld prediction available.")
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(yolo_world_model_id, actor)
                        logger.debug("Usage of YOLOWorld denoted.")
                    return response

            if CORE_MODEL_DOCTR_ENABLED:

                @app.post(
                    "/doctr/ocr",
                    response_model=Union[
                        OCRInferenceResponse, List[OCRInferenceResponse]
                    ],
                    summary="DocTR OCR response",
                    description="Run the DocTR OCR model to retrieve text in an image.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def doctr_retrieve_text(
                    inference_request: DoctrOCRInferenceRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds image data using the DocTR model.

                    Args:
                        inference_request (M.DoctrOCRInferenceRequest): The request containing the image from which to retrieve text.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        OCRInferenceResponse: The response containing the embedded image.
                    """
                    logger.debug(f"Reached /doctr/ocr")
                    doctr_model_id = load_doctr_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        doctr_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(doctr_model_id, actor)
                    return orjson_response_keeping_parent_id(response)

            if CORE_MODEL_EASYOCR_ENABLED:

                @app.post(
                    "/easy_ocr/ocr",
                    response_model=Union[
                        OCRInferenceResponse, List[OCRInferenceResponse]
                    ],
                    summary="EasyOCR OCR response",
                    description="Run the EasyOCR model to retrieve text in an image.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def easy_ocr_retrieve_text(
                    inference_request: EasyOCRInferenceRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds image data using the EasyOCR model.

                    Args:
                        inference_request (EasyOCRInferenceRequest): The request containing the image from which to retrieve text.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        OCRInferenceResponse: The response containing the embedded image.
                    """
                    logger.debug(f"Reached /easy_ocr/ocr")
                    easy_ocr_model_id = load_easy_ocr_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        easy_ocr_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(easy_ocr_model_id, actor)
                    return orjson_response_keeping_parent_id(response)

            if CORE_MODEL_SAM_ENABLED:

                @app.post(
                    "/sam/embed_image",
                    response_model=SamEmbeddingResponse,
                    summary="SAM Image Embeddings",
                    description="Run the Meta AI Segmant Anything Model to embed image data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def sam_embed_image(
                    inference_request: SamEmbeddingRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds image data using the Meta AI Segmant Anything Model (SAM).

                    Args:
                        inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        M.SamEmbeddingResponse or Response: The response containing the embedded image.
                    """
                    logger.debug(f"Reached /sam/embed_image")
                    sam_model_id = load_sam_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    model_response = self.model_manager.infer_from_request_sync(
                        sam_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(sam_model_id, actor)
                    if inference_request.format == "binary":
                        return Response(
                            content=model_response.embeddings,
                            headers={"Content-Type": "application/octet-stream"},
                        )
                    return model_response

                @app.post(
                    "/sam/segment_image",
                    response_model=SamSegmentationResponse,
                    summary="SAM Image Segmentation",
                    description="Run the Meta AI Segmant Anything Model to generate segmenations for image data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def sam_segment_image(
                    inference_request: SamSegmentationRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Generates segmentations for image data using the Meta AI Segmant Anything Model (SAM).

                    Args:
                        inference_request (SamSegmentationRequest): The request containing the image to be segmented.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        M.SamSegmentationResponse or Response: The response containing the segmented image.
                    """
                    logger.debug(f"Reached /sam/segment_image")
                    sam_model_id = load_sam_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    model_response = self.model_manager.infer_from_request_sync(
                        sam_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(sam_model_id, actor)
                    if inference_request.format == "binary":
                        return Response(
                            content=model_response,
                            headers={"Content-Type": "application/octet-stream"},
                        )
                    return model_response

            if CORE_MODEL_SAM2_ENABLED:

                @app.post(
                    "/sam2/embed_image",
                    response_model=Sam2EmbeddingResponse,
                    summary="SAM2 Image Embeddings",
                    description="Run the Meta AI Segment Anything 2 Model to embed image data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def sam2_embed_image(
                    inference_request: Sam2EmbeddingRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds image data using the Meta AI Segment Anything Model (SAM).

                    Args:
                        inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        M.Sam2EmbeddingResponse or Response: The response affirming the image has been embedded
                    """
                    logger.debug(f"Reached /sam2/embed_image")
                    sam2_model_id = load_sam2_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    model_response = self.model_manager.infer_from_request_sync(
                        sam2_model_id, inference_request
                    )
                    return model_response

                @app.post(
                    "/sam2/segment_image",
                    response_model=Sam2SegmentationResponse,
                    summary="SAM2 Image Segmentation",
                    description="Run the Meta AI Segment Anything 2 Model to generate segmenations for image data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def sam2_segment_image(
                    inference_request: Sam2SegmentationRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Generates segmentations for image data using the Meta AI Segment Anything Model (SAM).

                    Args:
                        inference_request (Sam2SegmentationRequest): The request containing the image to be segmented.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        M.SamSegmentationResponse or Response: The response containing the segmented image.
                    """
                    logger.debug(f"Reached /sam2/segment_image")
                    sam2_model_id = load_sam2_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    model_response = self.model_manager.infer_from_request_sync(
                        sam2_model_id, inference_request
                    )
                    if inference_request.format == "binary":
                        return Response(
                            content=model_response,
                            headers={"Content-Type": "application/octet-stream"},
                        )
                    return model_response

            if CORE_MODEL_SAM3_ENABLED:

                @app.post(
                    "/sam3/embed_image",
                    response_model=Sam3EmbeddingResponse,
                    summary="Seg preview Image Embeddings",
                    description="Run the  Model to embed image data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def sam3_embed_image(
                    inference_request: Sam2EmbeddingRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    logger.debug(f"Reached /sam3/embed_image")

                    if SAM3_EXEC_MODE == "remote":
                        raise HTTPException(
                            status_code=501,
                            detail="SAM3 embedding is not supported in remote execution mode.",
                        )

                    self.model_manager.add_model(
                        "sam3/sam3_interactive",
                        api_key=api_key,
                        endpoint_type=ModelEndpointType.CORE_MODEL,
                        countinference=countinference,
                        service_secret=service_secret,
                    )

                    model_response = self.model_manager.infer_from_request_sync(
                        "sam3/sam3_interactive", inference_request
                    )
                    return model_response

            if CORE_MODEL_SAM3_ENABLED:

                @app.post(
                    "/sam3/concept_segment",
                    response_model=Sam3SegmentationResponse,
                    summary="SAM3 PCS (promptable concept segmentation)",
                    description="Run the SAM3 PCS (promptable concept segmentation) to generate segmentations for image data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def sam3_segment_image(
                    inference_request: Sam3SegmentationRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    if not SAM3_FINE_TUNED_MODELS_ENABLED:
                        if not inference_request.model_id.startswith("sam3/"):
                            raise HTTPException(
                                status_code=501,
                                detail="Fine-tuned SAM3 models are not supported on this deployment. Please use a workflow or self-host the server.",
                            )

                    if SAM3_EXEC_MODE == "remote":
                        endpoint = f"{API_BASE_URL}/inferenceproxy/seg-preview"

                        # Construct payload for remote API
                        # The remote API expects:
                        # {
                        #     "image": {"type": "base64", "value": ...},
                        #     "prompts": [{"type": "text", "text": ...}, ...],
                        #     "output_prob_thresh": ...
                        # }

                        # Extract prompts from request
                        http_prompts = []
                        for prompt in inference_request.prompts:
                            p_dict = prompt.dict(exclude_none=True)
                            # Ensure type is set if missing (default to text if text is present)
                            if "type" not in p_dict:
                                if "text" in p_dict:
                                    p_dict["type"] = "text"
                            http_prompts.append(p_dict)

                        # Prepare image
                        # inference_request.image is InferenceRequestImage
                        if inference_request.image.type == "base64":
                            http_image = {
                                "type": "base64",
                                "value": inference_request.image.value,
                            }
                        elif inference_request.image.type == "url":
                            http_image = {
                                "type": "url",
                                "value": inference_request.image.value,
                            }
                        elif inference_request.image.type == "numpy":
                            # Numpy not supported for remote proxy easily without serialization,
                            # but InferenceRequestImage usually comes as base64/url in HTTP API.
                            # If it is numpy, we might need to handle it, but for now assume base64/url.
                            # If it's numpy, it's likely from internal call, but this is HTTP API.
                            http_image = {
                                "type": "numpy",
                                "value": inference_request.image.value,
                            }
                        else:
                            http_image = {
                                "type": inference_request.image.type,
                                "value": inference_request.image.value,
                            }

                        payload = {
                            "image": http_image,
                            "prompts": http_prompts,
                            "output_prob_thresh": inference_request.output_prob_thresh,
                        }

                        try:
                            headers = {"Content-Type": "application/json"}
                            if ROBOFLOW_INTERNAL_SERVICE_NAME:
                                headers["X-Roboflow-Internal-Service-Name"] = (
                                    ROBOFLOW_INTERNAL_SERVICE_NAME
                                )
                            if ROBOFLOW_INTERNAL_SERVICE_SECRET:
                                headers["X-Roboflow-Internal-Service-Secret"] = (
                                    ROBOFLOW_INTERNAL_SERVICE_SECRET
                                )

                            headers = build_roboflow_api_headers(
                                explicit_headers=headers
                            )

                            response = requests.post(
                                wrap_url(f"{endpoint}?api_key={api_key}"),
                                json=payload,
                                headers=headers,
                                timeout=60,
                            )
                            response.raise_for_status()
                            resp_json = response.json()

                            # The remote API returns the same structure as Sam3SegmentationResponse
                            return Sam3SegmentationResponse(**resp_json)

                        except Exception as e:
                            logger.error(f"SAM3 remote request failed: {e}")
                            raise HTTPException(
                                status_code=500,
                                detail=f"SAM3 remote request failed: {str(e)}",
                            )

                    if inference_request.model_id.startswith("sam3/"):
                        self.model_manager.add_model(
                            inference_request.model_id,
                            api_key=api_key,
                            endpoint_type=ModelEndpointType.CORE_MODEL,
                            countinference=countinference,
                            service_secret=service_secret,
                        )
                    else:
                        self.model_manager.add_model(
                            inference_request.model_id,
                            api_key=api_key,
                            endpoint_type=ModelEndpointType.ORT,
                            countinference=countinference,
                            service_secret=service_secret,
                        )

                    model_response = self.model_manager.infer_from_request_sync(
                        inference_request.model_id, inference_request
                    )
                    if inference_request.format == "binary":
                        return Response(
                            content=model_response,
                            headers={"Content-Type": "application/octet-stream"},
                        )
                    return model_response

                @app.post(
                    "/sam3/visual_segment",
                    response_model=Sam2SegmentationResponse,
                    summary="SAM3 PVS (promptable visual segmentation)",
                    description="Run the SAM3 PVS (promptable visual segmentation) to generate segmentations for image data.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def sam3_visual_segment(
                    inference_request: Sam2SegmentationRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    logger.debug(f"Reached /sam3/visual_segment")

                    if SAM3_EXEC_MODE == "remote":
                        endpoint = f"{API_BASE_URL}/inferenceproxy/sam3-pvs"

                        http_image = {
                            "type": inference_request.image.type,
                            "value": inference_request.image.value,
                        }

                        prompts_data = (
                            inference_request.prompts.dict(exclude_none=True)
                            if inference_request.prompts
                            else None
                        )

                        payload = {
                            "image": http_image,
                            "prompts": prompts_data,
                            "multimask_output": inference_request.multimask_output,
                        }

                        try:
                            headers = {"Content-Type": "application/json"}
                            if ROBOFLOW_INTERNAL_SERVICE_NAME:
                                headers["X-Roboflow-Internal-Service-Name"] = (
                                    ROBOFLOW_INTERNAL_SERVICE_NAME
                                )
                            if ROBOFLOW_INTERNAL_SERVICE_SECRET:
                                headers["X-Roboflow-Internal-Service-Secret"] = (
                                    ROBOFLOW_INTERNAL_SERVICE_SECRET
                                )

                            headers = build_roboflow_api_headers(
                                explicit_headers=headers
                            )

                            response = requests.post(
                                wrap_url(f"{endpoint}?api_key={api_key}"),
                                json=payload,
                                headers=headers,
                                timeout=60,
                            )
                            response.raise_for_status()
                            resp_json = response.json()

                            return Sam2SegmentationResponse(**resp_json)

                        except Exception as e:
                            logger.error(
                                f"SAM3 visual_segment remote request failed: {e}"
                            )
                            raise HTTPException(
                                status_code=500,
                                detail=f"SAM3 visual_segment remote request failed: {str(e)}",
                            )

                    self.model_manager.add_model(
                        "sam3/sam3_interactive",
                        api_key=api_key,
                        endpoint_type=ModelEndpointType.CORE_MODEL,
                        countinference=countinference,
                        service_secret=service_secret,
                    )

                    model_response = self.model_manager.infer_from_request_sync(
                        "sam3/sam3_interactive", inference_request
                    )
                    return model_response

            if SAM3_3D_OBJECTS_ENABLED:

                @app.post(
                    "/sam3_3d/infer",
                    summary="SAM3 3D Object Generation",
                    description="Generate 3D meshes and Gaussian splatting from 2D images with mask prompts.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def sam3_3d_infer(
                    inference_request: Sam3_3D_Objects_InferenceRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """Generate 3D meshes and Gaussian splatting from 2D images with mask prompts.

                    Args:
                        inference_request (Sam3_3D_Objects_InferenceRequest): The request containing
                            the image and mask input for 3D generation.
                        api_key (Optional[str]): Roboflow API Key for artifact retrieval.

                    Returns:
                        dict: Response containing base64-encoded 3D outputs:
                            - mesh_glb: Scene mesh in GLB format (base64)
                            - gaussian_ply: Combined Gaussian splatting in PLY format (base64)
                            - objects: List of individual objects with their 3D data
                            - time: Inference time in seconds
                    """
                    logger.debug("Reached /sam3_3d/infer")
                    model_id = inference_request.model_id or "sam3-3d-objects"

                    self.model_manager.add_model(
                        model_id,
                        api_key=api_key,
                        endpoint_type=ModelEndpointType.CORE_MODEL,
                        countinference=countinference,
                        service_secret=service_secret,
                    )

                    model_response = self.model_manager.infer_from_request_sync(
                        model_id, inference_request
                    )

                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(model_id, actor)

                    # Convert bytes to base64 for JSON serialization
                    def encode_bytes(data):
                        if data is None:
                            return None
                        return base64.b64encode(data).decode("utf-8")

                    objects_list = []
                    for obj in model_response.objects:
                        objects_list.append(
                            {
                                "mesh_glb": encode_bytes(obj.mesh_glb),
                                "gaussian_ply": encode_bytes(obj.gaussian_ply),
                                "metadata": {
                                    "rotation": obj.metadata.rotation,
                                    "translation": obj.metadata.translation,
                                    "scale": obj.metadata.scale,
                                },
                            }
                        )

                    return {
                        "mesh_glb": encode_bytes(model_response.mesh_glb),
                        "gaussian_ply": encode_bytes(model_response.gaussian_ply),
                        "objects": objects_list,
                        "time": model_response.time,
                    }

            if CORE_MODEL_OWLV2_ENABLED:

                @app.post(
                    "/owlv2/infer",
                    response_model=ObjectDetectionInferenceResponse,
                    summary="Owlv2 image prompting",
                    description="Run the google owlv2 model to few-shot object detect",
                )
                @with_route_exceptions
                @usage_collector("request")
                def owlv2_infer(
                    inference_request: OwlV2InferenceRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Embeds image data using the Meta AI Segmant Anything Model (SAM).

                    Args:
                        inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        M.Sam2EmbeddingResponse or Response: The response affirming the image has been embedded
                    """
                    logger.debug(f"Reached /owlv2/infer")
                    owl2_model_id = load_owlv2_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    model_response = self.model_manager.infer_from_request_sync(
                        owl2_model_id, inference_request
                    )
                    return model_response

            if CORE_MODEL_GAZE_ENABLED:

                @app.post(
                    "/gaze/gaze_detection",
                    summary="Gaze Detection (deprecated)",
                    description=(
                        "Deprecated. Always returns HTTP 410 Gone. The endpoint stub "
                        "will be removed end of Q2 2026."
                    ),
                    deprecated=True,
                )
                @with_route_exceptions
                def gaze_detection_deprecated():
                    raise FeatureDeprecatedError(
                        feature="/gaze/gaze_detection",
                        removal_release="end of Q2 2026",
                        reason="MediaPipe dependency removed from inference; endpoint is a 410 stub.",
                    )

            if DEPTH_ESTIMATION_ENABLED:

                @app.post(
                    "/infer/depth-estimation",
                    response_model=DepthEstimationResponse,
                    summary="Depth Estimation",
                    description="Run the depth estimation model to generate a depth map.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def depth_estimation(
                    inference_request: DepthEstimationRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Generate a depth map using the depth estimation model.

                    Args:
                        inference_request (DepthEstimationRequest): The request containing the image to estimate depth for.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        DepthEstimationResponse: The response containing the normalized depth map and optional visualization.
                    """
                    logger.debug(f"Reached /infer/depth-estimation")
                    depth_model_id = inference_request.model_id
                    self.model_manager.add_model(
                        depth_model_id,
                        inference_request.api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        depth_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(depth_model_id, actor)

                    # Extract data from nested response structure
                    depth_data = response.response
                    depth_response = DepthEstimationResponse(
                        normalized_depth=depth_data["normalized_depth"].tolist(),
                        image=depth_data["image"].base64_image,
                    )
                    return depth_response

            if CORE_MODEL_TROCR_ENABLED:

                @app.post(
                    "/ocr/trocr",
                    response_model=OCRInferenceResponse,
                    summary="TrOCR OCR response",
                    description="Run the TrOCR model to retrieve text in an image.",
                )
                @with_route_exceptions
                @usage_collector("request")
                def trocr_retrieve_text(
                    inference_request: TrOCRInferenceRequest,
                    request: Request,
                    api_key: Optional[str] = Query(
                        None,
                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                    ),
                    countinference: Optional[bool] = None,
                    service_secret: Optional[str] = None,
                ):
                    """
                    Retrieves text from image data using the TrOCR model.

                    Args:
                        inference_request (TrOCRInferenceRequest): The request containing the image from which to retrieve text.
                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                        request (Request, default Body()): The HTTP request.

                    Returns:
                        OCRInferenceResponse: The response containing the retrieved text.
                    """
                    logger.debug(f"Reached /trocr/ocr")
                    trocr_model_id = load_trocr_model(
                        inference_request,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                    response = self.model_manager.infer_from_request_sync(
                        trocr_model_id, inference_request
                    )
                    if LAMBDA:
                        actor = request.scope["aws.event"]["requestContext"][
                            "authorizer"
                        ]["lambda"]["actor"]
                        trackUsage(trocr_model_id, actor)
                    return orjson_response_keeping_parent_id(response)

        if not (LAMBDA or GCP_SERVERLESS):

            @app.get(
                "/notebook/start",
                summary="Jupyter Lab Server Start",
                description="Starts a jupyter lab server for running development code",
            )
            @with_route_exceptions
            def notebook_start(browserless: bool = False):
                """Starts a jupyter lab server for running development code.

                Args:
                    inference_request (NotebookStartRequest): The request containing the necessary details for starting a jupyter lab server.
                    background_tasks: (BackgroundTasks) pool of fastapi background tasks

                Returns:
                    NotebookStartResponse: The response containing the URL of the jupyter lab server.
                """
                logger.debug(f"Reached /notebook/start")
                if NOTEBOOK_ENABLED:
                    start_notebook()
                    if browserless:
                        return {
                            "success": True,
                            "message": f"Jupyter Lab server started at http://localhost:{NOTEBOOK_PORT}?token={NOTEBOOK_PASSWORD}",
                        }
                    else:
                        sleep(2)
                        return RedirectResponse(
                            f"http://localhost:{NOTEBOOK_PORT}/lab/tree/quickstart.ipynb?token={NOTEBOOK_PASSWORD}"
                        )
                else:
                    if browserless:
                        return {
                            "success": False,
                            "message": "Notebook server is not enabled. Enable notebooks via the NOTEBOOK_ENABLED environment variable.",
                        }
                    else:
                        return RedirectResponse(f"/notebook-instructions.html")

        if ENABLE_BUILDER:
            from inference.core.interfaces.http.builder.routes import (
                router as builder_router,
            )

            # Allow CORS on builder API and workflow endpoints needed by the builder UI
            # Enables Private Network Access for Chrome 142+ (local development)
            app.add_middleware(
                PathAwareCORSMiddleware,
                match_paths=r"^/(build/api|workflows/).*",
                allow_origins=[BUILDER_ORIGIN],
                allow_methods=["*"],
                allow_headers=["*"],
                allow_credentials=True,
                allow_private_network=True,
            )

            # Attach all routes from builder to the /build prefix
            app.include_router(builder_router, prefix="/build", tags=["builder"])

        if LEGACY_ROUTE_ENABLED:
            # Legacy object detection inference path for backwards compatibility
            @app.get(
                "/{dataset_id}/{version_id:str}",
                # Order matters in this response model Union. It will use the first matching model. For example, Object Detection Inference Response is a subset of Instance segmentation inference response, so instance segmentation must come first in order for the matching logic to work.
                response_model=Union[
                    InstanceSegmentationInferenceResponse,
                    KeypointsDetectionInferenceResponse,
                    ObjectDetectionInferenceResponse,
                    ClassificationInferenceResponse,
                    MultiLabelClassificationInferenceResponse,
                    SemanticSegmentationInferenceResponse,
                    StubResponse,
                    Any,
                ],
                response_model_exclude_none=True,
            )
            @app.post(
                "/{dataset_id}/{version_id:str}",
                # Order matters in this response model Union. It will use the first matching model. For example, Object Detection Inference Response is a subset of Instance segmentation inference response, so instance segmentation must come first in order for the matching logic to work.
                response_model=Union[
                    InstanceSegmentationInferenceResponse,
                    KeypointsDetectionInferenceResponse,
                    ObjectDetectionInferenceResponse,
                    ClassificationInferenceResponse,
                    MultiLabelClassificationInferenceResponse,
                    SemanticSegmentationInferenceResponse,
                    StubResponse,
                    Any,
                ],
                response_model_exclude_none=True,
            )
            @with_route_exceptions
            @usage_collector("request")
            def legacy_infer_from_request(
                background_tasks: BackgroundTasks,
                request: Request,
                request_body: Annotated[
                    Optional[Union[bytes, UploadFile]],
                    Depends(parse_body_content_for_legacy_request_handler),
                ],
                dataset_id: str = Path(
                    description="ID of a Roboflow dataset corresponding to the model to use for inference OR workspace ID"
                ),
                version_id: str = Path(
                    description="ID of a Roboflow dataset version corresponding to the model to use for inference OR model ID"
                ),
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                confidence: Confidence = Query(
                    0.4,
                    description=(
                        "The confidence threshold used to filter out predictions. "
                        'Pass a float in [0, 1], or "best" to use F1-optimal '
                        'thresholds from model evaluation, or "default" to use '
                        "the model's built-in default."
                    ),
                ),
                keypoint_confidence: float = Query(
                    0.0,
                    description="The confidence threshold used to filter out keypoints that are not visible based on model confidence",
                ),
                format: str = Query(
                    "json",
                    description="One of 'json' or 'image'. If 'json' prediction data is return as a JSON string. If 'image' prediction data is visualized and overlayed on the original input image.",
                ),
                image: Optional[str] = Query(
                    None,
                    description="The publically accessible URL of an image to use for inference.",
                ),
                image_type: Optional[str] = Query(
                    "base64",
                    description="One of base64 or numpy. Note, numpy input is not supported for Roboflow Hosted Inference.",
                ),
                labels: Optional[bool] = Query(
                    False,
                    description="If true, labels will be include in any inference visualization.",
                ),
                mask_decode_mode: Optional[str] = Query(
                    "accurate",
                    description="One of 'accurate' or 'fast'. If 'accurate' the mask will be decoded using the original image size. If 'fast' the mask will be decoded using the original mask size. 'accurate' is slower but more accurate.",
                ),
                tradeoff_factor: Optional[float] = Query(
                    0.0,
                    description="The amount to tradeoff between 0='fast' and 1='accurate'",
                ),
                max_detections: int = Query(
                    300,
                    description="The maximum number of detections to return. This is used to limit the number of predictions returned by the model. The model may return more predictions than this number, but only the top `max_detections` predictions will be returned.",
                ),
                overlap: float = Query(
                    0.3,
                    description="The IoU threhsold that must be met for a box pair to be considered duplicate during NMS",
                ),
                stroke: int = Query(
                    1, description="The stroke width used when visualizing predictions"
                ),
                countinference: Optional[bool] = Query(
                    True,
                    description="If false, does not track inference against usage.",
                    include_in_schema=False,
                ),
                service_secret: Optional[str] = Query(
                    None,
                    description="Shared secret used to authenticate requests to the inference server from internal services (e.g. to allow disabling inference usage tracking via the `countinference` query parameter)",
                    include_in_schema=False,
                ),
                disable_preproc_auto_orient: Optional[bool] = Query(
                    False, description="If true, disables automatic image orientation"
                ),
                disable_preproc_contrast: Optional[bool] = Query(
                    False, description="If true, disables automatic contrast adjustment"
                ),
                disable_preproc_grayscale: Optional[bool] = Query(
                    False,
                    description="If true, disables automatic grayscale conversion",
                ),
                disable_preproc_static_crop: Optional[bool] = Query(
                    False, description="If true, disables automatic static crop"
                ),
                disable_active_learning: Optional[bool] = Query(
                    default=False,
                    description="If true, the predictions will be prevented from registration by Active Learning (if the functionality is enabled)",
                ),
                active_learning_target_dataset: Optional[str] = Query(
                    default=None,
                    description="Parameter to be used when Active Learning data registration should happen against different dataset than the one pointed by model_id",
                ),
                source: Optional[str] = Query(
                    "external",
                    description="The source of the inference request",
                ),
                source_info: Optional[str] = Query(
                    "external",
                    description="The detailed source information of the inference request",
                ),
                disable_model_monitoring: Optional[bool] = Query(
                    False,
                    description="If true, disables model monitoring for this request",
                    include_in_schema=False,
                ),
                response_mask_format: Optional[Literal["polygon", "rle"]] = Query(
                    default="polygon",
                    description="The format of the prediction mask - polygon (default) or rle - applicable "
                    "for instance segmentation models.",
                ),
            ):
                """
                Legacy inference endpoint for object detection, instance segmentation, and classification.

                Args:
                    background_tasks: (BackgroundTasks) pool of fastapi background tasks
                    dataset_id (str): ID of a Roboflow dataset corresponding to the model to use for inference OR workspace ID
                    version_id (str): ID of a Roboflow dataset version corresponding to the model to use for inference OR model ID
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    # Other parameters described in the function signature...

                Returns:
                    Union[InstanceSegmentationInferenceResponse, KeypointsDetectionInferenceRequest, ObjectDetectionInferenceResponse, ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse, SemanticSegmentationInferenceResponse, Any]: The response containing the inference results.
                """
                logger.debug(
                    f"Reached legacy route /:dataset_id/:version_id with {dataset_id}/{version_id}"
                )
                model_id = f"{dataset_id}/{version_id}"
                if isinstance(confidence, (int, float)):
                    if confidence >= 1:
                        confidence /= 100
                    if confidence < CONFIDENCE_LOWER_BOUND_OOM_PREVENTION:
                        # allowing lower confidence results in RAM usage explosion
                        confidence = CONFIDENCE_LOWER_BOUND_OOM_PREVENTION

                if overlap >= 1:
                    overlap /= 100
                if image is not None:
                    request_image = InferenceRequestImage(type="url", value=image)
                else:
                    if "Content-Type" not in request.headers:
                        raise ContentTypeMissing(
                            f"Request must include a Content-Type header"
                        )
                    if isinstance(request_body, UploadFile):
                        base64_image_str = request_body.file.read()
                        base64_image_str = base64.b64encode(base64_image_str)
                        request_image = InferenceRequestImage(
                            type="base64", value=base64_image_str.decode("ascii")
                        )
                    elif isinstance(request_body, bytes):
                        request_image = InferenceRequestImage(
                            type=image_type, value=request_body
                        )
                    elif request_body is None:
                        raise InputImageLoadError(
                            message="Image not found in request body.",
                            public_message="Image not found in request body.",
                        )
                    else:
                        raise ContentTypeInvalid(
                            f"Invalid Content-Type: {request.headers['Content-Type']}"
                        )

                if not countinference and service_secret != ROBOFLOW_SERVICE_SECRET:
                    raise MissingServiceSecretError(
                        "Service secret is required to disable inference usage tracking"
                    )
                if LAMBDA:
                    logger.debug("request.scope: %s", request.scope)
                    request_model_id = (
                        request.scope["aws.event"]["requestContext"]["authorizer"][
                            "lambda"
                        ]["model"]["endpoint"]
                        .replace("--", "/")
                        .replace("rf-", "")
                        .replace("nu-", "")
                    )
                    actor = request.scope["aws.event"]["requestContext"]["authorizer"][
                        "lambda"
                    ]["actor"]
                    if countinference:
                        trackUsage(request_model_id, actor)
                    else:
                        if service_secret != ROBOFLOW_SERVICE_SECRET:
                            raise MissingServiceSecretError(
                                "Service secret is required to disable inference usage tracking"
                            )
                        logger.info("Not counting inference for usage")
                else:
                    request_model_id = model_id
                logger.debug(
                    f"State of model registry: {self.model_manager.describe_models()}"
                )
                self.model_manager.add_model(
                    request_model_id,
                    api_key,
                    model_id_alias=model_id,
                    countinference=countinference,
                    service_secret=service_secret,
                )

                task_type = self.model_manager.get_task_type(model_id, api_key=api_key)
                inference_request_type = ObjectDetectionInferenceRequest
                args = dict()
                if task_type == "instance-segmentation":
                    inference_request_type = InstanceSegmentationInferenceRequest
                    args = {
                        "mask_decode_mode": mask_decode_mode,
                        "tradeoff_factor": tradeoff_factor,
                    }
                    if response_mask_format:
                        args["response_mask_format"] = response_mask_format
                elif task_type == "classification":
                    inference_request_type = ClassificationInferenceRequest
                elif task_type == "keypoint-detection":
                    inference_request_type = KeypointsDetectionInferenceRequest
                    args = {"keypoint_confidence": keypoint_confidence}
                elif task_type == "semantic-segmentation":
                    inference_request_type = SemanticSegmentationInferenceRequest
                inference_request = inference_request_type(
                    api_key=api_key,
                    model_id=model_id,
                    image=request_image,
                    confidence=confidence,
                    iou_threshold=overlap,
                    max_detections=max_detections,
                    visualization_labels=labels,
                    visualization_stroke_width=stroke,
                    visualize_predictions=(
                        format == "image" or format == "image_and_json"
                    ),
                    disable_preproc_auto_orient=disable_preproc_auto_orient,
                    disable_preproc_contrast=disable_preproc_contrast,
                    disable_preproc_grayscale=disable_preproc_grayscale,
                    disable_preproc_static_crop=disable_preproc_static_crop,
                    disable_active_learning=disable_active_learning,
                    active_learning_target_dataset=active_learning_target_dataset,
                    source=source,
                    source_info=source_info,
                    usage_billable=countinference,
                    disable_model_monitoring=disable_model_monitoring,
                    **args,
                )
                inference_response = self.model_manager.infer_from_request_sync(
                    inference_request.model_id,
                    inference_request,
                    active_learning_eligible=True,
                    background_tasks=background_tasks,
                )
                logger.debug("Response ready.")
                if format == "image":
                    return Response(
                        content=inference_response.visualization,
                        media_type="image/jpeg",
                    )
                else:
                    return orjson_response(inference_response)

        if not (LAMBDA or GCP_SERVERLESS):
            # Legacy clear cache endpoint for backwards compatibility
            @app.get("/clear_cache", response_model=str)
            def legacy_clear_cache():
                """
                Clears the model cache.

                This endpoint provides a way to clear the cache of loaded models.

                Returns:
                    str: A string indicating that the cache has been cleared.
                """
                logger.debug(f"Reached /clear_cache")
                model_clear()
                return "Cache Cleared"

            # Legacy add model endpoint for backwards compatibility
            @app.get("/start/{dataset_id}/{version_id}")
            def model_add_legacy(
                dataset_id: str,
                version_id: str,
                api_key: str = None,
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Starts a model inference session.

                This endpoint initializes and starts an inference session for the specified model version.

                Args:
                    dataset_id (str): ID of a Roboflow dataset corresponding to the model.
                    version_id (str): ID of a Roboflow dataset version corresponding to the model.
                    api_key (str, optional): Roboflow API Key for artifact retrieval.
                    countinference (Optional[bool]): Whether to count inference or not.
                    service_secret (Optional[str]): The service secret for the request.

                Returns:
                    JSONResponse: A response object containing the status and a success message.
                """
                logger.debug(
                    f"Reached /start/{dataset_id}/{version_id} with {dataset_id}/{version_id}"
                )
                model_id = f"{dataset_id}/{version_id}"
                self.model_manager.add_model(
                    model_id,
                    api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )

                return JSONResponse(
                    {
                        "status": 200,
                        "message": "inference session started from local memory.",
                    }
                )

        if not ENABLE_DASHBOARD:

            @app.get("/dashboard.html")
            @app.head("/dashboard.html")
            async def dashboard_guard():
                return Response(status_code=404)

        @app.exception_handler(InputImageLoadError)
        async def unicorn_exception_handler(request: Request, exc: InputImageLoadError):
            return JSONResponse(
                status_code=400,
                content={
                    "message": f"Could not load input image. Cause: {exc.get_public_error_details()}"
                },
            )

        app.mount(
            "/",
            StaticFiles(directory="./inference/landing/out", html=True),
            name="root",
        )

    def run(self):
        uvicorn.run(self.app, host="127.0.0.1", port=8080)
Methods:
__init__
__init__(model_manager, root_path=None)

Initializes the HttpInterface with given model manager and model registry.

Parameters:

Name Type Description Default
model_manager ModelManager

The manager for handling different models.

required
root_path Optional[str]

The root path for the FastAPI application.

None
Description

Deploy Roboflow trained models to nearly any compute environment!

Source code in inference/core/interfaces/http/http_api.py
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
def __init__(
    self,
    model_manager: ModelManager,
    root_path: Optional[str] = None,
):
    """
    Initializes the HttpInterface with given model manager and model registry.

    Args:
        model_manager (ModelManager): The manager for handling different models.
        root_path (Optional[str]): The root path for the FastAPI application.

    Description:
        Deploy Roboflow trained models to nearly any compute environment!
    """

    description = "Roboflow inference server"

    app = FastAPI(
        title="Roboflow Inference Server",
        description=description,
        version=__version__,
        terms_of_service="https://roboflow.com/terms",
        contact={
            "name": "Roboflow Inc.",
            "url": "https://roboflow.com/contact",
            "email": "help@roboflow.com",
        },
        license_info={
            "name": "Apache 2.0",
            "url": "https://www.apache.org/licenses/LICENSE-2.0.html",
        },
        root_path=root_path,
    )
    # Ensure in-memory logging is initialized as early as possible for all runtimes
    try:
        from inference.core.logging.memory_handler import setup_memory_logging

        setup_memory_logging()
    except Exception:
        pass

    app.mount(
        "/static",
        StaticFiles(directory="./inference/landing/out/static", html=True),
        name="static",
    )
    app.mount(
        "/_next/static",
        StaticFiles(directory="./inference/landing/out/_next/static", html=True),
        name="_next_static",
    )

    # OpenTelemetry: must be set up before any middleware is added
    # so the FastAPI instrumentor wraps at the outermost ASGI layer.
    if OTEL_TRACING_ENABLED:
        setup_telemetry(app)

    @app.middleware("http")
    async def set_request_path_context(request: Request, call_next):
        # CVE-2026-48710: prefer the raw ASGI scope path over
        # request.url.path. This ContextVar feeds downstream registry
        # metadata (_model_request_paths in ModelManagerBase), so a
        # Host-poisoned path would surface in model-info responses.
        token = current_request_path.set(request.scope["path"])
        try:
            return await call_next(request)
        finally:
            current_request_path.reset(token)

    @app.on_event("shutdown")
    async def on_shutdown():
        logger.info("Shutting down %s", description)
        await usage_collector.async_push_usage_payloads()
        if OTEL_TRACING_ENABLED:
            shutdown_telemetry()

    self._instrumentator = InferenceInstrumentator(
        app, model_manager=model_manager, endpoint="/metrics"
    )
    if LAMBDA:
        app.add_middleware(LambdaMiddleware)
    if GCP_SERVERLESS:
        app.add_middleware(GCPServerlessMiddleware)

    if len(ALLOW_ORIGINS) > 0:
        # Add CORS Middleware (but not for /build**, which is controlled separately)
        app.add_middleware(
            PathAwareCORSMiddleware,
            match_paths=r"^(?!/build).*",
            allow_origins=ALLOW_ORIGINS,
            allow_credentials=True,
            allow_methods=["*"],
            allow_headers=["*"],
            expose_headers=[
                PROCESSING_TIME_HEADER,
                REMOTE_PROCESSING_TIME_HEADER,
                REMOTE_PROCESSING_TIMES_HEADER,
                MODEL_COLD_START_HEADER,
                MODEL_COLD_START_COUNT_HEADER,
                MODEL_LOAD_TIME_HEADER,
                MODEL_LOAD_DETAILS_HEADER,
                MODEL_ID_HEADER,
                WORKFLOW_ID_HEADER,
                WORKSPACE_ID_HEADER,
                TRACE_ID_HEADER,
            ]
            + ([EXECUTION_ID_HEADER] if EXECUTION_ID_HEADER is not None else [])
            + ["traceparent", "tracestate"],
        )

    # Optionally add middleware for profiling the FastAPI server and underlying inference API code
    if PROFILE:
        app.add_middleware(
            CProfileMiddleware,
            enable=True,
            server_app=app,
            filename="/profile/output.pstats",
            strip_dirs=False,
            sort_by="cumulative",
        )
    if API_LOGGING_ENABLED:
        app.add_middleware(
            asgi_correlation_id.CorrelationIdMiddleware,
            header_name=CORRELATION_ID_HEADER,
            update_request_header=True,
            generator=lambda: uuid4().hex,
            validator=lambda a: True,
            transformer=lambda a: a,
        )
        if STRUCTURED_API_LOGGING:
            # Suppress uvicorn's default access log to avoid duplicate
            # unstructured entries — we replace it with a structured
            # access log middleware (see structured_access_log below).
            logging.getLogger("uvicorn.access").handlers = []
            logging.getLogger("uvicorn.access").propagate = False
    else:
        app.add_middleware(asgi_correlation_id.CorrelationIdMiddleware)

    if METRICS_ENABLED:

        @app.middleware("http")
        async def count_errors(request: Request, call_next):
            """Middleware to count errors.

            Args:
                request (Request): The incoming request.
                call_next (Callable): The next middleware or endpoint to call.

            Returns:
                Response: The response from the next middleware or endpoint.
            """
            response = await call_next(request)
            if self.model_manager.pingback and response.status_code >= 400:
                self.model_manager.num_errors += 1
            return response

    if not (LAMBDA or GCP_SERVERLESS):

        @app.get("/device/stats")
        def device_stats():
            not_configured_error_message = {
                "error": "Device statistics endpoint is not enabled.",
                "hint": "Mount the Docker socket and point its location when running the docker "
                "container to collect device stats "
                "(i.e. `docker run ... -v /var/run/docker.sock:/var/run/docker.sock "
                "-e DOCKER_SOCKET_PATH=/var/run/docker.sock ...`).",
            }
            if not DOCKER_SOCKET_PATH:
                return JSONResponse(
                    status_code=404,
                    content=not_configured_error_message,
                )
            if not is_docker_socket_mounted(docker_socket_path=DOCKER_SOCKET_PATH):
                return JSONResponse(
                    status_code=500,
                    content=not_configured_error_message,
                )
            container_stats = get_container_stats(
                docker_socket_path=DOCKER_SOCKET_PATH
            )
            return JSONResponse(status_code=200, content=container_stats)

    cached_api_keys: Dict[AuthorizationCacheKey, AuthorizationCacheEntry] = {}

    if GCP_SERVERLESS:

        @app.middleware("http")
        async def check_authorization_serverless(request: Request, call_next):
            request_id, execution_id_value = (
                _prepare_serverless_observability_context(request=request)
            )
            _log_serverless_request_received(
                request=request,
                request_id=request_id,
                execution_id_value=execution_id_value,
            )
            t1 = time.time()

            # exclusions
            # CVE-2026-48710: use the raw ASGI scope path so a malicious
            # Host header (e.g. `Host: x?/docs`) cannot poison request.url.path
            # and slip an authenticated route into the allowlist.
            scope_path = request.scope["path"]
            skip_check = (
                request.method not in ["GET", "POST"]
                or scope_path
                in [
                    "/",
                    "/docs",
                    "/info",
                    "/healthz",  # health check endpoint for liveness probe
                    "/readiness",
                    "/metrics",
                    "/openapi.json",  # needed for /docs and /redoc
                    "/model/registry",  # dont auth this route, usually not used on serverlerless, but queue based serverless uses it internally (not accessible from outside)
                ]
                or scope_path.startswith("/static/")
                or scope_path.startswith("/_next/")
            )

            # for these routes we only want to auth if dynamic python modules are provided
            if scope_path in [
                "/workflows/blocks/describe",
                "/workflows/definition/schema",
            ]:
                if request.method == "GET":
                    skip_check = True

                elif (
                    get_content_type(request) == "application/json"
                    and int(request.headers.get("content-length", 0)) > 0
                ):
                    json_params = await request.json()
                    dynamic_blocks_definitions = json_params.get(
                        "dynamic_blocks_definitions", None
                    )
                    if not dynamic_blocks_definitions:
                        skip_check = True

            if skip_check:
                return await call_next(request)

            def _authorization_error_response(
                status_code: int,
                msg: str,
                workspace_id: Optional[str] = None,
                cache_hit: bool = False,
            ):
                response = JSONResponse(
                    status_code=status_code,
                    content={
                        "status": status_code,
                        "message": msg,
                    },
                )
                _attach_observability_headers_to_early_response(
                    response=response,
                    request_id=request_id,
                    execution_id_value=execution_id_value,
                    processing_time=time.time() - t1,
                    workspace_id=workspace_id,
                )
                _log_serverless_authorization_denial(
                    request=request,
                    status_code=status_code,
                    message=msg,
                    request_id=request_id,
                    execution_id_value=execution_id_value,
                    workspace_id=workspace_id,
                    cache_hit=cache_hit,
                )
                return response

            try:
                with start_span(
                    "serverless.authorization.check",
                    attributes={
                        "http.method": request.method,
                        # CVE-2026-48710: log the real ASGI path. The span
                        # records the auth decision, so it must not be
                        # forgeable via Host header.
                        "http.target": scope_path,
                    },
                ) as auth_span:
                    req_params = request.query_params
                    json_params = dict()
                    api_key = req_params.get("api_key", None)
                    if (
                        api_key is None
                        and get_content_type(request) == "application/json"
                        and int(request.headers.get("content-length", 0)) > 0
                    ):
                        # have to try catch here, because some legacy endpoints that abuse Content-Type header but dont actually receive json
                        try:
                            json_params = await request.json()
                        except Exception:
                            pass
                    api_key = json_params.get("api_key", api_key)

                    if api_key is None:
                        if auth_span is not None:
                            auth_span.set_attribute("http.status_code", 401)
                            auth_span.set_attribute(
                                "auth.result", "missing_api_key"
                            )
                        return _authorization_error_response(
                            401, "Unauthorized api_key"
                        )

                    enforce_credits_verification = (
                        not _is_non_billable_internal_request(
                            req_params=req_params,
                            json_params=json_params,
                        )
                    )
                    cache_key = (api_key, enforce_credits_verification)
                    cache_entry = cached_api_keys.get(cache_key)
                    workspace_id = None
                    if auth_span is not None:
                        auth_span.set_attribute(
                            "auth.enforce_credits_verification",
                            enforce_credits_verification,
                        )
                    if cache_entry and cache_entry.expires_at >= time.time():
                        if auth_span is not None:
                            auth_span.set_attribute("auth.cache_hit", True)
                        if cache_entry.status_code != 200:
                            if auth_span is not None:
                                auth_span.set_attribute(
                                    "http.status_code", cache_entry.status_code
                                )
                                auth_span.set_attribute(
                                    "auth.result", "denied_from_cache"
                                )
                            return _authorization_error_response(
                                cache_entry.status_code,
                                cache_entry.message or "Unauthorized api_key",
                                workspace_id=cache_entry.workspace_id,
                                cache_hit=True,
                            )
                        workspace_id = cache_entry.workspace_id
                    else:
                        if auth_span is not None:
                            auth_span.set_attribute("auth.cache_hit", False)
                        if not enforce_credits_verification:
                            try:
                                workspace_id = await get_roboflow_workspace_async(
                                    api_key=api_key
                                )
                                cached_api_keys[cache_key] = (
                                    AuthorizationCacheEntry(
                                        expires_at=time.time()
                                        + AUTH_CACHE_TTL_SECONDS,
                                        workspace_id=workspace_id,
                                    )
                                )
                            except (
                                RoboflowAPINotAuthorizedError,
                                WorkspaceLoadError,
                            ):
                                cached_api_keys[cache_key] = (
                                    AuthorizationCacheEntry(
                                        expires_at=time.time()
                                        + SHORT_AUTH_CACHE_TTL_SECONDS,
                                        workspace_id=None,
                                        status_code=401,
                                        message="Unauthorized api_key",
                                    )
                                )
                                if auth_span is not None:
                                    auth_span.set_attribute("http.status_code", 401)
                                    auth_span.set_attribute(
                                        "auth.result", "unauthorized"
                                    )
                                return _authorization_error_response(
                                    401,
                                    cached_api_keys[cache_key].message,
                                    cache_hit=False,
                                )
                        else:
                            usage_check_result = (
                                await get_serverless_usage_check_async(
                                    api_key=api_key
                                )
                            )
                            if usage_check_result.status_code == 200:
                                workspace_id = usage_check_result.workspace_id
                                cached_api_keys[cache_key] = (
                                    AuthorizationCacheEntry(
                                        expires_at=time.time()
                                        + AUTH_CACHE_TTL_SECONDS,
                                        workspace_id=workspace_id,
                                    )
                                )
                            elif usage_check_result.status_code == 401:
                                cached_api_keys[cache_key] = (
                                    AuthorizationCacheEntry(
                                        expires_at=time.time()
                                        + SHORT_AUTH_CACHE_TTL_SECONDS,
                                        workspace_id=None,
                                        status_code=401,
                                        message=(
                                            "Unauthorized api_key. This key is not authorized "
                                            "for serverless inference."
                                        ),
                                    )
                                )
                                if auth_span is not None:
                                    auth_span.set_attribute("http.status_code", 401)
                                    auth_span.set_attribute(
                                        "auth.result",
                                        "serverless_inference_unauthorized",
                                    )
                                return _authorization_error_response(
                                    401,
                                    cached_api_keys[cache_key].message,
                                    cache_hit=False,
                                )
                            elif usage_check_result.status_code == 402:
                                message = (
                                    "This workspace cannot currently spend credits for serverless inference. "
                                    "Verify billing or credit cap settings."
                                )
                                if usage_check_result.error:
                                    message = (
                                        f"{message} {usage_check_result.error}"
                                    )
                                cached_api_keys[cache_key] = (
                                    AuthorizationCacheEntry(
                                        expires_at=time.time()
                                        + SHORT_AUTH_CACHE_TTL_SECONDS,
                                        workspace_id=usage_check_result.workspace_id,
                                        status_code=402,
                                        message=message,
                                    )
                                )
                                if auth_span is not None:
                                    auth_span.set_attribute("http.status_code", 402)
                                    auth_span.set_attribute(
                                        "auth.result",
                                        "credits_verification_failed",
                                    )
                                return _authorization_error_response(
                                    402,
                                    cached_api_keys[cache_key].message,
                                    workspace_id=usage_check_result.workspace_id,
                                    cache_hit=False,
                                )

                    if auth_span is not None:
                        auth_span.set_attribute("http.status_code", 200)
                        auth_span.set_attribute("auth.result", "authorized")
                        if workspace_id is not None:
                            auth_span.set_attribute("workspace.id", workspace_id)
            except Exception as error:
                record_error(error)
                raise

            response = await call_next(request)
            if workspace_id:
                response.headers[WORKSPACE_ID_HEADER] = workspace_id
            return response

    if (
        DEDICATED_DEPLOYMENT_WORKSPACE_URL
        or WORKSPACES_WHITELISTED_FOR_LOCAL_DEPLOYMENT
    ):

        @app.middleware("http")
        async def check_authorization(request: Request, call_next):
            # exclusions
            # CVE-2026-48710: use the raw ASGI scope path so a malicious
            # Host header (e.g. `Host: x?/docs`) cannot poison request.url.path
            # and slip an authenticated route into the allowlist.
            scope_path = request.scope["path"]
            skip_check = (
                request.method not in ["GET", "POST"]
                or scope_path
                in [
                    "/",
                    "/docs",
                    "/redoc",
                    "/info",
                    "/healthz",  # health check endpoint for liveness probe
                    "/readiness",
                    "/metrics",
                    "/openapi.json",  # needed for /docs and /redoc
                ]
                or scope_path.startswith("/static/")
                or scope_path.startswith("/_next/")
            )
            if skip_check:
                return await call_next(request)

            def _unauthorized_response(msg):
                return JSONResponse(
                    status_code=401,
                    content={
                        "status": 401,
                        "message": msg,
                    },
                )

            # check api_key
            req_params = request.query_params
            json_params = dict()
            api_key = req_params.get("api_key", None)
            if (
                api_key is None
                and get_content_type(request) == "application/json"
                and int(request.headers.get("content-length", 0)) > 0
            ):
                # have to try catch here, because some legacy endpoints that abuse Content-Type header but dont actually receive json
                try:
                    json_params = await request.json()
                except Exception:
                    pass
            api_key = json_params.get("api_key", api_key)

            if api_key is None:
                return _unauthorized_response("Unauthorized api_key")

            cache_entry = cached_api_keys.get(api_key)
            workspace_id = None
            if cache_entry and cache_entry.expires_at >= time.time():
                if cache_entry.status_code != 200:
                    return _unauthorized_response("Unauthorized api_key")
                workspace_id = cache_entry.workspace_id
            else:
                try:
                    if api_key is None:
                        workspace_id = None
                    else:
                        workspace_id = await get_roboflow_workspace_async(
                            api_key=api_key
                        )
                    allowed_workspaces = set()
                    if DEDICATED_DEPLOYMENT_WORKSPACE_URL:
                        allowed_workspaces.add(DEDICATED_DEPLOYMENT_WORKSPACE_URL)
                    if WORKSPACES_WHITELISTED_FOR_LOCAL_DEPLOYMENT:
                        allowed_workspaces.update(
                            WORKSPACES_WHITELISTED_FOR_LOCAL_DEPLOYMENT
                        )
                    if workspace_id not in allowed_workspaces:
                        return _unauthorized_response("Unauthorized api_key")

                    cached_api_keys[api_key] = AuthorizationCacheEntry(
                        expires_at=time.time() + AUTH_CACHE_TTL_SECONDS,
                        workspace_id=workspace_id,
                    )
                except (RoboflowAPINotAuthorizedError, WorkspaceLoadError):
                    return _unauthorized_response("Unauthorized api_key")

            response = await call_next(request)
            if workspace_id:
                response.headers[WORKSPACE_ID_HEADER] = workspace_id
            return response

    @app.middleware("http")
    async def add_inference_engine_headers(request: Request, call_next):
        response = await call_next(request)
        inference_engine = (
            "inference-models" if USE_INFERENCE_MODELS else "old-inference"
        )
        response.headers["x-inference-engine"] = inference_engine
        return response

    @app.middleware("http")
    async def track_model_load(request: Request, call_next):
        load_collector = ModelLoadCollector()
        model_load_info.set(load_collector)
        ids_collector = RequestModelIds()
        request_model_ids.set(ids_collector)
        response = await call_next(request)
        remote_processing_collector = getattr(
            request.state, "remote_processing_time_collector", None
        )
        if remote_processing_collector is not None:
            remote_model_ids = remote_processing_collector.snapshot_model_ids()
            remote_cold_start_entries = (
                remote_processing_collector.snapshot_cold_start_entries()
            )
            remote_cold_start_count = (
                remote_processing_collector.snapshot_cold_start_count()
            )
            remote_cold_start_total_load_time = (
                remote_processing_collector.snapshot_cold_start_total_load_time()
            )
        else:
            remote_model_ids = set()
            remote_cold_start_entries = []
            remote_cold_start_count = 0
            remote_cold_start_total_load_time = 0.0
        response.headers.update(
            build_model_response_headers(
                local_model_ids=ids_collector.get_ids(),
                local_cold_start_entries=load_collector.snapshot_entries(),
                remote_model_ids=remote_model_ids,
                remote_cold_start_entries=remote_cold_start_entries,
                remote_cold_start_count=remote_cold_start_count,
                remote_cold_start_total_load_time=remote_cold_start_total_load_time,
            )
        )
        wf_id = request_workflow_id.get(None)
        if wf_id:
            response.headers[WORKFLOW_ID_HEADER] = wf_id
        return response

    if API_LOGGING_ENABLED and STRUCTURED_API_LOGGING:

        @app.middleware("http")
        async def structured_access_log(request: Request, call_next):
            response = await call_next(request)
            log_fields = {
                "method": request.method,
                "path": request.url.path,
                "status_code": response.status_code,
            }

            # Read request_id and execution_id from response headers
            # instead of ContextVars — @app.middleware("http") uses
            # BaseHTTPMiddleware which runs the inner chain in a
            # separate asyncio task, so ContextVars set by inner
            # middlewares are not visible here.
            header_fields = {
                "request_id": CORRELATION_ID_HEADER,
                "processing_time": PROCESSING_TIME_HEADER,
                "model_cold_start": MODEL_COLD_START_HEADER,
                "model_cold_start_count": MODEL_COLD_START_COUNT_HEADER,
                "model_load_time": MODEL_LOAD_TIME_HEADER,
                "model_id": MODEL_ID_HEADER,
                "workflow_id": WORKFLOW_ID_HEADER,
                "workspace_id": WORKSPACE_ID_HEADER,
            }
            if EXECUTION_ID_HEADER is not None:
                header_fields["execution_id"] = EXECUTION_ID_HEADER
            for field_name, header_name in header_fields.items():
                value = response.headers.get(header_name)
                if value is not None:
                    log_fields[field_name] = value

            # Extract trace_id from traceparent header if present
            # (reading from header due to ContextVar isolation in BaseHTTPMiddleware)
            traceparent = request.headers.get("traceparent")
            if traceparent:
                parts = traceparent.split("-")
                if len(parts) >= 3:
                    log_fields["trace_id"] = parts[1]

            logger.info(
                f"{request.method} {request.url.path} {response.status_code}",
                **log_fields,
            )
            return response

    self.app = app
    self.model_manager = model_manager
    self.stream_manager_client: Optional[StreamManagerClient] = None
    self.shared_thread_pool_executor: Optional[ThreadPoolExecutor] = None
    if HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_ENABLED:
        self.shared_thread_pool_executor = ThreadPoolExecutor(
            max_workers=HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_WORKERS
        )
    self.inference_models_cache_daemon: Optional[InferenceModelsCacheWatchdog] = (
        None
    )
    if USE_INFERENCE_MODELS and MAX_INFERENCE_MODELS_CACHE_SIZE_MB > 0:
        from inference_models.configuration import INFERENCE_HOME

        self.inference_models_cache_daemon = InferenceModelsCacheWatchdog(
            inference_home=INFERENCE_HOME,
            max_cache_size_mb=MAX_INFERENCE_MODELS_CACHE_SIZE_MB,
            interval_minutes=INFERENCE_MODELS_CACHE_WATCHDOG_INTERVAL_MINUTES,
        )
        self.inference_models_cache_daemon.start()

    if ENABLE_STREAM_API:
        operations_timeout = os.getenv("STREAM_MANAGER_OPERATIONS_TIMEOUT")
        if operations_timeout is not None:
            operations_timeout = float(operations_timeout)
        self.stream_manager_client = StreamManagerClient.init(
            host=os.getenv("STREAM_MANAGER_HOST", "127.0.0.1"),
            port=int(os.getenv("STREAM_MANAGER_PORT", "7070")),
            operations_timeout=operations_timeout,
        )
        self._instrumentator.set_stream_manager_client(self.stream_manager_client)

    def process_inference_request(
        inference_request: InferenceRequest,
        api_key: Optional[str] = None,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
        **kwargs,
    ) -> InferenceResponse:
        """Processes an inference request by calling the appropriate model.

        Args:
            inference_request (InferenceRequest): The request containing model ID and other inference details.
            countinference (Optional[bool]): Whether to count inference for usage.
            service_secret (Optional[str]): The service secret.

        Returns:
            InferenceResponse: The response containing the inference results.
        """
        if api_key is not None:
            inference_request.api_key = api_key
        requested_model_id = inference_request.model_id
        de_aliased_model_id = resolve_roboflow_model_alias(
            model_id=requested_model_id
        )
        model_id_alias = (
            requested_model_id
            if de_aliased_model_id != requested_model_id
            else None
        )
        self.model_manager.add_model(
            de_aliased_model_id,
            inference_request.api_key,
            model_id_alias=model_id_alias,
            countinference=countinference,
            service_secret=service_secret,
        )
        inference_model_id = (
            requested_model_id
            if model_id_alias is not None
            else de_aliased_model_id
        )
        resp = self.model_manager.infer_from_request_sync(
            inference_model_id,
            inference_request,
            **kwargs,
        )
        return orjson_response(resp)

    def process_workflow_inference_request(
        workflow_request: WorkflowInferenceRequest,
        workflow_specification: dict,
        background_tasks: Optional[BackgroundTasks],
        profiler: WorkflowsProfiler,
    ) -> WorkflowInferenceResponse:
        if workflow_request.workflow_id:
            request_workflow_id.set(workflow_request.workflow_id)

        workflow_init_parameters = {
            "workflows_core.model_manager": model_manager,
            "workflows_core.api_key": workflow_request.api_key,
            "workflows_core.background_tasks": background_tasks,
        }
        with start_span(
            "workflow.init",
            {"workflow.id": workflow_request.workflow_id or ""},
        ):
            execution_engine = ExecutionEngine.init(
                workflow_definition=workflow_specification,
                init_parameters=workflow_init_parameters,
                max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
                prevent_local_images_loading=True,
                profiler=profiler,
                executor=self.shared_thread_pool_executor,
                workflow_id=workflow_request.workflow_id,
            )
        is_preview = False
        if hasattr(workflow_request, "is_preview"):
            is_preview = workflow_request.is_preview
        workflow_results = execution_engine.run(
            runtime_parameters=workflow_request.inputs,
            serialize_results=True,
            _is_preview=is_preview,
        )
        with profiler.profile_execution_phase(
            name="workflow_results_filtering",
            categories=["inference_package_operation"],
        ):
            outputs = filter_out_unwanted_workflow_outputs(
                workflow_results=workflow_results,
                excluded_fields=workflow_request.excluded_fields,
            )
        profiler_trace = profiler.export_trace()
        response = WorkflowInferenceResponse(
            outputs=outputs,
            profiler_trace=profiler_trace,
        )
        return orjson_response(response=response)

    def load_core_model(
        inference_request: InferenceRequest,
        api_key: Optional[str] = None,
        core_model: str = None,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
    ) -> None:
        """Loads a core model (e.g., "clip" or "sam") into the model manager.

        Args:
            inference_request (InferenceRequest): The request containing version and other details.
            api_key (Optional[str]): The API key for the request.
            core_model (str): The core model type, e.g., "clip" or "sam".
            countinference (Optional[bool]): Whether to count inference or not.
            service_secret (Optional[str]): The service secret for the request.

        Returns:
            str: The core model ID.
        """
        if api_key:
            inference_request.api_key = api_key
        version_id_field = f"{core_model}_version_id"
        core_model_id = (
            f"{core_model}/{inference_request.__getattribute__(version_id_field)}"
        )
        self.model_manager.add_model(
            core_model_id,
            inference_request.api_key,
            endpoint_type=ModelEndpointType.CORE_MODEL,
            countinference=countinference,
            service_secret=service_secret,
        )
        return core_model_id

    load_clip_model = partial(load_core_model, core_model="clip")
    """Loads the CLIP model into the model manager.

    Args:
    Same as `load_core_model`.

    Returns:
    The CLIP model ID.
    """

    load_pe_model = partial(load_core_model, core_model="perception_encoder")
    """Loads the Perception Encoder model into the model manager.

    Args:
    Same as `load_core_model`.

    Returns:
    The Perception Encoder model ID.
    """

    load_sam_model = partial(load_core_model, core_model="sam")
    """Loads the SAM model into the model manager.

    Args:
    Same as `load_core_model`.

    Returns:
    The SAM model ID.
    """
    load_sam2_model = partial(load_core_model, core_model="sam2")
    """Loads the SAM2 model into the model manager.

    Args:
    Same as `load_core_model`.

    Returns:
    The SAM2 model ID.
    """

    load_doctr_model = partial(load_core_model, core_model="doctr")
    """Loads the DocTR model into the model manager.

    Args:
    Same as `load_core_model`.

    Returns:
    The DocTR model ID.
    """

    load_easy_ocr_model = partial(load_core_model, core_model="easy_ocr")
    """Loads the EasyOCR model into the model manager.

    Args:
    Same as `load_core_model`.

    Returns:
    The EasyOCR model ID.
    """

    load_paligemma_model = partial(load_core_model, core_model="paligemma")

    load_grounding_dino_model = partial(
        load_core_model, core_model="grounding_dino"
    )
    """Loads the Grounding DINO model into the model manager.

    Args:
    Same as `load_core_model`.

    Returns:
    The Grounding DINO model ID.
    """

    load_yolo_world_model = partial(load_core_model, core_model="yolo_world")
    load_owlv2_model = partial(load_core_model, core_model="owlv2")
    """Loads the YOLO World model into the model manager.

    Args:
    Same as `load_core_model`.

    Returns:
    The YOLO World model ID.
    """

    load_trocr_model = partial(load_core_model, core_model="trocr")
    """Loads the TrOCR model into the model manager.

    Args:
    Same as `load_core_model`.

    Returns:
    The TrOCR model ID.
    """

    @app.get(
        "/info",
        response_model=ServerVersionInfo,
        summary="Info",
        description="Get the server name and version number",
    )
    def root():
        """Endpoint to get the server name and version number.

        Returns:
            ServerVersionInfo: The server version information.
        """
        return ServerVersionInfo(
            name="Roboflow Inference Server",
            version=__version__,
            uuid=GLOBAL_INFERENCE_SERVER_ID,
        )

    @app.get(
        "/logs",
        summary="Get Recent Logs",
        description="Get recent application logs for debugging",
    )
    def get_logs(
        limit: Optional[int] = Query(
            100, description="Maximum number of log entries to return"
        ),
        level: Optional[str] = Query(
            None,
            description="Filter by log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)",
        ),
        since: Optional[str] = Query(
            None, description="Return logs since this ISO timestamp"
        ),
    ):
        """Get recent application logs from memory.

        Only available when ENABLE_IN_MEMORY_LOGS environment variable is set to 'true'.

        Args:
            limit: Maximum number of log entries (default 100)
            level: Filter by log level
            since: ISO timestamp to filter logs since

        Returns:
            List of log entries with timestamp, level, logger, and message
        """
        # Check if in-memory logging is enabled
        from inference.core.logging.memory_handler import (
            get_recent_logs,
            is_memory_logging_enabled,
        )

        if not is_memory_logging_enabled():
            raise HTTPException(
                status_code=404, detail="Logs endpoint not available"
            )

        try:
            logs = get_recent_logs(limit=limit or 100, level=level, since=since)
            return {"logs": logs, "total_count": len(logs)}
        except (ImportError, ModuleNotFoundError):
            raise HTTPException(
                status_code=500, detail="Logging system not properly initialized"
            )

    if not LAMBDA and GET_MODEL_REGISTRY_ENABLED:

        @app.get(
            "/model/registry",
            response_model=ModelsDescriptions,
            summary="Get model keys",
            description="Get the ID of each loaded model",
        )
        def registry():
            """Get the ID of each loaded model in the registry.

            Returns:
                ModelsDescriptions: The object containing models descriptions
            """
            logger.debug(f"Reached /model/registry")
            models_descriptions = self.model_manager.describe_models()
            return ModelsDescriptions.from_models_descriptions(
                models_descriptions=models_descriptions
            )

    # The current AWS Lambda authorizer only supports path parameters, therefore we can only use the legacy infer route. This case statement excludes routes which won't work for the current Lambda authorizer.
    if not (LAMBDA or GCP_SERVERLESS):

        @app.post(
            "/model/add",
            response_model=ModelsDescriptions,
            summary="Load a model",
            description="Load the model with the given model ID",
        )
        @with_route_exceptions
        def model_add(
            request: AddModelRequest,
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ):
            """Load the model with the given model ID into the model manager.

            Args:
                request (AddModelRequest): The request containing the model ID and optional API key.
                countinference (Optional[bool]): Whether to count inference or not.
                service_secret (Optional[str]): The service secret for the request.

            Returns:
                ModelsDescriptions: The object containing models descriptions
            """
            logger.debug(f"Reached /model/add")
            de_aliased_model_id = resolve_roboflow_model_alias(
                model_id=request.model_id
            )
            logger.info(f"Loading model: {de_aliased_model_id}")
            self.model_manager.add_model(
                de_aliased_model_id,
                request.api_key,
                countinference=countinference,
                service_secret=service_secret,
            )
            models_descriptions = self.model_manager.describe_models()
            return ModelsDescriptions.from_models_descriptions(
                models_descriptions=models_descriptions
            )

        @app.post(
            "/model/remove",
            response_model=ModelsDescriptions,
            summary="Remove a model",
            description="Remove the model with the given model ID",
        )
        @with_route_exceptions
        def model_remove(request: ClearModelRequest):
            """Remove the model with the given model ID from the model manager.

            Args:
                request (ClearModelRequest): The request containing the model ID to be removed.

            Returns:
                ModelsDescriptions: The object containing models descriptions
            """
            logger.debug(f"Reached /model/remove")
            de_aliased_model_id = resolve_roboflow_model_alias(
                model_id=request.model_id
            )
            self.model_manager.remove(de_aliased_model_id)
            models_descriptions = self.model_manager.describe_models()
            return ModelsDescriptions.from_models_descriptions(
                models_descriptions=models_descriptions
            )

        @app.post(
            "/model/clear",
            response_model=ModelsDescriptions,
            summary="Remove all models",
            description="Remove all loaded models",
        )
        @with_route_exceptions
        def model_clear():
            """Remove all loaded models from the model manager.

            Returns:
                ModelsDescriptions: The object containing models descriptions
            """
            logger.debug(f"Reached /model/clear")
            self.model_manager.clear()
            models_descriptions = self.model_manager.describe_models()
            return ModelsDescriptions.from_models_descriptions(
                models_descriptions=models_descriptions
            )

    # these NEW endpoints need authentication protection
    if not LAMBDA and not GCP_SERVERLESS:

        @app.post(
            "/infer/object_detection",
            response_model=Union[
                ObjectDetectionInferenceResponse,
                List[ObjectDetectionInferenceResponse],
                StubResponse,
            ],
            summary="Object detection infer",
            description="Run inference with the specified object detection model",
            response_model_exclude_none=True,
        )
        @with_route_exceptions
        @usage_collector("request")
        def infer_object_detection(
            inference_request: ObjectDetectionInferenceRequest,
            background_tasks: BackgroundTasks,
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ):
            """Run inference with the specified object detection model.

            Args:
                inference_request (ObjectDetectionInferenceRequest): The request containing the necessary details for object detection.
                background_tasks: (BackgroundTasks) pool of fastapi background tasks

            Returns:
                Union[ObjectDetectionInferenceResponse, List[ObjectDetectionInferenceResponse]]: The response containing the inference results.
            """
            logger.debug(f"Reached /infer/object_detection")
            return process_inference_request(
                inference_request,
                active_learning_eligible=True,
                background_tasks=background_tasks,
                countinference=countinference,
                service_secret=service_secret,
            )

        @app.post(
            "/infer/instance_segmentation",
            response_model=Union[
                InstanceSegmentationInferenceResponse, StubResponse
            ],
            summary="Instance segmentation infer",
            description="Run inference with the specified instance segmentation model",
        )
        @with_route_exceptions
        @usage_collector("request")
        def infer_instance_segmentation(
            inference_request: InstanceSegmentationInferenceRequest,
            background_tasks: BackgroundTasks,
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ):
            """Run inference with the specified instance segmentation model.

            Args:
                inference_request (InstanceSegmentationInferenceRequest): The request containing the necessary details for instance segmentation.
                background_tasks: (BackgroundTasks) pool of fastapi background tasks

            Returns:
                InstanceSegmentationInferenceResponse: The response containing the inference results.
            """
            logger.debug(f"Reached /infer/instance_segmentation")
            return process_inference_request(
                inference_request,
                active_learning_eligible=True,
                background_tasks=background_tasks,
                countinference=countinference,
                service_secret=service_secret,
            )

        @app.post(
            "/infer/semantic_segmentation",
            response_model=Union[
                SemanticSegmentationInferenceResponse, StubResponse
            ],
            summary="Semantic segmentation infer",
            description="Run inference with the specified semantic segmentation model",
        )
        @with_route_exceptions
        @usage_collector("request")
        def infer_semantic_segmentation(
            inference_request: SemanticSegmentationInferenceRequest,
            background_tasks: BackgroundTasks,
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ):
            """Run inference with the specified semantic segmentation model.

            Args:
                inference_request (SemanticSegmentationInferenceRequest): The request containing the necessary details for semantic segmentation.
                background_tasks: (BackgroundTasks) pool of fastapi background tasks

            Returns:
                SemanticSegmentationInferenceResponse: The response containing the inference results.
            """
            logger.debug(f"Reached /infer/semantic_segmentation")
            return process_inference_request(
                inference_request,
                active_learning_eligible=True,
                background_tasks=background_tasks,
                countinference=countinference,
                service_secret=service_secret,
            )

        @app.post(
            "/infer/classification",
            response_model=Union[
                ClassificationInferenceResponse,
                MultiLabelClassificationInferenceResponse,
                StubResponse,
            ],
            summary="Classification infer",
            description="Run inference with the specified classification model",
        )
        @with_route_exceptions
        @usage_collector("request")
        def infer_classification(
            inference_request: ClassificationInferenceRequest,
            background_tasks: BackgroundTasks,
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ):
            """Run inference with the specified classification model.

            Args:
                inference_request (ClassificationInferenceRequest): The request containing the necessary details for classification.
                background_tasks: (BackgroundTasks) pool of fastapi background tasks

            Returns:
                Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]: The response containing the inference results.
            """
            logger.debug(f"Reached /infer/classification")
            return process_inference_request(
                inference_request,
                active_learning_eligible=True,
                background_tasks=background_tasks,
                countinference=countinference,
                service_secret=service_secret,
            )

        @app.post(
            "/infer/keypoints_detection",
            response_model=Union[KeypointsDetectionInferenceResponse, StubResponse],
            summary="Keypoints detection infer",
            description="Run inference with the specified keypoints detection model",
        )
        @with_route_exceptions
        @usage_collector("request")
        def infer_keypoints(
            inference_request: KeypointsDetectionInferenceRequest,
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ):
            """Run inference with the specified keypoints detection model.

            Args:
                inference_request (KeypointsDetectionInferenceRequest): The request containing the necessary details for keypoints detection.

            Returns:
                Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]: The response containing the inference results.
            """
            logger.debug(f"Reached /infer/keypoints_detection")
            return process_inference_request(
                inference_request,
                countinference=countinference,
                service_secret=service_secret,
            )

    if not LAMBDA and (LMM_ENABLED or MOONDREAM2_ENABLED):

        @app.post(
            "/infer/lmm",
            response_model=Union[
                LMMInferenceResponse,
                List[LMMInferenceResponse],
                StubResponse,
            ],
            summary="Large multi-modal model infer",
            description="Run inference with the specified large multi-modal model",
            response_model_exclude_none=True,
        )
        @with_route_exceptions
        @usage_collector("request")
        def infer_lmm(
            inference_request: LMMInferenceRequest,
            api_key: Optional[str] = Query(
                None,
                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
            ),
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ):
            """Run inference with the specified large multi-modal model.

            Args:
                inference_request (LMMInferenceRequest): The request containing the necessary details for LMM inference.

            Returns:
                Union[LMMInferenceResponse, List[LMMInferenceResponse]]: The response containing the inference results.
            """
            logger.debug(f"Reached /infer/lmm")
            return process_inference_request(
                inference_request,
                api_key=api_key,
                countinference=countinference,
                service_secret=service_secret,
            )

        @app.post(
            "/infer/lmm/{model_id:path}",
            response_model=Union[
                LMMInferenceResponse,
                List[LMMInferenceResponse],
                StubResponse,
            ],
            summary="Large multi-modal model infer with model ID in path",
            description="Run inference with the specified large multi-modal model. Model ID is specified in the URL path (can contain slashes).",
            response_model_exclude_none=True,
        )
        @with_route_exceptions
        @usage_collector("request")
        def infer_lmm_with_model_id(
            model_id: str,
            inference_request: LMMInferenceRequest,
            api_key: Optional[str] = Query(
                None,
                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
            ),
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ):
            """Run inference with the specified large multi-modal model.

            The model_id can be specified in the URL path. If model_id is also provided
            in the request body, it must match the path parameter.

            Args:
                model_id (str): The model identifier from the URL path.
                inference_request (LMMInferenceRequest): The request containing the necessary details for LMM inference.

            Returns:
                Union[LMMInferenceResponse, List[LMMInferenceResponse]]: The response containing the inference results.

            Raises:
                HTTPException: If model_id in path and request body don't match.
            """
            logger.debug(f"Reached /infer/lmm/{model_id}")

            # Validate model_id consistency between path and request body
            if (
                inference_request.model_id is not None
                and inference_request.model_id != model_id
            ):
                raise HTTPException(
                    status_code=400,
                    detail=f"Model ID mismatch: path specifies '{model_id}' but request body specifies '{inference_request.model_id}'",
                )

            # Set the model_id from path if not in request body
            inference_request.model_id = model_id

            return process_inference_request(
                inference_request,
                api_key=api_key,
                countinference=countinference,
                service_secret=service_secret,
            )

    if not DISABLE_WORKFLOW_ENDPOINTS:

        @app.post(
            "/{workspace_name}/workflows/{workflow_id}/describe_interface",
            response_model=DescribeInterfaceResponse,
            summary="Endpoint to describe interface of predefined workflow",
            description="Checks Roboflow API for workflow definition, once acquired - describes workflow inputs and outputs",
        )
        @with_route_exceptions
        def describe_predefined_workflow_interface(
            workspace_name: str,
            workflow_id: str,
            workflow_request: PredefinedWorkflowDescribeInterfaceRequest,
        ) -> DescribeInterfaceResponse:
            workflow_specification = get_workflow_specification(
                api_key=workflow_request.api_key,
                workspace_id=workspace_name,
                workflow_id=workflow_id,
                use_cache=workflow_request.use_cache,
                workflow_version_id=workflow_request.workflow_version_id,
            )
            return handle_describe_workflows_interface(
                definition=workflow_specification,
            )

        @app.post(
            "/workflows/describe_interface",
            response_model=DescribeInterfaceResponse,
            summary="Endpoint to describe interface of workflow given in request",
            description="Parses workflow definition and retrieves describes inputs and outputs",
        )
        @with_route_exceptions
        def describe_workflow_interface(
            workflow_request: WorkflowSpecificationDescribeInterfaceRequest,
        ) -> DescribeInterfaceResponse:
            return handle_describe_workflows_interface(
                definition=workflow_request.specification,
            )

        @app.post(
            "/{workspace_name}/workflows/{workflow_id}",
            response_model=WorkflowInferenceResponse,
            summary="Endpoint to run predefined workflow",
            description="Checks Roboflow API for workflow definition, once acquired - parses and executes injecting runtime parameters from request body",
        )
        @app.post(
            "/infer/workflows/{workspace_name}/{workflow_id}",
            response_model=WorkflowInferenceResponse,
            summary="[LEGACY] Endpoint to run predefined workflow",
            description="Checks Roboflow API for workflow definition, once acquired - parses and executes injecting runtime parameters from request body. This endpoint is deprecated and will be removed end of Q2 2024",
            deprecated=True,
        )
        @with_route_exceptions
        @usage_collector("request")
        def infer_from_predefined_workflow(
            workspace_name: str,
            workflow_id: str,
            workflow_request: PredefinedWorkflowInferenceRequest,
            background_tasks: BackgroundTasks,
        ) -> WorkflowInferenceResponse:
            # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
            if ENABLE_WORKFLOWS_PROFILING and workflow_request.enable_profiling:
                profiler = BaseWorkflowsProfiler.init(
                    max_runs_in_buffer=WORKFLOWS_PROFILER_BUFFER_SIZE,
                )
            else:
                profiler = NullWorkflowsProfiler.init()
            with profiler.profile_execution_phase(
                name="workflow_definition_fetching",
                categories=["inference_package_operation"],
            ):
                workflow_specification = get_workflow_specification(
                    api_key=workflow_request.api_key,
                    workspace_id=workspace_name,
                    workflow_id=workflow_id,
                    use_cache=workflow_request.use_cache,
                    workflow_version_id=workflow_request.workflow_version_id,
                )
            if not workflow_request.workflow_id:
                workflow_request.workflow_id = workflow_id
            if not workflow_specification.get("id"):
                logger.warning(
                    "Internal workflow ID missing in specification for '%s'",
                    workflow_id,
                )
            return process_workflow_inference_request(
                workflow_request=workflow_request,
                workflow_specification=workflow_specification,
                background_tasks=(
                    background_tasks if not (LAMBDA or GCP_SERVERLESS) else None
                ),
                profiler=profiler,
            )

        @app.post(
            "/workflows/run",
            response_model=WorkflowInferenceResponse,
            summary="Endpoint to run workflow specification provided in payload",
            description="Parses and executes workflow specification, injecting runtime parameters from request body.",
        )
        @app.post(
            "/infer/workflows",
            response_model=WorkflowInferenceResponse,
            summary="[LEGACY] Endpoint to run workflow specification provided in payload",
            description="Parses and executes workflow specification, injecting runtime parameters from request body. This endpoint is deprecated and will be removed end of Q2 2024.",
            deprecated=True,
        )
        @with_route_exceptions
        @usage_collector("request")
        def infer_from_workflow(
            workflow_request: WorkflowSpecificationInferenceRequest,
            background_tasks: BackgroundTasks,
        ) -> WorkflowInferenceResponse:
            # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
            if ENABLE_WORKFLOWS_PROFILING and workflow_request.enable_profiling:
                profiler = BaseWorkflowsProfiler.init(
                    max_runs_in_buffer=WORKFLOWS_PROFILER_BUFFER_SIZE,
                )
            else:
                profiler = NullWorkflowsProfiler.init()
            return process_workflow_inference_request(
                workflow_request=workflow_request,
                workflow_specification=workflow_request.specification,
                background_tasks=(
                    background_tasks if not (LAMBDA or GCP_SERVERLESS) else None
                ),
                profiler=profiler,
            )

        @app.get(
            "/workflows/execution_engine/versions",
            response_model=ExecutionEngineVersions,
            summary="Returns available Execution Engine versions sorted from oldest to newest",
            description="Returns available Execution Engine versions sorted from oldest to newest",
        )
        @with_route_exceptions
        def get_execution_engine_versions() -> ExecutionEngineVersions:
            # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
            versions = get_available_versions()
            return ExecutionEngineVersions(versions=versions)

        @app.get(
            "/workflows/blocks/describe",
            response_model=WorkflowsBlocksDescription,
            summary="[LEGACY] Endpoint to get definition of workflows blocks that are accessible",
            description="Endpoint provides detailed information about workflows building blocks that are "
            "accessible in the inference server. This information could be used to programmatically "
            "build / display workflows.",
            deprecated=True,
        )
        @with_route_exceptions
        def describe_workflows_blocks(
            request: Request,
            air_gapped: bool = Query(False),
        ) -> Union[WorkflowsBlocksDescription, Response]:
            result = handle_describe_workflows_blocks_request(
                air_gapped=air_gapped,
            )
            return gzip_response_if_requested(request=request, response=result)

        @app.post(
            "/workflows/blocks/describe",
            response_model=WorkflowsBlocksDescription,
            summary="[EXPERIMENTAL] Endpoint to get definition of workflows blocks that are accessible",
            description="Endpoint provides detailed information about workflows building blocks that are "
            "accessible in the inference server. This information could be used to programmatically "
            "build / display workflows. Additionally - in request body one can specify list of "
            "dynamic blocks definitions which will be transformed into blocks and used to generate "
            "schemas and definitions of connections",
        )
        @with_route_exceptions
        def describe_workflows_blocks(
            request: Request,
            request_payload: Optional[DescribeBlocksRequest] = None,
            air_gapped: bool = Query(False),
        ) -> Union[WorkflowsBlocksDescription, Response]:
            # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
            dynamic_blocks_definitions = None
            requested_execution_engine_version = None
            api_key = None
            if request_payload is not None:
                dynamic_blocks_definitions = (
                    request_payload.dynamic_blocks_definitions
                )
                requested_execution_engine_version = (
                    request_payload.execution_engine_version
                )
                api_key = request_payload.api_key or request.query_params.get(
                    "api_key", None
                )
            result = handle_describe_workflows_blocks_request(
                dynamic_blocks_definitions=dynamic_blocks_definitions,
                requested_execution_engine_version=requested_execution_engine_version,
                api_key=api_key,
                air_gapped=air_gapped,
            )
            return gzip_response_if_requested(request=request, response=result)

        @app.get(
            "/workflows/definition/schema",
            response_model=WorkflowsBlocksSchemaDescription,
            summary="Endpoint to fetch the workflows block schema",
            description="Endpoint to fetch the schema of all available blocks. This information can be "
            "used to validate workflow definitions and suggest syntax in the JSON editor.",
        )
        @with_route_exceptions
        def get_workflow_schema(
            request: Request,
        ) -> WorkflowsBlocksSchemaDescription:
            result = get_workflow_schema_description()
            return gzip_response_if_requested(request, response=result)

        @app.post(
            "/workflows/blocks/dynamic_outputs",
            response_model=List[OutputDefinition],
            summary="[EXPERIMENTAL] Endpoint to get definition of dynamic output for workflow step",
            description="Endpoint to be used when step outputs can be discovered only after "
            "filling manifest with data.",
        )
        @with_route_exceptions
        def get_dynamic_block_outputs(
            step_manifest: Dict[str, Any],
        ) -> List[OutputDefinition]:
            # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
            # Potentially TODO: dynamic blocks do not support dynamic outputs, but if it changes
            # we need to provide dynamic blocks manifests here
            dummy_workflow_definition = {
                "version": "1.0",
                "inputs": [],
                "steps": [step_manifest],
                "outputs": [],
            }
            available_blocks = load_workflow_blocks()
            parsed_definition = parse_workflow_definition(
                raw_workflow_definition=dummy_workflow_definition,
                available_blocks=available_blocks,
            )
            parsed_manifest = parsed_definition.steps[0]
            return parsed_manifest.get_actual_outputs()

        @app.post(
            "/workflows/validate",
            response_model=WorkflowValidationStatus,
            summary="[EXPERIMENTAL] Endpoint to validate",
            description="Endpoint provides a way to check validity of JSON workflow definition.",
        )
        @with_route_exceptions
        def validate_workflow(
            specification: dict,
            api_key: Optional[str] = Query(
                None,
                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
            ),
        ) -> WorkflowValidationStatus:
            # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
            step_execution_mode = StepExecutionMode(WORKFLOWS_STEP_EXECUTION_MODE)
            workflow_init_parameters = {
                "workflows_core.model_manager": model_manager,
                "workflows_core.api_key": api_key,
                "workflows_core.background_tasks": None,
                "workflows_core.step_execution_mode": step_execution_mode,
            }
            _ = ExecutionEngine.init(
                workflow_definition=specification,
                init_parameters=workflow_init_parameters,
                max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
                prevent_local_images_loading=True,
            )
            return WorkflowValidationStatus(status="ok")

    if WEBRTC_WORKER_ENABLED:

        @app.post(
            "/initialise_webrtc_worker",
            response_model=InitializeWebRTCResponse,
            summary="[EXPERIMENTAL] Establishes WebRTC peer connection and processes video stream in spawned process or modal function",
            description="[EXPERIMENTAL] Establishes WebRTC peer connection and processes video stream in spawned process or modal function",
        )
        @with_route_exceptions_async
        async def initialise_webrtc_worker(
            request: WebRTCWorkerRequest,
            r: Request,
        ) -> InitializeWebRTCResponse:
            if str(r.headers.get("origin")).lower() == BUILDER_ORIGIN.lower():
                if re.search(
                    r"^https://[^.]+\.roboflow\.[^./]+/", str(r.url).lower()
                ):
                    request.is_preview = True

            logger.debug("Received initialise_webrtc_worker request")
            worker_result: WebRTCWorkerResult = await start_worker(
                webrtc_request=request,
            )
            if worker_result.exception_type is not None:
                if worker_result.exception_type == "WorkflowSyntaxError":
                    # Reconstruct exception from serialized worker result.
                    # We dynamically create an exception class to preserve
                    # the original type name (e.g., "ValidationError") for
                    # the inner_error_type property, since exceptions can't
                    # be pickled across the worker process boundary.
                    inner_error = None
                    if worker_result.inner_error and worker_result.inner_error_type:
                        inner_error = type(
                            worker_result.inner_error_type,
                            (Exception,),
                            {},
                        )(worker_result.inner_error)

                    blocks_errors = None
                    if worker_result.blocks_errors:
                        blocks_errors = [
                            WorkflowBlockError(**be)
                            for be in worker_result.blocks_errors
                        ]
                    raise WorkflowSyntaxError(
                        public_message=worker_result.error_message,
                        context=worker_result.error_context,
                        inner_error=inner_error,
                        blocks_errors=blocks_errors,
                    )
                if worker_result.exception_type == "WorkflowError":
                    raise WorkflowError(
                        public_message=worker_result.error_message,
                        context=worker_result.error_context,
                    )
                expected_exceptions = {
                    "Exception": Exception,
                    "KeyError": KeyError,
                    "MissingApiKeyError": MissingApiKeyError,
                    "NotImplementedError": NotImplementedError,
                    "RoboflowAPINotAuthorizedError": RoboflowAPINotAuthorizedError,
                    "RoboflowAPINotNotFoundError": RoboflowAPINotNotFoundError,
                    "ValidationError": ValidationError,
                    "WebRTCConfigurationError": WebRTCConfigurationError,
                }
                exc = expected_exceptions.get(
                    worker_result.exception_type, Exception
                )(worker_result.error_message)
                logger.error(
                    f"Initialise webrtc worker failed with %s: %s",
                    worker_result.exception_type,
                    worker_result.error_message,
                )
                raise exc
            logger.debug("Returning initialise_webrtc_worker response")
            return InitializeWebRTCResponse(
                context=CommandContext(),
                status=OperationStatus.SUCCESS,
                sdp=worker_result.answer.sdp,
                type=worker_result.answer.type,
            )

        @app.post(
            "/webrtc/session/heartbeat",
            summary="WebRTC session heartbeat",
        )
        @with_route_exceptions_async
        async def webrtc_session_heartbeat(
            request: WebRTCSessionHeartbeatRequest,
        ) -> dict:
            """Receive heartbeat for an active WebRTC session.

            This endpoint is called periodically to indicate
            that their session is still active. The session will be removed from
            the quota count if no heartbeat is received within the TTL period.

            Requires api_key for authentication.
            """
            try:
                workspace_id = await get_roboflow_workspace_async(
                    api_key=request.api_key
                )
            except (RoboflowAPINotAuthorizedError, WorkspaceLoadError):
                raise HTTPException(
                    status_code=401,
                    detail={"status": "error", "message": "unauthorized"},
                )
            if not workspace_id:
                raise HTTPException(
                    status_code=500,
                    detail={
                        "status": "error",
                        "message": "failed to retrieve workspace",
                    },
                )

            session_refreshed = refresh_webrtc_session(
                workspace_id=workspace_id,
                session_id=request.session_id,
            )
            if not session_refreshed:
                raise HTTPException(
                    status_code=404,
                    detail={"status": "error", "message": "session not found"},
                )
            return {"status": "ok"}

        @app.post(
            "/webrtc/session/heartbeat/end",
            summary="End WebRTC session",
        )
        @with_route_exceptions_async
        async def webrtc_session_end(
            request: WebRTCSessionHeartbeatRequest,
        ) -> dict:
            """End a WebRTC session and immediately free the quota slot.

            Requires api_key for authentication.
            """
            try:
                workspace_id = await get_roboflow_workspace_async(
                    api_key=request.api_key
                )
            except (RoboflowAPINotAuthorizedError, WorkspaceLoadError):
                raise HTTPException(
                    status_code=401,
                    detail={"status": "error", "message": "unauthorized"},
                )
            if not workspace_id:
                raise HTTPException(
                    status_code=500,
                    detail={
                        "status": "error",
                        "message": "failed to retrieve workspace",
                    },
                )

            deregister_webrtc_session(
                workspace_id=workspace_id,
                session_id=request.session_id,
            )
            return {"status": "ok"}

    if ENABLE_STREAM_API:

        @app.get(
            "/inference_pipelines/list",
            response_model=ListPipelinesResponse,
            summary="[EXPERIMENTAL] List active InferencePipelines",
            description="[EXPERIMENTAL] Listing all active InferencePipelines processing videos",
        )
        @with_route_exceptions_async
        async def list_pipelines(_: Request) -> ListPipelinesResponse:
            return await self.stream_manager_client.list_pipelines()

        @app.get(
            "/inference_pipelines/{pipeline_id}/status",
            response_model=InferencePipelineStatusResponse,
            summary="[EXPERIMENTAL] Get status of InferencePipeline",
            description="[EXPERIMENTAL] Get status of InferencePipeline",
        )
        @with_route_exceptions_async
        async def get_status(pipeline_id: str) -> InferencePipelineStatusResponse:
            return await self.stream_manager_client.get_status(
                pipeline_id=pipeline_id
            )

        @app.post(
            "/inference_pipelines/initialise",
            response_model=CommandResponse,
            summary="[EXPERIMENTAL] Starts new InferencePipeline",
            description="[EXPERIMENTAL] Starts new InferencePipeline",
        )
        @with_route_exceptions_async
        async def initialise(request: InitialisePipelinePayload) -> CommandResponse:
            return await self.stream_manager_client.initialise_pipeline(
                initialisation_request=request
            )

        @app.post(
            "/inference_pipelines/initialise_webrtc",
            response_model=InitializeWebRTCPipelineResponse,
            summary="[EXPERIMENTAL] Establishes WebRTC peer connection and starts new InferencePipeline consuming video track",
            description="[EXPERIMENTAL] Establishes WebRTC peer connection and starts new InferencePipeline consuming video track",
        )
        @with_route_exceptions_async
        async def initialise_webrtc_inference_pipeline(
            request: InitialiseWebRTCPipelinePayload,
        ) -> CommandResponse:
            logger.debug("Received initialise webrtc inference pipeline request")
            resp = await self.stream_manager_client.initialise_webrtc_pipeline(
                initialisation_request=request
            )
            logger.debug("Returning initialise webrtc inference pipeline response")
            return resp

        @app.post(
            "/inference_pipelines/{pipeline_id}/pause",
            response_model=CommandResponse,
            summary="[EXPERIMENTAL] Pauses the InferencePipeline",
            description="[EXPERIMENTAL] Pauses the InferencePipeline",
        )
        @with_route_exceptions_async
        async def pause(pipeline_id: str) -> CommandResponse:
            return await self.stream_manager_client.pause_pipeline(
                pipeline_id=pipeline_id
            )

        @app.post(
            "/inference_pipelines/{pipeline_id}/resume",
            response_model=CommandResponse,
            summary="[EXPERIMENTAL] Resumes the InferencePipeline",
            description="[EXPERIMENTAL] Resumes the InferencePipeline",
        )
        @with_route_exceptions_async
        async def resume(pipeline_id: str) -> CommandResponse:
            return await self.stream_manager_client.resume_pipeline(
                pipeline_id=pipeline_id
            )

        @app.post(
            "/inference_pipelines/{pipeline_id}/terminate",
            response_model=CommandResponse,
            summary="[EXPERIMENTAL] Terminates the InferencePipeline",
            description="[EXPERIMENTAL] Terminates the InferencePipeline",
        )
        @with_route_exceptions_async
        async def terminate(pipeline_id: str) -> CommandResponse:
            return await self.stream_manager_client.terminate_pipeline(
                pipeline_id=pipeline_id
            )

        @app.get(
            "/inference_pipelines/{pipeline_id}/consume",
            response_model=ConsumePipelineResponse,
            summary="[EXPERIMENTAL] Consumes InferencePipeline result",
            description="[EXPERIMENTAL] Consumes InferencePipeline result",
        )
        @with_route_exceptions_async
        async def consume(
            pipeline_id: str,
            request: Optional[ConsumeResultsPayload] = None,
        ) -> ConsumePipelineResponse:
            if request is None:
                request = ConsumeResultsPayload()
            return await self.stream_manager_client.consume_pipeline_result(
                pipeline_id=pipeline_id,
                excluded_fields=request.excluded_fields,
            )

    class ModelInitState:
        """Class to track model initialization state."""

        def __init__(self):
            self.is_ready = False
            self.lock = Lock()  # For thread-safe updates
            self.initialization_errors = []  # Track errors per model

    model_init_state = ModelInitState()

    should_preload = PRELOAD_MODELS or PINNED_MODELS
    if not should_preload:
        model_init_state.is_ready = True

    # Enable preloading models at startup
    if should_preload:

        def initialize_models(state: ModelInitState):
            """Perform asynchronous initialization tasks to load models."""

            def load_model(model_id):
                t_start = time.perf_counter()
                de_aliased = resolve_roboflow_model_alias(model_id=model_id)
                model_id_alias = model_id if de_aliased != model_id else None
                loaded_model_id = model_id_alias or de_aliased
                logger.info(
                    f"Preload: starting model load for '{model_id}' (resolved: '{de_aliased}')"
                )
                try:
                    self.model_manager.add_model(
                        de_aliased,
                        PRELOAD_API_KEY,
                        model_id_alias=model_id_alias,
                    )
                    load_time = time.perf_counter() - t_start
                    logger.info(
                        f"Preload: model '{model_id}' loaded successfully in {load_time:.1f}s"
                    )
                except Exception as e:
                    load_time = time.perf_counter() - t_start
                    error_msg = f"Preload: error loading model '{model_id}' after {load_time:.1f}s: {e}"
                    logger.error(error_msg)
                    with state.lock:
                        state.initialization_errors.append((model_id, str(e)))
                    return

                # Pin if this model is in PINNED_MODELS
                if (
                    PINNED_MODELS
                    and model_id in PINNED_MODELS
                    and hasattr(self.model_manager, "pin_model")
                ):
                    self.model_manager.pin_model(loaded_model_id)

            all_models = list(
                dict.fromkeys((PRELOAD_MODELS or []) + (PINNED_MODELS or []))
            )
            if all_models:
                # Create tasks for each model to be loaded
                model_loading_executor = ThreadPoolExecutor(max_workers=2)
                loaded_futures: List[Tuple[str, Future]] = []
                for model_id in all_models:
                    future = model_loading_executor.submit(
                        load_model, model_id=model_id
                    )
                    loaded_futures.append((model_id, future))

                for model_id, future in loaded_futures:
                    try:
                        future.result(timeout=300)
                    except (
                        TimeoutError,
                        CancelledError,
                        concurrent.futures.TimeoutError,
                    ):
                        state.initialization_errors.append(
                            (
                                model_id,
                                "Could not finalise model loading before timeout",
                            )
                        )
                        future.cancel()
                    except Exception as e:
                        logger.error(
                            f"Preload: unexpected error for model '{model_id}': {e}"
                        )
                        with state.lock:
                            state.initialization_errors.append((model_id, str(e)))

            # Update the readiness state in a thread-safe manner
            with state.lock:
                state.is_ready = True

        @app.on_event("startup")
        def startup_model_init():
            """Initialize the models on startup."""
            startup_thread = Thread(
                target=initialize_models, args=(model_init_state,), daemon=True
            )
            startup_thread.start()
            logger.info("Model initialization started in the background.")

    # Attach health/readiness endpoints
    @app.get("/readiness", status_code=200)
    def readiness(
        state: ModelInitState = Depends(lambda: model_init_state),
    ):
        """Readiness endpoint for Kubernetes readiness probe."""
        with state.lock:
            if state.is_ready:
                return {"status": "ready"}
            else:
                return JSONResponse(
                    content={"status": "not ready"}, status_code=503
                )

    @app.get("/healthz", status_code=200)
    def healthz():
        """Health endpoint for Kubernetes liveness probe.

        Verifies CUDA context health when running on GPU. Returns 503 if
        CUDA is corrupted (unrecoverable - requires process restart).
        """
        from inference.core.utils.cuda_health import check_cuda_health

        is_healthy, error = check_cuda_health()
        if is_healthy:
            return {"status": "healthy"}
        else:
            logger.error("CUDA health check failed: %s", error)
            return JSONResponse(
                content={
                    "status": "unhealthy",
                    "reason": "cuda_error",
                },
                status_code=503,
            )

    if CORE_MODELS_ENABLED:
        if CORE_MODEL_CLIP_ENABLED:

            @app.post(
                "/clip/embed_image",
                response_model=ClipEmbeddingResponse,
                summary="CLIP Image Embeddings",
                description="Run the Open AI CLIP model to embed image data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def clip_embed_image(
                inference_request: ClipImageEmbeddingRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds image data using the OpenAI CLIP model.

                Args:
                    inference_request (ClipImageEmbeddingRequest): The request containing the image to be embedded.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    ClipEmbeddingResponse: The response containing the embedded image.
                """
                logger.debug(f"Reached /clip/embed_image")
                clip_model_id = load_clip_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    clip_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(clip_model_id, actor)
                return response

            @app.post(
                "/clip/embed_text",
                response_model=ClipEmbeddingResponse,
                summary="CLIP Text Embeddings",
                description="Run the Open AI CLIP model to embed text data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def clip_embed_text(
                inference_request: ClipTextEmbeddingRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds text data using the OpenAI CLIP model.

                Args:
                    inference_request (ClipTextEmbeddingRequest): The request containing the text to be embedded.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    ClipEmbeddingResponse: The response containing the embedded text.
                """
                logger.debug(f"Reached /clip/embed_text")
                clip_model_id = load_clip_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    clip_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(clip_model_id, actor)
                return response

            @app.post(
                "/clip/compare",
                response_model=ClipCompareResponse,
                summary="CLIP Compare",
                description="Run the Open AI CLIP model to compute similarity scores.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def clip_compare(
                inference_request: ClipCompareRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Computes similarity scores using the OpenAI CLIP model.

                Args:
                    inference_request (ClipCompareRequest): The request containing the data to be compared.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    ClipCompareResponse: The response containing the similarity scores.
                """
                logger.debug(f"Reached /clip/compare")
                clip_model_id = load_clip_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    clip_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(clip_model_id, actor, n=2)
                return response

        if CORE_MODEL_PE_ENABLED:

            @app.post(
                "/perception_encoder/embed_image",
                response_model=PerceptionEncoderEmbeddingResponse,
                summary="PE Image Embeddings",
                description="Run the Meta Perception Encoder model to embed image data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def pe_embed_image(
                inference_request: PerceptionEncoderImageEmbeddingRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds image data using the Perception Encoder PE model.

                Args:
                    inference_request (PerceptionEncoderImageEmbeddingRequest): The request containing the image to be embedded.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    PerceptionEncoderEmbeddingResponse: The response containing the embedded image.
                """
                logger.debug(f"Reached /perception_encoder/embed_image")
                pe_model_id = load_pe_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    pe_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(pe_model_id, actor)
                return response

            @app.post(
                "/perception_encoder/embed_text",
                response_model=PerceptionEncoderEmbeddingResponse,
                summary="Perception Encoder Text Embeddings",
                description="Run the Meta Perception Encoder model to embed text data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def pe_embed_text(
                inference_request: PerceptionEncoderTextEmbeddingRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds text data using the Meta Perception Encoder model.

                Args:
                    inference_request (PerceptionEncoderTextEmbeddingRequest): The request containing the text to be embedded.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    PerceptionEncoderEmbeddingResponse: The response containing the embedded text.
                """
                logger.debug(f"Reached /perception_encoder/embed_text")
                pe_model_id = load_pe_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    pe_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(pe_model_id, actor)
                return response

            @app.post(
                "/perception_encoder/compare",
                response_model=PerceptionEncoderCompareResponse,
                summary="Perception Encoder Compare",
                description="Run the Meta Perception Encoder model to compute similarity scores.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def pe_compare(
                inference_request: PerceptionEncoderCompareRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Computes similarity scores using the Meta Perception Encoder model.

                Args:
                    inference_request (PerceptionEncoderCompareRequest): The request containing the data to be compared.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    PerceptionEncoderCompareResponse: The response containing the similarity scores.
                """
                logger.debug(f"Reached /perception_encoder/compare")
                pe_model_id = load_pe_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    pe_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(pe_model_id, actor, n=2)
                return response

        if CORE_MODEL_GROUNDINGDINO_ENABLED:

            @app.post(
                "/grounding_dino/infer",
                response_model=ObjectDetectionInferenceResponse,
                summary="Grounding DINO inference.",
                description="Run the Grounding DINO zero-shot object detection model.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def grounding_dino_infer(
                inference_request: GroundingDINOInferenceRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds image data using the Grounding DINO model.

                Args:
                    inference_request GroundingDINOInferenceRequest): The request containing the image on which to run object detection.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    ObjectDetectionInferenceResponse: The object detection response.
                """
                logger.debug(f"Reached /grounding_dino/infer")
                grounding_dino_model_id = load_grounding_dino_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    grounding_dino_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(grounding_dino_model_id, actor)
                return response

        if CORE_MODEL_YOLO_WORLD_ENABLED:

            @app.post(
                "/yolo_world/infer",
                response_model=ObjectDetectionInferenceResponse,
                summary="YOLO-World inference.",
                description="Run the YOLO-World zero-shot object detection model.",
                response_model_exclude_none=True,
            )
            @with_route_exceptions
            @usage_collector("request")
            def yolo_world_infer(
                inference_request: YOLOWorldInferenceRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Runs the YOLO-World zero-shot object detection model.

                Args:
                    inference_request (YOLOWorldInferenceRequest): The request containing the image on which to run object detection.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    ObjectDetectionInferenceResponse: The object detection response.
                """
                logger.debug(f"Reached /yolo_world/infer. Loading model")
                yolo_world_model_id = load_yolo_world_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                logger.debug("YOLOWorld model loaded. Staring the inference.")
                response = self.model_manager.infer_from_request_sync(
                    yolo_world_model_id, inference_request
                )
                logger.debug("YOLOWorld prediction available.")
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(yolo_world_model_id, actor)
                    logger.debug("Usage of YOLOWorld denoted.")
                return response

        if CORE_MODEL_DOCTR_ENABLED:

            @app.post(
                "/doctr/ocr",
                response_model=Union[
                    OCRInferenceResponse, List[OCRInferenceResponse]
                ],
                summary="DocTR OCR response",
                description="Run the DocTR OCR model to retrieve text in an image.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def doctr_retrieve_text(
                inference_request: DoctrOCRInferenceRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds image data using the DocTR model.

                Args:
                    inference_request (M.DoctrOCRInferenceRequest): The request containing the image from which to retrieve text.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    OCRInferenceResponse: The response containing the embedded image.
                """
                logger.debug(f"Reached /doctr/ocr")
                doctr_model_id = load_doctr_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    doctr_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(doctr_model_id, actor)
                return orjson_response_keeping_parent_id(response)

        if CORE_MODEL_EASYOCR_ENABLED:

            @app.post(
                "/easy_ocr/ocr",
                response_model=Union[
                    OCRInferenceResponse, List[OCRInferenceResponse]
                ],
                summary="EasyOCR OCR response",
                description="Run the EasyOCR model to retrieve text in an image.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def easy_ocr_retrieve_text(
                inference_request: EasyOCRInferenceRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds image data using the EasyOCR model.

                Args:
                    inference_request (EasyOCRInferenceRequest): The request containing the image from which to retrieve text.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    OCRInferenceResponse: The response containing the embedded image.
                """
                logger.debug(f"Reached /easy_ocr/ocr")
                easy_ocr_model_id = load_easy_ocr_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    easy_ocr_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(easy_ocr_model_id, actor)
                return orjson_response_keeping_parent_id(response)

        if CORE_MODEL_SAM_ENABLED:

            @app.post(
                "/sam/embed_image",
                response_model=SamEmbeddingResponse,
                summary="SAM Image Embeddings",
                description="Run the Meta AI Segmant Anything Model to embed image data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def sam_embed_image(
                inference_request: SamEmbeddingRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds image data using the Meta AI Segmant Anything Model (SAM).

                Args:
                    inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    M.SamEmbeddingResponse or Response: The response containing the embedded image.
                """
                logger.debug(f"Reached /sam/embed_image")
                sam_model_id = load_sam_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                model_response = self.model_manager.infer_from_request_sync(
                    sam_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(sam_model_id, actor)
                if inference_request.format == "binary":
                    return Response(
                        content=model_response.embeddings,
                        headers={"Content-Type": "application/octet-stream"},
                    )
                return model_response

            @app.post(
                "/sam/segment_image",
                response_model=SamSegmentationResponse,
                summary="SAM Image Segmentation",
                description="Run the Meta AI Segmant Anything Model to generate segmenations for image data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def sam_segment_image(
                inference_request: SamSegmentationRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Generates segmentations for image data using the Meta AI Segmant Anything Model (SAM).

                Args:
                    inference_request (SamSegmentationRequest): The request containing the image to be segmented.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    M.SamSegmentationResponse or Response: The response containing the segmented image.
                """
                logger.debug(f"Reached /sam/segment_image")
                sam_model_id = load_sam_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                model_response = self.model_manager.infer_from_request_sync(
                    sam_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(sam_model_id, actor)
                if inference_request.format == "binary":
                    return Response(
                        content=model_response,
                        headers={"Content-Type": "application/octet-stream"},
                    )
                return model_response

        if CORE_MODEL_SAM2_ENABLED:

            @app.post(
                "/sam2/embed_image",
                response_model=Sam2EmbeddingResponse,
                summary="SAM2 Image Embeddings",
                description="Run the Meta AI Segment Anything 2 Model to embed image data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def sam2_embed_image(
                inference_request: Sam2EmbeddingRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds image data using the Meta AI Segment Anything Model (SAM).

                Args:
                    inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    M.Sam2EmbeddingResponse or Response: The response affirming the image has been embedded
                """
                logger.debug(f"Reached /sam2/embed_image")
                sam2_model_id = load_sam2_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                model_response = self.model_manager.infer_from_request_sync(
                    sam2_model_id, inference_request
                )
                return model_response

            @app.post(
                "/sam2/segment_image",
                response_model=Sam2SegmentationResponse,
                summary="SAM2 Image Segmentation",
                description="Run the Meta AI Segment Anything 2 Model to generate segmenations for image data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def sam2_segment_image(
                inference_request: Sam2SegmentationRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Generates segmentations for image data using the Meta AI Segment Anything Model (SAM).

                Args:
                    inference_request (Sam2SegmentationRequest): The request containing the image to be segmented.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    M.SamSegmentationResponse or Response: The response containing the segmented image.
                """
                logger.debug(f"Reached /sam2/segment_image")
                sam2_model_id = load_sam2_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                model_response = self.model_manager.infer_from_request_sync(
                    sam2_model_id, inference_request
                )
                if inference_request.format == "binary":
                    return Response(
                        content=model_response,
                        headers={"Content-Type": "application/octet-stream"},
                    )
                return model_response

        if CORE_MODEL_SAM3_ENABLED:

            @app.post(
                "/sam3/embed_image",
                response_model=Sam3EmbeddingResponse,
                summary="Seg preview Image Embeddings",
                description="Run the  Model to embed image data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def sam3_embed_image(
                inference_request: Sam2EmbeddingRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                logger.debug(f"Reached /sam3/embed_image")

                if SAM3_EXEC_MODE == "remote":
                    raise HTTPException(
                        status_code=501,
                        detail="SAM3 embedding is not supported in remote execution mode.",
                    )

                self.model_manager.add_model(
                    "sam3/sam3_interactive",
                    api_key=api_key,
                    endpoint_type=ModelEndpointType.CORE_MODEL,
                    countinference=countinference,
                    service_secret=service_secret,
                )

                model_response = self.model_manager.infer_from_request_sync(
                    "sam3/sam3_interactive", inference_request
                )
                return model_response

        if CORE_MODEL_SAM3_ENABLED:

            @app.post(
                "/sam3/concept_segment",
                response_model=Sam3SegmentationResponse,
                summary="SAM3 PCS (promptable concept segmentation)",
                description="Run the SAM3 PCS (promptable concept segmentation) to generate segmentations for image data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def sam3_segment_image(
                inference_request: Sam3SegmentationRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                if not SAM3_FINE_TUNED_MODELS_ENABLED:
                    if not inference_request.model_id.startswith("sam3/"):
                        raise HTTPException(
                            status_code=501,
                            detail="Fine-tuned SAM3 models are not supported on this deployment. Please use a workflow or self-host the server.",
                        )

                if SAM3_EXEC_MODE == "remote":
                    endpoint = f"{API_BASE_URL}/inferenceproxy/seg-preview"

                    # Construct payload for remote API
                    # The remote API expects:
                    # {
                    #     "image": {"type": "base64", "value": ...},
                    #     "prompts": [{"type": "text", "text": ...}, ...],
                    #     "output_prob_thresh": ...
                    # }

                    # Extract prompts from request
                    http_prompts = []
                    for prompt in inference_request.prompts:
                        p_dict = prompt.dict(exclude_none=True)
                        # Ensure type is set if missing (default to text if text is present)
                        if "type" not in p_dict:
                            if "text" in p_dict:
                                p_dict["type"] = "text"
                        http_prompts.append(p_dict)

                    # Prepare image
                    # inference_request.image is InferenceRequestImage
                    if inference_request.image.type == "base64":
                        http_image = {
                            "type": "base64",
                            "value": inference_request.image.value,
                        }
                    elif inference_request.image.type == "url":
                        http_image = {
                            "type": "url",
                            "value": inference_request.image.value,
                        }
                    elif inference_request.image.type == "numpy":
                        # Numpy not supported for remote proxy easily without serialization,
                        # but InferenceRequestImage usually comes as base64/url in HTTP API.
                        # If it is numpy, we might need to handle it, but for now assume base64/url.
                        # If it's numpy, it's likely from internal call, but this is HTTP API.
                        http_image = {
                            "type": "numpy",
                            "value": inference_request.image.value,
                        }
                    else:
                        http_image = {
                            "type": inference_request.image.type,
                            "value": inference_request.image.value,
                        }

                    payload = {
                        "image": http_image,
                        "prompts": http_prompts,
                        "output_prob_thresh": inference_request.output_prob_thresh,
                    }

                    try:
                        headers = {"Content-Type": "application/json"}
                        if ROBOFLOW_INTERNAL_SERVICE_NAME:
                            headers["X-Roboflow-Internal-Service-Name"] = (
                                ROBOFLOW_INTERNAL_SERVICE_NAME
                            )
                        if ROBOFLOW_INTERNAL_SERVICE_SECRET:
                            headers["X-Roboflow-Internal-Service-Secret"] = (
                                ROBOFLOW_INTERNAL_SERVICE_SECRET
                            )

                        headers = build_roboflow_api_headers(
                            explicit_headers=headers
                        )

                        response = requests.post(
                            wrap_url(f"{endpoint}?api_key={api_key}"),
                            json=payload,
                            headers=headers,
                            timeout=60,
                        )
                        response.raise_for_status()
                        resp_json = response.json()

                        # The remote API returns the same structure as Sam3SegmentationResponse
                        return Sam3SegmentationResponse(**resp_json)

                    except Exception as e:
                        logger.error(f"SAM3 remote request failed: {e}")
                        raise HTTPException(
                            status_code=500,
                            detail=f"SAM3 remote request failed: {str(e)}",
                        )

                if inference_request.model_id.startswith("sam3/"):
                    self.model_manager.add_model(
                        inference_request.model_id,
                        api_key=api_key,
                        endpoint_type=ModelEndpointType.CORE_MODEL,
                        countinference=countinference,
                        service_secret=service_secret,
                    )
                else:
                    self.model_manager.add_model(
                        inference_request.model_id,
                        api_key=api_key,
                        endpoint_type=ModelEndpointType.ORT,
                        countinference=countinference,
                        service_secret=service_secret,
                    )

                model_response = self.model_manager.infer_from_request_sync(
                    inference_request.model_id, inference_request
                )
                if inference_request.format == "binary":
                    return Response(
                        content=model_response,
                        headers={"Content-Type": "application/octet-stream"},
                    )
                return model_response

            @app.post(
                "/sam3/visual_segment",
                response_model=Sam2SegmentationResponse,
                summary="SAM3 PVS (promptable visual segmentation)",
                description="Run the SAM3 PVS (promptable visual segmentation) to generate segmentations for image data.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def sam3_visual_segment(
                inference_request: Sam2SegmentationRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                logger.debug(f"Reached /sam3/visual_segment")

                if SAM3_EXEC_MODE == "remote":
                    endpoint = f"{API_BASE_URL}/inferenceproxy/sam3-pvs"

                    http_image = {
                        "type": inference_request.image.type,
                        "value": inference_request.image.value,
                    }

                    prompts_data = (
                        inference_request.prompts.dict(exclude_none=True)
                        if inference_request.prompts
                        else None
                    )

                    payload = {
                        "image": http_image,
                        "prompts": prompts_data,
                        "multimask_output": inference_request.multimask_output,
                    }

                    try:
                        headers = {"Content-Type": "application/json"}
                        if ROBOFLOW_INTERNAL_SERVICE_NAME:
                            headers["X-Roboflow-Internal-Service-Name"] = (
                                ROBOFLOW_INTERNAL_SERVICE_NAME
                            )
                        if ROBOFLOW_INTERNAL_SERVICE_SECRET:
                            headers["X-Roboflow-Internal-Service-Secret"] = (
                                ROBOFLOW_INTERNAL_SERVICE_SECRET
                            )

                        headers = build_roboflow_api_headers(
                            explicit_headers=headers
                        )

                        response = requests.post(
                            wrap_url(f"{endpoint}?api_key={api_key}"),
                            json=payload,
                            headers=headers,
                            timeout=60,
                        )
                        response.raise_for_status()
                        resp_json = response.json()

                        return Sam2SegmentationResponse(**resp_json)

                    except Exception as e:
                        logger.error(
                            f"SAM3 visual_segment remote request failed: {e}"
                        )
                        raise HTTPException(
                            status_code=500,
                            detail=f"SAM3 visual_segment remote request failed: {str(e)}",
                        )

                self.model_manager.add_model(
                    "sam3/sam3_interactive",
                    api_key=api_key,
                    endpoint_type=ModelEndpointType.CORE_MODEL,
                    countinference=countinference,
                    service_secret=service_secret,
                )

                model_response = self.model_manager.infer_from_request_sync(
                    "sam3/sam3_interactive", inference_request
                )
                return model_response

        if SAM3_3D_OBJECTS_ENABLED:

            @app.post(
                "/sam3_3d/infer",
                summary="SAM3 3D Object Generation",
                description="Generate 3D meshes and Gaussian splatting from 2D images with mask prompts.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def sam3_3d_infer(
                inference_request: Sam3_3D_Objects_InferenceRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """Generate 3D meshes and Gaussian splatting from 2D images with mask prompts.

                Args:
                    inference_request (Sam3_3D_Objects_InferenceRequest): The request containing
                        the image and mask input for 3D generation.
                    api_key (Optional[str]): Roboflow API Key for artifact retrieval.

                Returns:
                    dict: Response containing base64-encoded 3D outputs:
                        - mesh_glb: Scene mesh in GLB format (base64)
                        - gaussian_ply: Combined Gaussian splatting in PLY format (base64)
                        - objects: List of individual objects with their 3D data
                        - time: Inference time in seconds
                """
                logger.debug("Reached /sam3_3d/infer")
                model_id = inference_request.model_id or "sam3-3d-objects"

                self.model_manager.add_model(
                    model_id,
                    api_key=api_key,
                    endpoint_type=ModelEndpointType.CORE_MODEL,
                    countinference=countinference,
                    service_secret=service_secret,
                )

                model_response = self.model_manager.infer_from_request_sync(
                    model_id, inference_request
                )

                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(model_id, actor)

                # Convert bytes to base64 for JSON serialization
                def encode_bytes(data):
                    if data is None:
                        return None
                    return base64.b64encode(data).decode("utf-8")

                objects_list = []
                for obj in model_response.objects:
                    objects_list.append(
                        {
                            "mesh_glb": encode_bytes(obj.mesh_glb),
                            "gaussian_ply": encode_bytes(obj.gaussian_ply),
                            "metadata": {
                                "rotation": obj.metadata.rotation,
                                "translation": obj.metadata.translation,
                                "scale": obj.metadata.scale,
                            },
                        }
                    )

                return {
                    "mesh_glb": encode_bytes(model_response.mesh_glb),
                    "gaussian_ply": encode_bytes(model_response.gaussian_ply),
                    "objects": objects_list,
                    "time": model_response.time,
                }

        if CORE_MODEL_OWLV2_ENABLED:

            @app.post(
                "/owlv2/infer",
                response_model=ObjectDetectionInferenceResponse,
                summary="Owlv2 image prompting",
                description="Run the google owlv2 model to few-shot object detect",
            )
            @with_route_exceptions
            @usage_collector("request")
            def owlv2_infer(
                inference_request: OwlV2InferenceRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Embeds image data using the Meta AI Segmant Anything Model (SAM).

                Args:
                    inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    M.Sam2EmbeddingResponse or Response: The response affirming the image has been embedded
                """
                logger.debug(f"Reached /owlv2/infer")
                owl2_model_id = load_owlv2_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                model_response = self.model_manager.infer_from_request_sync(
                    owl2_model_id, inference_request
                )
                return model_response

        if CORE_MODEL_GAZE_ENABLED:

            @app.post(
                "/gaze/gaze_detection",
                summary="Gaze Detection (deprecated)",
                description=(
                    "Deprecated. Always returns HTTP 410 Gone. The endpoint stub "
                    "will be removed end of Q2 2026."
                ),
                deprecated=True,
            )
            @with_route_exceptions
            def gaze_detection_deprecated():
                raise FeatureDeprecatedError(
                    feature="/gaze/gaze_detection",
                    removal_release="end of Q2 2026",
                    reason="MediaPipe dependency removed from inference; endpoint is a 410 stub.",
                )

        if DEPTH_ESTIMATION_ENABLED:

            @app.post(
                "/infer/depth-estimation",
                response_model=DepthEstimationResponse,
                summary="Depth Estimation",
                description="Run the depth estimation model to generate a depth map.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def depth_estimation(
                inference_request: DepthEstimationRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Generate a depth map using the depth estimation model.

                Args:
                    inference_request (DepthEstimationRequest): The request containing the image to estimate depth for.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    DepthEstimationResponse: The response containing the normalized depth map and optional visualization.
                """
                logger.debug(f"Reached /infer/depth-estimation")
                depth_model_id = inference_request.model_id
                self.model_manager.add_model(
                    depth_model_id,
                    inference_request.api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    depth_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(depth_model_id, actor)

                # Extract data from nested response structure
                depth_data = response.response
                depth_response = DepthEstimationResponse(
                    normalized_depth=depth_data["normalized_depth"].tolist(),
                    image=depth_data["image"].base64_image,
                )
                return depth_response

        if CORE_MODEL_TROCR_ENABLED:

            @app.post(
                "/ocr/trocr",
                response_model=OCRInferenceResponse,
                summary="TrOCR OCR response",
                description="Run the TrOCR model to retrieve text in an image.",
            )
            @with_route_exceptions
            @usage_collector("request")
            def trocr_retrieve_text(
                inference_request: TrOCRInferenceRequest,
                request: Request,
                api_key: Optional[str] = Query(
                    None,
                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
                ),
                countinference: Optional[bool] = None,
                service_secret: Optional[str] = None,
            ):
                """
                Retrieves text from image data using the TrOCR model.

                Args:
                    inference_request (TrOCRInferenceRequest): The request containing the image from which to retrieve text.
                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                    request (Request, default Body()): The HTTP request.

                Returns:
                    OCRInferenceResponse: The response containing the retrieved text.
                """
                logger.debug(f"Reached /trocr/ocr")
                trocr_model_id = load_trocr_model(
                    inference_request,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )
                response = self.model_manager.infer_from_request_sync(
                    trocr_model_id, inference_request
                )
                if LAMBDA:
                    actor = request.scope["aws.event"]["requestContext"][
                        "authorizer"
                    ]["lambda"]["actor"]
                    trackUsage(trocr_model_id, actor)
                return orjson_response_keeping_parent_id(response)

    if not (LAMBDA or GCP_SERVERLESS):

        @app.get(
            "/notebook/start",
            summary="Jupyter Lab Server Start",
            description="Starts a jupyter lab server for running development code",
        )
        @with_route_exceptions
        def notebook_start(browserless: bool = False):
            """Starts a jupyter lab server for running development code.

            Args:
                inference_request (NotebookStartRequest): The request containing the necessary details for starting a jupyter lab server.
                background_tasks: (BackgroundTasks) pool of fastapi background tasks

            Returns:
                NotebookStartResponse: The response containing the URL of the jupyter lab server.
            """
            logger.debug(f"Reached /notebook/start")
            if NOTEBOOK_ENABLED:
                start_notebook()
                if browserless:
                    return {
                        "success": True,
                        "message": f"Jupyter Lab server started at http://localhost:{NOTEBOOK_PORT}?token={NOTEBOOK_PASSWORD}",
                    }
                else:
                    sleep(2)
                    return RedirectResponse(
                        f"http://localhost:{NOTEBOOK_PORT}/lab/tree/quickstart.ipynb?token={NOTEBOOK_PASSWORD}"
                    )
            else:
                if browserless:
                    return {
                        "success": False,
                        "message": "Notebook server is not enabled. Enable notebooks via the NOTEBOOK_ENABLED environment variable.",
                    }
                else:
                    return RedirectResponse(f"/notebook-instructions.html")

    if ENABLE_BUILDER:
        from inference.core.interfaces.http.builder.routes import (
            router as builder_router,
        )

        # Allow CORS on builder API and workflow endpoints needed by the builder UI
        # Enables Private Network Access for Chrome 142+ (local development)
        app.add_middleware(
            PathAwareCORSMiddleware,
            match_paths=r"^/(build/api|workflows/).*",
            allow_origins=[BUILDER_ORIGIN],
            allow_methods=["*"],
            allow_headers=["*"],
            allow_credentials=True,
            allow_private_network=True,
        )

        # Attach all routes from builder to the /build prefix
        app.include_router(builder_router, prefix="/build", tags=["builder"])

    if LEGACY_ROUTE_ENABLED:
        # Legacy object detection inference path for backwards compatibility
        @app.get(
            "/{dataset_id}/{version_id:str}",
            # Order matters in this response model Union. It will use the first matching model. For example, Object Detection Inference Response is a subset of Instance segmentation inference response, so instance segmentation must come first in order for the matching logic to work.
            response_model=Union[
                InstanceSegmentationInferenceResponse,
                KeypointsDetectionInferenceResponse,
                ObjectDetectionInferenceResponse,
                ClassificationInferenceResponse,
                MultiLabelClassificationInferenceResponse,
                SemanticSegmentationInferenceResponse,
                StubResponse,
                Any,
            ],
            response_model_exclude_none=True,
        )
        @app.post(
            "/{dataset_id}/{version_id:str}",
            # Order matters in this response model Union. It will use the first matching model. For example, Object Detection Inference Response is a subset of Instance segmentation inference response, so instance segmentation must come first in order for the matching logic to work.
            response_model=Union[
                InstanceSegmentationInferenceResponse,
                KeypointsDetectionInferenceResponse,
                ObjectDetectionInferenceResponse,
                ClassificationInferenceResponse,
                MultiLabelClassificationInferenceResponse,
                SemanticSegmentationInferenceResponse,
                StubResponse,
                Any,
            ],
            response_model_exclude_none=True,
        )
        @with_route_exceptions
        @usage_collector("request")
        def legacy_infer_from_request(
            background_tasks: BackgroundTasks,
            request: Request,
            request_body: Annotated[
                Optional[Union[bytes, UploadFile]],
                Depends(parse_body_content_for_legacy_request_handler),
            ],
            dataset_id: str = Path(
                description="ID of a Roboflow dataset corresponding to the model to use for inference OR workspace ID"
            ),
            version_id: str = Path(
                description="ID of a Roboflow dataset version corresponding to the model to use for inference OR model ID"
            ),
            api_key: Optional[str] = Query(
                None,
                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
            ),
            confidence: Confidence = Query(
                0.4,
                description=(
                    "The confidence threshold used to filter out predictions. "
                    'Pass a float in [0, 1], or "best" to use F1-optimal '
                    'thresholds from model evaluation, or "default" to use '
                    "the model's built-in default."
                ),
            ),
            keypoint_confidence: float = Query(
                0.0,
                description="The confidence threshold used to filter out keypoints that are not visible based on model confidence",
            ),
            format: str = Query(
                "json",
                description="One of 'json' or 'image'. If 'json' prediction data is return as a JSON string. If 'image' prediction data is visualized and overlayed on the original input image.",
            ),
            image: Optional[str] = Query(
                None,
                description="The publically accessible URL of an image to use for inference.",
            ),
            image_type: Optional[str] = Query(
                "base64",
                description="One of base64 or numpy. Note, numpy input is not supported for Roboflow Hosted Inference.",
            ),
            labels: Optional[bool] = Query(
                False,
                description="If true, labels will be include in any inference visualization.",
            ),
            mask_decode_mode: Optional[str] = Query(
                "accurate",
                description="One of 'accurate' or 'fast'. If 'accurate' the mask will be decoded using the original image size. If 'fast' the mask will be decoded using the original mask size. 'accurate' is slower but more accurate.",
            ),
            tradeoff_factor: Optional[float] = Query(
                0.0,
                description="The amount to tradeoff between 0='fast' and 1='accurate'",
            ),
            max_detections: int = Query(
                300,
                description="The maximum number of detections to return. This is used to limit the number of predictions returned by the model. The model may return more predictions than this number, but only the top `max_detections` predictions will be returned.",
            ),
            overlap: float = Query(
                0.3,
                description="The IoU threhsold that must be met for a box pair to be considered duplicate during NMS",
            ),
            stroke: int = Query(
                1, description="The stroke width used when visualizing predictions"
            ),
            countinference: Optional[bool] = Query(
                True,
                description="If false, does not track inference against usage.",
                include_in_schema=False,
            ),
            service_secret: Optional[str] = Query(
                None,
                description="Shared secret used to authenticate requests to the inference server from internal services (e.g. to allow disabling inference usage tracking via the `countinference` query parameter)",
                include_in_schema=False,
            ),
            disable_preproc_auto_orient: Optional[bool] = Query(
                False, description="If true, disables automatic image orientation"
            ),
            disable_preproc_contrast: Optional[bool] = Query(
                False, description="If true, disables automatic contrast adjustment"
            ),
            disable_preproc_grayscale: Optional[bool] = Query(
                False,
                description="If true, disables automatic grayscale conversion",
            ),
            disable_preproc_static_crop: Optional[bool] = Query(
                False, description="If true, disables automatic static crop"
            ),
            disable_active_learning: Optional[bool] = Query(
                default=False,
                description="If true, the predictions will be prevented from registration by Active Learning (if the functionality is enabled)",
            ),
            active_learning_target_dataset: Optional[str] = Query(
                default=None,
                description="Parameter to be used when Active Learning data registration should happen against different dataset than the one pointed by model_id",
            ),
            source: Optional[str] = Query(
                "external",
                description="The source of the inference request",
            ),
            source_info: Optional[str] = Query(
                "external",
                description="The detailed source information of the inference request",
            ),
            disable_model_monitoring: Optional[bool] = Query(
                False,
                description="If true, disables model monitoring for this request",
                include_in_schema=False,
            ),
            response_mask_format: Optional[Literal["polygon", "rle"]] = Query(
                default="polygon",
                description="The format of the prediction mask - polygon (default) or rle - applicable "
                "for instance segmentation models.",
            ),
        ):
            """
            Legacy inference endpoint for object detection, instance segmentation, and classification.

            Args:
                background_tasks: (BackgroundTasks) pool of fastapi background tasks
                dataset_id (str): ID of a Roboflow dataset corresponding to the model to use for inference OR workspace ID
                version_id (str): ID of a Roboflow dataset version corresponding to the model to use for inference OR model ID
                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
                # Other parameters described in the function signature...

            Returns:
                Union[InstanceSegmentationInferenceResponse, KeypointsDetectionInferenceRequest, ObjectDetectionInferenceResponse, ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse, SemanticSegmentationInferenceResponse, Any]: The response containing the inference results.
            """
            logger.debug(
                f"Reached legacy route /:dataset_id/:version_id with {dataset_id}/{version_id}"
            )
            model_id = f"{dataset_id}/{version_id}"
            if isinstance(confidence, (int, float)):
                if confidence >= 1:
                    confidence /= 100
                if confidence < CONFIDENCE_LOWER_BOUND_OOM_PREVENTION:
                    # allowing lower confidence results in RAM usage explosion
                    confidence = CONFIDENCE_LOWER_BOUND_OOM_PREVENTION

            if overlap >= 1:
                overlap /= 100
            if image is not None:
                request_image = InferenceRequestImage(type="url", value=image)
            else:
                if "Content-Type" not in request.headers:
                    raise ContentTypeMissing(
                        f"Request must include a Content-Type header"
                    )
                if isinstance(request_body, UploadFile):
                    base64_image_str = request_body.file.read()
                    base64_image_str = base64.b64encode(base64_image_str)
                    request_image = InferenceRequestImage(
                        type="base64", value=base64_image_str.decode("ascii")
                    )
                elif isinstance(request_body, bytes):
                    request_image = InferenceRequestImage(
                        type=image_type, value=request_body
                    )
                elif request_body is None:
                    raise InputImageLoadError(
                        message="Image not found in request body.",
                        public_message="Image not found in request body.",
                    )
                else:
                    raise ContentTypeInvalid(
                        f"Invalid Content-Type: {request.headers['Content-Type']}"
                    )

            if not countinference and service_secret != ROBOFLOW_SERVICE_SECRET:
                raise MissingServiceSecretError(
                    "Service secret is required to disable inference usage tracking"
                )
            if LAMBDA:
                logger.debug("request.scope: %s", request.scope)
                request_model_id = (
                    request.scope["aws.event"]["requestContext"]["authorizer"][
                        "lambda"
                    ]["model"]["endpoint"]
                    .replace("--", "/")
                    .replace("rf-", "")
                    .replace("nu-", "")
                )
                actor = request.scope["aws.event"]["requestContext"]["authorizer"][
                    "lambda"
                ]["actor"]
                if countinference:
                    trackUsage(request_model_id, actor)
                else:
                    if service_secret != ROBOFLOW_SERVICE_SECRET:
                        raise MissingServiceSecretError(
                            "Service secret is required to disable inference usage tracking"
                        )
                    logger.info("Not counting inference for usage")
            else:
                request_model_id = model_id
            logger.debug(
                f"State of model registry: {self.model_manager.describe_models()}"
            )
            self.model_manager.add_model(
                request_model_id,
                api_key,
                model_id_alias=model_id,
                countinference=countinference,
                service_secret=service_secret,
            )

            task_type = self.model_manager.get_task_type(model_id, api_key=api_key)
            inference_request_type = ObjectDetectionInferenceRequest
            args = dict()
            if task_type == "instance-segmentation":
                inference_request_type = InstanceSegmentationInferenceRequest
                args = {
                    "mask_decode_mode": mask_decode_mode,
                    "tradeoff_factor": tradeoff_factor,
                }
                if response_mask_format:
                    args["response_mask_format"] = response_mask_format
            elif task_type == "classification":
                inference_request_type = ClassificationInferenceRequest
            elif task_type == "keypoint-detection":
                inference_request_type = KeypointsDetectionInferenceRequest
                args = {"keypoint_confidence": keypoint_confidence}
            elif task_type == "semantic-segmentation":
                inference_request_type = SemanticSegmentationInferenceRequest
            inference_request = inference_request_type(
                api_key=api_key,
                model_id=model_id,
                image=request_image,
                confidence=confidence,
                iou_threshold=overlap,
                max_detections=max_detections,
                visualization_labels=labels,
                visualization_stroke_width=stroke,
                visualize_predictions=(
                    format == "image" or format == "image_and_json"
                ),
                disable_preproc_auto_orient=disable_preproc_auto_orient,
                disable_preproc_contrast=disable_preproc_contrast,
                disable_preproc_grayscale=disable_preproc_grayscale,
                disable_preproc_static_crop=disable_preproc_static_crop,
                disable_active_learning=disable_active_learning,
                active_learning_target_dataset=active_learning_target_dataset,
                source=source,
                source_info=source_info,
                usage_billable=countinference,
                disable_model_monitoring=disable_model_monitoring,
                **args,
            )
            inference_response = self.model_manager.infer_from_request_sync(
                inference_request.model_id,
                inference_request,
                active_learning_eligible=True,
                background_tasks=background_tasks,
            )
            logger.debug("Response ready.")
            if format == "image":
                return Response(
                    content=inference_response.visualization,
                    media_type="image/jpeg",
                )
            else:
                return orjson_response(inference_response)

    if not (LAMBDA or GCP_SERVERLESS):
        # Legacy clear cache endpoint for backwards compatibility
        @app.get("/clear_cache", response_model=str)
        def legacy_clear_cache():
            """
            Clears the model cache.

            This endpoint provides a way to clear the cache of loaded models.

            Returns:
                str: A string indicating that the cache has been cleared.
            """
            logger.debug(f"Reached /clear_cache")
            model_clear()
            return "Cache Cleared"

        # Legacy add model endpoint for backwards compatibility
        @app.get("/start/{dataset_id}/{version_id}")
        def model_add_legacy(
            dataset_id: str,
            version_id: str,
            api_key: str = None,
            countinference: Optional[bool] = None,
            service_secret: Optional[str] = None,
        ):
            """
            Starts a model inference session.

            This endpoint initializes and starts an inference session for the specified model version.

            Args:
                dataset_id (str): ID of a Roboflow dataset corresponding to the model.
                version_id (str): ID of a Roboflow dataset version corresponding to the model.
                api_key (str, optional): Roboflow API Key for artifact retrieval.
                countinference (Optional[bool]): Whether to count inference or not.
                service_secret (Optional[str]): The service secret for the request.

            Returns:
                JSONResponse: A response object containing the status and a success message.
            """
            logger.debug(
                f"Reached /start/{dataset_id}/{version_id} with {dataset_id}/{version_id}"
            )
            model_id = f"{dataset_id}/{version_id}"
            self.model_manager.add_model(
                model_id,
                api_key,
                countinference=countinference,
                service_secret=service_secret,
            )

            return JSONResponse(
                {
                    "status": 200,
                    "message": "inference session started from local memory.",
                }
            )

    if not ENABLE_DASHBOARD:

        @app.get("/dashboard.html")
        @app.head("/dashboard.html")
        async def dashboard_guard():
            return Response(status_code=404)

    @app.exception_handler(InputImageLoadError)
    async def unicorn_exception_handler(request: Request, exc: InputImageLoadError):
        return JSONResponse(
            status_code=400,
            content={
                "message": f"Could not load input image. Cause: {exc.get_public_error_details()}"
            },
        )

    app.mount(
        "/",
        StaticFiles(directory="./inference/landing/out", html=True),
        name="root",
    )

Functions:

inference.core.interfaces.http.uvicorn_config

Helpers for translating SSL env vars into uvicorn configuration.

Classes

HTTPSConfigurationError

Bases: ValueError

Raised when HTTPS is enabled but the SSL configuration is incomplete.

Source code in inference/core/interfaces/http/uvicorn_config.py
6
7
class HTTPSConfigurationError(ValueError):
    """Raised when HTTPS is enabled but the SSL configuration is incomplete."""

Functions:

build_ssl_uvicorn_cli_args

build_ssl_uvicorn_cli_args(
    enable_https,
    ssl_certfile,
    ssl_keyfile,
    ssl_keyfile_password=None,
    ssl_ca_certs=None,
)

Return a list of CLI flags for the uvicorn binary.

Mirrors :func:build_ssl_uvicorn_kwargs for callers that shell out to uvicorn rather than calling uvicorn.run directly.

Source code in inference/core/interfaces/http/uvicorn_config.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def build_ssl_uvicorn_cli_args(
    enable_https: bool,
    ssl_certfile: Optional[str],
    ssl_keyfile: Optional[str],
    ssl_keyfile_password: Optional[str] = None,
    ssl_ca_certs: Optional[str] = None,
) -> List[str]:
    """Return a list of CLI flags for the uvicorn binary.

    Mirrors :func:`build_ssl_uvicorn_kwargs` for callers that shell out to
    ``uvicorn`` rather than calling ``uvicorn.run`` directly.
    """
    kwargs = build_ssl_uvicorn_kwargs(
        enable_https=enable_https,
        ssl_certfile=ssl_certfile,
        ssl_keyfile=ssl_keyfile,
        ssl_keyfile_password=ssl_keyfile_password,
        ssl_ca_certs=ssl_ca_certs,
    )
    flag_map = {
        "ssl_certfile": "--ssl-certfile",
        "ssl_keyfile": "--ssl-keyfile",
        "ssl_keyfile_password": "--ssl-keyfile-password",
        "ssl_ca_certs": "--ssl-ca-certs",
    }
    args: List[str] = []
    for key, flag in flag_map.items():
        if key in kwargs:
            args.extend([flag, str(kwargs[key])])
    return args

build_ssl_uvicorn_kwargs

build_ssl_uvicorn_kwargs(
    enable_https,
    ssl_certfile,
    ssl_keyfile,
    ssl_keyfile_password=None,
    ssl_ca_certs=None,
)

Return a dict of SSL kwargs suitable for uvicorn.run.

Returns an empty dict when enable_https is falsy. When enabled, both ssl_certfile and ssl_keyfile are required.

Source code in inference/core/interfaces/http/uvicorn_config.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def build_ssl_uvicorn_kwargs(
    enable_https: bool,
    ssl_certfile: Optional[str],
    ssl_keyfile: Optional[str],
    ssl_keyfile_password: Optional[str] = None,
    ssl_ca_certs: Optional[str] = None,
) -> Dict[str, Any]:
    """Return a dict of SSL kwargs suitable for ``uvicorn.run``.

    Returns an empty dict when ``enable_https`` is falsy. When enabled, both
    ``ssl_certfile`` and ``ssl_keyfile`` are required.
    """
    if not enable_https:
        return {}
    if not ssl_certfile or not ssl_keyfile:
        raise HTTPSConfigurationError(
            "ENABLE_HTTPS is set but SSL_CERTFILE and SSL_KEYFILE must both be "
            "configured to serve HTTPS."
        )
    kwargs: Dict[str, Any] = {
        "ssl_certfile": ssl_certfile,
        "ssl_keyfile": ssl_keyfile,
    }
    if ssl_keyfile_password:
        kwargs["ssl_keyfile_password"] = ssl_keyfile_password
    if ssl_ca_certs:
        kwargs["ssl_ca_certs"] = ssl_ca_certs
    return kwargs

core/interfaces/http/handlers

inference.core.interfaces.http.handlers.workflows

Classes

Functions:

enrich_with_air_gapped_info

enrich_with_air_gapped_info(result)

Post-process block descriptions to include air-gapped availability info.

Deep-copies block schemas before mutating so the LRU-cached objects are not modified.

Source code in inference/core/interfaces/http/handlers/workflows.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def enrich_with_air_gapped_info(
    result: WorkflowsBlocksDescription,
) -> WorkflowsBlocksDescription:
    """Post-process block descriptions to include air-gapped availability info.

    Deep-copies block schemas before mutating so the LRU-cached objects are
    not modified.
    """
    enriched_blocks = []
    for block in result.blocks:
        manifest_cls = block.manifest_class
        air_gapped_info = _get_air_gapped_info_for_block(manifest_cls)
        enriched_schema = copy.deepcopy(block.block_schema)
        if "json_schema_extra" not in enriched_schema:
            enriched_schema["json_schema_extra"] = {}
        enriched_schema["json_schema_extra"][
            "air_gapped_info"
        ] = air_gapped_info.to_dict()
        enriched_blocks.append(
            block.model_copy(update={"block_schema": enriched_schema})
        )
    return result.model_copy(update={"blocks": enriched_blocks})

core/interfaces/http/middlewares

inference.core.interfaces.http.middlewares.cors

Classes

PathAwareCORSMiddleware

Bases: CORSMiddleware

Extends Starlette's CORSMiddleware to allow specifying a regex of paths that this middleware should apply to. If 'match_paths' is given, only requests matching that regex will have CORS headers applied.

Also supports Private Network Access (PNA) for local development, allowing requests from public websites to localhost.

Source code in inference/core/interfaces/http/middlewares/cors.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
class PathAwareCORSMiddleware(StarletteCORSMiddleware):
    """
    Extends Starlette's CORSMiddleware to allow specifying a regex of paths that
    this middleware should apply to.
    If 'match_paths' is given, only requests matching that regex will have CORS
    headers applied.

    Also supports Private Network Access (PNA) for local development, allowing
    requests from public websites to localhost.
    """

    def __init__(
        self,
        app: ASGIApp,
        match_paths: str | None = None,
        allow_origins: typing.Sequence[str] = (),
        allow_methods: typing.Sequence[str] = ("GET",),
        allow_headers: typing.Sequence[str] = (),
        allow_credentials: bool = False,
        allow_origin_regex: str | None = None,
        expose_headers: typing.Sequence[str] = (),
        max_age: int = 600,
        allow_private_network: bool = False,
    ) -> None:
        super().__init__(
            app=app,
            allow_origins=allow_origins,
            allow_methods=allow_methods,
            allow_headers=allow_headers,
            allow_credentials=allow_credentials,
            allow_origin_regex=allow_origin_regex,
            expose_headers=expose_headers,
            max_age=max_age,
        )
        self.match_paths_regex = re.compile(match_paths) if match_paths else None
        self.allow_private_network = allow_private_network
        # Store these for PNA preflight handling (not exposed by parent class)
        self._max_age = max_age
        self._allow_methods = allow_methods
        self._allow_headers = allow_headers
        self._allow_credentials = allow_credentials

    async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
        """
        Only apply the CORS logic if the path matches self.match_paths_regex
        (when provided). Otherwise, just call the wrapped 'app'.
        """
        # If it's not an HTTP request, skip the CORS processing:
        if scope["type"] != "http":
            await self.app(scope, receive, send)
            return

        # If match_paths was supplied, check if the current path matches
        if self.match_paths_regex is not None:
            path = scope.get("path", "")
            if not self.match_paths_regex.match(path):
                # If it does NOT match, just run the app without CORS
                await self.app(scope, receive, send)
                return

        # Handle Private Network Access preflight requests
        if self.allow_private_network:
            headers = Headers(scope=scope)
            if (
                scope["method"] == "OPTIONS"
                and "access-control-request-private-network" in headers
            ):
                await self._handle_pna_preflight(scope, receive, send, headers)
                return

        # If we got here, apply the normal Starlette CORSMiddleware behavior
        await super().__call__(scope, receive, send)

    async def _handle_pna_preflight(
        self, scope: Scope, receive: Receive, send: Send, request_headers: Headers
    ) -> None:
        """
        Handle preflight requests that include Private Network Access header.
        """
        origin = request_headers.get("origin", "")
        if self.is_allowed_origin(origin=origin):
            response_headers = {
                "access-control-allow-origin": origin,
                "access-control-allow-private-network": "true",
                "access-control-allow-methods": ", ".join(self._allow_methods),
                "access-control-max-age": str(self._max_age),
            }
            if self._allow_headers and "*" not in self._allow_headers:
                response_headers["access-control-allow-headers"] = ", ".join(
                    self._allow_headers
                )
            elif "*" in self._allow_headers:
                requested_headers = request_headers.get(
                    "access-control-request-headers", ""
                )
                if requested_headers:
                    response_headers["access-control-allow-headers"] = requested_headers
            if self._allow_credentials:
                response_headers["access-control-allow-credentials"] = "true"
            response = PlainTextResponse(
                "OK", status_code=200, headers=response_headers
            )
        else:
            response = PlainTextResponse(
                "Disallowed CORS origin", status_code=400, headers={"vary": "Origin"}
            )
        await response(scope, receive, send)
Methods:
__call__ async
__call__(scope, receive, send)

Only apply the CORS logic if the path matches self.match_paths_regex (when provided). Otherwise, just call the wrapped 'app'.

Source code in inference/core/interfaces/http/middlewares/cors.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
    """
    Only apply the CORS logic if the path matches self.match_paths_regex
    (when provided). Otherwise, just call the wrapped 'app'.
    """
    # If it's not an HTTP request, skip the CORS processing:
    if scope["type"] != "http":
        await self.app(scope, receive, send)
        return

    # If match_paths was supplied, check if the current path matches
    if self.match_paths_regex is not None:
        path = scope.get("path", "")
        if not self.match_paths_regex.match(path):
            # If it does NOT match, just run the app without CORS
            await self.app(scope, receive, send)
            return

    # Handle Private Network Access preflight requests
    if self.allow_private_network:
        headers = Headers(scope=scope)
        if (
            scope["method"] == "OPTIONS"
            and "access-control-request-private-network" in headers
        ):
            await self._handle_pna_preflight(scope, receive, send, headers)
            return

    # If we got here, apply the normal Starlette CORSMiddleware behavior
    await super().__call__(scope, receive, send)

core/interfaces/stream

inference.core.interfaces.stream.sinks

Classes

UDPSink

Source code in inference/core/interfaces/stream/sinks.py
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
class UDPSink:
    @classmethod
    def init(cls, ip_address: str, port: int) -> "UDPSink":
        """
        Creates `InferencePipeline` predictions sink capable of sending model predictions over network
        using UDP socket.

        As an `inference` user, please use .init() method instead of constructor to instantiate objects.
        Args:
            ip_address (str): IP address to send predictions
            port (int): Port to send predictions

        Returns: Initialised object of `UDPSink` class.
        """
        udp_socket = socket.socket(family=socket.AF_INET, type=socket.SOCK_DGRAM)
        udp_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        udp_socket.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
        udp_socket.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 1)
        udp_socket.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 65536)
        return cls(
            ip_address=ip_address,
            port=port,
            udp_socket=udp_socket,
        )

    def __init__(self, ip_address: str, port: int, udp_socket: socket.socket):
        self._ip_address = ip_address
        self._port = port
        self._socket = udp_socket

    def send_predictions(
        self,
        predictions: Union[dict, List[Optional[dict]]],
        video_frame: Union[VideoFrame, List[Optional[VideoFrame]]],
    ) -> None:
        """
        Method to send predictions via UDP socket. Useful in combination with `InferencePipeline` as
        a sink for predictions.

        Args:
            predictions (Union[dict, List[Optional[dict]]]): Roboflow predictions, the function support single prediction
                processing and batch processing since version `0.9.18`. Batch predictions elements are optional, but
                should occur at the same position as `video_frame` list. Order is expected to match with `video_frame`.
            video_frame (Union[VideoFrame, List[Optional[VideoFrame]]]): frame of video with its basic metadata emitted
                by `VideoSource` or list of frames from (it is possible for empty batch frames at corresponding positions
                to `predictions` list). Order is expected to match with `predictions`

        Returns: None
        Side effects: Sends serialised `predictions` and `video_frame` metadata via the UDP socket as
            JSON string. It adds key named "inference_metadata" into `predictions` dict (mutating its
            state). "inference_metadata" contain id of the frame, frame grabbing timestamp and message
            emission time in datetime iso format.

        Example:
            ```python
            import cv2
            from inference.core.interfaces.stream.inference_pipeline import InferencePipeline
            from inference.core.interfaces.stream.sinks import UDPSink

            udp_sink = UDPSink.init(ip_address="127.0.0.1", port=9090)

            pipeline = InferencePipeline.init(
                 model_id="your-model/3",
                 video_reference="./some_file.mp4",
                 on_prediction=udp_sink.send_predictions,
            )
            pipeline.start()
            pipeline.join()
            ```
            `UDPSink` used in this way will emit predictions to receiver automatically.
        """
        video_frame = wrap_in_list(element=video_frame)
        predictions = wrap_in_list(element=predictions)
        for single_frame, frame_predictions in zip(video_frame, predictions):
            if single_frame is None:
                continue
            inference_metadata = {
                "source_id": single_frame.source_id,
                "frame_id": single_frame.frame_id,
                "frame_decoding_time": single_frame.frame_timestamp.isoformat(),
                "emission_time": datetime.now().isoformat(),
            }
            frame_predictions["inference_metadata"] = inference_metadata
            serialised_predictions = json.dumps(frame_predictions).encode("utf-8")
            self._socket.sendto(
                serialised_predictions,
                (
                    self._ip_address,
                    self._port,
                ),
            )
Methods:
init classmethod
init(ip_address, port)

Creates InferencePipeline predictions sink capable of sending model predictions over network using UDP socket.

As an inference user, please use .init() method instead of constructor to instantiate objects. Args: ip_address (str): IP address to send predictions port (int): Port to send predictions

Returns: Initialised object of UDPSink class.

Source code in inference/core/interfaces/stream/sinks.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
@classmethod
def init(cls, ip_address: str, port: int) -> "UDPSink":
    """
    Creates `InferencePipeline` predictions sink capable of sending model predictions over network
    using UDP socket.

    As an `inference` user, please use .init() method instead of constructor to instantiate objects.
    Args:
        ip_address (str): IP address to send predictions
        port (int): Port to send predictions

    Returns: Initialised object of `UDPSink` class.
    """
    udp_socket = socket.socket(family=socket.AF_INET, type=socket.SOCK_DGRAM)
    udp_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    udp_socket.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
    udp_socket.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 1)
    udp_socket.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 65536)
    return cls(
        ip_address=ip_address,
        port=port,
        udp_socket=udp_socket,
    )
send_predictions
send_predictions(predictions, video_frame)

Method to send predictions via UDP socket. Useful in combination with InferencePipeline as a sink for predictions.

Parameters:

Name Type Description Default
predictions Union[dict, List[Optional[dict]]]

Roboflow predictions, the function support single prediction processing and batch processing since version 0.9.18. Batch predictions elements are optional, but should occur at the same position as video_frame list. Order is expected to match with video_frame.

required
video_frame Union[VideoFrame, List[Optional[VideoFrame]]]

frame of video with its basic metadata emitted by VideoSource or list of frames from (it is possible for empty batch frames at corresponding positions to predictions list). Order is expected to match with predictions

required

Side effects: Sends serialised predictions and video_frame metadata via the UDP socket as JSON string. It adds key named "inference_metadata" into predictions dict (mutating its state). "inference_metadata" contain id of the frame, frame grabbing timestamp and message emission time in datetime iso format.

Example

import cv2
from inference.core.interfaces.stream.inference_pipeline import InferencePipeline
from inference.core.interfaces.stream.sinks import UDPSink

udp_sink = UDPSink.init(ip_address="127.0.0.1", port=9090)

pipeline = InferencePipeline.init(
     model_id="your-model/3",
     video_reference="./some_file.mp4",
     on_prediction=udp_sink.send_predictions,
)
pipeline.start()
pipeline.join()
UDPSink used in this way will emit predictions to receiver automatically.

Source code in inference/core/interfaces/stream/sinks.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
def send_predictions(
    self,
    predictions: Union[dict, List[Optional[dict]]],
    video_frame: Union[VideoFrame, List[Optional[VideoFrame]]],
) -> None:
    """
    Method to send predictions via UDP socket. Useful in combination with `InferencePipeline` as
    a sink for predictions.

    Args:
        predictions (Union[dict, List[Optional[dict]]]): Roboflow predictions, the function support single prediction
            processing and batch processing since version `0.9.18`. Batch predictions elements are optional, but
            should occur at the same position as `video_frame` list. Order is expected to match with `video_frame`.
        video_frame (Union[VideoFrame, List[Optional[VideoFrame]]]): frame of video with its basic metadata emitted
            by `VideoSource` or list of frames from (it is possible for empty batch frames at corresponding positions
            to `predictions` list). Order is expected to match with `predictions`

    Returns: None
    Side effects: Sends serialised `predictions` and `video_frame` metadata via the UDP socket as
        JSON string. It adds key named "inference_metadata" into `predictions` dict (mutating its
        state). "inference_metadata" contain id of the frame, frame grabbing timestamp and message
        emission time in datetime iso format.

    Example:
        ```python
        import cv2
        from inference.core.interfaces.stream.inference_pipeline import InferencePipeline
        from inference.core.interfaces.stream.sinks import UDPSink

        udp_sink = UDPSink.init(ip_address="127.0.0.1", port=9090)

        pipeline = InferencePipeline.init(
             model_id="your-model/3",
             video_reference="./some_file.mp4",
             on_prediction=udp_sink.send_predictions,
        )
        pipeline.start()
        pipeline.join()
        ```
        `UDPSink` used in this way will emit predictions to receiver automatically.
    """
    video_frame = wrap_in_list(element=video_frame)
    predictions = wrap_in_list(element=predictions)
    for single_frame, frame_predictions in zip(video_frame, predictions):
        if single_frame is None:
            continue
        inference_metadata = {
            "source_id": single_frame.source_id,
            "frame_id": single_frame.frame_id,
            "frame_decoding_time": single_frame.frame_timestamp.isoformat(),
            "emission_time": datetime.now().isoformat(),
        }
        frame_predictions["inference_metadata"] = inference_metadata
        serialised_predictions = json.dumps(frame_predictions).encode("utf-8")
        self._socket.sendto(
            serialised_predictions,
            (
                self._ip_address,
                self._port,
            ),
        )

VideoFileSink

Source code in inference/core/interfaces/stream/sinks.py
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
class VideoFileSink:
    @classmethod
    def init(
        cls,
        video_file_name: str,
        annotator: Optional[Union[BaseAnnotator, List[BaseAnnotator]]] = None,
        display_size: Optional[Tuple[int, int]] = (1280, 720),
        fps_monitor: Optional[sv.FPSMonitor] = DEFAULT_FPS_MONITOR,
        display_statistics: bool = False,
        output_fps: int = 25,
        quiet: bool = False,
        video_frame_size: Tuple[int, int] = (1280, 720),
    ) -> "VideoFileSink":
        """
        Creates `InferencePipeline` predictions sink capable of saving model predictions into video file.
        It works both for pipelines with single input video and multiple ones.

        As an `inference` user, please use .init() method instead of constructor to instantiate objects.
        Args:
            video_file_name (str): name of the video file to save predictions
            annotator (Union[BaseAnnotator, List[BaseAnnotator]]): instance of class inheriting from supervision BaseAnnotator
                or list of such instances. If nothing is passed chain of `sv.BoxAnnotator()` and `sv.LabelAnnotator()` is used.
            display_size (Tuple[int, int]): tuple in format (width, height) to resize visualisation output. Should
                be set to the same value as `display_size` for InferencePipeline with single video source, otherwise
                it represents the size of single visualisation tile (whole tiles mosaic will be scaled to
                `video_frame_size`)
            fps_monitor (Optional[sv.FPSMonitor]): FPS monitor used to monitor throughput
            display_statistics (bool): Flag to decide if throughput and latency can be displayed in the result image,
                if enabled, throughput will only be presented if `fps_monitor` is not None
            output_fps (int): desired FPS of output file
            quiet (bool): Flag to decide whether to log progress
            video_frame_size (Tuple[int, int]): The size of frame in target video file.

        Attributes:
            on_prediction (Callable[[dict, VideoFrame], None]): callable to be used as a sink for predictions

        Returns: Initialized object of `VideoFileSink` class.

        Example:
            ```python
            import cv2
            from inference import InferencePipeline
            from inference.core.interfaces.stream.sinks import VideoFileSink

            video_sink = VideoFileSink.init(video_file_name="output.avi")

            pipeline = InferencePipeline.init(
                model_id="your-model/3",
                video_reference="./some_file.mp4",
                on_prediction=video_sink.on_prediction,
            )
            pipeline.start()
            pipeline.join()
            video_sink.release()
            ```

            `VideoFileSink` used in this way will save predictions to video file automatically.
        """
        return cls(
            video_file_name=video_file_name,
            annotator=annotator,
            display_size=display_size,
            fps_monitor=fps_monitor,
            display_statistics=display_statistics,
            output_fps=output_fps,
            quiet=quiet,
            video_frame_size=video_frame_size,
        )

    def __init__(
        self,
        video_file_name: str,
        annotator: Union[BaseAnnotator, List[BaseAnnotator]],
        display_size: Optional[Tuple[int, int]],
        fps_monitor: Optional[sv.FPSMonitor],
        display_statistics: bool,
        output_fps: int,
        quiet: bool,
        video_frame_size: Tuple[int, int],
    ):
        self._video_file_name = video_file_name
        self._annotator = annotator
        self._display_size = display_size
        self._fps_monitor = fps_monitor
        self._display_statistics = display_statistics
        self._output_fps = output_fps
        self._quiet = quiet
        self._frame_idx = 0
        self._video_frame_size = video_frame_size
        self._video_writer: Optional[cv2.VideoWriter] = None
        self.on_prediction = partial(
            render_boxes,
            annotator=self._annotator,
            display_size=self._display_size,
            fps_monitor=self._fps_monitor,
            display_statistics=self._display_statistics,
            on_frame_rendered=self._save_predictions,
        )

    def release(self) -> None:
        """
        Releases VideoWriter object.
        """
        if self._video_writer is not None and self._video_writer.isOpened():
            self._video_writer.release()

    def _save_predictions(
        self,
        frame: Union[ImageWithSourceID, List[ImageWithSourceID]],
    ) -> None:
        if self._video_writer is None:
            self._initialise_sink()
        if issubclass(type(frame), list):
            frame = create_tiles(images=[i[1] for i in frame])
        else:
            frame = frame[1]
        if (frame.shape[1], frame.shape[0]) != self._video_frame_size:
            frame = letterbox_image(image=frame, desired_size=self._video_frame_size)
        self._video_writer.write(frame)
        if not self._quiet:
            print(f"Writing frame {self._frame_idx}", end="\r")
        self._frame_idx += 1

    def _initialise_sink(self) -> None:
        self._video_writer = cv2.VideoWriter(
            self._video_file_name,
            cv2.VideoWriter_fourcc(*"MJPG"),
            self._output_fps,
            self._video_frame_size,
        )

    def __enter__(self) -> "VideoFileSink":
        return self

    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        self.release()
Methods:
init classmethod
init(
    video_file_name,
    annotator=None,
    display_size=(1280, 720),
    fps_monitor=DEFAULT_FPS_MONITOR,
    display_statistics=False,
    output_fps=25,
    quiet=False,
    video_frame_size=(1280, 720),
)

Creates InferencePipeline predictions sink capable of saving model predictions into video file. It works both for pipelines with single input video and multiple ones.

As an inference user, please use .init() method instead of constructor to instantiate objects. Args: video_file_name (str): name of the video file to save predictions annotator (Union[BaseAnnotator, List[BaseAnnotator]]): instance of class inheriting from supervision BaseAnnotator or list of such instances. If nothing is passed chain of sv.BoxAnnotator() and sv.LabelAnnotator() is used. display_size (Tuple[int, int]): tuple in format (width, height) to resize visualisation output. Should be set to the same value as display_size for InferencePipeline with single video source, otherwise it represents the size of single visualisation tile (whole tiles mosaic will be scaled to video_frame_size) fps_monitor (Optional[sv.FPSMonitor]): FPS monitor used to monitor throughput display_statistics (bool): Flag to decide if throughput and latency can be displayed in the result image, if enabled, throughput will only be presented if fps_monitor is not None output_fps (int): desired FPS of output file quiet (bool): Flag to decide whether to log progress video_frame_size (Tuple[int, int]): The size of frame in target video file.

Attributes:

Name Type Description
on_prediction Callable[[dict, VideoFrame], None]

callable to be used as a sink for predictions

Example
import cv2
from inference import InferencePipeline
from inference.core.interfaces.stream.sinks import VideoFileSink

video_sink = VideoFileSink.init(video_file_name="output.avi")

pipeline = InferencePipeline.init(
    model_id="your-model/3",
    video_reference="./some_file.mp4",
    on_prediction=video_sink.on_prediction,
)
pipeline.start()
pipeline.join()
video_sink.release()

VideoFileSink used in this way will save predictions to video file automatically.

Source code in inference/core/interfaces/stream/sinks.py
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
@classmethod
def init(
    cls,
    video_file_name: str,
    annotator: Optional[Union[BaseAnnotator, List[BaseAnnotator]]] = None,
    display_size: Optional[Tuple[int, int]] = (1280, 720),
    fps_monitor: Optional[sv.FPSMonitor] = DEFAULT_FPS_MONITOR,
    display_statistics: bool = False,
    output_fps: int = 25,
    quiet: bool = False,
    video_frame_size: Tuple[int, int] = (1280, 720),
) -> "VideoFileSink":
    """
    Creates `InferencePipeline` predictions sink capable of saving model predictions into video file.
    It works both for pipelines with single input video and multiple ones.

    As an `inference` user, please use .init() method instead of constructor to instantiate objects.
    Args:
        video_file_name (str): name of the video file to save predictions
        annotator (Union[BaseAnnotator, List[BaseAnnotator]]): instance of class inheriting from supervision BaseAnnotator
            or list of such instances. If nothing is passed chain of `sv.BoxAnnotator()` and `sv.LabelAnnotator()` is used.
        display_size (Tuple[int, int]): tuple in format (width, height) to resize visualisation output. Should
            be set to the same value as `display_size` for InferencePipeline with single video source, otherwise
            it represents the size of single visualisation tile (whole tiles mosaic will be scaled to
            `video_frame_size`)
        fps_monitor (Optional[sv.FPSMonitor]): FPS monitor used to monitor throughput
        display_statistics (bool): Flag to decide if throughput and latency can be displayed in the result image,
            if enabled, throughput will only be presented if `fps_monitor` is not None
        output_fps (int): desired FPS of output file
        quiet (bool): Flag to decide whether to log progress
        video_frame_size (Tuple[int, int]): The size of frame in target video file.

    Attributes:
        on_prediction (Callable[[dict, VideoFrame], None]): callable to be used as a sink for predictions

    Returns: Initialized object of `VideoFileSink` class.

    Example:
        ```python
        import cv2
        from inference import InferencePipeline
        from inference.core.interfaces.stream.sinks import VideoFileSink

        video_sink = VideoFileSink.init(video_file_name="output.avi")

        pipeline = InferencePipeline.init(
            model_id="your-model/3",
            video_reference="./some_file.mp4",
            on_prediction=video_sink.on_prediction,
        )
        pipeline.start()
        pipeline.join()
        video_sink.release()
        ```

        `VideoFileSink` used in this way will save predictions to video file automatically.
    """
    return cls(
        video_file_name=video_file_name,
        annotator=annotator,
        display_size=display_size,
        fps_monitor=fps_monitor,
        display_statistics=display_statistics,
        output_fps=output_fps,
        quiet=quiet,
        video_frame_size=video_frame_size,
    )
release
release()

Releases VideoWriter object.

Source code in inference/core/interfaces/stream/sinks.py
505
506
507
508
509
510
def release(self) -> None:
    """
    Releases VideoWriter object.
    """
    if self._video_writer is not None and self._video_writer.isOpened():
        self._video_writer.release()

Functions:

active_learning_sink

active_learning_sink(
    predictions,
    video_frame,
    active_learning_middleware,
    model_type,
    disable_preproc_auto_orient=False,
)

Function to serve as Active Learning sink for InferencePipeline.

Parameters:

Name Type Description Default
predictions Union[dict, List[Optional[dict]]]

Roboflow predictions, the function support single prediction processing and batch processing since version 0.9.18. Batch predictions elements are optional, but should occur at the same position as video_frame list. Order is expected to match with video_frame.

required
video_frame Union[VideoFrame, List[Optional[VideoFrame]]]

frame of video with its basic metadata emitted by VideoSource or list of frames from (it is possible for empty batch frames at corresponding positions to predictions list). Order is expected to match with predictions

required
active_learning_middleware ActiveLearningMiddleware

instance of middleware to register data.

required
model_type str

Type of Roboflow model in use

required
disable_preproc_auto_orient bool

Flag to denote how image is preprocessed which is important in Active Learning.

False

Side effects: Can register data and predictions in Roboflow backend if that's the evaluation of sampling engine.

Source code in inference/core/interfaces/stream/sinks.py
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
def active_learning_sink(
    predictions: Union[dict, List[Optional[dict]]],
    video_frame: Union[VideoFrame, List[Optional[VideoFrame]]],
    active_learning_middleware: ActiveLearningMiddleware,
    model_type: str,
    disable_preproc_auto_orient: bool = False,
) -> None:
    """
    Function to serve as Active Learning sink for InferencePipeline.

    Args:
        predictions (Union[dict, List[Optional[dict]]]): Roboflow predictions, the function support single prediction
            processing and batch processing since version `0.9.18`. Batch predictions elements are optional, but
            should occur at the same position as `video_frame` list. Order is expected to match with `video_frame`.
        video_frame (Union[VideoFrame, List[Optional[VideoFrame]]]): frame of video with its basic metadata emitted
            by `VideoSource` or list of frames from (it is possible for empty batch frames at corresponding positions
            to `predictions` list). Order is expected to match with `predictions`
        active_learning_middleware (ActiveLearningMiddleware): instance of middleware to register data.
        model_type (str): Type of Roboflow model in use
        disable_preproc_auto_orient (bool): Flag to denote how image is preprocessed which is important in
            Active Learning.

    Returns: None
    Side effects: Can register data and predictions in Roboflow backend if that's the evaluation of sampling engine.
    """
    video_frame = wrap_in_list(element=video_frame)
    predictions = wrap_in_list(element=predictions)
    images = [f.image for f in video_frame if f is not None]
    predictions = [p for p in predictions if p is not None]
    active_learning_middleware.register_batch(
        inference_inputs=images,
        predictions=predictions,
        prediction_type=model_type,
        disable_preproc_auto_orient=disable_preproc_auto_orient,
    )

multi_sink

multi_sink(predictions, video_frame, sinks)

Helper util useful to combine multiple sinks together, while using InferencePipeline.

Parameters:

Name Type Description Default
video_frame VideoFrame

frame of video with its basic metadata emitted by VideoSource

required
predictions dict

Roboflow object detection predictions with Bounding Boxes

required
sinks List[Callable[[VideoFrame, dict], None]]

list of sinks to be used. Each will be executed one-by-one in the order pointed in input list, all errors will be caught and reported via logger, without re-raising.

required

Side effects: Uses all sinks in context if (video_frame, predictions) input.

Example
from functools import partial
import cv2
from inference import InferencePipeline
from inference.core.interfaces.stream.sinks import UDPSink, render_boxes

udp_sink = UDPSink(ip_address="127.0.0.1", port=9090)
on_prediction = partial(multi_sink, sinks=[udp_sink.send_predictions, render_boxes])

pipeline = InferencePipeline.init(
    model_id="your-model/3",
    video_reference="./some_file.mp4",
    on_prediction=on_prediction,
)
pipeline.start()
pipeline.join()

As a result, predictions will both be sent via UDP socket and displayed in the screen.

Source code in inference/core/interfaces/stream/sinks.py
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
def multi_sink(
    predictions: Union[dict, List[Optional[dict]]],
    video_frame: Union[VideoFrame, List[Optional[VideoFrame]]],
    sinks: List[SinkHandler],
) -> None:
    """
    Helper util useful to combine multiple sinks together, while using `InferencePipeline`.

    Args:
        video_frame (VideoFrame): frame of video with its basic metadata emitted by `VideoSource`
        predictions (dict): Roboflow object detection predictions with Bounding Boxes
        sinks (List[Callable[[VideoFrame, dict], None]]): list of sinks to be used. Each will be executed
            one-by-one in the order pointed in input list, all errors will be caught and reported via logger,
            without re-raising.

    Returns: None
    Side effects: Uses all sinks in context if (video_frame, predictions) input.

    Example:
        ```python
        from functools import partial
        import cv2
        from inference import InferencePipeline
        from inference.core.interfaces.stream.sinks import UDPSink, render_boxes

        udp_sink = UDPSink(ip_address="127.0.0.1", port=9090)
        on_prediction = partial(multi_sink, sinks=[udp_sink.send_predictions, render_boxes])

        pipeline = InferencePipeline.init(
            model_id="your-model/3",
            video_reference="./some_file.mp4",
            on_prediction=on_prediction,
        )
        pipeline.start()
        pipeline.join()
        ```

        As a result, predictions will both be sent via UDP socket and displayed in the screen.
    """
    for sink in sinks:
        try:
            sink(predictions, video_frame)
        except Exception as error:
            logger.error(
                f"Could not send prediction and/or frame to sink due to error: {error}."
            )

render_boxes

render_boxes(
    predictions,
    video_frame,
    annotator=None,
    display_size=(1280, 720),
    fps_monitor=DEFAULT_FPS_MONITOR,
    display_statistics=False,
    on_frame_rendered=display_image,
)

Helper tool to render object detection predictions on top of video frame. It is designed to be used with InferencePipeline, as sink for predictions. By default, it uses standard sv.BoxAnnotator() chained with sv.LabelAnnotator() to draw bounding boxes and resizes prediction to 1280x720 (keeping aspect ratio and adding black padding). One may configure default behaviour, for instance to display latency and throughput statistics. In batch mode it will display tiles of frames and overlay predictions.

This sink is only partially compatible with stubs and classification models (it will not fail, although predictions will not be displayed).

Since version 0.9.18, when multi-source InferencePipeline was introduced - it support batch input, without changes to old functionality when single (predictions, video_frame) is used.

Parameters:

Name Type Description Default
predictions Union[dict, List[Optional[dict]]]

Roboflow predictions, the function support single prediction processing and batch processing since version 0.9.18. Batch predictions elements are optional, but should occur at the same position as video_frame list. Order is expected to match with video_frame.

required
video_frame Union[VideoFrame, List[Optional[VideoFrame]]]

frame of video with its basic metadata emitted by VideoSource or list of frames from (it is possible for empty batch frames at corresponding positions to predictions list). Order is expected to match with predictions

required
annotator Union[BaseAnnotator, List[BaseAnnotator]]

instance of class inheriting from supervision BaseAnnotator or list of such instances. If nothing is passed chain of sv.BoxAnnotator() and sv.LabelAnnotator() is used.

None
display_size Tuple[int, int]

tuple in format (width, height) to resize visualisation output

(1280, 720)
fps_monitor Optional[FPSMonitor]

FPS monitor used to monitor throughput

DEFAULT_FPS_MONITOR
display_statistics bool

Flag to decide if throughput and latency can be displayed in the result image, if enabled, throughput will only be presented if fps_monitor is not None

False
on_frame_rendered Callable[[Union[ImageWithSourceID, List[ImageWithSourceID]]], None]

callback to be called once frame is rendered - by default, function will display OpenCV window. It expects optional integer identifier with np.ndarray or list of those elements. Identifier is supposed to refer to either source_id (for sequential input) or position in the batch (from 0 to batch_size-1).

display_image

Side effects: on_frame_rendered() is called against the tuple (stream_id, np.ndarray) produced from video frame and predictions.

Example
from functools import partial
import cv2
from inference import InferencePipeline
from inference.core.interfaces.stream.sinks import render_boxes

output_size = (640, 480)
video_sink = cv2.VideoWriter("output.avi", cv2.VideoWriter_fourcc(*"MJPG"), 25.0, output_size)
on_prediction = partial(
    render_boxes,
    display_size=output_size,
    on_frame_rendered=lambda frame_data: video_sink.write(frame_data[1])
)

pipeline = InferencePipeline.init(
     model_id="your-model/3",
     video_reference="./some_file.mp4",
     on_prediction=on_prediction,
)
pipeline.start()
pipeline.join()
video_sink.release()

In this example, render_boxes() is used as a sink for InferencePipeline predictions - making frames with predictions displayed to be saved into video file. Please note that this is oversimplified example of usage which will not be robust against multiple streams - better implementation available in VideoFileSink class.

Source code in inference/core/interfaces/stream/sinks.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def render_boxes(
    predictions: Union[dict, List[Optional[dict]]],
    video_frame: Union[VideoFrame, List[Optional[VideoFrame]]],
    annotator: Union[BaseAnnotator, List[BaseAnnotator]] = None,
    display_size: Optional[Tuple[int, int]] = (1280, 720),
    fps_monitor: Optional[sv.FPSMonitor] = DEFAULT_FPS_MONITOR,
    display_statistics: bool = False,
    on_frame_rendered: Callable[
        [Union[ImageWithSourceID, List[ImageWithSourceID]]], None
    ] = display_image,
) -> None:
    """
    Helper tool to render object detection predictions on top of video frame. It is designed
    to be used with `InferencePipeline`, as sink for predictions. By default, it uses
    standard `sv.BoxAnnotator()` chained with `sv.LabelAnnotator()`
    to draw bounding boxes and resizes prediction to 1280x720 (keeping aspect ratio and adding black padding).
    One may configure default behaviour, for instance to display latency and throughput statistics.
    In batch mode it will display tiles of frames and overlay predictions.

    This sink is only partially compatible with stubs and classification models (it will not fail,
    although predictions will not be displayed).

    Since version `0.9.18`, when multi-source InferencePipeline was introduced - it support batch input, without
    changes to old functionality when single (predictions, video_frame) is used.

    Args:
        predictions (Union[dict, List[Optional[dict]]]): Roboflow predictions, the function support single prediction
            processing and batch processing since version `0.9.18`. Batch predictions elements are optional, but
            should occur at the same position as `video_frame` list. Order is expected to match with `video_frame`.
        video_frame (Union[VideoFrame, List[Optional[VideoFrame]]]): frame of video with its basic metadata emitted
            by `VideoSource` or list of frames from (it is possible for empty batch frames at corresponding positions
            to `predictions` list). Order is expected to match with `predictions`
        annotator (Union[BaseAnnotator, List[BaseAnnotator]]): instance of class inheriting from supervision BaseAnnotator
            or list of such instances. If nothing is passed chain of `sv.BoxAnnotator()` and `sv.LabelAnnotator()` is used.
        display_size (Tuple[int, int]): tuple in format (width, height) to resize visualisation output
        fps_monitor (Optional[sv.FPSMonitor]): FPS monitor used to monitor throughput
        display_statistics (bool): Flag to decide if throughput and latency can be displayed in the result image,
            if enabled, throughput will only be presented if `fps_monitor` is not None
        on_frame_rendered (Callable[[Union[ImageWithSourceID, List[ImageWithSourceID]]], None]): callback to be
            called once frame is rendered - by default, function will display OpenCV window. It expects optional integer
            identifier with np.ndarray or list of those elements. Identifier is supposed to refer to either source_id
            (for sequential input) or position in the batch (from 0 to batch_size-1).

    Returns: None
    Side effects: on_frame_rendered() is called against the tuple (stream_id, np.ndarray) produced from video
        frame and predictions.

    Example:
        ```python
        from functools import partial
        import cv2
        from inference import InferencePipeline
        from inference.core.interfaces.stream.sinks import render_boxes

        output_size = (640, 480)
        video_sink = cv2.VideoWriter("output.avi", cv2.VideoWriter_fourcc(*"MJPG"), 25.0, output_size)
        on_prediction = partial(
            render_boxes,
            display_size=output_size,
            on_frame_rendered=lambda frame_data: video_sink.write(frame_data[1])
        )

        pipeline = InferencePipeline.init(
             model_id="your-model/3",
             video_reference="./some_file.mp4",
             on_prediction=on_prediction,
        )
        pipeline.start()
        pipeline.join()
        video_sink.release()
        ```

        In this example, `render_boxes()` is used as a sink for `InferencePipeline` predictions - making frames with
        predictions displayed to be saved into video file. Please note that this is oversimplified example of usage
        which will not be robust against multiple streams - better implementation available in `VideoFileSink` class.
    """
    sequential_input_provided = False
    if not isinstance(video_frame, list):
        sequential_input_provided = True
    video_frame = wrap_in_list(element=video_frame)
    predictions = wrap_in_list(element=predictions)
    if annotator is None:
        annotator = [
            DEFAULT_BBOX_ANNOTATOR,
            DEFAULT_LABEL_ANNOTATOR,
        ]
    fps_value = None
    if fps_monitor is not None:
        ticks = sum(f is not None for f in video_frame)
        for _ in range(ticks):
            fps_monitor.tick()
        if hasattr(fps_monitor, "fps"):
            fps_value = fps_monitor.fps
        else:
            fps_value = fps_monitor()
    images: List[ImageWithSourceID] = []
    annotators = annotator if isinstance(annotator, list) else [annotator]
    for idx, (single_frame, frame_prediction) in enumerate(
        zip(video_frame, predictions)
    ):
        image = _handle_frame_rendering(
            frame=single_frame,
            prediction=frame_prediction,
            annotators=annotators,
            display_size=display_size,
            display_statistics=display_statistics,
            fps_value=fps_value,
        )
        images.append((idx, image))
    if sequential_input_provided:
        on_frame_rendered((video_frame[0].source_id, images[0][1]))
    else:
        on_frame_rendered(images)

inference.core.interfaces.stream.stream

Classes

Stream

Bases: BaseInterface

Roboflow defined stream interface for a general-purpose inference server.

Attributes:

Name Type Description
model_manager ModelManager

The manager that handles model inference tasks.

model_registry RoboflowModelRegistry

The registry to fetch model instances.

api_key str

The API key for accessing models.

class_agnostic_nms bool

Flag for class-agnostic non-maximum suppression.

confidence float

Confidence threshold for inference.

iou_threshold float

The intersection-over-union threshold for detection.

json_response bool

Flag to toggle JSON response format.

max_candidates float

The maximum number of candidates for detection.

max_detections float

The maximum number of detections.

model str | Callable

The model to be used.

stream_id str

The ID of the stream to be used.

use_bytetrack bool

Flag to use bytetrack,

Methods:

Name Description
init_infer

Initialize the inference with a test frame.

preprocess_thread

Preprocess incoming frames for inference.

inference_request_thread

Manage the inference requests.

run_thread

Run the preprocessing and inference threads.

Source code in inference/core/interfaces/stream/stream.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
class Stream(BaseInterface):
    """Roboflow defined stream interface for a general-purpose inference server.

    Attributes:
        model_manager (ModelManager): The manager that handles model inference tasks.
        model_registry (RoboflowModelRegistry): The registry to fetch model instances.
        api_key (str): The API key for accessing models.
        class_agnostic_nms (bool): Flag for class-agnostic non-maximum suppression.
        confidence (float): Confidence threshold for inference.
        iou_threshold (float): The intersection-over-union threshold for detection.
        json_response (bool): Flag to toggle JSON response format.
        max_candidates (float): The maximum number of candidates for detection.
        max_detections (float): The maximum number of detections.
        model (str|Callable): The model to be used.
        stream_id (str): The ID of the stream to be used.
        use_bytetrack (bool): Flag to use bytetrack,

    Methods:
        init_infer: Initialize the inference with a test frame.
        preprocess_thread: Preprocess incoming frames for inference.
        inference_request_thread: Manage the inference requests.
        run_thread: Run the preprocessing and inference threads.
    """

    def __init__(
        self,
        api_key: str = API_KEY,
        class_agnostic_nms: bool = CLASS_AGNOSTIC_NMS,
        confidence: float = CONFIDENCE,
        enforce_fps: bool = ENFORCE_FPS,
        iou_threshold: float = IOU_THRESHOLD,
        max_candidates: float = MAX_CANDIDATES,
        max_detections: float = MAX_DETECTIONS,
        model: Union[str, Callable] = MODEL_ID,
        source: Union[int, str] = STREAM_ID,
        use_bytetrack: bool = ENABLE_BYTE_TRACK,
        use_main_thread: bool = False,
        output_channel_order: str = "RGB",
        on_prediction: Callable = None,
        on_start: Callable = None,
        on_stop: Callable = None,
    ):
        """Initialize the stream with the given parameters.
        Prints the server settings and initializes the inference with a test frame.
        """
        logger.info("Initializing server")

        self.frame_count = 0
        self.byte_tracker = sv.ByteTrack() if use_bytetrack else None
        self.use_bytetrack = use_bytetrack

        if source == "webcam":
            stream_id = 0
        else:
            stream_id = source

        self.stream_id = stream_id
        if self.stream_id is None:
            raise ValueError("STREAM_ID is not defined")
        self.model_id = model
        if not self.model_id:
            raise ValueError("MODEL_ID is not defined")
        self.api_key = api_key

        self.active_learning_middleware = NullActiveLearningMiddleware()
        if isinstance(model, str):
            self.model = get_model(model, self.api_key)
            if ACTIVE_LEARNING_ENABLED:
                self.active_learning_middleware = (
                    ThreadingActiveLearningMiddleware.init(
                        api_key=self.api_key,
                        model_id=self.model_id,
                        cache=cache,
                    )
                )
            self.task_type = get_model_type(
                model_id=self.model_id, api_key=self.api_key
            )[0]
        else:
            self.model = model
            self.task_type = "unknown"

        self.class_agnostic_nms = class_agnostic_nms
        self.confidence = confidence
        self.iou_threshold = iou_threshold
        self.max_candidates = max_candidates
        self.max_detections = max_detections
        self.use_main_thread = use_main_thread
        self.output_channel_order = output_channel_order

        self.inference_request_type = (
            inference.core.entities.requests.inference.ObjectDetectionInferenceRequest
        )

        self.webcam_stream = WebcamStream(
            stream_id=self.stream_id, enforce_fps=enforce_fps
        )
        logger.info(
            f"Streaming from device with resolution: {self.webcam_stream.width} x {self.webcam_stream.height}"
        )

        self.on_start_callbacks = []
        self.on_stop_callbacks = [
            lambda: self.active_learning_middleware.stop_registration_thread()
        ]
        self.on_prediction_callbacks = []

        if on_prediction:
            self.on_prediction_callbacks.append(on_prediction)

        if on_start:
            self.on_start_callbacks.append(on_start)

        if on_stop:
            self.on_stop_callbacks.append(on_stop)

        self.init_infer()
        self.preproc_result = None
        self.inference_request_obj = None
        self.queue_control = False
        self.inference_response = None
        self.stop = False

        self.frame = None
        self.frame_cv = None
        self.frame_id = None
        logger.info("Server initialized with settings:")
        logger.info(f"Stream ID: {self.stream_id}")
        logger.info(f"Model ID: {self.model_id}")
        logger.info(f"Enforce FPS: {enforce_fps}")
        logger.info(f"Confidence: {self.confidence}")
        logger.info(f"Class Agnostic NMS: {self.class_agnostic_nms}")
        logger.info(f"IOU Threshold: {self.iou_threshold}")
        logger.info(f"Max Candidates: {self.max_candidates}")
        logger.info(f"Max Detections: {self.max_detections}")

        self.run_thread()

    def on_start(self, callback):
        self.on_start_callbacks.append(callback)

        unsubscribe = lambda: self.on_start_callbacks.remove(callback)
        return unsubscribe

    def on_stop(self, callback):
        self.on_stop_callbacks.append(callback)

        unsubscribe = lambda: self.on_stop_callbacks.remove(callback)
        return unsubscribe

    def on_prediction(self, callback):
        self.on_prediction_callbacks.append(callback)

        unsubscribe = lambda: self.on_prediction_callbacks.remove(callback)
        return unsubscribe

    def init_infer(self):
        """Initialize the inference with a test frame.

        Creates a test frame and runs it through the entire inference process to ensure everything is working.
        """
        frame = Image.new("RGB", (640, 640), color="black")
        self.model.infer(
            frame, confidence=self.confidence, iou_threshold=self.iou_threshold
        )
        self.active_learning_middleware.start_registration_thread()

    def preprocess_thread(self):
        """Preprocess incoming frames for inference.

        Reads frames from the webcam stream, converts them into the proper format, and preprocesses them for
        inference.
        """
        webcam_stream = self.webcam_stream
        webcam_stream.start()
        # processing frames in input stream
        try:
            while True:
                if webcam_stream.stopped is True or self.stop:
                    break
                else:
                    self.frame_cv, frame_id = webcam_stream.read_opencv()
                    if frame_id > 0 and frame_id != self.frame_id:
                        self.frame_id = frame_id
                        self.frame = cv2.cvtColor(self.frame_cv, cv2.COLOR_BGR2RGB)
                        self.preproc_result = self.model.preprocess(self.frame_cv)
                        self.img_in, self.img_dims = self.preproc_result
                        self.queue_control = True

        except Exception as e:
            logger.exception(e)

    def inference_request_thread(self):
        """Manage the inference requests.

        Processes preprocessed frames for inference, post-processes the predictions, and sends the results
        to registered callbacks.
        """
        last_print = time.perf_counter()
        print_ind = 0
        while True:
            if self.webcam_stream.stopped is True or self.stop:
                while len(self.on_stop_callbacks) > 0:
                    # run each onStop callback only once from this thread
                    cb = self.on_stop_callbacks.pop()
                    cb()
                break
            if self.queue_control:
                while len(self.on_start_callbacks) > 0:
                    # run each onStart callback only once from this thread
                    cb = self.on_start_callbacks.pop()
                    cb()

                self.queue_control = False
                frame_id = self.frame_id
                inference_input = np.copy(self.frame_cv)
                start = time.perf_counter()
                predictions = self.model.predict(
                    self.img_in,
                )
                predictions = self.model.postprocess(
                    predictions,
                    self.img_dims,
                    class_agnostic_nms=self.class_agnostic_nms,
                    confidence=self.confidence,
                    iou_threshold=self.iou_threshold,
                    max_candidates=self.max_candidates,
                    max_detections=self.max_detections,
                )[0]

                self.active_learning_middleware.register(
                    inference_input=inference_input,
                    prediction=predictions.dict(by_alias=True, exclude_none=True),
                    prediction_type=self.task_type,
                )
                if self.use_bytetrack:
                    if hasattr(sv.Detections, "from_inference"):
                        detections = sv.Detections.from_inference(
                            predictions.dict(by_alias=True, exclude_none=True)
                        )
                    else:
                        detections = sv.Detections.from_inference(
                            predictions.dict(by_alias=True, exclude_none=True)
                        )
                    detections = self.byte_tracker.update_with_detections(detections)

                    if detections.tracker_id is None:
                        detections.tracker_id = np.array([], dtype=int)

                    for pred, detect in zip(predictions.predictions, detections):
                        pred.tracker_id = int(detect[4])
                predictions.frame_id = frame_id
                predictions = predictions.dict(by_alias=True, exclude_none=True)

                self.inference_response = predictions
                self.frame_count += 1

                for cb in self.on_prediction_callbacks:
                    if self.output_channel_order == "BGR":
                        cb(predictions, self.frame_cv)
                    else:
                        cb(predictions, np.asarray(self.frame))

                current = time.perf_counter()
                self.webcam_stream.max_fps = 1 / (current - start)
                logger.debug(f"FPS: {self.webcam_stream.max_fps:.2f}")

                if time.perf_counter() - last_print > 1:
                    print_ind = (print_ind + 1) % 4
                    last_print = time.perf_counter()

    def run_thread(self):
        """Run the preprocessing and inference threads.

        Starts the preprocessing and inference threads, and handles graceful shutdown on KeyboardInterrupt.
        """
        preprocess_thread = threading.Thread(target=self.preprocess_thread)
        preprocess_thread.start()

        if self.use_main_thread:
            self.inference_request_thread()
        else:
            # start a thread that looks for the predictions
            # and call the callbacks
            inference_request_thread = threading.Thread(
                target=self.inference_request_thread
            )
            inference_request_thread.start()
Methods:
__init__
__init__(
    api_key=API_KEY,
    class_agnostic_nms=CLASS_AGNOSTIC_NMS,
    confidence=CONFIDENCE,
    enforce_fps=ENFORCE_FPS,
    iou_threshold=IOU_THRESHOLD,
    max_candidates=MAX_CANDIDATES,
    max_detections=MAX_DETECTIONS,
    model=MODEL_ID,
    source=STREAM_ID,
    use_bytetrack=ENABLE_BYTE_TRACK,
    use_main_thread=False,
    output_channel_order="RGB",
    on_prediction=None,
    on_start=None,
    on_stop=None,
)

Initialize the stream with the given parameters. Prints the server settings and initializes the inference with a test frame.

Source code in inference/core/interfaces/stream/stream.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def __init__(
    self,
    api_key: str = API_KEY,
    class_agnostic_nms: bool = CLASS_AGNOSTIC_NMS,
    confidence: float = CONFIDENCE,
    enforce_fps: bool = ENFORCE_FPS,
    iou_threshold: float = IOU_THRESHOLD,
    max_candidates: float = MAX_CANDIDATES,
    max_detections: float = MAX_DETECTIONS,
    model: Union[str, Callable] = MODEL_ID,
    source: Union[int, str] = STREAM_ID,
    use_bytetrack: bool = ENABLE_BYTE_TRACK,
    use_main_thread: bool = False,
    output_channel_order: str = "RGB",
    on_prediction: Callable = None,
    on_start: Callable = None,
    on_stop: Callable = None,
):
    """Initialize the stream with the given parameters.
    Prints the server settings and initializes the inference with a test frame.
    """
    logger.info("Initializing server")

    self.frame_count = 0
    self.byte_tracker = sv.ByteTrack() if use_bytetrack else None
    self.use_bytetrack = use_bytetrack

    if source == "webcam":
        stream_id = 0
    else:
        stream_id = source

    self.stream_id = stream_id
    if self.stream_id is None:
        raise ValueError("STREAM_ID is not defined")
    self.model_id = model
    if not self.model_id:
        raise ValueError("MODEL_ID is not defined")
    self.api_key = api_key

    self.active_learning_middleware = NullActiveLearningMiddleware()
    if isinstance(model, str):
        self.model = get_model(model, self.api_key)
        if ACTIVE_LEARNING_ENABLED:
            self.active_learning_middleware = (
                ThreadingActiveLearningMiddleware.init(
                    api_key=self.api_key,
                    model_id=self.model_id,
                    cache=cache,
                )
            )
        self.task_type = get_model_type(
            model_id=self.model_id, api_key=self.api_key
        )[0]
    else:
        self.model = model
        self.task_type = "unknown"

    self.class_agnostic_nms = class_agnostic_nms
    self.confidence = confidence
    self.iou_threshold = iou_threshold
    self.max_candidates = max_candidates
    self.max_detections = max_detections
    self.use_main_thread = use_main_thread
    self.output_channel_order = output_channel_order

    self.inference_request_type = (
        inference.core.entities.requests.inference.ObjectDetectionInferenceRequest
    )

    self.webcam_stream = WebcamStream(
        stream_id=self.stream_id, enforce_fps=enforce_fps
    )
    logger.info(
        f"Streaming from device with resolution: {self.webcam_stream.width} x {self.webcam_stream.height}"
    )

    self.on_start_callbacks = []
    self.on_stop_callbacks = [
        lambda: self.active_learning_middleware.stop_registration_thread()
    ]
    self.on_prediction_callbacks = []

    if on_prediction:
        self.on_prediction_callbacks.append(on_prediction)

    if on_start:
        self.on_start_callbacks.append(on_start)

    if on_stop:
        self.on_stop_callbacks.append(on_stop)

    self.init_infer()
    self.preproc_result = None
    self.inference_request_obj = None
    self.queue_control = False
    self.inference_response = None
    self.stop = False

    self.frame = None
    self.frame_cv = None
    self.frame_id = None
    logger.info("Server initialized with settings:")
    logger.info(f"Stream ID: {self.stream_id}")
    logger.info(f"Model ID: {self.model_id}")
    logger.info(f"Enforce FPS: {enforce_fps}")
    logger.info(f"Confidence: {self.confidence}")
    logger.info(f"Class Agnostic NMS: {self.class_agnostic_nms}")
    logger.info(f"IOU Threshold: {self.iou_threshold}")
    logger.info(f"Max Candidates: {self.max_candidates}")
    logger.info(f"Max Detections: {self.max_detections}")

    self.run_thread()
inference_request_thread
inference_request_thread()

Manage the inference requests.

Processes preprocessed frames for inference, post-processes the predictions, and sends the results to registered callbacks.

Source code in inference/core/interfaces/stream/stream.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
def inference_request_thread(self):
    """Manage the inference requests.

    Processes preprocessed frames for inference, post-processes the predictions, and sends the results
    to registered callbacks.
    """
    last_print = time.perf_counter()
    print_ind = 0
    while True:
        if self.webcam_stream.stopped is True or self.stop:
            while len(self.on_stop_callbacks) > 0:
                # run each onStop callback only once from this thread
                cb = self.on_stop_callbacks.pop()
                cb()
            break
        if self.queue_control:
            while len(self.on_start_callbacks) > 0:
                # run each onStart callback only once from this thread
                cb = self.on_start_callbacks.pop()
                cb()

            self.queue_control = False
            frame_id = self.frame_id
            inference_input = np.copy(self.frame_cv)
            start = time.perf_counter()
            predictions = self.model.predict(
                self.img_in,
            )
            predictions = self.model.postprocess(
                predictions,
                self.img_dims,
                class_agnostic_nms=self.class_agnostic_nms,
                confidence=self.confidence,
                iou_threshold=self.iou_threshold,
                max_candidates=self.max_candidates,
                max_detections=self.max_detections,
            )[0]

            self.active_learning_middleware.register(
                inference_input=inference_input,
                prediction=predictions.dict(by_alias=True, exclude_none=True),
                prediction_type=self.task_type,
            )
            if self.use_bytetrack:
                if hasattr(sv.Detections, "from_inference"):
                    detections = sv.Detections.from_inference(
                        predictions.dict(by_alias=True, exclude_none=True)
                    )
                else:
                    detections = sv.Detections.from_inference(
                        predictions.dict(by_alias=True, exclude_none=True)
                    )
                detections = self.byte_tracker.update_with_detections(detections)

                if detections.tracker_id is None:
                    detections.tracker_id = np.array([], dtype=int)

                for pred, detect in zip(predictions.predictions, detections):
                    pred.tracker_id = int(detect[4])
            predictions.frame_id = frame_id
            predictions = predictions.dict(by_alias=True, exclude_none=True)

            self.inference_response = predictions
            self.frame_count += 1

            for cb in self.on_prediction_callbacks:
                if self.output_channel_order == "BGR":
                    cb(predictions, self.frame_cv)
                else:
                    cb(predictions, np.asarray(self.frame))

            current = time.perf_counter()
            self.webcam_stream.max_fps = 1 / (current - start)
            logger.debug(f"FPS: {self.webcam_stream.max_fps:.2f}")

            if time.perf_counter() - last_print > 1:
                print_ind = (print_ind + 1) % 4
                last_print = time.perf_counter()
init_infer
init_infer()

Initialize the inference with a test frame.

Creates a test frame and runs it through the entire inference process to ensure everything is working.

Source code in inference/core/interfaces/stream/stream.py
196
197
198
199
200
201
202
203
204
205
def init_infer(self):
    """Initialize the inference with a test frame.

    Creates a test frame and runs it through the entire inference process to ensure everything is working.
    """
    frame = Image.new("RGB", (640, 640), color="black")
    self.model.infer(
        frame, confidence=self.confidence, iou_threshold=self.iou_threshold
    )
    self.active_learning_middleware.start_registration_thread()
preprocess_thread
preprocess_thread()

Preprocess incoming frames for inference.

Reads frames from the webcam stream, converts them into the proper format, and preprocesses them for inference.

Source code in inference/core/interfaces/stream/stream.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def preprocess_thread(self):
    """Preprocess incoming frames for inference.

    Reads frames from the webcam stream, converts them into the proper format, and preprocesses them for
    inference.
    """
    webcam_stream = self.webcam_stream
    webcam_stream.start()
    # processing frames in input stream
    try:
        while True:
            if webcam_stream.stopped is True or self.stop:
                break
            else:
                self.frame_cv, frame_id = webcam_stream.read_opencv()
                if frame_id > 0 and frame_id != self.frame_id:
                    self.frame_id = frame_id
                    self.frame = cv2.cvtColor(self.frame_cv, cv2.COLOR_BGR2RGB)
                    self.preproc_result = self.model.preprocess(self.frame_cv)
                    self.img_in, self.img_dims = self.preproc_result
                    self.queue_control = True

    except Exception as e:
        logger.exception(e)
run_thread
run_thread()

Run the preprocessing and inference threads.

Starts the preprocessing and inference threads, and handles graceful shutdown on KeyboardInterrupt.

Source code in inference/core/interfaces/stream/stream.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
def run_thread(self):
    """Run the preprocessing and inference threads.

    Starts the preprocessing and inference threads, and handles graceful shutdown on KeyboardInterrupt.
    """
    preprocess_thread = threading.Thread(target=self.preprocess_thread)
    preprocess_thread.start()

    if self.use_main_thread:
        self.inference_request_thread()
    else:
        # start a thread that looks for the predictions
        # and call the callbacks
        inference_request_thread = threading.Thread(
            target=self.inference_request_thread
        )
        inference_request_thread.start()

Functions:

inference.core.interfaces.stream.watchdog

This module contains component intended to use in combination with InferencePipeline to ensure observability. Please consider them internal details of implementation.

Classes

BasePipelineWatchDog

Bases: PipelineWatchDog

Implementation to be used from single inference thread, as it keeps state assumed to represent status of consecutive stage of prediction process in latency monitor.

Source code in inference/core/interfaces/stream/watchdog.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
class BasePipelineWatchDog(PipelineWatchDog):
    """
    Implementation to be used from single inference thread, as it keeps
    state assumed to represent status of consecutive stage of prediction process
    in latency monitor.
    """

    def __init__(self):
        super().__init__()
        self._video_sources: Optional[List[VideoSource]] = None
        self._inference_throughput_monitor = sv.FPSMonitor()
        self._latency_monitors: Dict[Optional[int], LatencyMonitor] = {}
        self._stream_updates = deque(maxlen=MAX_UPDATES_CONTEXT)

    def register_video_sources(self, video_sources: List[VideoSource]) -> None:
        self._video_sources = video_sources
        for source in video_sources:
            self._latency_monitors[source.source_id] = LatencyMonitor(
                source_id=source.source_id
            )

    def on_status_update(self, status_update: StatusUpdate) -> None:
        if status_update.severity.value <= UpdateSeverity.DEBUG.value:
            return None
        self._stream_updates.append(status_update)

    def on_model_inference_started(self, frames: List[VideoFrame]) -> None:
        for frame in frames:
            self._latency_monitors[frame.source_id].register_inference_start(
                frame_timestamp=frame.frame_timestamp,
                frame_id=frame.frame_id,
            )

    def on_model_prediction_ready(self, frames: List[VideoFrame]) -> None:
        for frame in frames:
            self._latency_monitors[frame.source_id].register_prediction_ready(
                frame_timestamp=frame.frame_timestamp,
                frame_id=frame.frame_id,
            )
            self._inference_throughput_monitor.tick()

    def get_report(self) -> PipelineStateReport:
        sources_metadata = []
        if self._video_sources is not None:
            sources_metadata = [s.describe_source() for s in self._video_sources]
        latency_reports = [
            monitor.summarise_reports() for monitor in self._latency_monitors.values()
        ]
        if hasattr(self._inference_throughput_monitor, "fps"):
            _inference_throughput_fps = self._inference_throughput_monitor.fps
        else:
            _inference_throughput_fps = self._inference_throughput_monitor()
        return PipelineStateReport(
            video_source_status_updates=list(self._stream_updates),
            latency_reports=latency_reports,
            inference_throughput=_inference_throughput_fps,
            sources_metadata=sources_metadata,
        )

core/interfaces/udp

inference.core.interfaces.udp.udp_stream

Classes

UdpStream

Bases: BaseInterface

Roboflow defined UDP interface for a general-purpose inference server.

Attributes:

Name Type Description
model_manager ModelManager

The manager that handles model inference tasks.

model_registry RoboflowModelRegistry

The registry to fetch model instances.

api_key str

The API key for accessing models.

class_agnostic_nms bool

Flag for class-agnostic non-maximum suppression.

confidence float

Confidence threshold for inference.

ip_broadcast_addr str

The IP address to broadcast to.

ip_broadcast_port int

The port to broadcast on.

iou_threshold float

The intersection-over-union threshold for detection.

max_candidates float

The maximum number of candidates for detection.

max_detections float

The maximum number of detections.

model_id str

The ID of the model to be used.

stream_id str

The ID of the stream to be used.

use_bytetrack bool

Flag to use bytetrack,

Methods:

Name Description
init_infer

Initialize the inference with a test frame.

preprocess_thread

Preprocess incoming frames for inference.

inference_request_thread

Manage the inference requests.

run_thread

Run the preprocessing and inference threads.

Source code in inference/core/interfaces/udp/udp_stream.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
class UdpStream(BaseInterface):
    """Roboflow defined UDP interface for a general-purpose inference server.

    Attributes:
        model_manager (ModelManager): The manager that handles model inference tasks.
        model_registry (RoboflowModelRegistry): The registry to fetch model instances.
        api_key (str): The API key for accessing models.
        class_agnostic_nms (bool): Flag for class-agnostic non-maximum suppression.
        confidence (float): Confidence threshold for inference.
        ip_broadcast_addr (str): The IP address to broadcast to.
        ip_broadcast_port (int): The port to broadcast on.
        iou_threshold (float): The intersection-over-union threshold for detection.
        max_candidates (float): The maximum number of candidates for detection.
        max_detections (float): The maximum number of detections.
        model_id (str): The ID of the model to be used.
        stream_id (str): The ID of the stream to be used.
        use_bytetrack (bool): Flag to use bytetrack,

    Methods:
        init_infer: Initialize the inference with a test frame.
        preprocess_thread: Preprocess incoming frames for inference.
        inference_request_thread: Manage the inference requests.
        run_thread: Run the preprocessing and inference threads.
    """

    def __init__(
        self,
        api_key: str = API_KEY,
        class_agnostic_nms: bool = CLASS_AGNOSTIC_NMS,
        confidence: float = CONFIDENCE,
        enforce_fps: bool = ENFORCE_FPS,
        ip_broadcast_addr: str = IP_BROADCAST_ADDR,
        ip_broadcast_port: int = IP_BROADCAST_PORT,
        iou_threshold: float = IOU_THRESHOLD,
        max_candidates: float = MAX_CANDIDATES,
        max_detections: float = MAX_DETECTIONS,
        model_id: str = MODEL_ID,
        stream_id: Union[int, str] = STREAM_ID,
        use_bytetrack: bool = ENABLE_BYTE_TRACK,
    ):
        """Initialize the UDP stream with the given parameters.
        Prints the server settings and initializes the inference with a test frame.
        """
        logger.info("Initializing server")

        self.frame_count = 0
        self.byte_tracker = sv.ByteTrack() if use_bytetrack else None
        self.use_bytetrack = use_bytetrack

        self.stream_id = stream_id
        if self.stream_id is None:
            raise ValueError("STREAM_ID is not defined")
        self.model_id = model_id
        if not self.model_id:
            raise ValueError("MODEL_ID is not defined")
        self.api_key = api_key
        if not self.api_key:
            raise ValueError(
                f"API key is missing. Either pass it explicitly to constructor, or use one of env variables: "
                f"{API_KEY_ENV_NAMES}. Visit "
                f"https://docs.roboflow.com/api-reference/authentication#retrieve-an-api-key to learn how to generate "
                f"the key."
            )

        self.model = get_model(self.model_id, self.api_key)
        self.task_type = get_model_type(model_id=self.model_id, api_key=self.api_key)[0]
        self.active_learning_middleware = NullActiveLearningMiddleware()
        if ACTIVE_LEARNING_ENABLED:
            self.active_learning_middleware = ThreadingActiveLearningMiddleware.init(
                api_key=self.api_key,
                model_id=self.model_id,
                cache=cache,
            )
        self.class_agnostic_nms = class_agnostic_nms
        self.confidence = confidence
        self.iou_threshold = iou_threshold
        self.max_candidates = max_candidates
        self.max_detections = max_detections
        self.ip_broadcast_addr = ip_broadcast_addr
        self.ip_broadcast_port = ip_broadcast_port

        self.inference_request_type = (
            inference.core.entities.requests.inference.ObjectDetectionInferenceRequest
        )

        self.UDPServerSocket = socket.socket(
            family=socket.AF_INET, type=socket.SOCK_DGRAM
        )
        self.UDPServerSocket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.UDPServerSocket.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
        self.UDPServerSocket.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 1)
        self.UDPServerSocket.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 65536)

        self.webcam_stream = WebcamStream(
            stream_id=self.stream_id, enforce_fps=enforce_fps
        )
        logger.info(
            f"Streaming from device with resolution: {self.webcam_stream.width} x {self.webcam_stream.height}"
        )

        self.init_infer()
        self.preproc_result = None
        self.inference_request_obj = None
        self.queue_control = False
        self.inference_response = None
        self.stop = False

        self.frame_cv = None
        self.frame_id = None
        logger.info("Server initialized with settings:")
        logger.info(f"Stream ID: {self.stream_id}")
        logger.info(f"Model ID: {self.model_id}")
        logger.info(f"Confidence: {self.confidence}")
        logger.info(f"Class Agnostic NMS: {self.class_agnostic_nms}")
        logger.info(f"IOU Threshold: {self.iou_threshold}")
        logger.info(f"Max Candidates: {self.max_candidates}")
        logger.info(f"Max Detections: {self.max_detections}")

    def init_infer(self):
        """Initialize the inference with a test frame.

        Creates a test frame and runs it through the entire inference process to ensure everything is working.
        """
        frame = Image.new("RGB", (640, 640), color="black")
        self.model.infer(
            frame, confidence=self.confidence, iou_threshold=self.iou_threshold
        )
        self.active_learning_middleware.start_registration_thread()

    def preprocess_thread(self):
        """Preprocess incoming frames for inference.

        Reads frames from the webcam stream, converts them into the proper format, and preprocesses them for
        inference.
        """
        webcam_stream = self.webcam_stream
        webcam_stream.start()
        # processing frames in input stream
        try:
            while True:
                if webcam_stream.stopped is True or self.stop:
                    break
                else:
                    self.frame_cv, frame_id = webcam_stream.read_opencv()
                    if frame_id != self.frame_id:
                        self.frame_id = frame_id
                        self.preproc_result = self.model.preprocess(self.frame_cv)
                        self.img_in, self.img_dims = self.preproc_result
                        self.queue_control = True

        except Exception as e:
            logger.error(e)

    def inference_request_thread(self):
        """Manage the inference requests.

        Processes preprocessed frames for inference, post-processes the predictions, and sends the results
        as a UDP broadcast.
        """
        last_print = time.perf_counter()
        print_ind = 0
        print_chars = ["|", "/", "-", "\\"]
        while True:
            if self.stop:
                break
            if self.queue_control:
                self.queue_control = False
                frame_id = self.frame_id
                inference_input = np.copy(self.frame_cv)
                predictions = self.model.predict(
                    self.img_in,
                )
                predictions = self.model.postprocess(
                    predictions,
                    self.img_dims,
                    class_agnostic_nms=self.class_agnostic_nms,
                    confidence=self.confidence,
                    iou_threshold=self.iou_threshold,
                    max_candidates=self.max_candidates,
                    max_detections=self.max_detections,
                )[0]
                self.active_learning_middleware.register(
                    inference_input=inference_input,
                    prediction=predictions.dict(by_alias=True, exclude_none=True),
                    prediction_type=self.task_type,
                )
                if self.use_bytetrack:
                    if hasattr(sv.Detections, "from_inference"):
                        detections = sv.Detections.from_inference(
                            predictions.dict(by_alias=True), self.model.class_names
                        )
                    else:
                        detections = sv.Detections.from_inference(
                            predictions.dict(by_alias=True), self.model.class_names
                        )
                    detections = self.byte_tracker.update_with_detections(detections)
                    for pred, detect in zip(predictions.predictions, detections):
                        pred.tracker_id = int(detect[4])
                predictions.frame_id = frame_id
                predictions = predictions.json(exclude_none=True, by_alias=True)

                self.inference_response = predictions
                self.frame_count += 1

                bytesToSend = predictions.encode("utf-8")
                self.UDPServerSocket.sendto(
                    bytesToSend,
                    (
                        self.ip_broadcast_addr,
                        self.ip_broadcast_port,
                    ),
                )
                if time.perf_counter() - last_print > 1:
                    print(f"Streaming {print_chars[print_ind]}", end="\r")
                    print_ind = (print_ind + 1) % 4
                    last_print = time.perf_counter()

    def run_thread(self):
        """Run the preprocessing and inference threads.

        Starts the preprocessing and inference threads, and handles graceful shutdown on KeyboardInterrupt.
        """
        preprocess_thread = threading.Thread(target=self.preprocess_thread)
        inference_request_thread = threading.Thread(
            target=self.inference_request_thread
        )

        preprocess_thread.start()
        inference_request_thread.start()

        while True:
            try:
                time.sleep(10)
            except KeyboardInterrupt:
                logger.info("Stopping server...")
                self.stop = True
                self.active_learning_middleware.stop_registration_thread()
                time.sleep(3)
                sys.exit(0)
Methods:
__init__
__init__(
    api_key=API_KEY,
    class_agnostic_nms=CLASS_AGNOSTIC_NMS,
    confidence=CONFIDENCE,
    enforce_fps=ENFORCE_FPS,
    ip_broadcast_addr=IP_BROADCAST_ADDR,
    ip_broadcast_port=IP_BROADCAST_PORT,
    iou_threshold=IOU_THRESHOLD,
    max_candidates=MAX_CANDIDATES,
    max_detections=MAX_DETECTIONS,
    model_id=MODEL_ID,
    stream_id=STREAM_ID,
    use_bytetrack=ENABLE_BYTE_TRACK,
)

Initialize the UDP stream with the given parameters. Prints the server settings and initializes the inference with a test frame.

Source code in inference/core/interfaces/udp/udp_stream.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def __init__(
    self,
    api_key: str = API_KEY,
    class_agnostic_nms: bool = CLASS_AGNOSTIC_NMS,
    confidence: float = CONFIDENCE,
    enforce_fps: bool = ENFORCE_FPS,
    ip_broadcast_addr: str = IP_BROADCAST_ADDR,
    ip_broadcast_port: int = IP_BROADCAST_PORT,
    iou_threshold: float = IOU_THRESHOLD,
    max_candidates: float = MAX_CANDIDATES,
    max_detections: float = MAX_DETECTIONS,
    model_id: str = MODEL_ID,
    stream_id: Union[int, str] = STREAM_ID,
    use_bytetrack: bool = ENABLE_BYTE_TRACK,
):
    """Initialize the UDP stream with the given parameters.
    Prints the server settings and initializes the inference with a test frame.
    """
    logger.info("Initializing server")

    self.frame_count = 0
    self.byte_tracker = sv.ByteTrack() if use_bytetrack else None
    self.use_bytetrack = use_bytetrack

    self.stream_id = stream_id
    if self.stream_id is None:
        raise ValueError("STREAM_ID is not defined")
    self.model_id = model_id
    if not self.model_id:
        raise ValueError("MODEL_ID is not defined")
    self.api_key = api_key
    if not self.api_key:
        raise ValueError(
            f"API key is missing. Either pass it explicitly to constructor, or use one of env variables: "
            f"{API_KEY_ENV_NAMES}. Visit "
            f"https://docs.roboflow.com/api-reference/authentication#retrieve-an-api-key to learn how to generate "
            f"the key."
        )

    self.model = get_model(self.model_id, self.api_key)
    self.task_type = get_model_type(model_id=self.model_id, api_key=self.api_key)[0]
    self.active_learning_middleware = NullActiveLearningMiddleware()
    if ACTIVE_LEARNING_ENABLED:
        self.active_learning_middleware = ThreadingActiveLearningMiddleware.init(
            api_key=self.api_key,
            model_id=self.model_id,
            cache=cache,
        )
    self.class_agnostic_nms = class_agnostic_nms
    self.confidence = confidence
    self.iou_threshold = iou_threshold
    self.max_candidates = max_candidates
    self.max_detections = max_detections
    self.ip_broadcast_addr = ip_broadcast_addr
    self.ip_broadcast_port = ip_broadcast_port

    self.inference_request_type = (
        inference.core.entities.requests.inference.ObjectDetectionInferenceRequest
    )

    self.UDPServerSocket = socket.socket(
        family=socket.AF_INET, type=socket.SOCK_DGRAM
    )
    self.UDPServerSocket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    self.UDPServerSocket.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
    self.UDPServerSocket.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 1)
    self.UDPServerSocket.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 65536)

    self.webcam_stream = WebcamStream(
        stream_id=self.stream_id, enforce_fps=enforce_fps
    )
    logger.info(
        f"Streaming from device with resolution: {self.webcam_stream.width} x {self.webcam_stream.height}"
    )

    self.init_infer()
    self.preproc_result = None
    self.inference_request_obj = None
    self.queue_control = False
    self.inference_response = None
    self.stop = False

    self.frame_cv = None
    self.frame_id = None
    logger.info("Server initialized with settings:")
    logger.info(f"Stream ID: {self.stream_id}")
    logger.info(f"Model ID: {self.model_id}")
    logger.info(f"Confidence: {self.confidence}")
    logger.info(f"Class Agnostic NMS: {self.class_agnostic_nms}")
    logger.info(f"IOU Threshold: {self.iou_threshold}")
    logger.info(f"Max Candidates: {self.max_candidates}")
    logger.info(f"Max Detections: {self.max_detections}")
inference_request_thread
inference_request_thread()

Manage the inference requests.

Processes preprocessed frames for inference, post-processes the predictions, and sends the results as a UDP broadcast.

Source code in inference/core/interfaces/udp/udp_stream.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def inference_request_thread(self):
    """Manage the inference requests.

    Processes preprocessed frames for inference, post-processes the predictions, and sends the results
    as a UDP broadcast.
    """
    last_print = time.perf_counter()
    print_ind = 0
    print_chars = ["|", "/", "-", "\\"]
    while True:
        if self.stop:
            break
        if self.queue_control:
            self.queue_control = False
            frame_id = self.frame_id
            inference_input = np.copy(self.frame_cv)
            predictions = self.model.predict(
                self.img_in,
            )
            predictions = self.model.postprocess(
                predictions,
                self.img_dims,
                class_agnostic_nms=self.class_agnostic_nms,
                confidence=self.confidence,
                iou_threshold=self.iou_threshold,
                max_candidates=self.max_candidates,
                max_detections=self.max_detections,
            )[0]
            self.active_learning_middleware.register(
                inference_input=inference_input,
                prediction=predictions.dict(by_alias=True, exclude_none=True),
                prediction_type=self.task_type,
            )
            if self.use_bytetrack:
                if hasattr(sv.Detections, "from_inference"):
                    detections = sv.Detections.from_inference(
                        predictions.dict(by_alias=True), self.model.class_names
                    )
                else:
                    detections = sv.Detections.from_inference(
                        predictions.dict(by_alias=True), self.model.class_names
                    )
                detections = self.byte_tracker.update_with_detections(detections)
                for pred, detect in zip(predictions.predictions, detections):
                    pred.tracker_id = int(detect[4])
            predictions.frame_id = frame_id
            predictions = predictions.json(exclude_none=True, by_alias=True)

            self.inference_response = predictions
            self.frame_count += 1

            bytesToSend = predictions.encode("utf-8")
            self.UDPServerSocket.sendto(
                bytesToSend,
                (
                    self.ip_broadcast_addr,
                    self.ip_broadcast_port,
                ),
            )
            if time.perf_counter() - last_print > 1:
                print(f"Streaming {print_chars[print_ind]}", end="\r")
                print_ind = (print_ind + 1) % 4
                last_print = time.perf_counter()
init_infer
init_infer()

Initialize the inference with a test frame.

Creates a test frame and runs it through the entire inference process to ensure everything is working.

Source code in inference/core/interfaces/udp/udp_stream.py
161
162
163
164
165
166
167
168
169
170
def init_infer(self):
    """Initialize the inference with a test frame.

    Creates a test frame and runs it through the entire inference process to ensure everything is working.
    """
    frame = Image.new("RGB", (640, 640), color="black")
    self.model.infer(
        frame, confidence=self.confidence, iou_threshold=self.iou_threshold
    )
    self.active_learning_middleware.start_registration_thread()
preprocess_thread
preprocess_thread()

Preprocess incoming frames for inference.

Reads frames from the webcam stream, converts them into the proper format, and preprocesses them for inference.

Source code in inference/core/interfaces/udp/udp_stream.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def preprocess_thread(self):
    """Preprocess incoming frames for inference.

    Reads frames from the webcam stream, converts them into the proper format, and preprocesses them for
    inference.
    """
    webcam_stream = self.webcam_stream
    webcam_stream.start()
    # processing frames in input stream
    try:
        while True:
            if webcam_stream.stopped is True or self.stop:
                break
            else:
                self.frame_cv, frame_id = webcam_stream.read_opencv()
                if frame_id != self.frame_id:
                    self.frame_id = frame_id
                    self.preproc_result = self.model.preprocess(self.frame_cv)
                    self.img_in, self.img_dims = self.preproc_result
                    self.queue_control = True

    except Exception as e:
        logger.error(e)
run_thread
run_thread()

Run the preprocessing and inference threads.

Starts the preprocessing and inference threads, and handles graceful shutdown on KeyboardInterrupt.

Source code in inference/core/interfaces/udp/udp_stream.py
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def run_thread(self):
    """Run the preprocessing and inference threads.

    Starts the preprocessing and inference threads, and handles graceful shutdown on KeyboardInterrupt.
    """
    preprocess_thread = threading.Thread(target=self.preprocess_thread)
    inference_request_thread = threading.Thread(
        target=self.inference_request_thread
    )

    preprocess_thread.start()
    inference_request_thread.start()

    while True:
        try:
            time.sleep(10)
        except KeyboardInterrupt:
            logger.info("Stopping server...")
            self.stop = True
            self.active_learning_middleware.stop_registration_thread()
            time.sleep(3)
            sys.exit(0)

Functions:

core/interfaces/webrtc_worker

inference.core.interfaces.webrtc_worker.entities

Classes

VideoFileUploadState

Bases: str, Enum

State of video file upload.

Source code in inference/core/interfaces/webrtc_worker/entities.py
124
125
126
127
128
129
130
131
class VideoFileUploadState(str, Enum):
    """State of video file upload."""

    IDLE = "idle"
    UPLOADING = "uploading"
    COMPLETE = "complete"
    PROCESSING = "processing"
    ERROR = "error"

WebRTCOutput

Bases: BaseModel

Output sent via WebRTC data channel.

serialized_output_data contains a dictionary with workflow outputs: - If data_output is None or []: no data sent (only metadata) - If data_output is ["*"]: all workflow outputs (excluding images, unless explicitly named) - If data_output is ["field1", "field2"]: only those fields (including images if explicitly named)

Source code in inference/core/interfaces/webrtc_worker/entities.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class WebRTCOutput(BaseModel):
    """Output sent via WebRTC data channel.

    serialized_output_data contains a dictionary with workflow outputs:
    - If data_output is None or []: no data sent (only metadata)
    - If data_output is ["*"]: all workflow outputs (excluding images, unless explicitly named)
    - If data_output is ["field1", "field2"]: only those fields (including images if explicitly named)
    """

    serialized_output_data: Optional[Dict[str, Any]] = None
    video_metadata: Optional[WebRTCVideoMetadata] = None
    errors: List[str] = Field(default_factory=list)
    processing_complete: bool = False  # Signals end of video file processing
    termination_reason: Optional[str] = None

WebRTCSessionHeartbeatRequest

Bases: BaseModel

Request body for WebRTC session heartbeat and end endpoints.

Source code in inference/core/interfaces/webrtc_worker/entities.py
100
101
102
103
104
class WebRTCSessionHeartbeatRequest(BaseModel):
    """Request body for WebRTC session heartbeat and end endpoints."""

    session_id: str
    api_key: str

inference.core.interfaces.webrtc_worker.serializers

Classes

Functions:

compress_image_for_webrtc

compress_image_for_webrtc(image)

Serialize image with low JPEG quality for efficient WebRTC transmission.

Source code in inference/core/interfaces/webrtc_worker/serializers.py
12
13
14
15
16
17
18
19
20
21
def compress_image_for_webrtc(image: WorkflowImageData) -> Dict[str, Any]:
    """Serialize image with low JPEG quality for efficient WebRTC transmission."""
    jpeg_bytes = encode_image_to_jpeg_bytes(
        image.numpy_image, jpeg_quality=WEBRTC_PREVIEW_FRAME_JPEG_QUALITY
    )
    return {
        "type": "base64",
        "value": base64.b64encode(jpeg_bytes).decode("ascii"),
        "video_metadata": image.video_metadata.dict() if image.video_metadata else None,
    }

serialize_for_webrtc

serialize_for_webrtc(value)

Serialize for WebRTC, compressing images with low JPEG quality.

Source code in inference/core/interfaces/webrtc_worker/serializers.py
24
25
26
27
28
29
30
31
32
def serialize_for_webrtc(value: Any) -> Any:
    """Serialize for WebRTC, compressing images with low JPEG quality."""
    if isinstance(value, WorkflowImageData):
        return compress_image_for_webrtc(value)
    if isinstance(value, dict):
        return {k: serialize_for_webrtc(v) for k, v in value.items()}
    if isinstance(value, list):
        return [serialize_for_webrtc(v) for v in value]
    return serialize_wildcard_kind(value)

inference.core.interfaces.webrtc_worker.utils

Classes

Functions:

deregister_webrtc_session

deregister_webrtc_session(workspace_id, session_id)

Remove a WebRTC session from the concurrent sessions set.

Should be called when a session ends to immediately free the quota slot, rather than waiting for TTL expiry.

Parameters:

Name Type Description Default
workspace_id str

The workspace identifier

required
session_id str

The session identifier to remove

required
Source code in inference/core/interfaces/webrtc_worker/utils.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def deregister_webrtc_session(workspace_id: str, session_id: str) -> None:
    """Remove a WebRTC session from the concurrent sessions set.

    Should be called when a session ends to immediately free the quota slot,
    rather than waiting for TTL expiry.

    Args:
        workspace_id: The workspace identifier
        session_id: The session identifier to remove
    """
    if not isinstance(cache, RedisCache):
        logger.warning(
            "[REDIS] Redis not available (cache is %s), skipping session deregistration",
            type(cache).__name__,
        )
        return

    key = _get_concurrent_sessions_key(workspace_id)
    try:
        result = cache.client.zrem(key, session_id)
        logger.info(
            "Deregistered session: workspace=%s, session=%s, removed=%s",
            workspace_id,
            session_id,
            result,
        )
    except Exception as e:
        logger.error("Failed to deregister session: %s", e)

detect_image_output

detect_image_output(workflow_output)

Detect the first available image output field in workflow output.

Source code in inference/core/interfaces/webrtc_worker/utils.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def detect_image_output(
    workflow_output: Dict[str, Union[WorkflowImageData, Any]],
) -> Optional[str]:
    """Detect the first available image output field in workflow output."""
    for output_name in workflow_output.keys():
        if (
            get_frame_from_workflow_output(
                workflow_output=workflow_output,
                frame_output_key=output_name,
            )
            is not None
        ):
            return output_name
    return None

get_concurrent_session_count

get_concurrent_session_count(workspace_id, ttl_seconds)

Get the count of concurrent sessions for a workspace.

Cleans up expired entries (older than TTL) before counting.

Parameters:

Name Type Description Default
workspace_id str

The workspace identifier

required
ttl_seconds int

TTL in seconds - entries older than this are considered expired

required

Returns:

Type Description
int

Number of concurrent sessions for the workspace

Source code in inference/core/interfaces/webrtc_worker/utils.py
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
def get_concurrent_session_count(workspace_id: str, ttl_seconds: int) -> int:
    """Get the count of concurrent sessions for a workspace.

    Cleans up expired entries (older than TTL) before counting.

    Args:
        workspace_id: The workspace identifier
        ttl_seconds: TTL in seconds - entries older than this are considered expired

    Returns:
        Number of concurrent sessions for the workspace
    """
    if not isinstance(cache, RedisCache):
        logger.warning(
            "Redis not available, cannot count concurrent sessions - allowing request"
        )
        return 0

    key = _get_concurrent_sessions_key(workspace_id)
    cutoff = time.time() - ttl_seconds

    try:
        # Step 1: we remove expired entries
        removed = cache.client.zremrangebyscore(key, "-inf", cutoff)
        logger.info("[REDIS] Removed %s expired entries from %s", removed, key)
        # Step 2: we return what is still valid
        count = cache.client.zcard(key)
        return count
    except Exception as e:
        logger.error(
            "[REDIS] Failed to get concurrent session count: %s", e, exc_info=True
        )
        return 0

get_cv2_rotation_code

get_cv2_rotation_code(rotation)

Get OpenCV rotation code to correct a given rotation.

Parameters:

Name Type Description Default
rotation int

Rotation angle in degrees from metadata

required

Returns:

Type Description
Optional[int]

cv2 rotation constant or None if no correction needed

Source code in inference/core/interfaces/webrtc_worker/utils.py
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
def get_cv2_rotation_code(rotation: int) -> Optional[int]:
    """Get OpenCV rotation code to correct a given rotation.

    Args:
        rotation: Rotation angle in degrees from metadata

    Returns:
        cv2 rotation constant or None if no correction needed
    """
    # The displaymatrix rotation indicates how the video is rotated.
    # To correct it, we apply the OPPOSITE rotation.
    if rotation in (-90, 270):
        return cv.ROTATE_90_CLOCKWISE
    elif rotation in (90, -270):
        return cv.ROTATE_90_COUNTERCLOCKWISE
    elif rotation in (180, -180):
        return cv.ROTATE_180
    return None

get_total_concurrent_sessions

get_total_concurrent_sessions(ttl_seconds)

Get total concurrent WebRTC sessions across all workspaces.

Parameters:

Name Type Description Default
ttl_seconds int

TTL in seconds - entries older than this are considered expired

required

Returns:

Type Description
int

Total number of active sessions

Source code in inference/core/interfaces/webrtc_worker/utils.py
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
def get_total_concurrent_sessions(ttl_seconds: int) -> int:
    """Get total concurrent WebRTC sessions across all workspaces.

    Args:
        ttl_seconds: TTL in seconds - entries older than this are considered expired

    Returns:
        Total number of active sessions
    """
    if not isinstance(cache, RedisCache):
        logger.warning(
            "[REDIS] Redis not available, cannot count total concurrent sessions"
        )
        return 0

    pattern = "webrtc:concurrent_sessions:*"
    cutoff = time.time() - ttl_seconds
    total = 0

    try:
        for key in cache.client.scan_iter(match=pattern):
            cache.client.zremrangebyscore(key, "-inf", cutoff)
            total += cache.client.zcard(key)
        return total
    except Exception as e:
        logger.error(
            "[REDIS] Failed to get total concurrent sessions: %s", e, exc_info=True
        )
        return 0

get_video_fps

get_video_fps(filepath)

Detect video FPS from container metadata.

Parameters:

Name Type Description Default
filepath str

Path to the video file

required

Returns:

Type Description
Optional[float]

FPS as float, or None if detection fails

Source code in inference/core/interfaces/webrtc_worker/utils.py
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
def get_video_fps(filepath: str) -> Optional[float]:
    """Detect video FPS from container metadata.

    Args:
        filepath: Path to the video file

    Returns:
        FPS as float, or None if detection fails
    """
    import json
    import subprocess

    try:
        result = subprocess.run(
            [
                "ffprobe",
                "-v",
                "error",
                "-select_streams",
                "v:0",
                "-show_entries",
                "stream=r_frame_rate,avg_frame_rate",
                "-of",
                "json",
                filepath,
            ],
            capture_output=True,
            text=True,
            timeout=5,
        )
        if result.returncode == 0:
            data = json.loads(result.stdout)
            streams = data.get("streams", [])
            if streams:
                stream = streams[0]
                # Prefer avg_frame_rate (actual average) over r_frame_rate (container rate)
                for rate_key in ["avg_frame_rate", "r_frame_rate"]:
                    rate_str = stream.get(rate_key, "0/1")
                    if "/" in rate_str:
                        num, den = rate_str.split("/")
                        if int(den) != 0:
                            fps = int(num) / int(den)
                            if fps > 0:
                                logger.info(
                                    "Video FPS detected: %.2f from %s", fps, rate_key
                                )
                                return fps
        else:
            logger.warning("ffprobe FPS detection failed: %s", result.stderr.strip())
    except FileNotFoundError:
        logger.warning("ffprobe not available for FPS detection")
    except subprocess.TimeoutExpired:
        logger.warning("ffprobe timed out during FPS detection")
    except Exception as e:
        logger.warning("ffprobe FPS detection failed: %s", e)

    return None

get_video_rotation

get_video_rotation(filepath)

Detect video rotation from metadata (displaymatrix or rotate tag).

Parameters:

Name Type Description Default
filepath str

Path to the video file

required

Returns:

Type Description
int

Rotation in degrees (-90, 0, 90, 180, 270) or 0 if not found.

int

Negative values indicate counter-clockwise rotation.

Source code in inference/core/interfaces/webrtc_worker/utils.py
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
def get_video_rotation(filepath: str) -> int:
    """Detect video rotation from metadata (displaymatrix or rotate tag).

    Args:
        filepath: Path to the video file

    Returns:
        Rotation in degrees (-90, 0, 90, 180, 270) or 0 if not found.
        Negative values indicate counter-clockwise rotation.
    """
    import json
    import subprocess

    try:
        # Use -show_streams which is compatible with all ffprobe versions
        result = subprocess.run(
            [
                "ffprobe",
                "-v",
                "error",
                "-select_streams",
                "v:0",
                "-show_streams",
                "-of",
                "json",
                filepath,
            ],
            capture_output=True,
            text=True,
            timeout=5,
        )
        if result.returncode == 0:
            data = json.loads(result.stdout)
            streams = data.get("streams", [])
            if streams:
                stream = streams[0]
                # Check displaymatrix side_data first
                for sd in stream.get("side_data_list", []):
                    if "rotation" in sd:
                        rotation = int(sd["rotation"])
                        logger.info("Video rotation detected: %d°", rotation)
                        return rotation
                # Fall back to rotate tag in stream tags
                rotate_str = stream.get("tags", {}).get("rotate", "0")
                rotation = int(rotate_str)
                if rotation != 0:
                    logger.info("Video rotation detected: %d°", rotation)
                    return rotation
        else:
            logger.warning("ffprobe failed: %s", result.stderr.strip())
    except FileNotFoundError:
        logger.warning("ffprobe not available")
    except subprocess.TimeoutExpired:
        logger.warning("ffprobe timed out")
    except Exception as e:
        logger.warning("ffprobe rotation detection failed: %s", e)

    return 0

is_over_workspace_session_quota

is_over_workspace_session_quota(
    workspace_id, quota, ttl_seconds
)

Check if a workspace has exceeded its concurrent session quota.

Parameters:

Name Type Description Default
workspace_id str

The workspace identifier

required
quota int

Maximum number of concurrent sessions allowed

required
ttl_seconds int

TTL for considering sessions as active

required

Returns:

Type Description
bool

True if the workspace has reached or exceeded the quota

Source code in inference/core/interfaces/webrtc_worker/utils.py
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
def is_over_workspace_session_quota(
    workspace_id: str, quota: int, ttl_seconds: int
) -> bool:
    """Check if a workspace has exceeded its concurrent session quota.

    Args:
        workspace_id: The workspace identifier
        quota: Maximum number of concurrent sessions allowed
        ttl_seconds: TTL for considering sessions as active

    Returns:
        True if the workspace has reached or exceeded the quota
    """
    count = get_concurrent_session_count(workspace_id, ttl_seconds)
    logger.info(
        "Workspace %s has %d concurrent sessions (quota: %d)",
        workspace_id,
        count,
        quota,
    )
    return count >= quota

parse_video_file_chunk

parse_video_file_chunk(message)

Parse video file chunk message.

Returns: (chunk_index, total_chunks, payload)

Source code in inference/core/interfaces/webrtc_worker/utils.py
198
199
200
201
202
203
204
205
206
def parse_video_file_chunk(message: bytes) -> Tuple[int, int, bytes]:
    """Parse video file chunk message.

    Returns: (chunk_index, total_chunks, payload)
    """
    if len(message) < VIDEO_FILE_HEADER_SIZE:
        raise ValueError(f"Message too short: {len(message)} bytes")
    chunk_index, total_chunks = struct.unpack("<II", message[:8])
    return chunk_index, total_chunks, message[8:]

refresh_webrtc_session

refresh_webrtc_session(workspace_id, session_id)

Refresh the timestamp for a concurrent WebRTC session.

Should be called periodically to keep the session marked as active. If not refreshed, the session will be considered expired after TTL.

Parameters:

Name Type Description Default
workspace_id str

The workspace identifier

required
session_id str

The session identifier to refresh

required

Returns:

Type Description
bool

True if session was refreshed (existed), False otherwise

Source code in inference/core/interfaces/webrtc_worker/utils.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
def refresh_webrtc_session(workspace_id: str, session_id: str) -> bool:
    """Refresh the timestamp for a concurrent WebRTC session.

    Should be called periodically to keep the session marked as active.
    If not refreshed, the session will be considered expired after TTL.

    Args:
        workspace_id: The workspace identifier
        session_id: The session identifier to refresh

    Returns:
        True if session was refreshed (existed), False otherwise
    """
    logger.debug(
        "[REDIS] refresh_webrtc_session called: workspace=%s, session=%s, cache_type=%s",
        workspace_id,
        session_id,
        type(cache).__name__,
    )
    if not isinstance(cache, RedisCache):
        logger.warning(
            "[REDIS] Redis not available (cache is %s), cannot refresh session",
            type(cache).__name__,
        )
        return False

    key = _get_concurrent_sessions_key(workspace_id)
    timestamp = time.time()
    try:
        # Only refresh sessions that already exist: we want to avoid attacks
        # where an attacker injects arbitrary session IDs via an authenticated
        # heartbeat endpoint
        if cache.client.zscore(key, session_id) is None:
            logger.warning(
                "[REDIS] Session not found: workspace=%s, session=%s",
                workspace_id,
                session_id,
            )
            return False

        cache.client.zadd(key, {session_id: timestamp})
        cache.client.expire(key, 600)  # Extend TTL on each heartbeat
        logger.info(
            "[REDIS] Refreshed session: workspace=%s, session=%s",
            workspace_id,
            session_id,
        )
        return True
    except Exception as e:
        logger.error("[REDIS] Failed to refresh session: %s", e, exc_info=True)
        return False

register_webrtc_session

register_webrtc_session(workspace_id, session_id)

Register a new concurrent WebRTC session for a workspace.

Adds the session to a Redis sorted set with current timestamp as score. Expired entries are cleaned up on read via ZREMRANGEBYSCORE (O(log N + M)).

Parameters:

Name Type Description Default
workspace_id str

The workspace identifier

required
session_id str

Unique identifier for this session

required
Source code in inference/core/interfaces/webrtc_worker/utils.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def register_webrtc_session(workspace_id: str, session_id: str) -> None:
    """Register a new concurrent WebRTC session for a workspace.

    Adds the session to a Redis sorted set with current timestamp as score.
    Expired entries are cleaned up on read via ZREMRANGEBYSCORE (O(log N + M)).

    Args:
        workspace_id: The workspace identifier
        session_id: Unique identifier for this session
    """
    if not isinstance(cache, RedisCache):
        logger.warning(
            "[REDIS] Redis not available (cache is %s), skipping session registration",
            type(cache).__name__,
        )
        return

    key = _get_concurrent_sessions_key(workspace_id)
    try:
        cache.client.zadd(key, {session_id: time.time()})
        cache.client.expire(key, 600)  # TTL 600 seconds, extended on each heartbeat
        logger.info(
            "Registered session: workspace=%s, session=%s",
            workspace_id,
            session_id,
        )
    except Exception as e:
        logger.error("Failed to register session: %s", e)

rotate_video_frame

rotate_video_frame(frame, rotation_code)

Apply rotation to a video frame using OpenCV.

Parameters:

Name Type Description Default
frame VideoFrame

Input VideoFrame

required
rotation_code int

cv2 rotation constant (ROTATE_90_CLOCKWISE, etc.)

required

Returns:

Type Description
VideoFrame

Rotated VideoFrame

Source code in inference/core/interfaces/webrtc_worker/utils.py
591
592
593
594
595
596
597
598
599
600
601
602
603
def rotate_video_frame(frame: VideoFrame, rotation_code: int) -> VideoFrame:
    """Apply rotation to a video frame using OpenCV.

    Args:
        frame: Input VideoFrame
        rotation_code: cv2 rotation constant (ROTATE_90_CLOCKWISE, etc.)

    Returns:
        Rotated VideoFrame
    """
    img = frame.to_ndarray(format="bgr24")
    img = cv.rotate(img, rotation_code)
    return VideoFrame.from_ndarray(img, format="bgr24")

inference.core.interfaces.webrtc_worker.webrtc

Classes

VideoFrameProcessor

Base class for processing video frames through workflow.

Can be used independently for data-only processing (no video track output) or as a base for VideoTransformTrackWithLoop when video output is needed.

Source code in inference/core/interfaces/webrtc_worker/webrtc.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
class VideoFrameProcessor:
    """Base class for processing video frames through workflow.

    Can be used independently for data-only processing (no video track output)
    or as a base for VideoTransformTrackWithLoop when video output is needed.
    """

    def __init__(
        self,
        asyncio_loop: asyncio.AbstractEventLoop,
        workflow_configuration: WorkflowConfiguration,
        api_key: str,
        model_manager: Optional[ModelManager] = None,
        data_output: Optional[List[str]] = None,
        stream_output: Optional[str] = None,
        has_video_track: bool = True,
        declared_fps: float = 30,
        termination_date: Optional[datetime.datetime] = None,
        terminate_event: Optional[asyncio.Event] = None,
        heartbeat_callback: Optional[Callable[[], None]] = None,
        realtime_processing: bool = True,
        is_preview: bool = False,
    ):
        self._file_processing = False
        self._loop = asyncio_loop
        self._termination_date = termination_date
        self._terminate_event = terminate_event
        self.track: Optional[MediaStreamTrack] = None
        self._track_active: bool = False
        self._av_logging_set: bool = False
        self._received_frames = 0
        self._declared_fps = declared_fps
        self._fps_monitor = sv.FPSMonitor()
        self._stop_processing = False
        self._termination_reason: Optional[str] = None
        self._processing_complete_sent = False
        self.heartbeat_callback = heartbeat_callback

        self.has_video_track = has_video_track
        self.stream_output = stream_output
        self.data_channel: Optional[RTCDataChannel] = None

        # Video file upload support
        self.video_upload_handler: Optional[VideoFileUploadHandler] = None
        self._track_ready_event: asyncio.Event = asyncio.Event()
        self.realtime_processing = realtime_processing
        self._rotation_code: Optional[int] = None

        # Optional receiver-paced flow control (enabled only after first ACK is received)
        self._ack_last: int = 0
        # If ack=1 and window=4, server may produce/send up to frame 5.
        # Configurable via WEBRTC_DATACHANNEL_ACK_WINDOW env var.
        self._ack_window: int = WEBRTC_DATA_CHANNEL_ACK_WINDOW
        self._ack_event: asyncio.Event = asyncio.Event()

        if data_output is None:
            self.data_output = None
            self._data_mode = DataOutputMode.NONE
        elif isinstance(data_output, list):
            self.data_output = [f for f in data_output if f]
            if self.data_output == ["*"]:
                self._data_mode = DataOutputMode.ALL
            elif len(self.data_output) == 0:
                self._data_mode = DataOutputMode.NONE
            else:
                self._data_mode = DataOutputMode.SPECIFIC
        else:
            raise WebRTCConfigurationError(
                f"data_output must be list or None, got {type(data_output).__name__}"
            )

        self._validate_output_fields(workflow_configuration)

        self._inference_pipeline = InferencePipeline.init_with_workflow(
            video_reference=VideoFrameProducer,
            workflow_specification=workflow_configuration.workflow_specification,
            workspace_name=workflow_configuration.workspace_name,
            workflow_id=workflow_configuration.workflow_id,
            api_key=api_key,
            image_input_name=workflow_configuration.image_input_name,
            workflows_parameters=workflow_configuration.workflows_parameters,
            workflows_thread_pool_workers=workflow_configuration.workflows_thread_pool_workers,
            cancel_thread_pool_tasks_on_exit=workflow_configuration.cancel_thread_pool_tasks_on_exit,
            video_metadata_input_name=workflow_configuration.video_metadata_input_name,
            model_manager=model_manager,
            _is_preview=is_preview,
            workflow_version_id=workflow_configuration.workflow_version_id,
        )

    def set_track(self, track: MediaStreamTrack, rotation_code: Optional[int] = None):
        if not self.track:
            self.track = track
            self._rotation_code = rotation_code
            self._track_ready_event.set()

    async def close(self):
        self._track_active = False
        self._stop_processing = True
        # Clean up video upload handler if present
        if self.video_upload_handler is not None:
            await self.video_upload_handler.cleanup()

    def record_ack(self, ack: int) -> None:
        """Record cumulative ACK from the client.

        ACK semantics: client has fully handled all frames <= ack.
        Backwards compatible: pacing is disabled until we receive the first ACK.
        """
        try:
            ack_int = int(ack)
        except (TypeError, ValueError):
            logger.warning("Invalid ACK value: %s", ack)
            return
        if ack_int < 0:
            logger.warning("Invalid ACK value: %s", ack)
            return
        if ack_int > self._ack_last:
            if ack_int % 100 == 1:
                logger.info("ACK received: %s", ack_int)
            self._ack_last = ack_int
            self._ack_event.set()

    async def _wait_for_ack_window(self, next_frame_id: int) -> None:
        """Block frame production when too far ahead of client ACKs."""
        if self.realtime_processing or self._ack_last == 0:
            return

        wait_counter = 0
        while not self._stop_processing and next_frame_id > (
            self._ack_last + self._ack_window
        ):
            if self._check_termination():
                return
            if self.heartbeat_callback:
                self.heartbeat_callback()

            self._ack_event.clear()
            try:
                await asyncio.wait_for(self._ack_event.wait(), timeout=0.2)
            except asyncio.TimeoutError:
                wait_counter += 1
                if wait_counter % 5 == 1:
                    logger.info(
                        "Waiting for ACK window (next=%d, ack_last=%d, window=%d)",
                        next_frame_id,
                        self._ack_last,
                        self._ack_window,
                    )

    def _check_termination(self):
        """Check if we should terminate based on timeout.

        Does NOT set terminate_event — callers must call _signal_termination()
        after sending final data-channel messages to avoid a race with the
        cleanup task closing the peer connection.
        """
        if self._termination_date and self._termination_date < datetime.datetime.now():
            logger.info("Timeout reached, terminating inference pipeline")
            self._termination_reason = "timeout_reached"
            return True
        if self._terminate_event and self._terminate_event.is_set():
            logger.info("Terminate event set, terminating inference pipeline")
            return True
        return False

    def _signal_termination(self):
        if self._terminate_event:
            self._terminate_event.set()

    @staticmethod
    def serialize_outputs_sync(
        fields_to_send: List[str],
        workflow_output: Dict[str, Any],
        data_output_mode: DataOutputMode,
    ) -> Tuple[Dict[str, Any], List[str]]:
        """Serialize workflow outputs for WebRTC transmission."""
        serialized = {}
        serialization_errors = []

        for field_name in fields_to_send:
            if field_name not in workflow_output:
                serialization_errors.append(f"Output '{field_name}' not found")
                continue

            output_data = workflow_output[field_name]

            if data_output_mode == DataOutputMode.ALL and isinstance(
                output_data, WorkflowImageData
            ):
                continue

            try:
                serialized[field_name] = serialize_for_webrtc(output_data)
            except Exception as e:
                serialization_errors.append(f"{field_name}: {e}")
                serialized[field_name] = {"__serialization_error__": str(e)}
                logger.error("[SERIALIZE] Error: %s - %s", field_name, e)

        return serialized, serialization_errors

    async def _send_data_output(
        self,
        workflow_output: Dict[str, Any],
        frame_timestamp: datetime.datetime,
        frame: VideoFrame,
        errors: List[str],
    ):
        frame_id = self._received_frames

        if not self.data_channel or self.data_channel.readyState != "open":
            return

        video_metadata = WebRTCVideoMetadata(
            frame_id=frame_id,
            received_at=frame_timestamp.isoformat(),
            pts=frame.pts,
            time_base=frame.time_base,
            declared_fps=self._declared_fps,
            height=frame.height,
            width=frame.width,
        )

        webrtc_output = WebRTCOutput(
            serialized_output_data=None,
            video_metadata=video_metadata,
            errors=errors.copy(),
        )

        if self._data_mode == DataOutputMode.NONE:
            json_bytes = await asyncio.to_thread(
                lambda: json.dumps(webrtc_output.model_dump()).encode("utf-8")
            )
            await send_chunked_data(
                self.data_channel,
                frame_id,
                json_bytes,
                heartbeat_callback=self.heartbeat_callback,
            )
            return

        if self._data_mode == DataOutputMode.ALL:
            fields_to_send = list(workflow_output.keys())
        else:
            fields_to_send = self.data_output

        serialized_outputs, serialization_errors = await asyncio.to_thread(
            VideoFrameProcessor.serialize_outputs_sync,
            fields_to_send,
            workflow_output,
            self._data_mode,
        )

        webrtc_output.errors.extend(serialization_errors)
        if serialized_outputs:
            webrtc_output.serialized_output_data = serialized_outputs

        # TODO: use orjson
        json_bytes = await asyncio.to_thread(
            lambda: orjson.dumps(
                webrtc_output.model_dump(),
                default=default_encoder,
                option=orjson.OPT_NON_STR_KEYS | orjson.OPT_SERIALIZE_NUMPY,
            )
        )

        if WEBRTC_GZIP_PREVIEW_FRAME_COMPRESSION:

            def compress_json():
                return gzip.compress(json_bytes, compresslevel=6)

            output_bytes = await asyncio.to_thread(compress_json)
        else:
            output_bytes = json_bytes

        success = await send_chunked_data(
            self.data_channel,
            frame_id,
            output_bytes,
            heartbeat_callback=self.heartbeat_callback,
        )
        if not success:
            logger.error("[SEND_OUTPUT] Frame %d failed", frame_id)

    async def _send_processing_complete(self):
        """Send final message indicating processing is complete.

        Also drains the data channel buffer to ensure delivery before the
        connection is closed.
        """
        if self._processing_complete_sent:
            return
        if not self.data_channel or self.data_channel.readyState != "open":
            return

        self._processing_complete_sent = True
        completion_output = WebRTCOutput(
            processing_complete=True,
            termination_reason=self._termination_reason,
            video_metadata=WebRTCVideoMetadata(
                frame_id=self._received_frames,
                received_at=datetime.datetime.now().isoformat(),
            ),
        )
        json_bytes = json.dumps(completion_output.model_dump()).encode("utf-8")
        await send_chunked_data(
            self.data_channel, self._received_frames + 1, json_bytes
        )
        if not await wait_for_buffer_drain(
            self.data_channel, timeout=2.0, low_threshold=0
        ):
            logger.warning(
                "Buffer drain timed out, processing_complete may not reach client"
            )

    async def process_frames_data_only(self):
        """Process frames for data extraction only, without video track output."""
        if not self._av_logging_set:
            av_logging.set_libav_level(av_logging.ERROR)
            self._av_logging_set = True

        try:
            while not self._stop_processing:
                await self._wait_for_ack_window(next_frame_id=self._received_frames + 1)
                if self._check_termination():
                    await self._send_processing_complete()
                    self._signal_termination()
                    break
                if self.heartbeat_callback:
                    self.heartbeat_callback()
                if not self.track or self.track.readyState == "ended":
                    break

                # Drain queue for realtime RTSP
                if (
                    isinstance(self.track, PlayerStreamTrack)
                    and self.realtime_processing
                ):
                    while self.track._queue.qsize() > 30:
                        self.track._queue.get_nowait()

                frame = await self.track.recv()
                self._received_frames += 1
                self._fps_monitor.tick()
                frame_timestamp = datetime.datetime.now()

                workflow_output, _, errors = await self._process_frame_async(
                    frame=frame,
                    frame_id=self._received_frames,
                    render_output=False,
                    include_errors_on_frame=False,
                )

                await self._send_data_output(
                    workflow_output, frame_timestamp, frame, errors
                )

        except asyncio.CancelledError as exc:
            # No one will catch this exception as it's executed in a create_task
            logger.info("[DATA_ONLY] Processing cancelled: %s", exc)
        except MediaStreamError as exc:
            logger.info("[DATA_ONLY] Media stream ended: %s", exc)
        except Exception as exc:
            logger.error(
                "[DATA_ONLY] Error at frame %d: %s", self._received_frames, exc
            )
        finally:
            await self._send_processing_complete()

    @staticmethod
    def _ensure_workflow_specification(
        workflow_configuration: WorkflowConfiguration, api_key: str
    ) -> None:
        has_specification = workflow_configuration.workflow_specification is not None
        has_workspace_and_workflow_id = (
            workflow_configuration.workspace_name is not None
            and workflow_configuration.workflow_id is not None
        )

        if not has_specification and not has_workspace_and_workflow_id:
            raise WebRTCConfigurationError(
                "Either 'workflow_specification' or both 'workspace_name' and 'workflow_id' must be provided"
            )

        if not has_specification and has_workspace_and_workflow_id:
            try:
                workflow_configuration.workflow_specification = (
                    get_workflow_specification(
                        api_key=api_key,
                        workspace_id=workflow_configuration.workspace_name,
                        workflow_id=workflow_configuration.workflow_id,
                        workflow_version_id=workflow_configuration.workflow_version_id,
                    )
                )
                workflow_configuration.workspace_name = None
                workflow_configuration.workflow_id = None
            except Exception as e:
                raise WebRTCConfigurationError(
                    f"Failed to fetch workflow specification from API: {str(e)}"
                )

    def _validate_output_fields(
        self, workflow_configuration: WorkflowConfiguration
    ) -> None:
        if workflow_configuration.workflow_specification is None:
            return

        workflow_outputs = workflow_configuration.workflow_specification.get(
            "outputs", []
        )
        available_output_names = [o.get("name") for o in workflow_outputs]

        if self._data_mode == DataOutputMode.SPECIFIC:
            invalid_fields = [
                field
                for field in self.data_output
                if field not in available_output_names
            ]
            if invalid_fields:
                raise WebRTCConfigurationError(
                    f"Invalid data_output fields: {invalid_fields}. "
                    f"Available workflow outputs: {available_output_names}"
                )

        if self.stream_output and self.stream_output not in available_output_names:
            raise WebRTCConfigurationError(
                f"Invalid stream_output field: '{self.stream_output}'. "
                f"Available workflow outputs: {available_output_names}"
            )

    async def _process_frame_async(
        self,
        frame: VideoFrame,
        frame_id: int,
        stream_output: Optional[str] = None,
        render_output: bool = True,
        include_errors_on_frame: bool = True,
    ) -> Tuple[Dict[str, Any], Optional[VideoFrame], List[str]]:
        """Async wrapper for process_frame using executor."""

        if self._rotation_code is not None:
            frame = rotate_video_frame(frame, self._rotation_code)

        loop = asyncio.get_running_loop()
        return await loop.run_in_executor(
            None,
            process_frame,
            frame,
            frame_id,
            self._declared_fps,
            (
                self._fps_monitor.fps
                if len(self._fps_monitor.all_timestamps) > 1
                else self._declared_fps
            ),
            self._file_processing,
            self._inference_pipeline,
            stream_output,
            render_output,
            include_errors_on_frame,
        )
Methods:
process_frames_data_only async
process_frames_data_only()

Process frames for data extraction only, without video track output.

Source code in inference/core/interfaces/webrtc_worker/webrtc.py
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
async def process_frames_data_only(self):
    """Process frames for data extraction only, without video track output."""
    if not self._av_logging_set:
        av_logging.set_libav_level(av_logging.ERROR)
        self._av_logging_set = True

    try:
        while not self._stop_processing:
            await self._wait_for_ack_window(next_frame_id=self._received_frames + 1)
            if self._check_termination():
                await self._send_processing_complete()
                self._signal_termination()
                break
            if self.heartbeat_callback:
                self.heartbeat_callback()
            if not self.track or self.track.readyState == "ended":
                break

            # Drain queue for realtime RTSP
            if (
                isinstance(self.track, PlayerStreamTrack)
                and self.realtime_processing
            ):
                while self.track._queue.qsize() > 30:
                    self.track._queue.get_nowait()

            frame = await self.track.recv()
            self._received_frames += 1
            self._fps_monitor.tick()
            frame_timestamp = datetime.datetime.now()

            workflow_output, _, errors = await self._process_frame_async(
                frame=frame,
                frame_id=self._received_frames,
                render_output=False,
                include_errors_on_frame=False,
            )

            await self._send_data_output(
                workflow_output, frame_timestamp, frame, errors
            )

    except asyncio.CancelledError as exc:
        # No one will catch this exception as it's executed in a create_task
        logger.info("[DATA_ONLY] Processing cancelled: %s", exc)
    except MediaStreamError as exc:
        logger.info("[DATA_ONLY] Media stream ended: %s", exc)
    except Exception as exc:
        logger.error(
            "[DATA_ONLY] Error at frame %d: %s", self._received_frames, exc
        )
    finally:
        await self._send_processing_complete()
record_ack
record_ack(ack)

Record cumulative ACK from the client.

ACK semantics: client has fully handled all frames <= ack. Backwards compatible: pacing is disabled until we receive the first ACK.

Source code in inference/core/interfaces/webrtc_worker/webrtc.py
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def record_ack(self, ack: int) -> None:
    """Record cumulative ACK from the client.

    ACK semantics: client has fully handled all frames <= ack.
    Backwards compatible: pacing is disabled until we receive the first ACK.
    """
    try:
        ack_int = int(ack)
    except (TypeError, ValueError):
        logger.warning("Invalid ACK value: %s", ack)
        return
    if ack_int < 0:
        logger.warning("Invalid ACK value: %s", ack)
        return
    if ack_int > self._ack_last:
        if ack_int % 100 == 1:
            logger.info("ACK received: %s", ack_int)
        self._ack_last = ack_int
        self._ack_event.set()
serialize_outputs_sync staticmethod
serialize_outputs_sync(
    fields_to_send, workflow_output, data_output_mode
)

Serialize workflow outputs for WebRTC transmission.

Source code in inference/core/interfaces/webrtc_worker/webrtc.py
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
@staticmethod
def serialize_outputs_sync(
    fields_to_send: List[str],
    workflow_output: Dict[str, Any],
    data_output_mode: DataOutputMode,
) -> Tuple[Dict[str, Any], List[str]]:
    """Serialize workflow outputs for WebRTC transmission."""
    serialized = {}
    serialization_errors = []

    for field_name in fields_to_send:
        if field_name not in workflow_output:
            serialization_errors.append(f"Output '{field_name}' not found")
            continue

        output_data = workflow_output[field_name]

        if data_output_mode == DataOutputMode.ALL and isinstance(
            output_data, WorkflowImageData
        ):
            continue

        try:
            serialized[field_name] = serialize_for_webrtc(output_data)
        except Exception as e:
            serialization_errors.append(f"{field_name}: {e}")
            serialized[field_name] = {"__serialization_error__": str(e)}
            logger.error("[SERIALIZE] Error: %s - %s", field_name, e)

    return serialized, serialization_errors

VideoTransformTrackWithLoop

Bases: VideoStreamTrack, VideoFrameProcessor

Video track that processes frames through workflow and sends video back.

Inherits from both VideoStreamTrack (for WebRTC video track functionality) and VideoFrameProcessor (for workflow processing logic).

Source code in inference/core/interfaces/webrtc_worker/webrtc.py
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
class VideoTransformTrackWithLoop(VideoStreamTrack, VideoFrameProcessor):
    """Video track that processes frames through workflow and sends video back.

    Inherits from both VideoStreamTrack (for WebRTC video track functionality)
    and VideoFrameProcessor (for workflow processing logic).
    """

    def __init__(
        self,
        asyncio_loop: asyncio.AbstractEventLoop,
        workflow_configuration: WorkflowConfiguration,
        api_key: str,
        model_manager: Optional[ModelManager] = None,
        data_output: Optional[List[str]] = None,
        stream_output: Optional[str] = None,
        has_video_track: bool = True,
        declared_fps: float = 30,
        termination_date: Optional[datetime.datetime] = None,
        terminate_event: Optional[asyncio.Event] = None,
        heartbeat_callback: Optional[Callable[[], None]] = None,
        realtime_processing: bool = True,
        is_preview: bool = False,
        *args,
        **kwargs,
    ):
        VideoStreamTrack.__init__(self, *args, **kwargs)
        VideoFrameProcessor.__init__(
            self,
            asyncio_loop=asyncio_loop,
            workflow_configuration=workflow_configuration,
            api_key=api_key,
            data_output=data_output,
            stream_output=stream_output,
            has_video_track=has_video_track,
            declared_fps=declared_fps,
            termination_date=termination_date,
            terminate_event=terminate_event,
            model_manager=model_manager,
            heartbeat_callback=heartbeat_callback,
            realtime_processing=realtime_processing,
            is_preview=is_preview,
        )

    async def _auto_detect_stream_output(
        self, frame: VideoFrame, frame_id: int
    ) -> None:
        workflow_output_for_detect, _, _ = await self._process_frame_async(
            frame=frame,
            frame_id=frame_id,
            render_output=False,
            include_errors_on_frame=False,
        )
        detected_output = detect_image_output(workflow_output_for_detect)
        if detected_output:
            self.stream_output = detected_output
            logger.info(f"Auto-detected stream_output: {detected_output}")
        else:
            logger.warning("No image output detected, will use fallback")
            self.stream_output = ""

    async def recv(self):
        # Silencing swscaler warnings in multi-threading environment
        if not self._av_logging_set:
            av_logging.set_libav_level(av_logging.ERROR)
            self._av_logging_set = True

        if self.heartbeat_callback:
            self.heartbeat_callback()

        # Wait for track to be ready (video file upload case)
        if self.track is None:
            logger.info("[RECV] Track is None, waiting for track_ready_event...")
            await self._track_ready_event.wait()
            if self.track is None:
                logger.error("[RECV] Track still None after wait!")
                raise MediaStreamError("Track not available after wait")

        # Optional ACK pacing: block producing the next frame if we're too far ahead.
        await self._wait_for_ack_window(next_frame_id=self._received_frames + 1)

        if self._check_termination():
            logger.warning("[RECV] Termination triggered, closing gracefully")
            await self._send_processing_complete()
            self._signal_termination()
            reason = self._termination_reason or "terminate_event"
            raise MediaStreamError(f"Processing terminated: {reason}")

        # Drain queue if using PlayerStreamTrack (RTSP/video file)
        if isinstance(self.track, PlayerStreamTrack) and self.realtime_processing:
            queue_size = self.track._queue.qsize()
            if queue_size > 30:
                drained = 0
                while self.track._queue.qsize() > 30:
                    self.track._queue.get_nowait()
                    drained += 1
                logger.info(
                    "[RECV] Drained %d frames from queue (was %d)", drained, queue_size
                )

        try:
            frame: VideoFrame = await self.track.recv()
        except MediaStreamError:
            logger.info("[RECV] Track ended after %d frames", self._received_frames)
            await self._send_processing_complete()
            raise

        self._received_frames += 1
        self._fps_monitor.tick()
        frame_id = self._received_frames
        frame_timestamp = datetime.datetime.now()

        if self.stream_output is None and frame_id == 1:
            await self._auto_detect_stream_output(frame, frame_id)

        workflow_output, new_frame, errors = await self._process_frame_async(
            frame=frame,
            frame_id=frame_id,
            stream_output=self.stream_output,
            render_output=True,
            include_errors_on_frame=True,
        )

        new_frame.pts = frame.pts
        new_frame.time_base = frame.time_base

        await self._send_data_output(workflow_output, frame_timestamp, frame, errors)

        if errors:
            logger.warning("[RECV] Frame %d errors: %s", frame_id, errors)

        return new_frame

Functions:

create_chunked_binary_message

create_chunked_binary_message(
    frame_id, chunk_index, total_chunks, payload
)

Create a binary message with standard 12-byte header.

Format: [frame_id: 4][chunk_index: 4][total_chunks: 4][payload: N] All integers are uint32 little-endian.

Source code in inference/core/interfaces/webrtc_worker/webrtc.py
86
87
88
89
90
91
92
93
94
95
def create_chunked_binary_message(
    frame_id: int, chunk_index: int, total_chunks: int, payload: bytes
) -> bytes:
    """Create a binary message with standard 12-byte header.

    Format: [frame_id: 4][chunk_index: 4][total_chunks: 4][payload: N]
    All integers are uint32 little-endian.
    """
    header = struct.pack("<III", frame_id, chunk_index, total_chunks)
    return header + payload

send_chunked_data async

send_chunked_data(
    data_channel,
    frame_id,
    payload_bytes,
    chunk_size=CHUNK_SIZE,
    heartbeat_callback=None,
    buffer_timeout=120.0,
)

Send payload via data channel with chunking and backpressure.

We chunk large payloads because WebRTC data channels have message size limits. We apply backpressure (wait for buffer to drain) to avoid overwhelming the network and causing ICE connection failures.

Heads up: buffer_timeout needs to be higher than WEBRTC_DATA_CHANNEL_BUFFER_DRAINING_DELAY! Otherwise we will timeout ourselves.

Source code in inference/core/interfaces/webrtc_worker/webrtc.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
async def send_chunked_data(
    data_channel: RTCDataChannel,
    frame_id: int,
    payload_bytes: bytes,
    chunk_size: int = CHUNK_SIZE,
    heartbeat_callback: Optional[Callable[[], None]] = None,
    buffer_timeout: float = 120.0,
) -> bool:
    """Send payload via data channel with chunking and backpressure.

    We chunk large payloads because WebRTC data channels have message size limits.
    We apply backpressure (wait for buffer to drain) to avoid overwhelming the
    network and causing ICE connection failures.

    Heads up: buffer_timeout needs to be higher than WEBRTC_DATA_CHANNEL_BUFFER_DRAINING_DELAY!
    Otherwise we will timeout ourselves.
    """

    if buffer_timeout <= WEBRTC_DATA_CHANNEL_BUFFER_DRAINING_DELAY:
        logger.warning(
            "[SEND_CHUNKED] buffer_timeout (%.2fs) <= WEBRTC_DATA_CHANNEL_BUFFER_DRAINING_DELAY (%.2fs), "
            "this will likely cause immediate timeouts during buffer drain",
            buffer_timeout,
            WEBRTC_DATA_CHANNEL_BUFFER_DRAINING_DELAY,
        )

    if data_channel.readyState != "open":
        return False

    payload_size = len(payload_bytes)
    total_chunks = (payload_size + chunk_size - 1) // chunk_size
    view = memoryview(payload_bytes)
    high_threshold = WEBRTC_DATA_CHANNEL_BUFFER_SIZE_LIMIT

    for chunk_index in range(total_chunks):
        if data_channel.readyState != "open":
            logger.error(
                "[SEND_CHUNKED] Channel closed at chunk %d/%d",
                chunk_index,
                total_chunks,
            )
            return False

        start = chunk_index * chunk_size
        end = min(start + chunk_size, payload_size)
        chunk_data = view[start:end]

        message = create_chunked_binary_message(
            frame_id, chunk_index, total_chunks, chunk_data
        )

        if data_channel.bufferedAmount > high_threshold:
            if not await wait_for_buffer_drain(
                data_channel, buffer_timeout, heartbeat_callback
            ):
                logger.error(
                    "[SEND_CHUNKED] Buffer drain failed at chunk %d/%d",
                    chunk_index,
                    total_chunks,
                )
                return False

        data_channel.send(message)

        if heartbeat_callback:
            heartbeat_callback()
        await asyncio.sleep(0.001)

    return True

wait_for_buffer_drain async

wait_for_buffer_drain(
    data_channel,
    timeout=30.0,
    heartbeat_callback=None,
    low_threshold=None,
)

Wait for data channel buffer to drain below threshold, with timeout.

We use a low threshold (1/4 of limit) instead of just below the limit to avoid hysteresis - constantly triggering this wait after sending just a few chunks.

And we wait WEBRTC_DATA_CHANNEL_BUFFER_DRAINING_DELAY to avoid starving the event loop.

Source code in inference/core/interfaces/webrtc_worker/webrtc.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
async def wait_for_buffer_drain(
    data_channel: RTCDataChannel,
    timeout: float = 30.0,
    heartbeat_callback: Optional[Callable[[], None]] = None,
    low_threshold: Optional[int] = None,
) -> bool:
    """Wait for data channel buffer to drain below threshold, with timeout.

    We use a low threshold (1/4 of limit) instead of just below the limit to avoid
    hysteresis - constantly triggering this wait after sending just a few chunks.

    And we wait WEBRTC_DATA_CHANNEL_BUFFER_DRAINING_DELAY to avoid starving the
    event loop.
    """
    if low_threshold is None:
        low_threshold = WEBRTC_DATA_CHANNEL_BUFFER_SIZE_LIMIT // 4

    start_time = asyncio.get_event_loop().time()

    while data_channel.bufferedAmount > low_threshold:
        elapsed = asyncio.get_event_loop().time() - start_time
        if elapsed > timeout:
            logger.error("[BUFFER_DRAIN] Timeout after %.1fs", timeout)
            return False
        if data_channel.readyState != "open":
            logger.error("[BUFFER_DRAIN] Channel closed: %s", data_channel.readyState)
            return False
        if heartbeat_callback:
            heartbeat_callback()
        await asyncio.sleep(WEBRTC_DATA_CHANNEL_BUFFER_DRAINING_DELAY)

    return True

core/interfaces/webrtc_worker/sources

inference.core.interfaces.webrtc_worker.sources.file

Video file source for WebRTC - handles uploaded video files.

Classes

ThreadedVideoFileTrack

Bases: MediaStreamTrack

Video track that decodes frames from a file in a background thread.

Uses a dedicated thread with a queue to avoid deadlocks with the event loop.

Source code in inference/core/interfaces/webrtc_worker/sources/file.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
class ThreadedVideoFileTrack(MediaStreamTrack):
    """Video track that decodes frames from a file in a background thread.

    Uses a dedicated thread with a queue to avoid deadlocks with the event loop.
    """

    kind = "video"

    def __init__(self, filepath: str, queue_size: int = 60):
        # TODO: add parameter queue size in settings
        super().__init__()
        self._queue = queue.Queue(maxsize=queue_size)
        self._stop_event = threading.Event()
        self._decode_thread = threading.Thread(
            target=_decode_worker,
            args=(filepath, self._queue, self._stop_event),
            daemon=True,
        )
        self._decode_thread.start()

    async def recv(self) -> VideoFrame:
        while True:
            try:
                data = self._queue.get_nowait()
                break
            except queue.Empty:
                await asyncio.sleep(0.001)

        if data is None:
            self.stop()
            raise MediaStreamError("End of video file")
        if isinstance(data, dict):
            logger.error("[ThreadedVideoTrack] Decode error: %s", data)
            self.stop()
            raise MediaStreamError(data.get("error", "Unknown decode error"))

        return data

    def stop(self):
        super().stop()
        self._stop_event.set()

VideoFileUploadHandler

Handles video file uploads via data channel.

Protocol: [chunk_index:u32][total_chunks:u32][payload] Auto-completes when all chunks received.

Source code in inference/core/interfaces/webrtc_worker/sources/file.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
class VideoFileUploadHandler:
    """Handles video file uploads via data channel.

    Protocol: [chunk_index:u32][total_chunks:u32][payload]
    Auto-completes when all chunks received.
    """

    def __init__(self):
        self._chunks: Dict[int, bytes] = {}
        self._total_chunks: Optional[int] = None
        self._temp_file_path: Optional[str] = None
        self._state = VideoFileUploadState.IDLE
        self.upload_complete_event = asyncio.Event()

    @property
    def temp_file_path(self) -> Optional[str]:
        return self._temp_file_path

    def handle_chunk(self, chunk_index: int, total_chunks: int, data: bytes) -> None:
        """Handle a chunk. Auto-completes when all chunks received."""
        # TODO: we need to refactor this...
        if self._total_chunks is None:
            self._total_chunks = total_chunks
            self._state = VideoFileUploadState.UPLOADING

        self._chunks[chunk_index] = data

        if len(self._chunks) == total_chunks:
            self._write_to_temp_file()
            self._state = VideoFileUploadState.COMPLETE
            self.upload_complete_event.set()

    def _write_to_temp_file(self) -> None:
        """Reassemble chunks and write to temp file."""
        import tempfile

        # TODO: we need to refactor this...
        with tempfile.NamedTemporaryFile(mode="wb", suffix=".mp4", delete=False) as f:
            for i in range(self._total_chunks):
                f.write(self._chunks[i])
            self._temp_file_path = f.name

        self._chunks.clear()

    def try_start_processing(self) -> Optional[str]:
        """Check if upload complete and transition to PROCESSING. Returns path or None."""
        if self._state == VideoFileUploadState.COMPLETE:
            self._state = VideoFileUploadState.PROCESSING
            return self._temp_file_path
        return None

    async def cleanup(self) -> None:
        """Clean up temp file."""
        # TODO: we need to refactor this...
        if self._temp_file_path:
            import os

            path = self._temp_file_path
            self._temp_file_path = None
            try:
                await asyncio.to_thread(os.unlink, path)
            except Exception:
                pass
Methods:
cleanup async
cleanup()

Clean up temp file.

Source code in inference/core/interfaces/webrtc_worker/sources/file.py
161
162
163
164
165
166
167
168
169
170
171
172
async def cleanup(self) -> None:
    """Clean up temp file."""
    # TODO: we need to refactor this...
    if self._temp_file_path:
        import os

        path = self._temp_file_path
        self._temp_file_path = None
        try:
            await asyncio.to_thread(os.unlink, path)
        except Exception:
            pass
handle_chunk
handle_chunk(chunk_index, total_chunks, data)

Handle a chunk. Auto-completes when all chunks received.

Source code in inference/core/interfaces/webrtc_worker/sources/file.py
128
129
130
131
132
133
134
135
136
137
138
139
140
def handle_chunk(self, chunk_index: int, total_chunks: int, data: bytes) -> None:
    """Handle a chunk. Auto-completes when all chunks received."""
    # TODO: we need to refactor this...
    if self._total_chunks is None:
        self._total_chunks = total_chunks
        self._state = VideoFileUploadState.UPLOADING

    self._chunks[chunk_index] = data

    if len(self._chunks) == total_chunks:
        self._write_to_temp_file()
        self._state = VideoFileUploadState.COMPLETE
        self.upload_complete_event.set()
try_start_processing
try_start_processing()

Check if upload complete and transition to PROCESSING. Returns path or None.

Source code in inference/core/interfaces/webrtc_worker/sources/file.py
154
155
156
157
158
159
def try_start_processing(self) -> Optional[str]:
    """Check if upload complete and transition to PROCESSING. Returns path or None."""
    if self._state == VideoFileUploadState.COMPLETE:
        self._state = VideoFileUploadState.PROCESSING
        return self._temp_file_path
    return None

core/logging

inference.core.logging.memory_handler

In-memory logging handler for dashboard log viewing.

This module provides a custom logging handler that stores log records in memory for retrieval via the /logs API endpoint. It's designed to be used when ENABLE_IN_MEMORY_LOGS environment variable is set to 'true'.

Classes

MemoryLogHandler

Bases: Handler

Custom log handler that stores log records in memory for dashboard access

Source code in inference/core/logging/memory_handler.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
class MemoryLogHandler(logging.Handler):
    """Custom log handler that stores log records in memory for dashboard access"""

    def emit(self, record):
        try:
            # Format the log entry for JSON serialization
            log_entry = {
                "timestamp": datetime.fromtimestamp(record.created).isoformat(),
                "level": record.levelname,
                "logger": record.name,
                "message": self.format(record),
                "module": record.module or "",
                "line": record.lineno,
            }

            with _log_lock:
                _log_entries.append(log_entry)
        except Exception:
            # Silently handle any errors in logging to prevent recursion
            pass

Functions:

get_recent_logs

get_recent_logs(limit=100, level=None, since=None)

Get recent log entries from memory

Source code in inference/core/logging/memory_handler.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def get_recent_logs(
    limit: int = 100, level: str = None, since: str = None
) -> List[Dict[str, Any]]:
    """Get recent log entries from memory"""
    with _log_lock:
        logs = list(_log_entries)

    # Filter by log level if specified
    if level:
        level_upper = level.upper()
        logs = [log for log in logs if log["level"] == level_upper]

    # Filter by timestamp if specified
    if since:
        try:
            since_dt = datetime.fromisoformat(since.replace("Z", "+00:00"))
            logs = [
                log
                for log in logs
                if datetime.fromisoformat(log["timestamp"]) > since_dt
            ]
        except ValueError:
            pass  # Invalid since timestamp, ignore filter

    # Limit results
    return logs[-limit:] if limit else logs

setup_memory_logging

setup_memory_logging()

Set up memory logging handler for the current logger hierarchy

Source code in inference/core/logging/memory_handler.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def setup_memory_logging() -> None:
    """Set up memory logging handler for the current logger hierarchy"""
    if not is_memory_logging_enabled():
        return
    logger.info("Setting up memory logging")
    memory_handler = MemoryLogHandler()
    memory_handler.setLevel(logging.DEBUG)  # Capture all levels
    memory_formatter = logging.Formatter(
        "%(asctime)s %(levelname)s %(name)s: %(message)s"
    )
    memory_handler.setFormatter(memory_formatter)

    # Add to root logger to capture all logs immediately
    root_logger = logging.getLogger()
    if memory_handler not in root_logger.handlers:
        root_logger.addHandler(memory_handler)

    # Specifically add to uvicorn.access logger to ensure access logs are captured now
    access_logger = logging.getLogger("uvicorn.access")
    if memory_handler not in access_logger.handlers:
        access_logger.addHandler(memory_handler)

    # Also patch uvicorn's default LOGGING_CONFIG so when uvicorn applies dictConfig,
    # our in-memory handler remains attached
    global _uvicorn_config_patched
    if not _uvicorn_config_patched:
        try:
            from uvicorn.config import LOGGING_CONFIG as UVICORN_LOGGING_CONFIG

            # Modify in-place (safe: uvicorn makes a deep copy later)
            log_config = UVICORN_LOGGING_CONFIG

            log_config.setdefault("formatters", {})
            if "default" not in log_config["formatters"]:
                log_config["formatters"]["default"] = {
                    "()": "uvicorn.logging.DefaultFormatter",
                    "fmt": "%(levelprefix)s %(message)s",
                    "use_colors": None,
                }

            log_config.setdefault("handlers", {})["inmemory"] = {
                "class": "inference.core.logging.memory_handler.MemoryLogHandler",
                "level": "DEBUG",
                "formatter": "default",
            }

            log_config.setdefault("loggers", {})
            log_config["loggers"].setdefault(
                "uvicorn.access",
                {
                    "handlers": ["default"],
                    "level": "INFO",
                    "propagate": False,
                },
            )
            if "inmemory" not in log_config["loggers"]["uvicorn.access"]["handlers"]:
                log_config["loggers"]["uvicorn.access"]["handlers"].append("inmemory")

            log_config["loggers"].setdefault(
                "uvicorn", {"handlers": ["default"], "level": "INFO"}
            )
            log_config["loggers"].setdefault("uvicorn.error", {"level": "INFO"})

            root_cfg = log_config.setdefault(
                "root", {"handlers": ["default"], "level": "INFO"}
            )
            if "inmemory" not in root_cfg.get("handlers", []):
                root_cfg.setdefault("handlers", []).append("inmemory")

            _uvicorn_config_patched = True
            logger.info("Patched uvicorn LOGGING_CONFIG to include MemoryLogHandler")
        except Exception:
            # Avoid hard failure if uvicorn is not available
            pass

    return memory_handler

core/managers

Model lifecycle managers: loading, unloading, registry, and resolution.

inference.core.managers.base

Classes

ModelManager

Model managers keep track of a dictionary of Model objects and is responsible for passing requests to the right model using the infer method.

Source code in inference/core/managers/base.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
class ModelManager:
    """Model managers keep track of a dictionary of Model objects and is responsible for passing requests to the right model using the infer method."""

    def __init__(self, model_registry: ModelRegistry, models: Optional[dict] = None):
        self.model_registry = model_registry
        self._models: Dict[str, Model] = models if models is not None else {}
        self._model_request_aliases: Dict[str, set] = {}
        self._model_request_paths: Dict[str, set] = {}
        self.pingback = None
        self._state_lock = Lock()
        self._models_state_locks: Dict[str, Lock] = {}
        # torch.jit.load/script mutate a process-global, non-thread-safe TorchScript
        # registry; loaders acquire this so concurrent loads cannot corrupt it.
        self.torchscript_state_global_lock = Lock()

    def init_pingback(self):
        """Initializes pingback mechanism."""
        self.num_errors = 0  # in the device
        self.uuid = ROBOFLOW_SERVER_UUID
        if METRICS_ENABLED:
            self.pingback = PingbackInfo(self)
            self.pingback.start()

    def add_model(
        self,
        model_id: str,
        api_key: str,
        model_id_alias: Optional[str] = None,
        endpoint_type: ModelEndpointType = ModelEndpointType.ORT,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
    ) -> None:
        """Adds a new model to the manager.

        Args:
            model_id (str): The identifier of the model.
            model (Model): The model instance.
            endpoint_type (ModelEndpointType, optional): The endpoint type to use for the model.
        """
        if MODELS_CACHE_AUTH_ENABLED:
            if not _check_if_api_key_has_access_to_model(
                api_key=api_key,
                model_id=model_id,
                endpoint_type=endpoint_type,
                countinference=countinference,
                service_secret=service_secret,
            ):
                raise RoboflowAPINotAuthorizedError(
                    f"API key {api_key} does not have access to model {model_id}"
                )

        logger.debug(
            f"ModelManager - Adding model with model_id={model_id}, model_id_alias={model_id_alias}"
        )
        resolved_identifier = model_id if model_id_alias is None else model_id_alias
        self.record_request_metadata(
            model_id=resolved_identifier,
            original_model_id=model_id,
            model_id_alias=model_id_alias,
        )
        ids_collector = request_model_ids.get(None)
        if ids_collector is not None:
            ids_collector.add(resolved_identifier)
        model_lock = self._get_lock_for_a_model(model_id=resolved_identifier)
        with acquire_with_timeout(lock=model_lock) as acquired:
            if not acquired:
                # if failed to acquire - then in use, no need to purge lock
                raise ModelManagerLockAcquisitionError(
                    f"Could not acquire lock for model with id={resolved_identifier}."
                )
            if resolved_identifier in self._models:
                logger.debug(
                    f"ModelManager - model with model_id={resolved_identifier} is already loaded."
                )
                return
            try:
                with start_span("model.load", {"model.id": resolved_identifier}):
                    logger.debug("ModelManager - model initialisation...")
                    t_load_start = time.perf_counter()
                    vram_before = _get_cuda_memory_allocated()
                    model_class = self.model_registry.get_model(
                        resolved_identifier,
                        api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                    )

                    extra_init_kwargs = {}
                    if USE_INFERENCE_MODELS:
                        extra_init_kwargs["torchscript_state_global_lock"] = (
                            self.torchscript_state_global_lock
                        )
                    model = model_class(
                        model_id=model_id,
                        api_key=api_key,
                        countinference=countinference,
                        service_secret=service_secret,
                        **extra_init_kwargs,
                    )
                    vram_after = _get_cuda_memory_allocated()
                    if vram_before is not None and vram_after is not None:
                        model._vram_bytes = vram_after - vram_before

                    # Pass countinference and service_secret to download_model_artifacts_from_roboflow_api if available
                    if (
                        hasattr(model, "download_model_artifacts_from_roboflow_api")
                        and INTERNAL_WEIGHTS_URL_SUFFIX == "serverless"
                    ):
                        # Only pass these parameters if INTERNAL_WEIGHTS_URL_SUFFIX is "serverless"
                        if (
                            hasattr(model, "cache_model_artefacts")
                            and not model.has_model_metadata
                        ):
                            # Override the download_model_artifacts_from_roboflow_api method with parameters
                            original_method = (
                                model.download_model_artifacts_from_roboflow_api
                            )
                            model.download_model_artifacts_from_roboflow_api = (
                                lambda: original_method(
                                    countinference=countinference,
                                    service_secret=service_secret,
                                )
                            )

                    load_time = time.perf_counter() - t_load_start
                    vram_delta = getattr(model, "_vram_bytes", None)
                    set_span_attribute("model.load_time_seconds", load_time)
                    record_model_loaded(resolved_identifier, load_time)
                    logger.info(
                        "Model loaded: model_id=%s, load_time=%.2fs, task_type=%s, vram_bytes=%s",
                        resolved_identifier,
                        load_time,
                        getattr(model, "task_type", "unknown"),
                        vram_delta,
                    )
                    self._models[resolved_identifier] = model
                    collector = model_load_info.get(None)
                    if collector is not None:
                        collector.record(
                            model_id=resolved_identifier, load_time=load_time
                        )
            except Exception as error:
                record_error(error)
                self._dispose_model_lock(model_id=resolved_identifier)
                raise error

    def record_request_metadata(
        self,
        model_id: str,
        original_model_id: Optional[str] = None,
        model_id_alias: Optional[str] = None,
    ) -> None:
        """Record request path and aliases for an already-loaded model.

        Decorators call this when they short-circuit ``add_model()`` for warm
        models so registry metadata stays in sync with the base manager path.
        """
        resolved_identifier = model_id
        if resolved_identifier not in self._model_request_aliases:
            self._model_request_aliases[resolved_identifier] = set()
        if original_model_id is not None and original_model_id != resolved_identifier:
            self._model_request_aliases[resolved_identifier].add(original_model_id)
        if model_id_alias is not None and model_id_alias != resolved_identifier:
            self._model_request_aliases[resolved_identifier].add(model_id_alias)
        req_path = current_request_path.get(None)
        if req_path:
            if resolved_identifier not in self._model_request_paths:
                self._model_request_paths[resolved_identifier] = set()
            self._model_request_paths[resolved_identifier].add(req_path)

    def check_for_model(self, model_id: str) -> None:
        """Checks whether the model with the given ID is in the manager.

        Args:
            model_id (str): The identifier of the model.

        Raises:
            InferenceModelNotFound: If the model is not found in the manager.
        """
        if model_id not in self:
            raise InferenceModelNotFound(f"Model with id {model_id} not loaded.")

    async def infer_from_request(
        self, model_id: str, request: InferenceRequest, **kwargs
    ) -> InferenceResponse:
        """Runs inference on the specified model with the given request.

        Args:
            model_id (str): The identifier of the model.
            request (InferenceRequest): The request to process.

        Returns:
            InferenceResponse: The response from the inference.
        """
        logger.debug(
            f"ModelManager - inference from request started for model_id={model_id}."
        )
        enable_model_monitoring = not getattr(
            request, "disable_model_monitoring", False
        )
        if METRICS_ENABLED and self.pingback and enable_model_monitoring:
            logger.debug("ModelManager - setting pingback fallback api key...")
            self.pingback.fallback_api_key = request.api_key
        with start_span(
            "model.infer",
            {"model.id": model_id, "model.infer.caller": "infer_from_request"},
        ):
            try:
                t_infer_start = time.perf_counter()
                rtn_val = await self.model_infer(
                    model_id=model_id, request=request, **kwargs
                )
                record_inference(model_id, time.perf_counter() - t_infer_start)
                logger.debug(
                    f"ModelManager - inference from request finished for model_id={model_id}."
                )
                finish_time = time.time()
                if not DISABLE_INFERENCE_CACHE and enable_model_monitoring:
                    with start_span("model.infer.cache"):
                        try:
                            logger.debug(
                                f"ModelManager - caching inference request started for model_id={model_id}"
                            )
                            model_monitoring_cache_module.model_monitoring_cache.zadd(
                                f"models",
                                value=f"{GLOBAL_INFERENCE_SERVER_ID}:{request.api_key}:{model_id}",
                                score=finish_time,
                                expire=METRICS_INTERVAL * 2,
                            )
                            if (
                                hasattr(request, "image")
                                and hasattr(request.image, "type")
                                and request.image.type == "numpy"
                            ):
                                request.image.value = str(request.image.value)
                            model_monitoring_cache_module.model_monitoring_cache.zadd(
                                f"inference:{GLOBAL_INFERENCE_SERVER_ID}:{model_id}",
                                value=to_cachable_inference_item(request, rtn_val),
                                score=finish_time,
                                expire=METRICS_INTERVAL * 2,
                            )
                            logger.debug(
                                f"ModelManager - caching inference request finished for model_id={model_id}"
                            )
                        except Exception as cache_error:
                            logger.warning(
                                f"Failed to cache inference data for model {model_id}: {cache_error}"
                            )
                return rtn_val
            except Exception as e:
                record_error(e)
                finish_time = time.time()
                if not DISABLE_INFERENCE_CACHE and enable_model_monitoring:
                    with start_span("model.infer.cache_error"):
                        try:
                            model_monitoring_cache_module.model_monitoring_cache.zadd(
                                f"models",
                                value=f"{GLOBAL_INFERENCE_SERVER_ID}:{request.api_key}:{model_id}",
                                score=finish_time,
                                expire=METRICS_INTERVAL * 2,
                            )
                            model_monitoring_cache_module.model_monitoring_cache.zadd(
                                f"error:{GLOBAL_INFERENCE_SERVER_ID}:{model_id}",
                                value={
                                    "request": jsonable_encoder(
                                        request.dict(
                                            exclude={"image", "subject", "prompt"}
                                        )
                                    ),
                                    "error": str(e),
                                },
                                score=finish_time,
                                expire=METRICS_INTERVAL * 2,
                            )
                        except Exception as cache_error:
                            logger.warning(
                                f"Failed to cache error data for model {model_id}: {cache_error}"
                            )
                raise

    def infer_from_request_sync(
        self, model_id: str, request: InferenceRequest, **kwargs
    ) -> InferenceResponse:
        """Runs inference on the specified model with the given request.

        Args:
            model_id (str): The identifier of the model.
            request (InferenceRequest): The request to process.

        Returns:
            InferenceResponse: The response from the inference.
        """
        logger.debug(
            f"ModelManager - inference from request started for model_id={model_id}."
        )
        enable_model_monitoring = not getattr(
            request, "disable_model_monitoring", False
        )
        if METRICS_ENABLED and self.pingback and enable_model_monitoring:
            logger.debug("ModelManager - setting pingback fallback api key...")
            self.pingback.fallback_api_key = request.api_key
        with start_span(
            "model.infer",
            {"model.id": model_id, "model.infer.caller": "infer_from_request_sync"},
        ):
            try:
                t_infer_start = time.perf_counter()
                rtn_val = self.model_infer_sync(
                    model_id=model_id, request=request, **kwargs
                )
                record_inference(model_id, time.perf_counter() - t_infer_start)
                logger.debug(
                    f"ModelManager - inference from request finished for model_id={model_id}."
                )
                finish_time = time.time()
                if not DISABLE_INFERENCE_CACHE and enable_model_monitoring:
                    with start_span("model.infer.cache"):
                        try:
                            logger.debug(
                                f"ModelManager - caching inference request started for model_id={model_id}"
                            )
                            model_monitoring_cache_module.model_monitoring_cache.zadd(
                                f"models",
                                value=f"{GLOBAL_INFERENCE_SERVER_ID}:{request.api_key}:{model_id}",
                                score=finish_time,
                                expire=METRICS_INTERVAL * 2,
                            )
                            if (
                                hasattr(request, "image")
                                and hasattr(request.image, "type")
                                and request.image.type == "numpy"
                            ):
                                request.image.value = str(request.image.value)
                            model_monitoring_cache_module.model_monitoring_cache.zadd(
                                f"inference:{GLOBAL_INFERENCE_SERVER_ID}:{model_id}",
                                value=to_cachable_inference_item(request, rtn_val),
                                score=finish_time,
                                expire=METRICS_INTERVAL * 2,
                            )
                            logger.debug(
                                f"ModelManager - caching inference request finished for model_id={model_id}"
                            )
                        except Exception as cache_error:
                            logger.warning(
                                f"Failed to cache inference data for model {model_id}: {cache_error}"
                            )
                return rtn_val
            except Exception as e:
                record_error(e)
                finish_time = time.time()
                if not DISABLE_INFERENCE_CACHE and enable_model_monitoring:
                    with start_span("model.infer.cache_error"):
                        try:
                            model_monitoring_cache_module.model_monitoring_cache.zadd(
                                f"models",
                                value=f"{GLOBAL_INFERENCE_SERVER_ID}:{request.api_key}:{model_id}",
                                score=finish_time,
                                expire=METRICS_INTERVAL * 2,
                            )
                            model_monitoring_cache_module.model_monitoring_cache.zadd(
                                f"error:{GLOBAL_INFERENCE_SERVER_ID}:{model_id}",
                                value={
                                    "request": jsonable_encoder(
                                        request.dict(
                                            exclude={"image", "subject", "prompt"}
                                        )
                                    ),
                                    "error": str(e),
                                },
                                score=finish_time,
                                expire=METRICS_INTERVAL * 2,
                            )
                        except Exception as cache_error:
                            logger.warning(
                                f"Failed to cache error data for model {model_id}: {cache_error}"
                            )
                raise

    async def model_infer(self, model_id: str, request: InferenceRequest, **kwargs):
        model = self._get_model_reference(model_id=model_id)
        return model.infer_from_request(request)

    def model_infer_sync(
        self, model_id: str, request: InferenceRequest, **kwargs
    ) -> Union[List[InferenceResponse], InferenceResponse]:
        model = self._get_model_reference(model_id=model_id)
        return model.infer_from_request(request)

    def make_response(
        self, model_id: str, predictions: List[List[float]], *args, **kwargs
    ) -> InferenceResponse:
        """Creates a response object from the model's predictions.

        Args:
            model_id (str): The identifier of the model.
            predictions (List[List[float]]): The model's predictions.

        Returns:
            InferenceResponse: The created response object.
        """
        model = self._get_model_reference(model_id=model_id)
        return model.make_response(predictions, *args, **kwargs)

    def postprocess(
        self,
        model_id: str,
        predictions: Tuple[np.ndarray, ...],
        preprocess_return_metadata: PreprocessReturnMetadata,
        *args,
        **kwargs,
    ) -> List[List[float]]:
        """Processes the model's predictions after inference.

        Args:
            model_id (str): The identifier of the model.
            predictions (np.ndarray): The model's predictions.

        Returns:
            List[List[float]]: The post-processed predictions.
        """
        model = self._get_model_reference(model_id=model_id)
        return model.postprocess(
            predictions, preprocess_return_metadata, *args, **kwargs
        )

    def predict(self, model_id: str, *args, **kwargs) -> Tuple[np.ndarray, ...]:
        """Runs prediction on the specified model.

        Args:
            model_id (str): The identifier of the model.

        Returns:
            np.ndarray: The predictions from the model.
        """
        model = self._get_model_reference(model_id=model_id)
        model.metrics["num_inferences"] += 1
        tic = time.perf_counter()
        res = model.predict(*args, **kwargs)
        toc = time.perf_counter()
        model.metrics["avg_inference_time"] += toc - tic
        return res

    def preprocess(
        self, model_id: str, request: InferenceRequest
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        """Preprocesses the request before inference.

        Args:
            model_id (str): The identifier of the model.
            request (InferenceRequest): The request to preprocess.

        Returns:
            Tuple[np.ndarray, List[Tuple[int, int]]]: The preprocessed data.
        """
        model = self._get_model_reference(model_id=model_id)
        return model.preprocess(**request.dict())

    def get_class_names(self, model_id):
        """Retrieves the class names for a given model.

        Args:
            model_id (str): The identifier of the model.

        Returns:
            List[str]: The class names of the model.
        """
        model = self._get_model_reference(model_id=model_id)
        return model.class_names

    def get_task_type(self, model_id: str, api_key: str = None) -> str:
        """Retrieves the task type for a given model.

        Args:
            model_id (str): The identifier of the model.

        Returns:
            str: The task type of the model.
        """
        model = self._get_model_reference(model_id=model_id)
        return model.task_type

    def remove(self, model_id: str, delete_from_disk: bool = True) -> None:
        """Removes a model from the manager.

        Args:
            model_id (str): The identifier of the model.
        """
        try:
            logger.debug(f"Removing model {model_id} from base model manager")
            model_lock = self._get_lock_for_a_model(model_id=model_id)
            with acquire_with_timeout(lock=model_lock) as acquired:
                if not acquired:
                    raise ModelManagerLockAcquisitionError(
                        f"Could not acquire lock for model with id={model_id}."
                    )
                if model_id not in self._models:
                    return None
                model = self._models[model_id]
                vram_bytes = getattr(model, "_vram_bytes", None)
                task_type = getattr(model, "task_type", "unknown")
                model.clear_cache(delete_from_disk=delete_from_disk)
                del self._models[model_id]
                logger.info(
                    "Model unloaded: model_id=%s, task_type=%s, vram_bytes=%s, remaining_models=%d",
                    model_id,
                    task_type,
                    vram_bytes,
                    len(self._models),
                )
                record_model_unloaded(model_id)
                self._model_request_aliases.pop(model_id, None)
                self._model_request_paths.pop(model_id, None)
                self._dispose_model_lock(model_id=model_id)
                try_releasing_cuda_memory()
        except InferenceModelNotFound:
            logger.warning(
                f"Attempted to remove model with id {model_id}, but it is not loaded. Skipping..."
            )

    def clear(self) -> None:
        """Removes all models from the manager."""
        model_ids = list(self.keys())
        for model_id in model_ids:
            self.remove(model_id)

    def _get_model_reference(self, model_id: str) -> Model:
        try:
            return self._models[model_id]
        except KeyError as error:
            raise InferenceModelNotFound(
                f"Model with id {model_id} not loaded."
            ) from error

    def __contains__(self, model_id: str) -> bool:
        """Checks if the model is contained in the manager.

        Args:
            model_id (str): The identifier of the model.

        Returns:
            bool: Whether the model is in the manager.
        """
        return model_id in self._models

    def __getitem__(self, key: str) -> Model:
        """Retrieve a model from the manager by key.

        Args:
            key (str): The identifier of the model.

        Returns:
            Model: The model corresponding to the key.
        """
        return self._get_model_reference(model_id=key)

    def __len__(self) -> int:
        """Retrieve the number of models in the manager.

        Returns:
            int: The number of models in the manager.
        """
        return len(self._models)

    def keys(self):
        """Retrieve the keys (model identifiers) from the manager.

        Returns:
            List[str]: The keys of the models in the manager.
        """
        return self._models.keys()

    def models(self) -> Dict[str, Model]:
        """Retrieve the models dictionary from the manager.

        Returns:
            Dict[str, Model]: The keys of the models in the manager.
        """
        return self._models

    def describe_models(self) -> List[ModelDescription]:
        return [
            ModelDescription(
                model_id=model_id,
                task_type=model.task_type,
                batch_size=getattr(model, "batch_size", None),
                input_width=getattr(model, "img_size_w", None),
                input_height=getattr(model, "img_size_h", None),
                vram_bytes=getattr(model, "_vram_bytes", None),
                request_aliases=sorted(
                    self._model_request_aliases.get(model_id, set())
                ),
                request_paths=sorted(self._model_request_paths.get(model_id, set())),
            )
            for model_id, model in self._models.items()
        ]

    def _get_lock_for_a_model(self, model_id: str) -> Lock:
        with acquire_with_timeout(lock=self._state_lock) as acquired:
            if not acquired:
                raise ModelManagerLockAcquisitionError(
                    "Could not acquire lock on Model Manager state to retrieve model lock."
                )
            if model_id not in self._models_state_locks:
                self._models_state_locks[model_id] = Lock()
            return self._models_state_locks[model_id]

    def _dispose_model_lock(self, model_id: str) -> None:
        with acquire_with_timeout(lock=self._state_lock) as acquired:
            if not acquired:
                raise ModelManagerLockAcquisitionError(
                    "Could not acquire lock on Model Manager state to dispose model lock."
                )
            if model_id not in self._models_state_locks:
                return None
            del self._models_state_locks[model_id]
Methods:
__contains__
__contains__(model_id)

Checks if the model is contained in the manager.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required

Returns:

Name Type Description
bool bool

Whether the model is in the manager.

Source code in inference/core/managers/base.py
586
587
588
589
590
591
592
593
594
595
def __contains__(self, model_id: str) -> bool:
    """Checks if the model is contained in the manager.

    Args:
        model_id (str): The identifier of the model.

    Returns:
        bool: Whether the model is in the manager.
    """
    return model_id in self._models
__getitem__
__getitem__(key)

Retrieve a model from the manager by key.

Parameters:

Name Type Description Default
key str

The identifier of the model.

required

Returns:

Name Type Description
Model Model

The model corresponding to the key.

Source code in inference/core/managers/base.py
597
598
599
600
601
602
603
604
605
606
def __getitem__(self, key: str) -> Model:
    """Retrieve a model from the manager by key.

    Args:
        key (str): The identifier of the model.

    Returns:
        Model: The model corresponding to the key.
    """
    return self._get_model_reference(model_id=key)
__len__
__len__()

Retrieve the number of models in the manager.

Returns:

Name Type Description
int int

The number of models in the manager.

Source code in inference/core/managers/base.py
608
609
610
611
612
613
614
def __len__(self) -> int:
    """Retrieve the number of models in the manager.

    Returns:
        int: The number of models in the manager.
    """
    return len(self._models)
add_model
add_model(
    model_id,
    api_key,
    model_id_alias=None,
    endpoint_type=ModelEndpointType.ORT,
    countinference=None,
    service_secret=None,
)

Adds a new model to the manager.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
model Model

The model instance.

required
endpoint_type ModelEndpointType

The endpoint type to use for the model.

ORT
Source code in inference/core/managers/base.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def add_model(
    self,
    model_id: str,
    api_key: str,
    model_id_alias: Optional[str] = None,
    endpoint_type: ModelEndpointType = ModelEndpointType.ORT,
    countinference: Optional[bool] = None,
    service_secret: Optional[str] = None,
) -> None:
    """Adds a new model to the manager.

    Args:
        model_id (str): The identifier of the model.
        model (Model): The model instance.
        endpoint_type (ModelEndpointType, optional): The endpoint type to use for the model.
    """
    if MODELS_CACHE_AUTH_ENABLED:
        if not _check_if_api_key_has_access_to_model(
            api_key=api_key,
            model_id=model_id,
            endpoint_type=endpoint_type,
            countinference=countinference,
            service_secret=service_secret,
        ):
            raise RoboflowAPINotAuthorizedError(
                f"API key {api_key} does not have access to model {model_id}"
            )

    logger.debug(
        f"ModelManager - Adding model with model_id={model_id}, model_id_alias={model_id_alias}"
    )
    resolved_identifier = model_id if model_id_alias is None else model_id_alias
    self.record_request_metadata(
        model_id=resolved_identifier,
        original_model_id=model_id,
        model_id_alias=model_id_alias,
    )
    ids_collector = request_model_ids.get(None)
    if ids_collector is not None:
        ids_collector.add(resolved_identifier)
    model_lock = self._get_lock_for_a_model(model_id=resolved_identifier)
    with acquire_with_timeout(lock=model_lock) as acquired:
        if not acquired:
            # if failed to acquire - then in use, no need to purge lock
            raise ModelManagerLockAcquisitionError(
                f"Could not acquire lock for model with id={resolved_identifier}."
            )
        if resolved_identifier in self._models:
            logger.debug(
                f"ModelManager - model with model_id={resolved_identifier} is already loaded."
            )
            return
        try:
            with start_span("model.load", {"model.id": resolved_identifier}):
                logger.debug("ModelManager - model initialisation...")
                t_load_start = time.perf_counter()
                vram_before = _get_cuda_memory_allocated()
                model_class = self.model_registry.get_model(
                    resolved_identifier,
                    api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                )

                extra_init_kwargs = {}
                if USE_INFERENCE_MODELS:
                    extra_init_kwargs["torchscript_state_global_lock"] = (
                        self.torchscript_state_global_lock
                    )
                model = model_class(
                    model_id=model_id,
                    api_key=api_key,
                    countinference=countinference,
                    service_secret=service_secret,
                    **extra_init_kwargs,
                )
                vram_after = _get_cuda_memory_allocated()
                if vram_before is not None and vram_after is not None:
                    model._vram_bytes = vram_after - vram_before

                # Pass countinference and service_secret to download_model_artifacts_from_roboflow_api if available
                if (
                    hasattr(model, "download_model_artifacts_from_roboflow_api")
                    and INTERNAL_WEIGHTS_URL_SUFFIX == "serverless"
                ):
                    # Only pass these parameters if INTERNAL_WEIGHTS_URL_SUFFIX is "serverless"
                    if (
                        hasattr(model, "cache_model_artefacts")
                        and not model.has_model_metadata
                    ):
                        # Override the download_model_artifacts_from_roboflow_api method with parameters
                        original_method = (
                            model.download_model_artifacts_from_roboflow_api
                        )
                        model.download_model_artifacts_from_roboflow_api = (
                            lambda: original_method(
                                countinference=countinference,
                                service_secret=service_secret,
                            )
                        )

                load_time = time.perf_counter() - t_load_start
                vram_delta = getattr(model, "_vram_bytes", None)
                set_span_attribute("model.load_time_seconds", load_time)
                record_model_loaded(resolved_identifier, load_time)
                logger.info(
                    "Model loaded: model_id=%s, load_time=%.2fs, task_type=%s, vram_bytes=%s",
                    resolved_identifier,
                    load_time,
                    getattr(model, "task_type", "unknown"),
                    vram_delta,
                )
                self._models[resolved_identifier] = model
                collector = model_load_info.get(None)
                if collector is not None:
                    collector.record(
                        model_id=resolved_identifier, load_time=load_time
                    )
        except Exception as error:
            record_error(error)
            self._dispose_model_lock(model_id=resolved_identifier)
            raise error
check_for_model
check_for_model(model_id)

Checks whether the model with the given ID is in the manager.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required

Raises:

Type Description
InferenceModelNotFound

If the model is not found in the manager.

Source code in inference/core/managers/base.py
223
224
225
226
227
228
229
230
231
232
233
def check_for_model(self, model_id: str) -> None:
    """Checks whether the model with the given ID is in the manager.

    Args:
        model_id (str): The identifier of the model.

    Raises:
        InferenceModelNotFound: If the model is not found in the manager.
    """
    if model_id not in self:
        raise InferenceModelNotFound(f"Model with id {model_id} not loaded.")
clear
clear()

Removes all models from the manager.

Source code in inference/core/managers/base.py
572
573
574
575
576
def clear(self) -> None:
    """Removes all models from the manager."""
    model_ids = list(self.keys())
    for model_id in model_ids:
        self.remove(model_id)
get_class_names
get_class_names(model_id)

Retrieves the class names for a given model.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required

Returns:

Type Description

List[str]: The class names of the model.

Source code in inference/core/managers/base.py
510
511
512
513
514
515
516
517
518
519
520
def get_class_names(self, model_id):
    """Retrieves the class names for a given model.

    Args:
        model_id (str): The identifier of the model.

    Returns:
        List[str]: The class names of the model.
    """
    model = self._get_model_reference(model_id=model_id)
    return model.class_names
get_task_type
get_task_type(model_id, api_key=None)

Retrieves the task type for a given model.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required

Returns:

Name Type Description
str str

The task type of the model.

Source code in inference/core/managers/base.py
522
523
524
525
526
527
528
529
530
531
532
def get_task_type(self, model_id: str, api_key: str = None) -> str:
    """Retrieves the task type for a given model.

    Args:
        model_id (str): The identifier of the model.

    Returns:
        str: The task type of the model.
    """
    model = self._get_model_reference(model_id=model_id)
    return model.task_type
infer_from_request async
infer_from_request(model_id, request, **kwargs)

Runs inference on the specified model with the given request.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
request InferenceRequest

The request to process.

required

Returns:

Name Type Description
InferenceResponse InferenceResponse

The response from the inference.

Source code in inference/core/managers/base.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
async def infer_from_request(
    self, model_id: str, request: InferenceRequest, **kwargs
) -> InferenceResponse:
    """Runs inference on the specified model with the given request.

    Args:
        model_id (str): The identifier of the model.
        request (InferenceRequest): The request to process.

    Returns:
        InferenceResponse: The response from the inference.
    """
    logger.debug(
        f"ModelManager - inference from request started for model_id={model_id}."
    )
    enable_model_monitoring = not getattr(
        request, "disable_model_monitoring", False
    )
    if METRICS_ENABLED and self.pingback and enable_model_monitoring:
        logger.debug("ModelManager - setting pingback fallback api key...")
        self.pingback.fallback_api_key = request.api_key
    with start_span(
        "model.infer",
        {"model.id": model_id, "model.infer.caller": "infer_from_request"},
    ):
        try:
            t_infer_start = time.perf_counter()
            rtn_val = await self.model_infer(
                model_id=model_id, request=request, **kwargs
            )
            record_inference(model_id, time.perf_counter() - t_infer_start)
            logger.debug(
                f"ModelManager - inference from request finished for model_id={model_id}."
            )
            finish_time = time.time()
            if not DISABLE_INFERENCE_CACHE and enable_model_monitoring:
                with start_span("model.infer.cache"):
                    try:
                        logger.debug(
                            f"ModelManager - caching inference request started for model_id={model_id}"
                        )
                        model_monitoring_cache_module.model_monitoring_cache.zadd(
                            f"models",
                            value=f"{GLOBAL_INFERENCE_SERVER_ID}:{request.api_key}:{model_id}",
                            score=finish_time,
                            expire=METRICS_INTERVAL * 2,
                        )
                        if (
                            hasattr(request, "image")
                            and hasattr(request.image, "type")
                            and request.image.type == "numpy"
                        ):
                            request.image.value = str(request.image.value)
                        model_monitoring_cache_module.model_monitoring_cache.zadd(
                            f"inference:{GLOBAL_INFERENCE_SERVER_ID}:{model_id}",
                            value=to_cachable_inference_item(request, rtn_val),
                            score=finish_time,
                            expire=METRICS_INTERVAL * 2,
                        )
                        logger.debug(
                            f"ModelManager - caching inference request finished for model_id={model_id}"
                        )
                    except Exception as cache_error:
                        logger.warning(
                            f"Failed to cache inference data for model {model_id}: {cache_error}"
                        )
            return rtn_val
        except Exception as e:
            record_error(e)
            finish_time = time.time()
            if not DISABLE_INFERENCE_CACHE and enable_model_monitoring:
                with start_span("model.infer.cache_error"):
                    try:
                        model_monitoring_cache_module.model_monitoring_cache.zadd(
                            f"models",
                            value=f"{GLOBAL_INFERENCE_SERVER_ID}:{request.api_key}:{model_id}",
                            score=finish_time,
                            expire=METRICS_INTERVAL * 2,
                        )
                        model_monitoring_cache_module.model_monitoring_cache.zadd(
                            f"error:{GLOBAL_INFERENCE_SERVER_ID}:{model_id}",
                            value={
                                "request": jsonable_encoder(
                                    request.dict(
                                        exclude={"image", "subject", "prompt"}
                                    )
                                ),
                                "error": str(e),
                            },
                            score=finish_time,
                            expire=METRICS_INTERVAL * 2,
                        )
                    except Exception as cache_error:
                        logger.warning(
                            f"Failed to cache error data for model {model_id}: {cache_error}"
                        )
            raise
infer_from_request_sync
infer_from_request_sync(model_id, request, **kwargs)

Runs inference on the specified model with the given request.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
request InferenceRequest

The request to process.

required

Returns:

Name Type Description
InferenceResponse InferenceResponse

The response from the inference.

Source code in inference/core/managers/base.py
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
def infer_from_request_sync(
    self, model_id: str, request: InferenceRequest, **kwargs
) -> InferenceResponse:
    """Runs inference on the specified model with the given request.

    Args:
        model_id (str): The identifier of the model.
        request (InferenceRequest): The request to process.

    Returns:
        InferenceResponse: The response from the inference.
    """
    logger.debug(
        f"ModelManager - inference from request started for model_id={model_id}."
    )
    enable_model_monitoring = not getattr(
        request, "disable_model_monitoring", False
    )
    if METRICS_ENABLED and self.pingback and enable_model_monitoring:
        logger.debug("ModelManager - setting pingback fallback api key...")
        self.pingback.fallback_api_key = request.api_key
    with start_span(
        "model.infer",
        {"model.id": model_id, "model.infer.caller": "infer_from_request_sync"},
    ):
        try:
            t_infer_start = time.perf_counter()
            rtn_val = self.model_infer_sync(
                model_id=model_id, request=request, **kwargs
            )
            record_inference(model_id, time.perf_counter() - t_infer_start)
            logger.debug(
                f"ModelManager - inference from request finished for model_id={model_id}."
            )
            finish_time = time.time()
            if not DISABLE_INFERENCE_CACHE and enable_model_monitoring:
                with start_span("model.infer.cache"):
                    try:
                        logger.debug(
                            f"ModelManager - caching inference request started for model_id={model_id}"
                        )
                        model_monitoring_cache_module.model_monitoring_cache.zadd(
                            f"models",
                            value=f"{GLOBAL_INFERENCE_SERVER_ID}:{request.api_key}:{model_id}",
                            score=finish_time,
                            expire=METRICS_INTERVAL * 2,
                        )
                        if (
                            hasattr(request, "image")
                            and hasattr(request.image, "type")
                            and request.image.type == "numpy"
                        ):
                            request.image.value = str(request.image.value)
                        model_monitoring_cache_module.model_monitoring_cache.zadd(
                            f"inference:{GLOBAL_INFERENCE_SERVER_ID}:{model_id}",
                            value=to_cachable_inference_item(request, rtn_val),
                            score=finish_time,
                            expire=METRICS_INTERVAL * 2,
                        )
                        logger.debug(
                            f"ModelManager - caching inference request finished for model_id={model_id}"
                        )
                    except Exception as cache_error:
                        logger.warning(
                            f"Failed to cache inference data for model {model_id}: {cache_error}"
                        )
            return rtn_val
        except Exception as e:
            record_error(e)
            finish_time = time.time()
            if not DISABLE_INFERENCE_CACHE and enable_model_monitoring:
                with start_span("model.infer.cache_error"):
                    try:
                        model_monitoring_cache_module.model_monitoring_cache.zadd(
                            f"models",
                            value=f"{GLOBAL_INFERENCE_SERVER_ID}:{request.api_key}:{model_id}",
                            score=finish_time,
                            expire=METRICS_INTERVAL * 2,
                        )
                        model_monitoring_cache_module.model_monitoring_cache.zadd(
                            f"error:{GLOBAL_INFERENCE_SERVER_ID}:{model_id}",
                            value={
                                "request": jsonable_encoder(
                                    request.dict(
                                        exclude={"image", "subject", "prompt"}
                                    )
                                ),
                                "error": str(e),
                            },
                            score=finish_time,
                            expire=METRICS_INTERVAL * 2,
                        )
                    except Exception as cache_error:
                        logger.warning(
                            f"Failed to cache error data for model {model_id}: {cache_error}"
                        )
            raise
init_pingback
init_pingback()

Initializes pingback mechanism.

Source code in inference/core/managers/base.py
68
69
70
71
72
73
74
def init_pingback(self):
    """Initializes pingback mechanism."""
    self.num_errors = 0  # in the device
    self.uuid = ROBOFLOW_SERVER_UUID
    if METRICS_ENABLED:
        self.pingback = PingbackInfo(self)
        self.pingback.start()
keys
keys()

Retrieve the keys (model identifiers) from the manager.

Returns:

Type Description

List[str]: The keys of the models in the manager.

Source code in inference/core/managers/base.py
616
617
618
619
620
621
622
def keys(self):
    """Retrieve the keys (model identifiers) from the manager.

    Returns:
        List[str]: The keys of the models in the manager.
    """
    return self._models.keys()
make_response
make_response(model_id, predictions, *args, **kwargs)

Creates a response object from the model's predictions.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
predictions List[List[float]]

The model's predictions.

required

Returns:

Name Type Description
InferenceResponse InferenceResponse

The created response object.

Source code in inference/core/managers/base.py
441
442
443
444
445
446
447
448
449
450
451
452
453
454
def make_response(
    self, model_id: str, predictions: List[List[float]], *args, **kwargs
) -> InferenceResponse:
    """Creates a response object from the model's predictions.

    Args:
        model_id (str): The identifier of the model.
        predictions (List[List[float]]): The model's predictions.

    Returns:
        InferenceResponse: The created response object.
    """
    model = self._get_model_reference(model_id=model_id)
    return model.make_response(predictions, *args, **kwargs)
models
models()

Retrieve the models dictionary from the manager.

Returns:

Type Description
Dict[str, Model]

Dict[str, Model]: The keys of the models in the manager.

Source code in inference/core/managers/base.py
624
625
626
627
628
629
630
def models(self) -> Dict[str, Model]:
    """Retrieve the models dictionary from the manager.

    Returns:
        Dict[str, Model]: The keys of the models in the manager.
    """
    return self._models
postprocess
postprocess(
    model_id,
    predictions,
    preprocess_return_metadata,
    *args,
    **kwargs
)

Processes the model's predictions after inference.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
predictions ndarray

The model's predictions.

required

Returns:

Type Description
List[List[float]]

List[List[float]]: The post-processed predictions.

Source code in inference/core/managers/base.py
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
def postprocess(
    self,
    model_id: str,
    predictions: Tuple[np.ndarray, ...],
    preprocess_return_metadata: PreprocessReturnMetadata,
    *args,
    **kwargs,
) -> List[List[float]]:
    """Processes the model's predictions after inference.

    Args:
        model_id (str): The identifier of the model.
        predictions (np.ndarray): The model's predictions.

    Returns:
        List[List[float]]: The post-processed predictions.
    """
    model = self._get_model_reference(model_id=model_id)
    return model.postprocess(
        predictions, preprocess_return_metadata, *args, **kwargs
    )
predict
predict(model_id, *args, **kwargs)

Runs prediction on the specified model.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required

Returns:

Type Description
Tuple[ndarray, ...]

np.ndarray: The predictions from the model.

Source code in inference/core/managers/base.py
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
def predict(self, model_id: str, *args, **kwargs) -> Tuple[np.ndarray, ...]:
    """Runs prediction on the specified model.

    Args:
        model_id (str): The identifier of the model.

    Returns:
        np.ndarray: The predictions from the model.
    """
    model = self._get_model_reference(model_id=model_id)
    model.metrics["num_inferences"] += 1
    tic = time.perf_counter()
    res = model.predict(*args, **kwargs)
    toc = time.perf_counter()
    model.metrics["avg_inference_time"] += toc - tic
    return res
preprocess
preprocess(model_id, request)

Preprocesses the request before inference.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
request InferenceRequest

The request to preprocess.

required

Returns:

Type Description
Tuple[ndarray, PreprocessReturnMetadata]

Tuple[np.ndarray, List[Tuple[int, int]]]: The preprocessed data.

Source code in inference/core/managers/base.py
495
496
497
498
499
500
501
502
503
504
505
506
507
508
def preprocess(
    self, model_id: str, request: InferenceRequest
) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
    """Preprocesses the request before inference.

    Args:
        model_id (str): The identifier of the model.
        request (InferenceRequest): The request to preprocess.

    Returns:
        Tuple[np.ndarray, List[Tuple[int, int]]]: The preprocessed data.
    """
    model = self._get_model_reference(model_id=model_id)
    return model.preprocess(**request.dict())
record_request_metadata
record_request_metadata(
    model_id, original_model_id=None, model_id_alias=None
)

Record request path and aliases for an already-loaded model.

Decorators call this when they short-circuit add_model() for warm models so registry metadata stays in sync with the base manager path.

Source code in inference/core/managers/base.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def record_request_metadata(
    self,
    model_id: str,
    original_model_id: Optional[str] = None,
    model_id_alias: Optional[str] = None,
) -> None:
    """Record request path and aliases for an already-loaded model.

    Decorators call this when they short-circuit ``add_model()`` for warm
    models so registry metadata stays in sync with the base manager path.
    """
    resolved_identifier = model_id
    if resolved_identifier not in self._model_request_aliases:
        self._model_request_aliases[resolved_identifier] = set()
    if original_model_id is not None and original_model_id != resolved_identifier:
        self._model_request_aliases[resolved_identifier].add(original_model_id)
    if model_id_alias is not None and model_id_alias != resolved_identifier:
        self._model_request_aliases[resolved_identifier].add(model_id_alias)
    req_path = current_request_path.get(None)
    if req_path:
        if resolved_identifier not in self._model_request_paths:
            self._model_request_paths[resolved_identifier] = set()
        self._model_request_paths[resolved_identifier].add(req_path)
remove
remove(model_id, delete_from_disk=True)

Removes a model from the manager.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
Source code in inference/core/managers/base.py
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
def remove(self, model_id: str, delete_from_disk: bool = True) -> None:
    """Removes a model from the manager.

    Args:
        model_id (str): The identifier of the model.
    """
    try:
        logger.debug(f"Removing model {model_id} from base model manager")
        model_lock = self._get_lock_for_a_model(model_id=model_id)
        with acquire_with_timeout(lock=model_lock) as acquired:
            if not acquired:
                raise ModelManagerLockAcquisitionError(
                    f"Could not acquire lock for model with id={model_id}."
                )
            if model_id not in self._models:
                return None
            model = self._models[model_id]
            vram_bytes = getattr(model, "_vram_bytes", None)
            task_type = getattr(model, "task_type", "unknown")
            model.clear_cache(delete_from_disk=delete_from_disk)
            del self._models[model_id]
            logger.info(
                "Model unloaded: model_id=%s, task_type=%s, vram_bytes=%s, remaining_models=%d",
                model_id,
                task_type,
                vram_bytes,
                len(self._models),
            )
            record_model_unloaded(model_id)
            self._model_request_aliases.pop(model_id, None)
            self._model_request_paths.pop(model_id, None)
            self._dispose_model_lock(model_id=model_id)
            try_releasing_cuda_memory()
    except InferenceModelNotFound:
        logger.warning(
            f"Attempted to remove model with id {model_id}, but it is not loaded. Skipping..."
        )

Functions:

inference.core.managers.metrics

Functions:

get_container_stats

get_container_stats(docker_socket_path)

Gets the container stats. Returns: dict: A dictionary containing the container stats.

Source code in inference/core/managers/metrics.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def get_container_stats(docker_socket_path: str) -> dict:
    """
    Gets the container stats.
    Returns:
        dict: A dictionary containing the container stats.
    """

    try:
        container_id = socket.gethostname()
        connection = http.client.HTTPConnection("localhost")
        connection.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
        connection.sock.connect(docker_socket_path)
        connection.request(
            "GET",
            f"/containers/{container_id}/stats?stream=false",
            headers={"Host": "localhost"},
        )
        response = connection.getresponse()
        data = response.read()
        connection.close()
        if response.status != 200:
            raise Exception(data.decode())
        stats = json.loads(data.decode())
        return {"stats": stats}
    except Exception as e:
        logger.exception(e)
        raise Exception("An error occurred while fetching container stats.")

get_model_metrics

get_model_metrics(
    inference_server_id, model_id, min=-1, max=float("inf")
)

Gets the metrics for a given model between a specified time range.

Parameters:

Name Type Description Default
device_id str

The identifier of the device.

required
model_id str

The identifier of the model.

required
start float

The starting timestamp of the time range. Defaults to -1.

required
stop float

The ending timestamp of the time range. Defaults to float("inf").

required

Returns:

Name Type Description
dict dict

A dictionary containing the metrics of the model: - num_inferences (int): The number of inferences made. - avg_inference_time (float): The average inference time. - num_errors (int): The number of errors occurred.

Source code in inference/core/managers/metrics.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def get_model_metrics(
    inference_server_id: str, model_id: str, min: float = -1, max: float = float("inf")
) -> dict:
    """
    Gets the metrics for a given model between a specified time range.

    Args:
        device_id (str): The identifier of the device.
        model_id (str): The identifier of the model.
        start (float, optional): The starting timestamp of the time range. Defaults to -1.
        stop (float, optional): The ending timestamp of the time range. Defaults to float("inf").

    Returns:
        dict: A dictionary containing the metrics of the model:
              - num_inferences (int): The number of inferences made.
              - avg_inference_time (float): The average inference time.
              - num_errors (int): The number of errors occurred.
    """
    now = time.time()
    inferences_with_times = (
        model_monitoring_cache_module.model_monitoring_cache.zrangebyscore(
            f"inference:{inference_server_id}:{model_id}",
            min=min,
            max=max,
            withscores=True,
        )
    )
    num_inferences = len(inferences_with_times)
    inference_times = []
    for inference, t in inferences_with_times:
        response = inference["response"]
        if isinstance(response, list):
            times = [r["time"] for r in response if "time" in r]
            inference_times.extend(times)
        else:
            if "time" in response:
                inference_times.append(response["time"])
    avg_inference_time = (
        sum(inference_times) / len(inference_times) if len(inference_times) > 0 else 0
    )
    errors_with_times = (
        model_monitoring_cache_module.model_monitoring_cache.zrangebyscore(
            f"error:{inference_server_id}:{model_id}",
            min=min,
            max=max,
            withscores=True,
        )
    )
    num_errors = len(errors_with_times)
    return {
        "num_inferences": num_inferences,
        "avg_inference_time": avg_inference_time,
        "num_errors": num_errors,
    }

get_system_info

get_system_info()

Collects system information such as platform, architecture, hostname, IP address, MAC address, and processor details.

Returns:

Name Type Description
dict dict

A dictionary containing detailed system information.

Source code in inference/core/managers/metrics.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def get_system_info() -> dict:
    """Collects system information such as platform, architecture, hostname, IP address, MAC address, and processor details.

    Returns:
        dict: A dictionary containing detailed system information.
    """
    info = {}
    try:
        info["platform"] = platform.system()
        info["platform_release"] = platform.release()
        info["platform_version"] = platform.version()
        info["architecture"] = platform.machine()
        info["hostname"] = socket.gethostname()
        info["ip_address"] = socket.gethostbyname(socket.gethostname())
        info["mac_address"] = ":".join(re.findall("..", "%012x" % uuid.getnode()))
        info["processor"] = platform.processor()
    except Exception as e:
        logger.exception(e)
    finally:
        return info

inference.core.managers.model_load_collector

Classes

ModelLoadCollector

Thread-safe collector for model cold start events during a request.

A single instance is shared across all threads handling a single request. Each entry stores a model_id alongside the load time.

Mirrors the design of RemoteProcessingTimeCollector from inference_sdk.

Source code in inference/core/managers/model_load_collector.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
class ModelLoadCollector:
    """Thread-safe collector for model cold start events during a request.

    A single instance is shared across all threads handling a single request.
    Each entry stores a model_id alongside the load time.

    Mirrors the design of RemoteProcessingTimeCollector from inference_sdk.
    """

    def __init__(self):
        self._entries: list = []  # list of (model_id, load_time) tuples
        self._lock = threading.Lock()

    def record(self, model_id: str, load_time: float) -> None:
        with self._lock:
            self._entries.append((model_id, load_time))

    def has_data(self) -> bool:
        with self._lock:
            return len(self._entries) > 0

    def snapshot_entries(self) -> list:
        with self._lock:
            return list(self._entries)

    def summarize(self, max_detail_bytes: int = 4096) -> Tuple[float, Optional[str]]:
        """Return (total_load_time, entries_json_or_none).

        Returns the total model load time and a JSON string of individual
        entries.  If the JSON exceeds *max_detail_bytes*, the detail string
        is omitted (None).
        """
        entries = self.snapshot_entries()
        total = sum(t for _, t in entries)
        detail = json.dumps([{"m": m, "t": t} for m, t in entries])
        if len(detail) > max_detail_bytes:
            detail = None
        return total, detail
Methods:
summarize
summarize(max_detail_bytes=4096)

Return (total_load_time, entries_json_or_none).

Returns the total model load time and a JSON string of individual entries. If the JSON exceeds max_detail_bytes, the detail string is omitted (None).

Source code in inference/core/managers/model_load_collector.py
32
33
34
35
36
37
38
39
40
41
42
43
44
def summarize(self, max_detail_bytes: int = 4096) -> Tuple[float, Optional[str]]:
    """Return (total_load_time, entries_json_or_none).

    Returns the total model load time and a JSON string of individual
    entries.  If the JSON exceeds *max_detail_bytes*, the detail string
    is omitted (None).
    """
    entries = self.snapshot_entries()
    total = sum(t for _, t in entries)
    detail = json.dumps([{"m": m, "t": t} for m, t in entries])
    if len(detail) > max_detail_bytes:
        detail = None
    return total, detail

RequestModelIds

Thread-safe set of model IDs used during a request.

Source code in inference/core/managers/model_load_collector.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
class RequestModelIds:
    """Thread-safe set of model IDs used during a request."""

    def __init__(self):
        self._ids: set = set()
        self._lock = threading.Lock()

    def add(self, model_id: str) -> None:
        with self._lock:
            self._ids.add(model_id)

    def get_ids(self) -> set:
        with self._lock:
            return set(self._ids)

inference.core.managers.pingback

Classes

PingbackInfo

Class responsible for managing pingback information for Roboflow.

This class initializes a scheduler to periodically post data to Roboflow, containing information about the models, container, and device.

Attributes:

Name Type Description
scheduler BackgroundScheduler

A scheduler for running jobs in the background.

model_manager ModelManager

Reference to the model manager object.

process_startup_time str

Unix timestamp indicating when the process started.

METRICS_URL str

URL to send the pingback data to.

system_info dict

Information about the system.

window_start_timestamp str

Unix timestamp indicating the start of the current window.

Source code in inference/core/managers/pingback.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class PingbackInfo:
    """Class responsible for managing pingback information for Roboflow.

    This class initializes a scheduler to periodically post data to Roboflow, containing information about the models,
    container, and device.

    Attributes:
        scheduler (BackgroundScheduler): A scheduler for running jobs in the background.
        model_manager (ModelManager): Reference to the model manager object.
        process_startup_time (str): Unix timestamp indicating when the process started.
        METRICS_URL (str): URL to send the pingback data to.
        system_info (dict): Information about the system.
        window_start_timestamp (str): Unix timestamp indicating the start of the current window.
    """

    def __init__(self, manager):
        """Initializes PingbackInfo with the given manager.

        Args:
            manager (ModelManager): Reference to the model manager object.
        """
        try:
            self.scheduler = BackgroundScheduler(
                job_defaults={"coalesce": True, "max_instances": 1}
            )
            self.model_manager = manager
            self.process_startup_time = str(int(time.time()))
            logger.debug(
                "UUID: " + self.model_manager.uuid
            )  # To correlate with UI container view
            self.window_start_timestamp = str(int(time.time()))
            context = {
                "api_key": API_KEY,
                "timestamp": str(int(time.time())),
                "device_id": GLOBAL_DEVICE_ID,
                "inference_server_id": GLOBAL_INFERENCE_SERVER_ID,
                "inference_server_version": __version__,
                "tags": TAGS,
            }
            self.environment_info = context | get_system_info()

            # we will set this from model manager when a new api key is used
            # to use in case there is no global ENV api key configured
            self.fallback_api_key = None

        except Exception as e:
            logger.debug(
                "Error sending pingback to Roboflow, if you want to disable this feature unset the ROBOFLOW_ENABLED environment variable. "
                + str(e)
            )

    def start(self):
        """Starts the scheduler to periodically post data to Roboflow.

        If METRICS_ENABLED is False, a warning is logged, and the method returns without starting the scheduler.
        """
        if METRICS_ENABLED == False:
            logger.warning(
                "Metrics reporting to Roboflow is disabled; not sending back stats to Roboflow."
            )
            return
        try:
            self.scheduler.add_job(
                self.post_data,
                "interval",
                seconds=METRICS_INTERVAL,
                args=[self.model_manager],
                replace_existing=True,
            )
            self.scheduler.start()
        except Exception as e:
            logger.debug(e)

    def stop(self):
        """Stops the scheduler."""
        self.scheduler.shutdown()

    def post_data(self, model_manager):
        """Posts data to Roboflow about the models, container, device, and other relevant metrics.

        Args:
            model_manager (ModelManager): Reference to the model manager object.

        The data is collected and reset for the next window, and a POST request is made to the pingback URL.
        """
        all_data = self.environment_info.copy()
        all_data["inference_results"] = []

        # use fallback api key if env didn't have one
        if self.fallback_api_key and not all_data.get("api_key"):
            all_data["api_key"] = self.fallback_api_key

        try:
            now = time.time()
            start = now - METRICS_INTERVAL
            for model_id in model_manager.models():
                results = get_inference_results_for_model(
                    GLOBAL_INFERENCE_SERVER_ID, model_id, min=start, max=now
                )
                all_data["inference_results"] = all_data["inference_results"] + results
            res = requests.post(
                wrap_url(METRICS_URL),
                json=all_data,
                timeout=10,
                verify=ROBOFLOW_API_VERIFY_SSL,
            )
            try:
                api_key_safe_raise_for_status(response=res)
                logger.debug(
                    "Sent metrics to Roboflow {} at {}.".format(
                        METRICS_URL, str(all_data)
                    )
                )
            except Exception as e:
                logger.debug(
                    f"Error sending metrics to Roboflow, if you want to disable this feature unset the METRICS_ENABLED environment variable."
                )

        except Exception as e:
            try:
                logger.exception(
                    f"Error sending metrics to Roboflow, if you want to disable this feature unset the METRICS_ENABLED environment variable. Error was: {e}. Data was: {all_data}"
                )

            except Exception as e2:
                logger.debug(
                    f"Error sending metrics to Roboflow, if you want to disable this feature unset the METRICS_ENABLED environment variable. Error was: {e}."
                )
Methods:
__init__
__init__(manager)

Initializes PingbackInfo with the given manager.

Parameters:

Name Type Description Default
manager ModelManager

Reference to the model manager object.

required
Source code in inference/core/managers/pingback.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def __init__(self, manager):
    """Initializes PingbackInfo with the given manager.

    Args:
        manager (ModelManager): Reference to the model manager object.
    """
    try:
        self.scheduler = BackgroundScheduler(
            job_defaults={"coalesce": True, "max_instances": 1}
        )
        self.model_manager = manager
        self.process_startup_time = str(int(time.time()))
        logger.debug(
            "UUID: " + self.model_manager.uuid
        )  # To correlate with UI container view
        self.window_start_timestamp = str(int(time.time()))
        context = {
            "api_key": API_KEY,
            "timestamp": str(int(time.time())),
            "device_id": GLOBAL_DEVICE_ID,
            "inference_server_id": GLOBAL_INFERENCE_SERVER_ID,
            "inference_server_version": __version__,
            "tags": TAGS,
        }
        self.environment_info = context | get_system_info()

        # we will set this from model manager when a new api key is used
        # to use in case there is no global ENV api key configured
        self.fallback_api_key = None

    except Exception as e:
        logger.debug(
            "Error sending pingback to Roboflow, if you want to disable this feature unset the ROBOFLOW_ENABLED environment variable. "
            + str(e)
        )
post_data
post_data(model_manager)

Posts data to Roboflow about the models, container, device, and other relevant metrics.

Parameters:

Name Type Description Default
model_manager ModelManager

Reference to the model manager object.

required

The data is collected and reset for the next window, and a POST request is made to the pingback URL.

Source code in inference/core/managers/pingback.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def post_data(self, model_manager):
    """Posts data to Roboflow about the models, container, device, and other relevant metrics.

    Args:
        model_manager (ModelManager): Reference to the model manager object.

    The data is collected and reset for the next window, and a POST request is made to the pingback URL.
    """
    all_data = self.environment_info.copy()
    all_data["inference_results"] = []

    # use fallback api key if env didn't have one
    if self.fallback_api_key and not all_data.get("api_key"):
        all_data["api_key"] = self.fallback_api_key

    try:
        now = time.time()
        start = now - METRICS_INTERVAL
        for model_id in model_manager.models():
            results = get_inference_results_for_model(
                GLOBAL_INFERENCE_SERVER_ID, model_id, min=start, max=now
            )
            all_data["inference_results"] = all_data["inference_results"] + results
        res = requests.post(
            wrap_url(METRICS_URL),
            json=all_data,
            timeout=10,
            verify=ROBOFLOW_API_VERIFY_SSL,
        )
        try:
            api_key_safe_raise_for_status(response=res)
            logger.debug(
                "Sent metrics to Roboflow {} at {}.".format(
                    METRICS_URL, str(all_data)
                )
            )
        except Exception as e:
            logger.debug(
                f"Error sending metrics to Roboflow, if you want to disable this feature unset the METRICS_ENABLED environment variable."
            )

    except Exception as e:
        try:
            logger.exception(
                f"Error sending metrics to Roboflow, if you want to disable this feature unset the METRICS_ENABLED environment variable. Error was: {e}. Data was: {all_data}"
            )

        except Exception as e2:
            logger.debug(
                f"Error sending metrics to Roboflow, if you want to disable this feature unset the METRICS_ENABLED environment variable. Error was: {e}."
            )
start
start()

Starts the scheduler to periodically post data to Roboflow.

If METRICS_ENABLED is False, a warning is logged, and the method returns without starting the scheduler.

Source code in inference/core/managers/pingback.py
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def start(self):
    """Starts the scheduler to periodically post data to Roboflow.

    If METRICS_ENABLED is False, a warning is logged, and the method returns without starting the scheduler.
    """
    if METRICS_ENABLED == False:
        logger.warning(
            "Metrics reporting to Roboflow is disabled; not sending back stats to Roboflow."
        )
        return
    try:
        self.scheduler.add_job(
            self.post_data,
            "interval",
            seconds=METRICS_INTERVAL,
            args=[self.model_manager],
            replace_existing=True,
        )
        self.scheduler.start()
    except Exception as e:
        logger.debug(e)
stop
stop()

Stops the scheduler.

Source code in inference/core/managers/pingback.py
 99
100
101
def stop(self):
    """Stops the scheduler."""
    self.scheduler.shutdown()

Functions:

inference.core.managers.prometheus

Classes

CustomCollector

Bases: Collector

Source code in inference/core/managers/prometheus.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
class CustomCollector(Collector):
    def __init__(self, model_manager, time_window: int = 10):
        super(CustomCollector, self).__init__()
        self.model_manager = model_manager
        self.time_window = time_window
        self.stream_manager_client = None

    def get_metrics(self, maxModels: int = 25):
        now = time.time()
        start = now - self.time_window
        count = 0
        results = {}
        if self.model_manager is None:
            logger.warning(
                "This inference server type does not support custom Prometheus metrics, skipping."
            )
            return results
        for model_id in self.model_manager.models():
            if count >= maxModels:
                break
            try:
                results[model_id] = get_model_metrics(
                    GLOBAL_INFERENCE_SERVER_ID, model_id, min=start, max=now
                )
            except Exception as e:
                logger.debug(
                    "Error getting metrics for model " + model_id + ": " + str(e)
                )
            count += 1
        return results

    async def _fetch_stream_metrics(self) -> Dict[str, dict]:
        # Pipeline status is fetched via TCP IPC to the stream manager process.
        # Pipelines run in separate subprocesses, so socket-based IPC is required.
        pipelines_response = await self.stream_manager_client.list_pipelines()
        pipeline_ids = pipelines_response.pipelines
        metrics = {}
        for pipeline_id in pipeline_ids:
            status_response = await self.stream_manager_client.get_status(pipeline_id)
            report = status_response.report
            latency_reports = report.get("latency_reports", [])
            sources_metadata = report.get("sources_metadata", [])
            camera_fps = self._average_source_fps(sources_metadata)
            source_label = self._extract_source_label(sources_metadata)
            metrics[pipeline_id] = {
                "inference_throughput": report.get("inference_throughput", 0.0),
                "camera_fps": camera_fps,
                "frame_decoding_latency": self._average_latency_field(
                    latency_reports, "frame_decoding_latency"
                ),
                "inference_latency": self._average_latency_field(
                    latency_reports, "inference_latency"
                ),
                "e2e_latency": self._average_latency_field(
                    latency_reports, "e2e_latency"
                ),
                "source": source_label,
            }
        return metrics

    def get_stream_metrics(self) -> Dict[str, dict]:
        if self.stream_manager_client is None:
            return {}
        try:
            try:
                return asyncio.run(self._fetch_stream_metrics())
            except RuntimeError:
                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
                    return pool.submit(
                        asyncio.run, self._fetch_stream_metrics()
                    ).result()
        except Exception:
            logger.debug("Failed to fetch stream metrics", exc_info=True)
            return {}

    @staticmethod
    def _average_latency_field(latency_reports: List[dict], field: str) -> float:
        values = [r[field] for r in latency_reports if r.get(field) is not None]
        if not values:
            return 0.0
        return sum(values) / len(values)

    @staticmethod
    def _average_source_fps(sources_metadata: List[dict]) -> float:
        values = []
        for src in sources_metadata:
            props = src.get("source_properties") or {}
            fps = props.get("fps")
            if fps is not None and fps > 0:
                values.append(fps)
        if not values:
            return 0.0
        return sum(values) / len(values)

    @staticmethod
    def _sanitize_source_reference(ref: str) -> str:
        """Strip credentials and query parameters from URLs to avoid leaking
        secrets in metrics."""
        parsed = urlparse(ref)
        if parsed.scheme and parsed.hostname:
            netloc = parsed.hostname + (f":{parsed.port}" if parsed.port else "")
            sanitized = parsed._replace(netloc=netloc, query="", fragment="")
            return urlunparse(sanitized)
        return ref

    @staticmethod
    def _extract_source_label(sources_metadata: List[dict]) -> str:
        if not METRICS_INCLUDE_SOURCE_LABELS:
            return ""
        refs = []
        for src in sources_metadata:
            ref = src.get("source_reference")
            if ref is not None:
                refs.append(CustomCollector._sanitize_source_reference(str(ref)))
        return ",".join(refs) if refs else ""

    def sanitize_string(self, input_string):
        sanitized_string = re.sub(r"[^a-zA-Z0-9_]", "_", input_string)
        return sanitized_string

    def collect(self):
        results = self.get_metrics()
        num_inferences_total = 0
        num_errors_total = 0
        avg_inference_time_total = 0
        for model_id, metrics in results.items():
            sane_model_id = self.sanitize_string(model_id)
            yield GaugeMetricFamily(
                f"num_inferences_{sane_model_id}",
                f"Number of inferences made in {self.time_window}s",
                value=metrics["num_inferences"],
            )
            yield GaugeMetricFamily(
                f"avg_inference_time_{sane_model_id}",
                f"Average inference time (over inferences completed in {self.time_window}s) to infer this model",
                value=metrics["avg_inference_time"],
            )
            yield GaugeMetricFamily(
                f"num_errors_{sane_model_id}",
                f"Number of errors in {self.time_window}s",
                value=metrics["num_errors"],
            )
            num_inferences_total += metrics["num_inferences"]
            num_errors_total += metrics["num_errors"]
            avg_inference_time_total += metrics["avg_inference_time"]
        yield GaugeMetricFamily(
            "num_inferences_total",
            f"Total number of inferences made in {self.time_window}s",
            value=num_inferences_total,
        )
        yield GaugeMetricFamily(
            "avg_inference_time_total",
            f"Average inference time (over inferences completed in {self.time_window}s) to infer all models.",
            value=avg_inference_time_total,
        )
        yield GaugeMetricFamily(
            "num_errors_total",
            f"Total number of errors in {self.time_window}s",
            value=num_errors_total,
        )

        stream_metrics = self.get_stream_metrics()
        pipeline_labels = ["pipeline_id", "source"]
        inference_fps = GaugeMetricFamily(
            "inference_pipeline_inference_fps",
            "Inference throughput FPS",
            labels=pipeline_labels,
        )
        camera_fps = GaugeMetricFamily(
            "inference_pipeline_camera_fps",
            "Camera source FPS",
            labels=pipeline_labels,
        )
        frame_decoding_latency = GaugeMetricFamily(
            "inference_pipeline_frame_decoding_latency",
            "Average frame decoding latency (seconds)",
            labels=pipeline_labels,
        )
        inference_latency = GaugeMetricFamily(
            "inference_pipeline_inference_latency",
            "Average inference latency (seconds)",
            labels=pipeline_labels,
        )
        e2e_latency = GaugeMetricFamily(
            "inference_pipeline_e2e_latency",
            "Average end-to-end latency (seconds)",
            labels=pipeline_labels,
        )
        for pipeline_id, pm in stream_metrics.items():
            label_values = [pipeline_id, pm["source"]]
            inference_fps.add_metric(label_values, pm["inference_throughput"])
            camera_fps.add_metric(label_values, pm["camera_fps"])
            frame_decoding_latency.add_metric(
                label_values, pm["frame_decoding_latency"]
            )
            inference_latency.add_metric(label_values, pm["inference_latency"])
            e2e_latency.add_metric(label_values, pm["e2e_latency"])
        yield inference_fps
        yield camera_fps
        yield frame_decoding_latency
        yield inference_latency
        yield e2e_latency
        yield GaugeMetricFamily(
            "inference_pipeline_active_streams",
            "Number of active inference pipelines",
            value=len(stream_metrics),
        )

InferenceInstrumentator

Class responsible for managing the Prometheus metrics for the inference server.

This class inititalizes the Prometheus Instrumentator and exposes the metrics endpoint.

Source code in inference/core/managers/prometheus.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class InferenceInstrumentator:
    """
    Class responsible for managing the Prometheus metrics for the inference server.

    This class inititalizes the Prometheus Instrumentator and exposes the metrics endpoint.

    """

    def __init__(self, app, model_manager, endpoint: str = "/metrics"):
        self.instrumentator = Instrumentator()
        self.instrumentator.instrument(app).expose(app, endpoint)
        self.collector = CustomCollector(model_manager)
        REGISTRY.register(self.collector)

    def set_stream_manager_client(self, stream_manager_client) -> None:
        self.collector.stream_manager_client = stream_manager_client

Functions:

core/managers/decorators

inference.core.managers.decorators.base

Classes

ModelManagerDecorator

Bases: ModelManager

Basic decorator, it acts like a ModelManager and contains a ModelManager.

Parameters:

Name Type Description Default
model_manager ModelManager

Instance of a ModelManager.

required

Methods:

Name Description
add_model

Adds a model to the manager.

infer

Processes a complete inference request.

infer_only

Performs only the inference part of a request.

preprocess

Processes the preprocessing part of a request.

get_task_type

Gets the task type associated with a model.

get_class_names

Gets the class names for a given model.

remove

Removes a model from the manager.

__len__

Returns the number of models in the manager.

__getitem__

Retrieves a model by its ID.

__contains__

Checks if a model exists in the manager.

keys

Returns the keys (model IDs) from the manager.

Source code in inference/core/managers/decorators/base.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
class ModelManagerDecorator(ModelManager):
    """Basic decorator, it acts like a `ModelManager` and contains a `ModelManager`.

    Args:
        model_manager (ModelManager): Instance of a ModelManager.

    Methods:
        add_model: Adds a model to the manager.
        infer: Processes a complete inference request.
        infer_only: Performs only the inference part of a request.
        preprocess: Processes the preprocessing part of a request.
        get_task_type: Gets the task type associated with a model.
        get_class_names: Gets the class names for a given model.
        remove: Removes a model from the manager.
        __len__: Returns the number of models in the manager.
        __getitem__: Retrieves a model by its ID.
        __contains__: Checks if a model exists in the manager.
        keys: Returns the keys (model IDs) from the manager.
    """

    @property
    def _models(self):
        raise ValueError("Should only be accessing self.model_manager._models")

    @property
    def model_registry(self):
        raise ValueError("Should only be accessing self.model_manager.model_registry")

    def __init__(self, model_manager: ModelManager):
        """Initializes the decorator with an instance of a ModelManager."""
        self.model_manager = model_manager

    def init_pingback(self):
        self.model_manager.init_pingback()

    @property
    def pingback(self):
        return self.model_manager.pingback

    def add_model(
        self,
        model_id: str,
        api_key: str,
        model_id_alias: Optional[str] = None,
        endpoint_type: ModelEndpointType = ModelEndpointType.ORT,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
    ):
        """Adds a model to the manager.

        Args:
            model_id (str): The identifier of the model.
            model (Model): The model instance.
            endpoint_type (ModelEndpointType, optional): The endpoint type to use for the model.
        """
        if model_id in self:
            self.model_manager.record_request_metadata(
                model_id=model_id,
                original_model_id=model_id,
                model_id_alias=model_id_alias,
            )
            ids_collector = request_model_ids.get(None)
            if ids_collector is not None:
                ids_collector.add(model_id)
            return
        self.model_manager.add_model(
            model_id,
            api_key,
            model_id_alias=model_id_alias,
            endpoint_type=endpoint_type,
            countinference=countinference,
            service_secret=service_secret,
        )

    def record_request_metadata(
        self,
        model_id: str,
        original_model_id: Optional[str] = None,
        model_id_alias: Optional[str] = None,
    ) -> None:
        self.model_manager.record_request_metadata(
            model_id=model_id,
            original_model_id=original_model_id,
            model_id_alias=model_id_alias,
        )

    async def infer_from_request(
        self, model_id: str, request: InferenceRequest, **kwargs
    ) -> InferenceResponse:
        """Processes a complete inference request.

        Args:
            model_id (str): The identifier of the model.
            request (InferenceRequest): The request to process.

        Returns:
            InferenceResponse: The response from the inference.
        """
        return await self.model_manager.infer_from_request(model_id, request, **kwargs)

    def infer_from_request_sync(
        self, model_id: str, request: InferenceRequest, **kwargs
    ) -> InferenceResponse:
        """Processes a complete inference request.

        Args:
            model_id (str): The identifier of the model.
            request (InferenceRequest): The request to process.

        Returns:
            InferenceResponse: The response from the inference.
        """
        return self.model_manager.infer_from_request_sync(model_id, request, **kwargs)

    def infer_only(self, model_id: str, request, img_in, img_dims, batch_size=None):
        """Performs only the inference part of a request.

        Args:
            model_id (str): The identifier of the model.
            request: The request to process.
            img_in: Input image.
            img_dims: Image dimensions.
            batch_size (int, optional): Batch size.

        Returns:
            Response from the inference-only operation.
        """
        return self.model_manager.infer_only(
            model_id, request, img_in, img_dims, batch_size
        )

    def preprocess(self, model_id: str, request: InferenceRequest):
        """Processes the preprocessing part of a request.

        Args:
            model_id (str): The identifier of the model.
            request (InferenceRequest): The request to preprocess.
        """
        return self.model_manager.preprocess(model_id, request)

    def get_task_type(self, model_id: str, api_key: str = None) -> str:
        """Gets the task type associated with a model.

        Args:
            model_id (str): The identifier of the model.

        Returns:
            str: The task type.
        """
        if api_key is None:
            api_key = API_KEY
        return self.model_manager.get_task_type(model_id, api_key=api_key)

    def get_class_names(self, model_id):
        """Gets the class names for a given model.

        Args:
            model_id: The identifier of the model.

        Returns:
            List of class names.
        """
        return self.model_manager.get_class_names(model_id)

    def remove(self, model_id: str, delete_from_disk: bool = True) -> Model:
        """Removes a model from the manager.

        Args:
            model_id (str): The identifier of the model.

        Returns:
            Model: The removed model.
        """
        return self.model_manager.remove(model_id, delete_from_disk=delete_from_disk)

    def __len__(self) -> int:
        """Returns the number of models in the manager.

        Returns:
            int: Number of models.
        """
        return len(self.model_manager)

    def __getitem__(self, key: str) -> Model:
        """Retrieves a model by its ID.

        Args:
            key (str): The identifier of the model.

        Returns:
            Model: The model instance.
        """
        return self.model_manager[key]

    def __contains__(self, model_id: str):
        """Checks if a model exists in the manager.

        Args:
            model_id (str): The identifier of the model.

        Returns:
            bool: True if the model exists, False otherwise.
        """
        return model_id in self.model_manager

    def keys(self):
        """Returns the keys (model IDs) from the manager.

        Returns:
            List of keys (model IDs).
        """
        return self.model_manager.keys()

    def models(self):
        return self.model_manager.models()

    def predict(self, model_id: str, *args, **kwargs) -> Tuple[np.ndarray, ...]:
        return self.model_manager.predict(model_id, *args, **kwargs)

    def postprocess(
        self,
        model_id: str,
        predictions: Tuple[np.ndarray, ...],
        preprocess_return_metadata: PreprocessReturnMetadata,
        *args,
        **kwargs
    ) -> List[List[float]]:
        return self.model_manager.postprocess(
            model_id, predictions, preprocess_return_metadata, *args, **kwargs
        )

    def make_response(
        self, model_id: str, predictions: List[List[float]], *args, **kwargs
    ) -> InferenceResponse:
        return self.model_manager.make_response(model_id, predictions, *args, **kwargs)

    @property
    def num_errors(self):
        return self.model_manager.num_errors

    @num_errors.setter
    def num_errors(self, value):
        self.model_manager.num_errors = value
Methods:
__contains__
__contains__(model_id)

Checks if a model exists in the manager.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required

Returns:

Name Type Description
bool

True if the model exists, False otherwise.

Source code in inference/core/managers/decorators/base.py
209
210
211
212
213
214
215
216
217
218
def __contains__(self, model_id: str):
    """Checks if a model exists in the manager.

    Args:
        model_id (str): The identifier of the model.

    Returns:
        bool: True if the model exists, False otherwise.
    """
    return model_id in self.model_manager
__getitem__
__getitem__(key)

Retrieves a model by its ID.

Parameters:

Name Type Description Default
key str

The identifier of the model.

required

Returns:

Name Type Description
Model Model

The model instance.

Source code in inference/core/managers/decorators/base.py
198
199
200
201
202
203
204
205
206
207
def __getitem__(self, key: str) -> Model:
    """Retrieves a model by its ID.

    Args:
        key (str): The identifier of the model.

    Returns:
        Model: The model instance.
    """
    return self.model_manager[key]
__init__
__init__(model_manager)

Initializes the decorator with an instance of a ModelManager.

Source code in inference/core/managers/decorators/base.py
43
44
45
def __init__(self, model_manager: ModelManager):
    """Initializes the decorator with an instance of a ModelManager."""
    self.model_manager = model_manager
__len__
__len__()

Returns the number of models in the manager.

Returns:

Name Type Description
int int

Number of models.

Source code in inference/core/managers/decorators/base.py
190
191
192
193
194
195
196
def __len__(self) -> int:
    """Returns the number of models in the manager.

    Returns:
        int: Number of models.
    """
    return len(self.model_manager)
add_model
add_model(
    model_id,
    api_key,
    model_id_alias=None,
    endpoint_type=ModelEndpointType.ORT,
    countinference=None,
    service_secret=None,
)

Adds a model to the manager.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
model Model

The model instance.

required
endpoint_type ModelEndpointType

The endpoint type to use for the model.

ORT
Source code in inference/core/managers/decorators/base.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def add_model(
    self,
    model_id: str,
    api_key: str,
    model_id_alias: Optional[str] = None,
    endpoint_type: ModelEndpointType = ModelEndpointType.ORT,
    countinference: Optional[bool] = None,
    service_secret: Optional[str] = None,
):
    """Adds a model to the manager.

    Args:
        model_id (str): The identifier of the model.
        model (Model): The model instance.
        endpoint_type (ModelEndpointType, optional): The endpoint type to use for the model.
    """
    if model_id in self:
        self.model_manager.record_request_metadata(
            model_id=model_id,
            original_model_id=model_id,
            model_id_alias=model_id_alias,
        )
        ids_collector = request_model_ids.get(None)
        if ids_collector is not None:
            ids_collector.add(model_id)
        return
    self.model_manager.add_model(
        model_id,
        api_key,
        model_id_alias=model_id_alias,
        endpoint_type=endpoint_type,
        countinference=countinference,
        service_secret=service_secret,
    )
get_class_names
get_class_names(model_id)

Gets the class names for a given model.

Parameters:

Name Type Description Default
model_id

The identifier of the model.

required

Returns:

Type Description

List of class names.

Source code in inference/core/managers/decorators/base.py
168
169
170
171
172
173
174
175
176
177
def get_class_names(self, model_id):
    """Gets the class names for a given model.

    Args:
        model_id: The identifier of the model.

    Returns:
        List of class names.
    """
    return self.model_manager.get_class_names(model_id)
get_task_type
get_task_type(model_id, api_key=None)

Gets the task type associated with a model.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required

Returns:

Name Type Description
str str

The task type.

Source code in inference/core/managers/decorators/base.py
155
156
157
158
159
160
161
162
163
164
165
166
def get_task_type(self, model_id: str, api_key: str = None) -> str:
    """Gets the task type associated with a model.

    Args:
        model_id (str): The identifier of the model.

    Returns:
        str: The task type.
    """
    if api_key is None:
        api_key = API_KEY
    return self.model_manager.get_task_type(model_id, api_key=api_key)
infer_from_request async
infer_from_request(model_id, request, **kwargs)

Processes a complete inference request.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
request InferenceRequest

The request to process.

required

Returns:

Name Type Description
InferenceResponse InferenceResponse

The response from the inference.

Source code in inference/core/managers/decorators/base.py
101
102
103
104
105
106
107
108
109
110
111
112
113
async def infer_from_request(
    self, model_id: str, request: InferenceRequest, **kwargs
) -> InferenceResponse:
    """Processes a complete inference request.

    Args:
        model_id (str): The identifier of the model.
        request (InferenceRequest): The request to process.

    Returns:
        InferenceResponse: The response from the inference.
    """
    return await self.model_manager.infer_from_request(model_id, request, **kwargs)
infer_from_request_sync
infer_from_request_sync(model_id, request, **kwargs)

Processes a complete inference request.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
request InferenceRequest

The request to process.

required

Returns:

Name Type Description
InferenceResponse InferenceResponse

The response from the inference.

Source code in inference/core/managers/decorators/base.py
115
116
117
118
119
120
121
122
123
124
125
126
127
def infer_from_request_sync(
    self, model_id: str, request: InferenceRequest, **kwargs
) -> InferenceResponse:
    """Processes a complete inference request.

    Args:
        model_id (str): The identifier of the model.
        request (InferenceRequest): The request to process.

    Returns:
        InferenceResponse: The response from the inference.
    """
    return self.model_manager.infer_from_request_sync(model_id, request, **kwargs)
infer_only
infer_only(
    model_id, request, img_in, img_dims, batch_size=None
)

Performs only the inference part of a request.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
request

The request to process.

required
img_in

Input image.

required
img_dims

Image dimensions.

required
batch_size int

Batch size.

None

Returns:

Type Description

Response from the inference-only operation.

Source code in inference/core/managers/decorators/base.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def infer_only(self, model_id: str, request, img_in, img_dims, batch_size=None):
    """Performs only the inference part of a request.

    Args:
        model_id (str): The identifier of the model.
        request: The request to process.
        img_in: Input image.
        img_dims: Image dimensions.
        batch_size (int, optional): Batch size.

    Returns:
        Response from the inference-only operation.
    """
    return self.model_manager.infer_only(
        model_id, request, img_in, img_dims, batch_size
    )
keys
keys()

Returns the keys (model IDs) from the manager.

Returns:

Type Description

List of keys (model IDs).

Source code in inference/core/managers/decorators/base.py
220
221
222
223
224
225
226
def keys(self):
    """Returns the keys (model IDs) from the manager.

    Returns:
        List of keys (model IDs).
    """
    return self.model_manager.keys()
preprocess
preprocess(model_id, request)

Processes the preprocessing part of a request.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
request InferenceRequest

The request to preprocess.

required
Source code in inference/core/managers/decorators/base.py
146
147
148
149
150
151
152
153
def preprocess(self, model_id: str, request: InferenceRequest):
    """Processes the preprocessing part of a request.

    Args:
        model_id (str): The identifier of the model.
        request (InferenceRequest): The request to preprocess.
    """
    return self.model_manager.preprocess(model_id, request)
remove
remove(model_id, delete_from_disk=True)

Removes a model from the manager.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required

Returns:

Name Type Description
Model Model

The removed model.

Source code in inference/core/managers/decorators/base.py
179
180
181
182
183
184
185
186
187
188
def remove(self, model_id: str, delete_from_disk: bool = True) -> Model:
    """Removes a model from the manager.

    Args:
        model_id (str): The identifier of the model.

    Returns:
        Model: The removed model.
    """
    return self.model_manager.remove(model_id, delete_from_disk=delete_from_disk)

inference.core.managers.decorators.locked_load

Classes

LockedLoadModelManagerDecorator

Bases: ModelManagerDecorator

Must acquire lock to load model

Source code in inference/core/managers/decorators/locked_load.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class LockedLoadModelManagerDecorator(ModelManagerDecorator):
    """Must acquire lock to load model"""

    def add_model(
        self,
        model_id: str,
        api_key: str,
        model_id_alias=None,
        endpoint_type: ModelEndpointType = ModelEndpointType.ORT,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
    ):
        with cache.lock(lock_str(model_id), expire=180.0):
            return super().add_model(
                model_id,
                api_key,
                model_id_alias=model_id_alias,
                endpoint_type=endpoint_type,
                countinference=countinference,
                service_secret=service_secret,
            )

inference.core.managers.decorators.logger

Classes

WithLogger

Bases: ModelManagerDecorator

Logger Decorator, it logs what's going on inside the manager.

Source code in inference/core/managers/decorators/logger.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
class WithLogger(ModelManagerDecorator):
    """Logger Decorator, it logs what's going on inside the manager."""

    def add_model(
        self,
        model_id: str,
        api_key: str,
        model_id_alias: Optional[str] = None,
        endpoint_type: ModelEndpointType = ModelEndpointType.ORT,
        countinference: Optional[bool] = None,
        service_secret: Optional[str] = None,
    ):
        """Adds a model to the manager and logs the action.

        Args:
            model_id (str): The identifier of the model.
            model (Model): The model instance.

        Returns:
            The result of the add_model method from the superclass.
        """
        logger.info(f"🤖 {model_id} added.")
        return super().add_model(
            model_id,
            api_key,
            model_id_alias=model_id_alias,
            endpoint_type=endpoint_type,
            countinference=countinference,
            service_secret=service_secret,
        )

    async def infer_from_request(
        self, model_id: str, request: InferenceRequest, **kwargs
    ) -> InferenceResponse:
        """Processes a complete inference request and logs both the request and response.

        Args:
            model_id (str): The identifier of the model.
            request (InferenceRequest): The request to process.

        Returns:
            InferenceResponse: The response from the inference.
        """
        logger.info(f"📥 [{model_id}] request={request}.")
        res = await super().infer_from_request(model_id, request, **kwargs)
        logger.info(f"📥 [{model_id}] res={res}.")
        return res

    def infer_from_request_sync(
        self, model_id: str, request: InferenceRequest, **kwargs
    ) -> InferenceResponse:
        """Processes a complete inference request and logs both the request and response.

        Args:
            model_id (str): The identifier of the model.
            request (InferenceRequest): The request to process.

        Returns:
            InferenceResponse: The response from the inference.
        """
        logger.info(f"📥 [{model_id}] request={request}.")
        res = super().infer_from_request_sync(model_id, request, **kwargs)
        logger.info(f"📥 [{model_id}] res={res}.")
        return res

    def remove(self, model_id: str, delete_from_disk: bool = True) -> Model:
        """Removes a model from the manager and logs the action.

        Args:
            model_id (str): The identifier of the model to remove.

        Returns:
            Model: The removed model.
        """
        res = super().remove(model_id)
        logger.info(f"❌ removed {model_id}, delete_from_disk={delete_from_disk}")
        return res
Methods:
add_model
add_model(
    model_id,
    api_key,
    model_id_alias=None,
    endpoint_type=ModelEndpointType.ORT,
    countinference=None,
    service_secret=None,
)

Adds a model to the manager and logs the action.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
model Model

The model instance.

required

Returns:

Type Description

The result of the add_model method from the superclass.

Source code in inference/core/managers/decorators/logger.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def add_model(
    self,
    model_id: str,
    api_key: str,
    model_id_alias: Optional[str] = None,
    endpoint_type: ModelEndpointType = ModelEndpointType.ORT,
    countinference: Optional[bool] = None,
    service_secret: Optional[str] = None,
):
    """Adds a model to the manager and logs the action.

    Args:
        model_id (str): The identifier of the model.
        model (Model): The model instance.

    Returns:
        The result of the add_model method from the superclass.
    """
    logger.info(f"🤖 {model_id} added.")
    return super().add_model(
        model_id,
        api_key,
        model_id_alias=model_id_alias,
        endpoint_type=endpoint_type,
        countinference=countinference,
        service_secret=service_secret,
    )
infer_from_request async
infer_from_request(model_id, request, **kwargs)

Processes a complete inference request and logs both the request and response.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
request InferenceRequest

The request to process.

required

Returns:

Name Type Description
InferenceResponse InferenceResponse

The response from the inference.

Source code in inference/core/managers/decorators/logger.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
async def infer_from_request(
    self, model_id: str, request: InferenceRequest, **kwargs
) -> InferenceResponse:
    """Processes a complete inference request and logs both the request and response.

    Args:
        model_id (str): The identifier of the model.
        request (InferenceRequest): The request to process.

    Returns:
        InferenceResponse: The response from the inference.
    """
    logger.info(f"📥 [{model_id}] request={request}.")
    res = await super().infer_from_request(model_id, request, **kwargs)
    logger.info(f"📥 [{model_id}] res={res}.")
    return res
infer_from_request_sync
infer_from_request_sync(model_id, request, **kwargs)

Processes a complete inference request and logs both the request and response.

Parameters:

Name Type Description Default
model_id str

The identifier of the model.

required
request InferenceRequest

The request to process.

required

Returns:

Name Type Description
InferenceResponse InferenceResponse

The response from the inference.

Source code in inference/core/managers/decorators/logger.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def infer_from_request_sync(
    self, model_id: str, request: InferenceRequest, **kwargs
) -> InferenceResponse:
    """Processes a complete inference request and logs both the request and response.

    Args:
        model_id (str): The identifier of the model.
        request (InferenceRequest): The request to process.

    Returns:
        InferenceResponse: The response from the inference.
    """
    logger.info(f"📥 [{model_id}] request={request}.")
    res = super().infer_from_request_sync(model_id, request, **kwargs)
    logger.info(f"📥 [{model_id}] res={res}.")
    return res
remove
remove(model_id, delete_from_disk=True)

Removes a model from the manager and logs the action.

Parameters:

Name Type Description Default
model_id str

The identifier of the model to remove.

required

Returns:

Name Type Description
Model Model

The removed model.

Source code in inference/core/managers/decorators/logger.py
76
77
78
79
80
81
82
83
84
85
86
87
def remove(self, model_id: str, delete_from_disk: bool = True) -> Model:
    """Removes a model from the manager and logs the action.

    Args:
        model_id (str): The identifier of the model to remove.

    Returns:
        Model: The removed model.
    """
    res = super().remove(model_id)
    logger.info(f"❌ removed {model_id}, delete_from_disk={delete_from_disk}")
    return res

core/models

Base model classes and common prediction logic shared across model types.

inference.core.models.base

Classes

BaseInference

General inference class.

This class provides a basic interface for inference tasks.

Source code in inference/core/models/base.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class BaseInference:
    """General inference class.

    This class provides a basic interface for inference tasks.
    """

    @usage_collector("model")
    def infer(self, image: Any, **kwargs) -> Any:
        """Runs inference on given data.
        - image:
            can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
        """
        with start_span("model.preprocess"):
            preproc_image, returned_metadata = self.preprocess(image, **kwargs)
            logger.debug(
                f"Preprocessed input shape: {getattr(preproc_image, 'shape', None)}"
            )
            if hasattr(preproc_image, "shape"):
                set_span_attribute("model.input_shape", str(preproc_image.shape))
        with start_span("model.predict"):
            predicted_arrays = self.predict(preproc_image, **kwargs)
        with start_span("model.postprocess"):
            postprocessed = self.postprocess(
                predicted_arrays, returned_metadata, **kwargs
            )

        return postprocessed

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        raise NotImplementedError

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray, ...]:
        raise NotImplementedError

    def postprocess(
        self,
        predictions: Tuple[np.ndarray, ...],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Any:
        raise NotImplementedError

    def infer_from_request(
        self, request: InferenceRequest
    ) -> Union[InferenceResponse, List[InferenceResponse]]:
        """Runs inference on a request

        Args:
            request (InferenceRequest): The request object.

        Returns:
            Union[CVInferenceResponse, List[CVInferenceResponse]]: The response object(s).

        Raises:
            NotImplementedError: This method must be implemented by a subclass.
        """
        raise NotImplementedError

    def make_response(
        self, *args, **kwargs
    ) -> Union[InferenceResponse, List[InferenceResponse]]:
        """Constructs an object detection response.

        Raises:
            NotImplementedError: This method must be implemented by a subclass.
        """
        raise NotImplementedError
Methods:
infer
infer(image, **kwargs)

Runs inference on given data. - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

Source code in inference/core/models/base.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
@usage_collector("model")
def infer(self, image: Any, **kwargs) -> Any:
    """Runs inference on given data.
    - image:
        can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
    """
    with start_span("model.preprocess"):
        preproc_image, returned_metadata = self.preprocess(image, **kwargs)
        logger.debug(
            f"Preprocessed input shape: {getattr(preproc_image, 'shape', None)}"
        )
        if hasattr(preproc_image, "shape"):
            set_span_attribute("model.input_shape", str(preproc_image.shape))
    with start_span("model.predict"):
        predicted_arrays = self.predict(preproc_image, **kwargs)
    with start_span("model.postprocess"):
        postprocessed = self.postprocess(
            predicted_arrays, returned_metadata, **kwargs
        )

    return postprocessed
infer_from_request
infer_from_request(request)

Runs inference on a request

Parameters:

Name Type Description Default
request InferenceRequest

The request object.

required

Returns:

Type Description
Union[InferenceResponse, List[InferenceResponse]]

Union[CVInferenceResponse, List[CVInferenceResponse]]: The response object(s).

Raises:

Type Description
NotImplementedError

This method must be implemented by a subclass.

Source code in inference/core/models/base.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def infer_from_request(
    self, request: InferenceRequest
) -> Union[InferenceResponse, List[InferenceResponse]]:
    """Runs inference on a request

    Args:
        request (InferenceRequest): The request object.

    Returns:
        Union[CVInferenceResponse, List[CVInferenceResponse]]: The response object(s).

    Raises:
        NotImplementedError: This method must be implemented by a subclass.
    """
    raise NotImplementedError
make_response
make_response(*args, **kwargs)

Constructs an object detection response.

Raises:

Type Description
NotImplementedError

This method must be implemented by a subclass.

Source code in inference/core/models/base.py
75
76
77
78
79
80
81
82
83
def make_response(
    self, *args, **kwargs
) -> Union[InferenceResponse, List[InferenceResponse]]:
    """Constructs an object detection response.

    Raises:
        NotImplementedError: This method must be implemented by a subclass.
    """
    raise NotImplementedError

Model

Bases: BaseInference

Base Inference Model (Inherits from BaseInference to define the needed methods)

This class provides the foundational methods for inference and logging, and can be extended by specific models.

Methods:

Name Description
log

Print the given message.

clear_cache

Clears any cache if necessary.

Source code in inference/core/models/base.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
class Model(BaseInference):
    """Base Inference Model (Inherits from BaseInference to define the needed methods)

    This class provides the foundational methods for inference and logging, and can be extended by specific models.

    Methods:
        log(m): Print the given message.
        clear_cache(): Clears any cache if necessary.
    """

    def log(self, m):
        """Prints the given message.

        Args:
            m (str): The message to print.
        """
        print(m)

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        """Clears any cache if necessary. This method should be implemented in derived classes as needed.

        Args:
            delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
        """
        pass

    def infer_from_request(
        self,
        request: InferenceRequest,
    ) -> Union[List[InferenceResponse], InferenceResponse]:
        """
        Perform inference based on the details provided in the request, and return the associated responses.
        The function can handle both single and multiple image inference requests. Optionally, it also provides
        a visualization of the predictions if requested.

        Args:
            request (InferenceRequest): The request object containing details for inference, such as the image or
                images to process, any classes to filter by, and whether or not to visualize the predictions.

        Returns:
            Union[List[InferenceResponse], InferenceResponse]: A list of response objects if the request contains
            multiple images, or a single response object if the request contains one image. Each response object
            contains details about the segmented instances, the time taken for inference, and optionally, a visualization.

        Examples:
            >>> request = InferenceRequest(image=my_image, visualize_predictions=True)
            >>> response = infer_from_request(request)
            >>> print(response.time)  # Prints the time taken for inference
            0.125
            >>> print(response.visualization)  # Accesses the visualization of the prediction if available

        Notes:
            - The processing time for each response is included within the response itself.
            - If `visualize_predictions` is set to True in the request, a visualization of the prediction
              is also included in the response.
        """
        t1 = perf_counter()
        kwargs = request.dict()
        confidence = kwargs.get("confidence")
        if isinstance(confidence, str) and not USE_INFERENCE_MODELS:
            logger.warning(
                "Legacy inference does not support confidence=%r, "
                "using model default",
                confidence,
            )
            kwargs.pop("confidence")
        responses = self.infer(**kwargs, return_image_dims=False)
        for response in responses:
            response.time = perf_counter() - t1
            logger.debug(f"model infer time: {response.time * 1000.0} ms")
            if request.id:
                response.inference_id = request.id

        if hasattr(request, "visualize_predictions") and request.visualize_predictions:
            for response in responses:
                response.visualization = self.draw_predictions(request, response)

        if not isinstance(request.image, list) and len(responses) > 0:
            responses = responses[0]

        return responses

    def make_response(
        self, *args, **kwargs
    ) -> Union[InferenceResponse, List[InferenceResponse]]:
        """Makes an inference response from the given arguments.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            InferenceResponse: The inference response.
        """
        raise NotImplementedError(self.__class__.__name__ + ".make_response")
Methods:
clear_cache
clear_cache(delete_from_disk=True)

Clears any cache if necessary. This method should be implemented in derived classes as needed.

Parameters:

Name Type Description Default
delete_from_disk bool

Whether to delete cached files from disk. Defaults to True.

True
Source code in inference/core/models/base.py
104
105
106
107
108
109
110
def clear_cache(self, delete_from_disk: bool = True) -> None:
    """Clears any cache if necessary. This method should be implemented in derived classes as needed.

    Args:
        delete_from_disk (bool, optional): Whether to delete cached files from disk. Defaults to True.
    """
    pass
infer_from_request
infer_from_request(request)

Perform inference based on the details provided in the request, and return the associated responses. The function can handle both single and multiple image inference requests. Optionally, it also provides a visualization of the predictions if requested.

Parameters:

Name Type Description Default
request InferenceRequest

The request object containing details for inference, such as the image or images to process, any classes to filter by, and whether or not to visualize the predictions.

required

Returns:

Type Description
Union[List[InferenceResponse], InferenceResponse]

Union[List[InferenceResponse], InferenceResponse]: A list of response objects if the request contains

Union[List[InferenceResponse], InferenceResponse]

multiple images, or a single response object if the request contains one image. Each response object

Union[List[InferenceResponse], InferenceResponse]

contains details about the segmented instances, the time taken for inference, and optionally, a visualization.

Examples:

>>> request = InferenceRequest(image=my_image, visualize_predictions=True)
>>> response = infer_from_request(request)
>>> print(response.time)  # Prints the time taken for inference
0.125
>>> print(response.visualization)  # Accesses the visualization of the prediction if available
Notes
  • The processing time for each response is included within the response itself.
  • If visualize_predictions is set to True in the request, a visualization of the prediction is also included in the response.
Source code in inference/core/models/base.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def infer_from_request(
    self,
    request: InferenceRequest,
) -> Union[List[InferenceResponse], InferenceResponse]:
    """
    Perform inference based on the details provided in the request, and return the associated responses.
    The function can handle both single and multiple image inference requests. Optionally, it also provides
    a visualization of the predictions if requested.

    Args:
        request (InferenceRequest): The request object containing details for inference, such as the image or
            images to process, any classes to filter by, and whether or not to visualize the predictions.

    Returns:
        Union[List[InferenceResponse], InferenceResponse]: A list of response objects if the request contains
        multiple images, or a single response object if the request contains one image. Each response object
        contains details about the segmented instances, the time taken for inference, and optionally, a visualization.

    Examples:
        >>> request = InferenceRequest(image=my_image, visualize_predictions=True)
        >>> response = infer_from_request(request)
        >>> print(response.time)  # Prints the time taken for inference
        0.125
        >>> print(response.visualization)  # Accesses the visualization of the prediction if available

    Notes:
        - The processing time for each response is included within the response itself.
        - If `visualize_predictions` is set to True in the request, a visualization of the prediction
          is also included in the response.
    """
    t1 = perf_counter()
    kwargs = request.dict()
    confidence = kwargs.get("confidence")
    if isinstance(confidence, str) and not USE_INFERENCE_MODELS:
        logger.warning(
            "Legacy inference does not support confidence=%r, "
            "using model default",
            confidence,
        )
        kwargs.pop("confidence")
    responses = self.infer(**kwargs, return_image_dims=False)
    for response in responses:
        response.time = perf_counter() - t1
        logger.debug(f"model infer time: {response.time * 1000.0} ms")
        if request.id:
            response.inference_id = request.id

    if hasattr(request, "visualize_predictions") and request.visualize_predictions:
        for response in responses:
            response.visualization = self.draw_predictions(request, response)

    if not isinstance(request.image, list) and len(responses) > 0:
        responses = responses[0]

    return responses
log
log(m)

Prints the given message.

Parameters:

Name Type Description Default
m str

The message to print.

required
Source code in inference/core/models/base.py
 96
 97
 98
 99
100
101
102
def log(self, m):
    """Prints the given message.

    Args:
        m (str): The message to print.
    """
    print(m)
make_response
make_response(*args, **kwargs)

Makes an inference response from the given arguments.

Parameters:

Name Type Description Default
*args

Variable length argument list.

()
**kwargs

Arbitrary keyword arguments.

{}

Returns:

Name Type Description
InferenceResponse Union[InferenceResponse, List[InferenceResponse]]

The inference response.

Source code in inference/core/models/base.py
168
169
170
171
172
173
174
175
176
177
178
179
180
def make_response(
    self, *args, **kwargs
) -> Union[InferenceResponse, List[InferenceResponse]]:
    """Makes an inference response from the given arguments.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.

    Returns:
        InferenceResponse: The inference response.
    """
    raise NotImplementedError(self.__class__.__name__ + ".make_response")

Functions:

inference.core.models.classification_base

Classes

ClassificationBaseOnnxRoboflowInferenceModel

Bases: OnnxRoboflowInferenceModel

Base class for ONNX models for Roboflow classification inference.

Attributes:

Name Type Description
multiclass bool

Whether the classification is multi-class or not.

Methods:

Name Description
get_infer_bucket_file_list

Get the list of required files for inference.

softmax

Compute softmax values for a given set of scores.

infer

ClassificationInferenceRequest) -> Union[List[Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]], Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]]: Perform inference on a given request and return the response.

draw_predictions

Draw prediction visuals on an image.

Source code in inference/core/models/classification_base.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
class ClassificationBaseOnnxRoboflowInferenceModel(OnnxRoboflowInferenceModel):
    """Base class for ONNX models for Roboflow classification inference.

    Attributes:
        multiclass (bool): Whether the classification is multi-class or not.

    Methods:
        get_infer_bucket_file_list() -> list: Get the list of required files for inference.
        softmax(x): Compute softmax values for a given set of scores.
        infer(request: ClassificationInferenceRequest) -> Union[List[Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]], Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]]: Perform inference on a given request and return the response.
        draw_predictions(inference_request, inference_response): Draw prediction visuals on an image.
    """

    task_type = "classification"

    preprocess_means = [0.5, 0.5, 0.5]
    preprocess_stds = [0.5, 0.5, 0.5]

    def __init__(self, *args, **kwargs):
        """Initialize the model, setting whether it is multiclass or not."""
        super().__init__(*args, **kwargs)
        self.multiclass = self.environment.get("MULTICLASS", False)

    def draw_predictions(self, inference_request, inference_response):
        """Draw prediction visuals on an image.

        This method overlays the predictions on the input image, including drawing rectangles and text to visualize the predicted classes.

        Args:
            inference_request: The request object containing the image and parameters.
            inference_response: The response object containing the predictions and other details.

        Returns:
            bytes: The bytes of the visualized image in JPEG format.
        """
        image = load_image_rgb(inference_request.image)
        image = Image.fromarray(image)
        draw = ImageDraw.Draw(image)
        font = ImageFont.load_default()
        if isinstance(inference_response.predictions, list):
            prediction = inference_response.predictions[0]
            color = self.colors.get(prediction.class_name, "#4892EA")
            draw.rectangle(
                [0, 0, image.size[1], image.size[0]],
                outline=color,
                width=inference_request.visualization_stroke_width,
            )
            text = f"{prediction.class_id} - {prediction.class_name} {prediction.confidence:.2f}"
            text_size = font.getbbox(text)

            # set button size + 10px margins
            button_size = (text_size[2] + 20, text_size[3] + 20)
            button_img = Image.new("RGBA", button_size, color)
            # put text on button with 10px margins
            button_draw = ImageDraw.Draw(button_img)
            button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

            # put button on source image in position (0, 0)
            image.paste(button_img, (0, 0))
        else:
            if len(inference_response.predictions) > 0:
                box_color = "#4892EA"
                draw.rectangle(
                    [0, 0, image.size[1], image.size[0]],
                    outline=box_color,
                    width=inference_request.visualization_stroke_width,
                )
            row = 0
            predictions = [
                (cls_name, pred)
                for cls_name, pred in inference_response.predictions.items()
            ]
            predictions = sorted(
                predictions, key=lambda x: x[1].confidence, reverse=True
            )
            for i, (cls_name, pred) in enumerate(predictions):
                color = self.colors.get(cls_name, "#4892EA")
                text = f"{cls_name} {pred.confidence:.2f}"
                text_size = font.getbbox(text)

                # set button size + 10px margins
                button_size = (text_size[2] + 20, text_size[3] + 20)
                button_img = Image.new("RGBA", button_size, color)
                # put text on button with 10px margins
                button_draw = ImageDraw.Draw(button_img)
                button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

                # put button on source image in position (0, 0)
                image.paste(button_img, (0, row))
                row += button_size[1]

        buffered = BytesIO()
        image = image.convert("RGB")
        image.save(buffered, format="JPEG")
        return buffered.getvalue()

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["environment.json"].
        """
        return ["environment.json"]

    def infer(
        self,
        image: Any,
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
        return_image_dims: bool = False,
        **kwargs,
    ):
        """
        Perform inference on the provided image(s) and return the predictions.

        Args:
            image (Any): The image or list of images to be processed.
                - can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
            disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
            disable_preproc_contrast (bool, optional): If true, the auto contrast preprocessing step is disabled for this call. Default is False.
            disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
            disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.
            return_image_dims (bool, optional): If set to True, the function will also return the dimensions of the image. Defaults to False.
            **kwargs: Additional parameters to customize the inference process.

        Returns:
            Union[List[np.array], np.array, Tuple[List[np.array], List[Tuple[int, int]]], Tuple[np.array, Tuple[int, int]]]:
            If `return_image_dims` is True and a list of images is provided, a tuple containing a list of prediction arrays and a list of image dimensions (width, height) is returned.
            If `return_image_dims` is True and a single image is provided, a tuple containing the prediction array and image dimensions (width, height) is returned.
            If `return_image_dims` is False and a list of images is provided, only the list of prediction arrays is returned.
            If `return_image_dims` is False and a single image is provided, only the prediction array is returned.

        Notes:
            - The input image(s) will be preprocessed (normalized and reshaped) before inference.
            - This function uses an ONNX session to perform inference on the input image(s).
        """
        return super().infer(
            image,
            disable_preproc_auto_orient=disable_preproc_auto_orient,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
            return_image_dims=return_image_dims,
            **kwargs,
        )

    def postprocess(
        self,
        predictions: Tuple[np.ndarray],
        preprocess_return_metadata: PreprocessReturnMetadata,
        return_image_dims=False,
        **kwargs,
    ) -> Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]:
        predictions = predictions[0]
        return self.make_response(
            predictions, preprocess_return_metadata["img_dims"], **kwargs
        )

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray]:
        with self._session_lock:
            predictions = run_session_via_iobinding(
                self.onnx_session, self.input_name, img_in
            )
        return (predictions,)

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        if isinstance(image, list):
            imgs_with_dims = [
                self.preproc_image(
                    i,
                    disable_preproc_auto_orient=kwargs.get(
                        "disable_preproc_auto_orient", False
                    ),
                    disable_preproc_contrast=kwargs.get(
                        "disable_preproc_contrast", False
                    ),
                    disable_preproc_grayscale=kwargs.get(
                        "disable_preproc_grayscale", False
                    ),
                    disable_preproc_static_crop=kwargs.get(
                        "disable_preproc_static_crop", False
                    ),
                )
                for i in image
            ]
            imgs, img_dims = zip(*imgs_with_dims)
            if isinstance(imgs[0], np.ndarray):
                img_in = np.concatenate(imgs, axis=0)
            elif USE_PYTORCH_FOR_PREPROCESSING:
                img_in = torch.cat(imgs, dim=0)
            else:
                raise ValueError(
                    f"Received a list of images of unknown type, {type(imgs[0])}; "
                    "This is most likely a bug. Contact Roboflow team through github issues "
                    "(https://github.com/roboflow/inference/issues) providing full context of the problem"
                )
        else:
            img_in, img_dims = self.preproc_image(
                image,
                disable_preproc_auto_orient=kwargs.get(
                    "disable_preproc_auto_orient", False
                ),
                disable_preproc_contrast=kwargs.get("disable_preproc_contrast", False),
                disable_preproc_grayscale=kwargs.get(
                    "disable_preproc_grayscale", False
                ),
                disable_preproc_static_crop=kwargs.get(
                    "disable_preproc_static_crop", False
                ),
            )
            img_dims = [img_dims]

        img_in /= 255.0

        mean = self.preprocess_means
        std = self.preprocess_stds
        if isinstance(img_in, np.ndarray):
            img_in = img_in.astype(np.float32)
        elif USE_PYTORCH_FOR_PREPROCESSING:
            img_in = img_in.float()
        else:
            raise ValueError(
                f"Received an image of unknown type, {type(img_in)}; "
                "This is most likely a bug. Contact Roboflow team through github issues "
                "(https://github.com/roboflow/inference/issues) providing full context of the problem"
            )

        img_in[:, 0, :, :] = (img_in[:, 0, :, :] - mean[0]) / std[0]
        img_in[:, 1, :, :] = (img_in[:, 1, :, :] - mean[1]) / std[1]
        img_in[:, 2, :, :] = (img_in[:, 2, :, :] - mean[2]) / std[2]
        return img_in, PreprocessReturnMetadata({"img_dims": img_dims})

    def infer_from_request(
        self,
        request: ClassificationInferenceRequest,
    ) -> Union[List[InferenceResponse], InferenceResponse]:
        """
        Handle an inference request to produce an appropriate response.

        Args:
            request (ClassificationInferenceRequest): The request object encapsulating the image(s) and relevant parameters.

        Returns:
            Union[List[InferenceResponse], InferenceResponse]: The response object(s) containing the predictions, visualization, and other pertinent details. If a list of images was provided, a list of responses is returned. Otherwise, a single response is returned.

        Notes:
            - Starts a timer at the beginning to calculate inference time.
            - Processes the image(s) through the `infer` method.
            - Generates the appropriate response object(s) using `make_response`.
            - Calculates and sets the time taken for inference.
            - If visualization is requested, the predictions are drawn on the image.
        """
        t1 = perf_counter()
        kwargs = request.dict()
        confidence = kwargs.get("confidence")
        if isinstance(confidence, str) and not USE_INFERENCE_MODELS:
            kwargs.pop("confidence")
        responses = self.infer(**kwargs, return_image_dims=True)
        for response in responses:
            response.time = perf_counter() - t1
            response.inference_id = getattr(request, "id", None)

        if request.visualize_predictions:
            for response in responses:
                response.visualization = self.draw_predictions(request, response)

        if not isinstance(request.image, list):
            responses = responses[0]

        return responses

    def make_response(
        self,
        predictions,
        img_dims,
        confidence: float = 0.5,
        **kwargs,
    ) -> Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]:
        """
        Create response objects for the given predictions and image dimensions.

        Args:
            predictions (list): List of prediction arrays from the inference process.
            img_dims (list): List of tuples indicating the dimensions (width, height) of each image.
            confidence (float, optional): Confidence threshold for filtering predictions. Defaults to 0.5.
            **kwargs: Additional parameters to influence the response creation process.

        Returns:
            Union[ClassificationInferenceResponse, List[ClassificationInferenceResponse]]: A response object or a list of response objects encapsulating the prediction details.

        Notes:
            - If the model is multiclass, a `MultiLabelClassificationInferenceResponse` is generated for each image.
            - If the model is not multiclass, a `ClassificationInferenceResponse` is generated for each image.
            - Predictions below the confidence threshold are filtered out.
        """
        responses = []
        confidence_threshold = float(confidence)
        for ind, prediction in enumerate(predictions):
            if self.multiclass:
                preds = prediction[0]
                results = dict()
                predicted_classes = []
                for i, o in enumerate(preds):
                    cls_name = self.class_names[i]
                    score = float(o)
                    results[cls_name] = {"confidence": score, "class_id": i}
                    if score > confidence_threshold:
                        predicted_classes.append(cls_name)
                response = MultiLabelClassificationInferenceResponse(
                    image=InferenceResponseImage(
                        width=img_dims[ind][0], height=img_dims[ind][1]
                    ),
                    predicted_classes=predicted_classes,
                    predictions=results,
                )
            else:
                preds = prediction[0]
                preds = self.softmax(preds)
                results = []
                for i, cls_name in enumerate(self.class_names):
                    score = float(preds[i])
                    if score < confidence_threshold:
                        continue
                    pred = {
                        "class_id": i,
                        "class": cls_name,
                        "confidence": round(score, 4),
                    }
                    results.append(pred)
                results = sorted(results, key=lambda x: x["confidence"], reverse=True)

                response = ClassificationInferenceResponse(
                    image=InferenceResponseImage(
                        width=img_dims[ind][1], height=img_dims[ind][0]
                    ),
                    predictions=results,
                    top=results[0]["class"] if results else "",
                    confidence=results[0]["confidence"] if results else 0.0,
                )
            responses.append(response)

        return responses

    @staticmethod
    def softmax(x):
        """Compute softmax values for each set of scores in x.

        Args:
            x (np.array): The input array containing the scores.

        Returns:
            np.array: The softmax values for each set of scores.
        """
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    def get_model_output_shape(self) -> Tuple[int, int, int]:
        test_image = (np.random.rand(1024, 1024, 3) * 255).astype(np.uint8)
        test_image, _ = self.preprocess(test_image)
        output = np.array(self.predict(test_image))
        return output.shape

    def validate_model_classes(self) -> None:
        output_shape = self.get_model_output_shape()
        num_classes = output_shape[3]
        try:
            assert num_classes == self.num_classes
        except AssertionError:
            raise ValueError(
                f"Number of classes in model ({num_classes}) does not match the number of classes in the environment ({self.num_classes})"
            )
Methods:
__init__
__init__(*args, **kwargs)

Initialize the model, setting whether it is multiclass or not.

Source code in inference/core/models/classification_base.py
45
46
47
48
def __init__(self, *args, **kwargs):
    """Initialize the model, setting whether it is multiclass or not."""
    super().__init__(*args, **kwargs)
    self.multiclass = self.environment.get("MULTICLASS", False)
draw_predictions
draw_predictions(inference_request, inference_response)

Draw prediction visuals on an image.

This method overlays the predictions on the input image, including drawing rectangles and text to visualize the predicted classes.

Parameters:

Name Type Description Default
inference_request

The request object containing the image and parameters.

required
inference_response

The response object containing the predictions and other details.

required

Returns:

Name Type Description
bytes

The bytes of the visualized image in JPEG format.

Source code in inference/core/models/classification_base.py
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def draw_predictions(self, inference_request, inference_response):
    """Draw prediction visuals on an image.

    This method overlays the predictions on the input image, including drawing rectangles and text to visualize the predicted classes.

    Args:
        inference_request: The request object containing the image and parameters.
        inference_response: The response object containing the predictions and other details.

    Returns:
        bytes: The bytes of the visualized image in JPEG format.
    """
    image = load_image_rgb(inference_request.image)
    image = Image.fromarray(image)
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    if isinstance(inference_response.predictions, list):
        prediction = inference_response.predictions[0]
        color = self.colors.get(prediction.class_name, "#4892EA")
        draw.rectangle(
            [0, 0, image.size[1], image.size[0]],
            outline=color,
            width=inference_request.visualization_stroke_width,
        )
        text = f"{prediction.class_id} - {prediction.class_name} {prediction.confidence:.2f}"
        text_size = font.getbbox(text)

        # set button size + 10px margins
        button_size = (text_size[2] + 20, text_size[3] + 20)
        button_img = Image.new("RGBA", button_size, color)
        # put text on button with 10px margins
        button_draw = ImageDraw.Draw(button_img)
        button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

        # put button on source image in position (0, 0)
        image.paste(button_img, (0, 0))
    else:
        if len(inference_response.predictions) > 0:
            box_color = "#4892EA"
            draw.rectangle(
                [0, 0, image.size[1], image.size[0]],
                outline=box_color,
                width=inference_request.visualization_stroke_width,
            )
        row = 0
        predictions = [
            (cls_name, pred)
            for cls_name, pred in inference_response.predictions.items()
        ]
        predictions = sorted(
            predictions, key=lambda x: x[1].confidence, reverse=True
        )
        for i, (cls_name, pred) in enumerate(predictions):
            color = self.colors.get(cls_name, "#4892EA")
            text = f"{cls_name} {pred.confidence:.2f}"
            text_size = font.getbbox(text)

            # set button size + 10px margins
            button_size = (text_size[2] + 20, text_size[3] + 20)
            button_img = Image.new("RGBA", button_size, color)
            # put text on button with 10px margins
            button_draw = ImageDraw.Draw(button_img)
            button_draw.text((10, 10), text, font=font, fill=(255, 255, 255, 255))

            # put button on source image in position (0, 0)
            image.paste(button_img, (0, row))
            row += button_size[1]

    buffered = BytesIO()
    image = image.convert("RGB")
    image.save(buffered, format="JPEG")
    return buffered.getvalue()
get_infer_bucket_file_list
get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name Type Description
list list

A list of required files for inference, e.g., ["environment.json"].

Source code in inference/core/models/classification_base.py
123
124
125
126
127
128
129
def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns: