Skip to content

clip_model

Clip

Bases: OnnxRoboflowCoreModel

Roboflow ONNX ClipModel model.

This class is responsible for handling the ONNX ClipModel model, including loading the model, preprocessing the input, and performing inference.

Attributes:

Name Type Description
visual_onnx_session InferenceSession

ONNX Runtime session for visual inference.

textual_onnx_session InferenceSession

ONNX Runtime session for textual inference.

resolution int

The resolution of the input image.

clip_preprocess function

Function to preprocess the image.

Source code in inference/models/clip/clip_model.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
class Clip(OnnxRoboflowCoreModel):
    """Roboflow ONNX ClipModel model.

    This class is responsible for handling the ONNX ClipModel model, including
    loading the model, preprocessing the input, and performing inference.

    Attributes:
        visual_onnx_session (onnxruntime.InferenceSession): ONNX Runtime session for visual inference.
        textual_onnx_session (onnxruntime.InferenceSession): ONNX Runtime session for textual inference.
        resolution (int): The resolution of the input image.
        clip_preprocess (function): Function to preprocess the image.
    """

    def __init__(
        self,
        *args,
        model_id: str = CLIP_MODEL_ID,
        onnxruntime_execution_providers: List[
            str
        ] = get_onnxruntime_execution_providers(ONNXRUNTIME_EXECUTION_PROVIDERS),
        **kwargs,
    ):
        """Initializes the Clip with the given arguments and keyword arguments."""
        self.onnxruntime_execution_providers = onnxruntime_execution_providers
        t1 = perf_counter()
        super().__init__(*args, model_id=model_id, **kwargs)
        # Create an ONNX Runtime Session with a list of execution providers in priority order. ORT attempts to load providers until one is successful. This keeps the code across devices identical.
        self.log("Creating inference sessions")
        self.visual_onnx_session = onnxruntime.InferenceSession(
            self.cache_file("visual.onnx"),
            providers=self.onnxruntime_execution_providers,
        )

        self.textual_onnx_session = onnxruntime.InferenceSession(
            self.cache_file("textual.onnx"),
            providers=self.onnxruntime_execution_providers,
        )

        if REQUIRED_ONNX_PROVIDERS:
            available_providers = onnxruntime.get_available_providers()
            for provider in REQUIRED_ONNX_PROVIDERS:
                if provider not in available_providers:
                    raise OnnxProviderNotAvailable(
                        f"Required ONNX Execution Provider {provider} is not availble. Check that you are using the correct docker image on a supported device."
                    )

        self.resolution = self.visual_onnx_session.get_inputs()[0].shape[2]

        self.clip_preprocess = clip.clip._transform(self.resolution)
        self.log(f"CLIP model loaded in {perf_counter() - t1:.2f} seconds")
        self.task_type = "embedding"

    def compare(
        self,
        subject: Any,
        prompt: Any,
        subject_type: str = "image",
        prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
        **kwargs,
    ) -> Union[List[float], Dict[str, float]]:
        """
        Compares the subject with the prompt to calculate similarity scores.

        Args:
            subject (Any): The subject data to be compared. Can be either an image or text.
            prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
            subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
            prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
            **kwargs: Additional keyword arguments.

        Returns:
            Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s). If prompt is a dictionary, returns a dictionary with keys corresponding to the original prompt dictionary's keys.

        Raises:
            ValueError: If subject_type or prompt_type is neither "image" nor "text".
            ValueError: If the number of prompts exceeds the maximum batch size.
        """

        if subject_type == "image":
            subject_embeddings = self.embed_image(subject)
        elif subject_type == "text":
            subject_embeddings = self.embed_text(subject)
        else:
            raise ValueError(
                "subject_type must be either 'image' or 'text', but got {request.subject_type}"
            )

        if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
            prompt_keys = prompt.keys()
            prompt = [prompt[k] for k in prompt_keys]
            prompt_obj = "dict"
        else:
            prompt = prompt
            if not isinstance(prompt, list):
                prompt = [prompt]
            prompt_obj = "list"

        if len(prompt) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
            )

        if prompt_type == "image":
            prompt_embeddings = self.embed_image(prompt)
        elif prompt_type == "text":
            prompt_embeddings = self.embed_text(prompt)
        else:
            raise ValueError(
                "prompt_type must be either 'image' or 'text', but got {request.prompt_type}"
            )

        similarities = [
            cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
        ]

        if prompt_obj == "dict":
            similarities = dict(zip(prompt_keys, similarities))

        return similarities

    def make_compare_response(
        self, similarities: Union[List[float], Dict[str, float]]
    ) -> ClipCompareResponse:
        """
        Creates a ClipCompareResponse object from the provided similarity data.

        Args:
            similarities (Union[List[float], Dict[str, float]]): A list or dictionary containing similarity scores.

        Returns:
            ClipCompareResponse: An instance of the ClipCompareResponse with the given similarity scores.

        Example:
            Assuming `ClipCompareResponse` expects a dictionary of string-float pairs:

            >>> make_compare_response({"image1": 0.98, "image2": 0.76})
            ClipCompareResponse(similarity={"image1": 0.98, "image2": 0.76})
        """
        response = ClipCompareResponse(similarity=similarities)
        return response

    def embed_image(
        self,
        image: Any,
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds an image or a list of images using the Clip model.

        Args:
            image (Any): The image or list of images to be embedded. Image can be in any format that is acceptable by the preproc_image method.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the image(s) as a numpy array.

        Raises:
            ValueError: If the number of images in the list exceeds the maximum batch size.

        Notes:
            The function measures performance using perf_counter and also has support for ONNX session to get embeddings.
        """
        t1 = perf_counter()

        if isinstance(image, list):
            if len(image) > CLIP_MAX_BATCH_SIZE:
                raise ValueError(
                    f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
                )
            imgs = [self.preproc_image(i) for i in image]
            img_in = np.concatenate(imgs, axis=0)
        else:
            img_in = self.preproc_image(image)

        onnx_input_image = {self.visual_onnx_session.get_inputs()[0].name: img_in}
        embeddings = self.visual_onnx_session.run(None, onnx_input_image)[0]

        return embeddings

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray]:
        onnx_input_image = {self.visual_onnx_session.get_inputs()[0].name: img_in}
        embeddings = self.visual_onnx_session.run(None, onnx_input_image)[0]
        return (embeddings,)

    def make_embed_image_response(
        self, embeddings: np.ndarray
    ) -> ClipEmbeddingResponse:
        """
        Converts the given embeddings into a ClipEmbeddingResponse object.

        Args:
            embeddings (np.ndarray): A numpy array containing the embeddings for an image or images.

        Returns:
            ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

        Example:
            >>> embeddings_array = np.array([[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
            >>> make_embed_image_response(embeddings_array)
            ClipEmbeddingResponse(embeddings=[[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
        """
        response = ClipEmbeddingResponse(embeddings=embeddings.tolist())

        return response

    def embed_text(
        self,
        text: Union[str, List[str]],
        **kwargs,
    ) -> np.ndarray:
        """
        Embeds a text or a list of texts using the Clip model.

        Args:
            text (Union[str, List[str]]): The text string or list of text strings to be embedded.
            **kwargs: Additional keyword arguments.

        Returns:
            np.ndarray: The embeddings of the text or texts as a numpy array.

        Raises:
            ValueError: If the number of text strings in the list exceeds the maximum batch size.

        Notes:
            The function utilizes an ONNX session to compute embeddings and measures the embedding time with perf_counter.
        """
        if isinstance(text, list):
            texts = text
        else:
            texts = [text]
        results = []
        for texts_batch in create_batches(
            sequence=texts, batch_size=CLIP_MAX_BATCH_SIZE
        ):
            tokenized_batch = clip.tokenize(texts_batch).numpy().astype(np.int32)
            onnx_input_text = {
                self.textual_onnx_session.get_inputs()[0].name: tokenized_batch
            }
            embeddings = self.textual_onnx_session.run(None, onnx_input_text)[0]
            results.append(embeddings)
        return np.concatenate(results, axis=0)

    def make_embed_text_response(self, embeddings: np.ndarray) -> ClipEmbeddingResponse:
        """
        Converts the given text embeddings into a ClipEmbeddingResponse object.

        Args:
            embeddings (np.ndarray): A numpy array containing the embeddings for a text or texts.

        Returns:
            ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

        Example:
            >>> embeddings_array = np.array([[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
            >>> make_embed_text_response(embeddings_array)
            ClipEmbeddingResponse(embeddings=[[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
        """
        response = ClipEmbeddingResponse(embeddings=embeddings.tolist())
        return response

    def get_infer_bucket_file_list(self) -> List[str]:
        """Gets the list of files required for inference.

        Returns:
            List[str]: The list of file names.
        """
        return ["textual.onnx", "visual.onnx"]

    def infer_from_request(
        self, request: ClipInferenceRequest
    ) -> ClipEmbeddingResponse:
        """Routes the request to the appropriate inference function.

        Args:
            request (ClipInferenceRequest): The request object containing the inference details.

        Returns:
            ClipEmbeddingResponse: The response object containing the embeddings.
        """
        t1 = perf_counter()
        if isinstance(request, ClipImageEmbeddingRequest):
            infer_func = self.embed_image
            make_response_func = self.make_embed_image_response
        elif isinstance(request, ClipTextEmbeddingRequest):
            infer_func = self.embed_text
            make_response_func = self.make_embed_text_response
        elif isinstance(request, ClipCompareRequest):
            infer_func = self.compare
            make_response_func = self.make_compare_response
        else:
            raise ValueError(
                f"Request type {type(request)} is not a valid ClipInferenceRequest"
            )
        data = infer_func(**request.dict())
        response = make_response_func(data)
        response.time = perf_counter() - t1
        return response

    def make_response(self, embeddings, *args, **kwargs) -> InferenceResponse:
        return [self.make_embed_image_response(embeddings)]

    def postprocess(
        self,
        predictions: Tuple[np.ndarray],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Any:
        return [self.make_embed_image_response(predictions[0])]

    def infer(self, image: Any, **kwargs) -> Any:
        """Embeds an image
        - image:
            can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
        """
        return super().infer(image, **kwargs)

    def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
        """Preprocesses an inference request image.

        Args:
            image (InferenceRequestImage): The object containing information necessary to load the image for inference.

        Returns:
            np.ndarray: A numpy array of the preprocessed image pixel data.
        """
        pil_image = Image.fromarray(load_image_rgb(image))
        preprocessed_image = self.clip_preprocess(pil_image)

        img_in = np.expand_dims(preprocessed_image, axis=0)

        return img_in.astype(np.float32)

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        return self.preproc_image(image), PreprocessReturnMetadata({})

__init__(*args, model_id=CLIP_MODEL_ID, onnxruntime_execution_providers=get_onnxruntime_execution_providers(ONNXRUNTIME_EXECUTION_PROVIDERS), **kwargs)

Initializes the Clip with the given arguments and keyword arguments.

Source code in inference/models/clip/clip_model.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def __init__(
    self,
    *args,
    model_id: str = CLIP_MODEL_ID,
    onnxruntime_execution_providers: List[
        str
    ] = get_onnxruntime_execution_providers(ONNXRUNTIME_EXECUTION_PROVIDERS),
    **kwargs,
):
    """Initializes the Clip with the given arguments and keyword arguments."""
    self.onnxruntime_execution_providers = onnxruntime_execution_providers
    t1 = perf_counter()
    super().__init__(*args, model_id=model_id, **kwargs)
    # Create an ONNX Runtime Session with a list of execution providers in priority order. ORT attempts to load providers until one is successful. This keeps the code across devices identical.
    self.log("Creating inference sessions")
    self.visual_onnx_session = onnxruntime.InferenceSession(
        self.cache_file("visual.onnx"),
        providers=self.onnxruntime_execution_providers,
    )

    self.textual_onnx_session = onnxruntime.InferenceSession(
        self.cache_file("textual.onnx"),
        providers=self.onnxruntime_execution_providers,
    )

    if REQUIRED_ONNX_PROVIDERS:
        available_providers = onnxruntime.get_available_providers()
        for provider in REQUIRED_ONNX_PROVIDERS:
            if provider not in available_providers:
                raise OnnxProviderNotAvailable(
                    f"Required ONNX Execution Provider {provider} is not availble. Check that you are using the correct docker image on a supported device."
                )

    self.resolution = self.visual_onnx_session.get_inputs()[0].shape[2]

    self.clip_preprocess = clip.clip._transform(self.resolution)
    self.log(f"CLIP model loaded in {perf_counter() - t1:.2f} seconds")
    self.task_type = "embedding"

compare(subject, prompt, subject_type='image', prompt_type='text', **kwargs)

Compares the subject with the prompt to calculate similarity scores.

Parameters:

Name Type Description Default
subject Any

The subject data to be compared. Can be either an image or text.

required
prompt Any

The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.

required
subject_type str

Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".

'image'
prompt_type Union[str, List[str], Dict[str, Any]]

Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".

'text'
**kwargs

Additional keyword arguments.

{}

Returns:

Type Description
Union[List[float], Dict[str, float]]

Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s). If prompt is a dictionary, returns a dictionary with keys corresponding to the original prompt dictionary's keys.

Raises:

Type Description
ValueError

If subject_type or prompt_type is neither "image" nor "text".

ValueError

If the number of prompts exceeds the maximum batch size.

Source code in inference/models/clip/clip_model.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def compare(
    self,
    subject: Any,
    prompt: Any,
    subject_type: str = "image",
    prompt_type: Union[str, List[str], Dict[str, Any]] = "text",
    **kwargs,
) -> Union[List[float], Dict[str, float]]:
    """
    Compares the subject with the prompt to calculate similarity scores.

    Args:
        subject (Any): The subject data to be compared. Can be either an image or text.
        prompt (Any): The prompt data to be compared against the subject. Can be a single value (image/text), list of values, or dictionary of values.
        subject_type (str, optional): Specifies the type of the subject data. Must be either "image" or "text". Defaults to "image".
        prompt_type (Union[str, List[str], Dict[str, Any]], optional): Specifies the type of the prompt data. Can be "image", "text", list of these types, or a dictionary containing these types. Defaults to "text".
        **kwargs: Additional keyword arguments.

    Returns:
        Union[List[float], Dict[str, float]]: A list or dictionary containing cosine similarity scores between the subject and prompt(s). If prompt is a dictionary, returns a dictionary with keys corresponding to the original prompt dictionary's keys.

    Raises:
        ValueError: If subject_type or prompt_type is neither "image" nor "text".
        ValueError: If the number of prompts exceeds the maximum batch size.
    """

    if subject_type == "image":
        subject_embeddings = self.embed_image(subject)
    elif subject_type == "text":
        subject_embeddings = self.embed_text(subject)
    else:
        raise ValueError(
            "subject_type must be either 'image' or 'text', but got {request.subject_type}"
        )

    if isinstance(prompt, dict) and not ("type" in prompt and "value" in prompt):
        prompt_keys = prompt.keys()
        prompt = [prompt[k] for k in prompt_keys]
        prompt_obj = "dict"
    else:
        prompt = prompt
        if not isinstance(prompt, list):
            prompt = [prompt]
        prompt_obj = "list"

    if len(prompt) > CLIP_MAX_BATCH_SIZE:
        raise ValueError(
            f"The maximum number of prompts that can be compared at once is {CLIP_MAX_BATCH_SIZE}"
        )

    if prompt_type == "image":
        prompt_embeddings = self.embed_image(prompt)
    elif prompt_type == "text":
        prompt_embeddings = self.embed_text(prompt)
    else:
        raise ValueError(
            "prompt_type must be either 'image' or 'text', but got {request.prompt_type}"
        )

    similarities = [
        cosine_similarity(subject_embeddings, p) for p in prompt_embeddings
    ]

    if prompt_obj == "dict":
        similarities = dict(zip(prompt_keys, similarities))

    return similarities

embed_image(image, **kwargs)

Embeds an image or a list of images using the Clip model.

Parameters:

Name Type Description Default
image Any

The image or list of images to be embedded. Image can be in any format that is acceptable by the preproc_image method.

required
**kwargs

Additional keyword arguments.

{}

Returns:

Type Description
ndarray

np.ndarray: The embeddings of the image(s) as a numpy array.

Raises:

Type Description
ValueError

If the number of images in the list exceeds the maximum batch size.

Notes

The function measures performance using perf_counter and also has support for ONNX session to get embeddings.

Source code in inference/models/clip/clip_model.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def embed_image(
    self,
    image: Any,
    **kwargs,
) -> np.ndarray:
    """
    Embeds an image or a list of images using the Clip model.

    Args:
        image (Any): The image or list of images to be embedded. Image can be in any format that is acceptable by the preproc_image method.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the image(s) as a numpy array.

    Raises:
        ValueError: If the number of images in the list exceeds the maximum batch size.

    Notes:
        The function measures performance using perf_counter and also has support for ONNX session to get embeddings.
    """
    t1 = perf_counter()

    if isinstance(image, list):
        if len(image) > CLIP_MAX_BATCH_SIZE:
            raise ValueError(
                f"The maximum number of images that can be embedded at once is {CLIP_MAX_BATCH_SIZE}"
            )
        imgs = [self.preproc_image(i) for i in image]
        img_in = np.concatenate(imgs, axis=0)
    else:
        img_in = self.preproc_image(image)

    onnx_input_image = {self.visual_onnx_session.get_inputs()[0].name: img_in}
    embeddings = self.visual_onnx_session.run(None, onnx_input_image)[0]

    return embeddings

embed_text(text, **kwargs)

Embeds a text or a list of texts using the Clip model.

Parameters:

Name Type Description Default
text Union[str, List[str]]

The text string or list of text strings to be embedded.

required
**kwargs

Additional keyword arguments.

{}

Returns:

Type Description
ndarray

np.ndarray: The embeddings of the text or texts as a numpy array.

Raises:

Type Description
ValueError

If the number of text strings in the list exceeds the maximum batch size.

Notes

The function utilizes an ONNX session to compute embeddings and measures the embedding time with perf_counter.

Source code in inference/models/clip/clip_model.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def embed_text(
    self,
    text: Union[str, List[str]],
    **kwargs,
) -> np.ndarray:
    """
    Embeds a text or a list of texts using the Clip model.

    Args:
        text (Union[str, List[str]]): The text string or list of text strings to be embedded.
        **kwargs: Additional keyword arguments.

    Returns:
        np.ndarray: The embeddings of the text or texts as a numpy array.

    Raises:
        ValueError: If the number of text strings in the list exceeds the maximum batch size.

    Notes:
        The function utilizes an ONNX session to compute embeddings and measures the embedding time with perf_counter.
    """
    if isinstance(text, list):
        texts = text
    else:
        texts = [text]
    results = []
    for texts_batch in create_batches(
        sequence=texts, batch_size=CLIP_MAX_BATCH_SIZE
    ):
        tokenized_batch = clip.tokenize(texts_batch).numpy().astype(np.int32)
        onnx_input_text = {
            self.textual_onnx_session.get_inputs()[0].name: tokenized_batch
        }
        embeddings = self.textual_onnx_session.run(None, onnx_input_text)[0]
        results.append(embeddings)
    return np.concatenate(results, axis=0)

get_infer_bucket_file_list()

Gets the list of files required for inference.

Returns:

Type Description
List[str]

List[str]: The list of file names.

Source code in inference/models/clip/clip_model.py
297
298
299
300
301
302
303
def get_infer_bucket_file_list(self) -> List[str]:
    """Gets the list of files required for inference.

    Returns:
        List[str]: The list of file names.
    """
    return ["textual.onnx", "visual.onnx"]

infer(image, **kwargs)

Embeds an image - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

Source code in inference/models/clip/clip_model.py
346
347
348
349
350
351
def infer(self, image: Any, **kwargs) -> Any:
    """Embeds an image
    - image:
        can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.
    """
    return super().infer(image, **kwargs)

infer_from_request(request)

Routes the request to the appropriate inference function.

Parameters:

Name Type Description Default
request ClipInferenceRequest

The request object containing the inference details.

required

Returns:

Name Type Description
ClipEmbeddingResponse ClipEmbeddingResponse

The response object containing the embeddings.

Source code in inference/models/clip/clip_model.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
def infer_from_request(
    self, request: ClipInferenceRequest
) -> ClipEmbeddingResponse:
    """Routes the request to the appropriate inference function.

    Args:
        request (ClipInferenceRequest): The request object containing the inference details.

    Returns:
        ClipEmbeddingResponse: The response object containing the embeddings.
    """
    t1 = perf_counter()
    if isinstance(request, ClipImageEmbeddingRequest):
        infer_func = self.embed_image
        make_response_func = self.make_embed_image_response
    elif isinstance(request, ClipTextEmbeddingRequest):
        infer_func = self.embed_text
        make_response_func = self.make_embed_text_response
    elif isinstance(request, ClipCompareRequest):
        infer_func = self.compare
        make_response_func = self.make_compare_response
    else:
        raise ValueError(
            f"Request type {type(request)} is not a valid ClipInferenceRequest"
        )
    data = infer_func(**request.dict())
    response = make_response_func(data)
    response.time = perf_counter() - t1
    return response

make_compare_response(similarities)

Creates a ClipCompareResponse object from the provided similarity data.

Parameters:

Name Type Description Default
similarities Union[List[float], Dict[str, float]]

A list or dictionary containing similarity scores.

required

Returns:

Name Type Description
ClipCompareResponse ClipCompareResponse

An instance of the ClipCompareResponse with the given similarity scores.

Example

Assuming ClipCompareResponse expects a dictionary of string-float pairs:

make_compare_response({"image1": 0.98, "image2": 0.76}) ClipCompareResponse(similarity={"image1": 0.98, "image2": 0.76})

Source code in inference/models/clip/clip_model.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def make_compare_response(
    self, similarities: Union[List[float], Dict[str, float]]
) -> ClipCompareResponse:
    """
    Creates a ClipCompareResponse object from the provided similarity data.

    Args:
        similarities (Union[List[float], Dict[str, float]]): A list or dictionary containing similarity scores.

    Returns:
        ClipCompareResponse: An instance of the ClipCompareResponse with the given similarity scores.

    Example:
        Assuming `ClipCompareResponse` expects a dictionary of string-float pairs:

        >>> make_compare_response({"image1": 0.98, "image2": 0.76})
        ClipCompareResponse(similarity={"image1": 0.98, "image2": 0.76})
    """
    response = ClipCompareResponse(similarity=similarities)
    return response

make_embed_image_response(embeddings)

Converts the given embeddings into a ClipEmbeddingResponse object.

Parameters:

Name Type Description Default
embeddings ndarray

A numpy array containing the embeddings for an image or images.

required

Returns:

Name Type Description
ClipEmbeddingResponse ClipEmbeddingResponse

An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

Example

embeddings_array = np.array([[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]]) make_embed_image_response(embeddings_array) ClipEmbeddingResponse(embeddings=[[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])

Source code in inference/models/clip/clip_model.py
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def make_embed_image_response(
    self, embeddings: np.ndarray
) -> ClipEmbeddingResponse:
    """
    Converts the given embeddings into a ClipEmbeddingResponse object.

    Args:
        embeddings (np.ndarray): A numpy array containing the embeddings for an image or images.

    Returns:
        ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

    Example:
        >>> embeddings_array = np.array([[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
        >>> make_embed_image_response(embeddings_array)
        ClipEmbeddingResponse(embeddings=[[0.5, 0.3, 0.2], [0.1, 0.9, 0.0]])
    """
    response = ClipEmbeddingResponse(embeddings=embeddings.tolist())

    return response

make_embed_text_response(embeddings)

Converts the given text embeddings into a ClipEmbeddingResponse object.

Parameters:

Name Type Description Default
embeddings ndarray

A numpy array containing the embeddings for a text or texts.

required

Returns:

Name Type Description
ClipEmbeddingResponse ClipEmbeddingResponse

An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

Example

embeddings_array = np.array([[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]]) make_embed_text_response(embeddings_array) ClipEmbeddingResponse(embeddings=[[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])

Source code in inference/models/clip/clip_model.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
def make_embed_text_response(self, embeddings: np.ndarray) -> ClipEmbeddingResponse:
    """
    Converts the given text embeddings into a ClipEmbeddingResponse object.

    Args:
        embeddings (np.ndarray): A numpy array containing the embeddings for a text or texts.

    Returns:
        ClipEmbeddingResponse: An instance of the ClipEmbeddingResponse with the provided embeddings converted to a list.

    Example:
        >>> embeddings_array = np.array([[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
        >>> make_embed_text_response(embeddings_array)
        ClipEmbeddingResponse(embeddings=[[0.8, 0.1, 0.1], [0.4, 0.5, 0.1]])
    """
    response = ClipEmbeddingResponse(embeddings=embeddings.tolist())
    return response

preproc_image(image)

Preprocesses an inference request image.

Parameters:

Name Type Description Default
image InferenceRequestImage

The object containing information necessary to load the image for inference.

required

Returns:

Type Description
ndarray

np.ndarray: A numpy array of the preprocessed image pixel data.

Source code in inference/models/clip/clip_model.py
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
def preproc_image(self, image: InferenceRequestImage) -> np.ndarray:
    """Preprocesses an inference request image.

    Args:
        image (InferenceRequestImage): The object containing information necessary to load the image for inference.

    Returns:
        np.ndarray: A numpy array of the preprocessed image pixel data.
    """
    pil_image = Image.fromarray(load_image_rgb(image))
    preprocessed_image = self.clip_preprocess(pil_image)

    img_in = np.expand_dims(preprocessed_image, axis=0)

    return img_in.astype(np.float32)