Skip to content

Doctr model

DocTR

Bases: RoboflowCoreModel

Source code in inference/models/doctr/doctr_model.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
class DocTR(RoboflowCoreModel):
    def __init__(self, *args, model_id: str = "doctr_rec/crnn_vgg16_bn", **kwargs):
        """Initializes the DocTR model.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        self.api_key = kwargs.get("api_key")
        self.dataset_id = "doctr"
        self.version_id = "default"
        self.endpoint = model_id
        model_id = model_id.lower()

        self.det_model = DocTRDet(api_key=kwargs.get("api_key"))
        self.rec_model = DocTRRec(api_key=kwargs.get("api_key"))

        os.makedirs(f"{MODEL_CACHE_DIR}/doctr/models/", exist_ok=True)

        detector_weights_path = (
            f"{MODEL_CACHE_DIR}/doctr/models/{self.det_model.version_id}.pt"
        )
        shutil.copyfile(
            f"{MODEL_CACHE_DIR}/doctr_det/{self.det_model.version_id}/model.pt",
            detector_weights_path,
        )
        recognizer_weights_path = (
            f"{MODEL_CACHE_DIR}/doctr/models/{self.rec_model.version_id}.pt"
        )
        shutil.copyfile(
            f"{MODEL_CACHE_DIR}/doctr_rec/{self.rec_model.version_id}/model.pt",
            recognizer_weights_path,
        )

        det_model = db_resnet50(pretrained=False, pretrained_backbone=False)
        det_model.load_state_dict(
            torch.load(detector_weights_path, map_location=DEVICE, weights_only=True)
        )

        reco_model = crnn_vgg16_bn(pretrained=False, pretrained_backbone=False)
        reco_model.load_state_dict(
            torch.load(recognizer_weights_path, map_location=DEVICE, weights_only=True)
        )

        self.model = ocr_predictor(
            det_arch=det_model,
            reco_arch=reco_model,
            pretrained=False,
        )
        self.task_type = "ocr"

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        self.det_model.clear_cache(delete_from_disk=delete_from_disk)
        self.rec_model.clear_cache(delete_from_disk=delete_from_disk)

    def preprocess_image(self, image: Image.Image) -> Image.Image:
        """
        DocTR pre-processes images as part of its inference pipeline.

        Thus, no preprocessing is required here.
        """
        pass

    def infer_from_request(
        self, request: DoctrOCRInferenceRequest
    ) -> Union[OCRInferenceResponse, List]:
        if type(request.image) is list:
            response = []
            request_copy = copy.copy(request)
            for image in request.image:
                request_copy.image = image
                response.append(self.single_request(request=request_copy))
            return response
        return self.single_request(request)

    def single_request(self, request: DoctrOCRInferenceRequest) -> OCRInferenceResponse:
        t1 = perf_counter()
        result = self.infer(**request.dict())
        if not isinstance(result, tuple):
            result = (result, None, None)
        # maintaining backwards compatibility with previous implementation
        if request.generate_bounding_boxes:
            return OCRInferenceResponse(
                result=result[0],
                image=result[1],
                predictions=result[2],
                time=perf_counter() - t1,
            )
        else:
            return OCRInferenceResponse(
                result=result[0],
                time=perf_counter() - t1,
            )

    def infer(
        self, image: Any, **kwargs
    ) -> Union[
        str, Tuple[str, InferenceResponseImage, List[ObjectDetectionPrediction]]
    ]:
        """
        Run inference on a provided image.
            - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

        Args:
            request (DoctrOCRInferenceRequest): The inference request.

        Returns:
            OCRInferenceResponse: The inference response.
        """

        img = load_image(image)

        with tempfile.NamedTemporaryFile(suffix=".jpg") as f:
            image = Image.fromarray(img[0])

            image.save(f.name)

            doc = DocumentFile.from_images([f.name])

            result = self.model(doc).export()

            blocks = result["pages"][0]["blocks"]
            page_dimensions = result["pages"][0]["dimensions"]

            words = [
                word
                for block in blocks
                for line in block["lines"]
                for word in line["words"]
            ]

            result = " ".join([word["value"] for word in words])
            # maintaining backwards compatibility with previous implementation
            if not kwargs.get("generate_bounding_boxes", False):
                return result

            bounding_boxes = [
                _geometry_to_bbox(page_dimensions, word["geometry"]) for word in words
            ]
            objects = [
                ObjectDetectionPrediction(
                    **{
                        "x": bbox[0] + (bbox[2] - bbox[0]) // 2,
                        "y": bbox[1] + (bbox[3] - bbox[1]) // 2,
                        "width": bbox[2] - bbox[0],
                        "height": bbox[3] - bbox[1],
                        "confidence": float(word["objectness_score"]),
                        "class": word["value"],
                        "class_id": 0,
                        "detection_id": str(uuid.uuid4()),
                    }
                )
                for word, bbox in zip(words, bounding_boxes)
            ]
            image_height, image_width = img[0].shape[:2]
            return (
                result,
                InferenceResponseImage(width=image_width, height=image_height),
                objects,
            )

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["model.pt"].
        """
        return ["model.pt"]

__init__(*args, model_id='doctr_rec/crnn_vgg16_bn', **kwargs)

Initializes the DocTR model.

Parameters:

Name Type Description Default
*args

Variable length argument list.

()
**kwargs

Arbitrary keyword arguments.

{}
Source code in inference/models/doctr/doctr_model.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def __init__(self, *args, model_id: str = "doctr_rec/crnn_vgg16_bn", **kwargs):
    """Initializes the DocTR model.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    self.api_key = kwargs.get("api_key")
    self.dataset_id = "doctr"
    self.version_id = "default"
    self.endpoint = model_id
    model_id = model_id.lower()

    self.det_model = DocTRDet(api_key=kwargs.get("api_key"))
    self.rec_model = DocTRRec(api_key=kwargs.get("api_key"))

    os.makedirs(f"{MODEL_CACHE_DIR}/doctr/models/", exist_ok=True)

    detector_weights_path = (
        f"{MODEL_CACHE_DIR}/doctr/models/{self.det_model.version_id}.pt"
    )
    shutil.copyfile(
        f"{MODEL_CACHE_DIR}/doctr_det/{self.det_model.version_id}/model.pt",
        detector_weights_path,
    )
    recognizer_weights_path = (
        f"{MODEL_CACHE_DIR}/doctr/models/{self.rec_model.version_id}.pt"
    )
    shutil.copyfile(
        f"{MODEL_CACHE_DIR}/doctr_rec/{self.rec_model.version_id}/model.pt",
        recognizer_weights_path,
    )

    det_model = db_resnet50(pretrained=False, pretrained_backbone=False)
    det_model.load_state_dict(
        torch.load(detector_weights_path, map_location=DEVICE, weights_only=True)
    )

    reco_model = crnn_vgg16_bn(pretrained=False, pretrained_backbone=False)
    reco_model.load_state_dict(
        torch.load(recognizer_weights_path, map_location=DEVICE, weights_only=True)
    )

    self.model = ocr_predictor(
        det_arch=det_model,
        reco_arch=reco_model,
        pretrained=False,
    )
    self.task_type = "ocr"

get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name Type Description
list list

A list of required files for inference, e.g., ["model.pt"].

Source code in inference/models/doctr/doctr_model.py
210
211
212
213
214
215
216
def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns:
        list: A list of required files for inference, e.g., ["model.pt"].
    """
    return ["model.pt"]

infer(image, **kwargs)

Run inference on a provided image. - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

Parameters:

Name Type Description Default
request DoctrOCRInferenceRequest

The inference request.

required

Returns:

Name Type Description
OCRInferenceResponse Union[str, Tuple[str, InferenceResponseImage, List[ObjectDetectionPrediction]]]

The inference response.

Source code in inference/models/doctr/doctr_model.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def infer(
    self, image: Any, **kwargs
) -> Union[
    str, Tuple[str, InferenceResponseImage, List[ObjectDetectionPrediction]]
]:
    """
    Run inference on a provided image.
        - image: can be a BGR numpy array, filepath, InferenceRequestImage, PIL Image, byte-string, etc.

    Args:
        request (DoctrOCRInferenceRequest): The inference request.

    Returns:
        OCRInferenceResponse: The inference response.
    """

    img = load_image(image)

    with tempfile.NamedTemporaryFile(suffix=".jpg") as f:
        image = Image.fromarray(img[0])

        image.save(f.name)

        doc = DocumentFile.from_images([f.name])

        result = self.model(doc).export()

        blocks = result["pages"][0]["blocks"]
        page_dimensions = result["pages"][0]["dimensions"]

        words = [
            word
            for block in blocks
            for line in block["lines"]
            for word in line["words"]
        ]

        result = " ".join([word["value"] for word in words])
        # maintaining backwards compatibility with previous implementation
        if not kwargs.get("generate_bounding_boxes", False):
            return result

        bounding_boxes = [
            _geometry_to_bbox(page_dimensions, word["geometry"]) for word in words
        ]
        objects = [
            ObjectDetectionPrediction(
                **{
                    "x": bbox[0] + (bbox[2] - bbox[0]) // 2,
                    "y": bbox[1] + (bbox[3] - bbox[1]) // 2,
                    "width": bbox[2] - bbox[0],
                    "height": bbox[3] - bbox[1],
                    "confidence": float(word["objectness_score"]),
                    "class": word["value"],
                    "class_id": 0,
                    "detection_id": str(uuid.uuid4()),
                }
            )
            for word, bbox in zip(words, bounding_boxes)
        ]
        image_height, image_width = img[0].shape[:2]
        return (
            result,
            InferenceResponseImage(width=image_width, height=image_height),
            objects,
        )

preprocess_image(image)

DocTR pre-processes images as part of its inference pipeline.

Thus, no preprocessing is required here.

Source code in inference/models/doctr/doctr_model.py
104
105
106
107
108
109
110
def preprocess_image(self, image: Image.Image) -> Image.Image:
    """
    DocTR pre-processes images as part of its inference pipeline.

    Thus, no preprocessing is required here.
    """
    pass

DocTRDet

Bases: RoboflowCoreModel

DocTR class for document Optical Character Recognition (OCR).

Attributes:

Name Type Description
doctr

The DocTR model.

ort_session

ONNX runtime inference session.

Source code in inference/models/doctr/doctr_model.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
class DocTRDet(RoboflowCoreModel):
    """DocTR class for document Optical Character Recognition (OCR).

    Attributes:
        doctr: The DocTR model.
        ort_session: ONNX runtime inference session.
    """

    def __init__(self, *args, model_id: str = "doctr_det/db_resnet50", **kwargs):
        """Initializes the DocTR model.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """

        self.get_infer_bucket_file_list()

        super().__init__(*args, model_id=model_id, **kwargs)

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        super().clear_cache(delete_from_disk=delete_from_disk)

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["model.pt"].
        """
        return ["model.pt"]

__init__(*args, model_id='doctr_det/db_resnet50', **kwargs)

Initializes the DocTR model.

Parameters:

Name Type Description Default
*args

Variable length argument list.

()
**kwargs

Arbitrary keyword arguments.

{}
Source code in inference/models/doctr/doctr_model.py
251
252
253
254
255
256
257
258
259
260
261
def __init__(self, *args, model_id: str = "doctr_det/db_resnet50", **kwargs):
    """Initializes the DocTR model.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """

    self.get_infer_bucket_file_list()

    super().__init__(*args, model_id=model_id, **kwargs)

get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name Type Description
list list

A list of required files for inference, e.g., ["model.pt"].

Source code in inference/models/doctr/doctr_model.py
266
267
268
269
270
271
272
def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns:
        list: A list of required files for inference, e.g., ["model.pt"].
    """
    return ["model.pt"]

DocTRRec

Bases: RoboflowCoreModel

Source code in inference/models/doctr/doctr_model.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
class DocTRRec(RoboflowCoreModel):
    def __init__(self, *args, model_id: str = "doctr_rec/crnn_vgg16_bn", **kwargs):
        """Initializes the DocTR model.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        self.get_infer_bucket_file_list()

        super().__init__(*args, model_id=model_id, **kwargs)

    def clear_cache(self, delete_from_disk: bool = True) -> None:
        super().clear_cache(delete_from_disk=delete_from_disk)

    def get_infer_bucket_file_list(self) -> list:
        """Get the list of required files for inference.

        Returns:
            list: A list of required files for inference, e.g., ["model.pt"].
        """
        return ["model.pt"]

__init__(*args, model_id='doctr_rec/crnn_vgg16_bn', **kwargs)

Initializes the DocTR model.

Parameters:

Name Type Description Default
*args

Variable length argument list.

()
**kwargs

Arbitrary keyword arguments.

{}
Source code in inference/models/doctr/doctr_model.py
220
221
222
223
224
225
226
227
228
229
def __init__(self, *args, model_id: str = "doctr_rec/crnn_vgg16_bn", **kwargs):
    """Initializes the DocTR model.

    Args:
        *args: Variable length argument list.
        **kwargs: Arbitrary keyword arguments.
    """
    self.get_infer_bucket_file_list()

    super().__init__(*args, model_id=model_id, **kwargs)

get_infer_bucket_file_list()

Get the list of required files for inference.

Returns:

Name Type Description
list list

A list of required files for inference, e.g., ["model.pt"].

Source code in inference/models/doctr/doctr_model.py
234
235
236
237
238
239
240
def get_infer_bucket_file_list(self) -> list:
    """Get the list of required files for inference.

    Returns:
        list: A list of required files for inference, e.g., ["model.pt"].
    """
    return ["model.pt"]