Skip to content

Example Workflows - Workflows with Visual Language Models

Below you can find example workflows you can use as inspiration to build your apps.

Prompting Anthropic Claude with arbitrary prompt

In this example, Anthropic Claude model is prompted with arbitrary text from user

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/anthropic_claude@v1",
            "name": "claude",
            "images": "$inputs.image",
            "task_type": "unconstrained",
            "prompt": "Give me dominant color of the image",
            "api_key": "$inputs.api_key"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.claude.output"
        }
    ]
}

Using Anthropic Claude as OCR model

In this example, Anthropic Claude model is used as OCR system. User just points task type and do not need to provide any prompt.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/anthropic_claude@v1",
            "name": "claude",
            "images": "$inputs.image",
            "task_type": "ocr",
            "api_key": "$inputs.api_key"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.claude.output"
        }
    ]
}

Using Anthropic Claude as Visual Question Answering system

In this example, Anthropic Claude model is used as VQA system. User provides question via prompt.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "prompt"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/anthropic_claude@v1",
            "name": "claude",
            "images": "$inputs.image",
            "task_type": "visual-question-answering",
            "prompt": "$inputs.prompt",
            "api_key": "$inputs.api_key"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.claude.output"
        }
    ]
}

Using Anthropic Claude as Image Captioning system

In this example, Anthropic Claude model is used as Image Captioning system.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/anthropic_claude@v1",
            "name": "claude",
            "images": "$inputs.image",
            "task_type": "caption",
            "api_key": "$inputs.api_key",
            "temperature": 1.0
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.claude.output"
        }
    ]
}

Using Anthropic Claude as multi-class classifier

In this example, Anthropic Claude model is used as classifier. Output from the model is parsed by special roboflow_core/vlm_as_classifier@v1 block which turns model output text into full-blown prediction, which can later be used by other blocks compatible with classification predictions - in this case we extract top-class property.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/anthropic_claude@v1",
            "name": "claude",
            "images": "$inputs.image",
            "task_type": "classification",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_classifier@v1",
            "name": "parser",
            "image": "$inputs.image",
            "vlm_output": "$steps.claude.output",
            "classes": "$steps.claude.classes"
        },
        {
            "type": "roboflow_core/property_definition@v1",
            "name": "top_class",
            "operations": [
                {
                    "type": "ClassificationPropertyExtract",
                    "property_name": "top_class"
                }
            ],
            "data": "$steps.parser.predictions"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "claude_result",
            "selector": "$steps.claude.output"
        },
        {
            "type": "JsonField",
            "name": "top_class",
            "selector": "$steps.top_class.output"
        },
        {
            "type": "JsonField",
            "name": "parsed_prediction",
            "selector": "$steps.parser.*"
        }
    ]
}

Using Anthropic Claude as multi-label classifier

In this example, Anthropic Claude model is used as multi-label classifier. Output from the model is parsed by special roboflow_core/vlm_as_classifier@v1 block which turns model output text into full-blown prediction, which can later be used by other blocks compatible with classification predictions - in this case we extract top-class property.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/anthropic_claude@v1",
            "name": "claude",
            "images": "$inputs.image",
            "task_type": "multi-label-classification",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_classifier@v1",
            "name": "parser",
            "image": "$inputs.image",
            "vlm_output": "$steps.claude.output",
            "classes": "$steps.claude.classes"
        },
        {
            "type": "roboflow_core/property_definition@v1",
            "name": "top_class",
            "operations": [
                {
                    "type": "ClassificationPropertyExtract",
                    "property_name": "top_class"
                }
            ],
            "data": "$steps.parser.predictions"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.top_class.output"
        },
        {
            "type": "JsonField",
            "name": "parsed_prediction",
            "selector": "$steps.parser.*"
        }
    ]
}

Using Anthropic Claude to provide structured JSON

In this example, Anthropic Claude model is expected to provide structured output in JSON, which can later be parsed by dedicated roboflow_core/json_parser@v1 block which transforms string into dictionary and expose it's keys to other blocks for further processing. In this case, parsed output is transformed using roboflow_core/property_definition@v1 block.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/anthropic_claude@v1",
            "name": "claude",
            "images": "$inputs.image",
            "task_type": "structured-answering",
            "output_structure": {
                "dogs_count": "count of dogs instances in the image",
                "cats_count": "count of cats instances in the image"
            },
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/json_parser@v1",
            "name": "parser",
            "raw_json": "$steps.claude.output",
            "expected_fields": [
                "dogs_count",
                "cats_count"
            ]
        },
        {
            "type": "roboflow_core/property_definition@v1",
            "name": "property_definition",
            "operations": [
                {
                    "type": "ToString"
                }
            ],
            "data": "$steps.parser.dogs_count"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.property_definition.output"
        }
    ]
}

Using Anthropic Claude as object-detection model

In this example, Anthropic Claude model is expected to provide output, which can later be parsed by dedicated roboflow_core/vlm_as_detector@v1 block which transforms string into sv.Detections, which can later be used by other blocks processing object-detection predictions.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/anthropic_claude@v1",
            "name": "claude",
            "images": "$inputs.image",
            "task_type": "object-detection",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_detector@v1",
            "name": "parser",
            "vlm_output": "$steps.claude.output",
            "image": "$inputs.image",
            "classes": "$steps.claude.classes",
            "model_type": "anthropic-claude",
            "task_type": "object-detection"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "claude_result",
            "selector": "$steps.claude.output"
        },
        {
            "type": "JsonField",
            "name": "parsed_prediction",
            "selector": "$steps.parser.predictions"
        }
    ]
}

Using Anthropic Claude as secondary classifier

In this example, Anthropic Claude model is used as secondary classifier - first, YOLO model detects dogs, then for each dog we run classification with VLM and at the end we replace detections classes to have bounding boxes with dogs breeds labels.

Breeds that we classify: russell-terrier, wirehaired-pointing-griffon, beagle

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes",
            "default_value": [
                "russell-terrier",
                "wirehaired-pointing-griffon",
                "beagle"
            ]
        }
    ],
    "steps": [
        {
            "type": "ObjectDetectionModel",
            "name": "general_detection",
            "image": "$inputs.image",
            "model_id": "yolov8n-640",
            "class_filter": [
                "dog"
            ]
        },
        {
            "type": "Crop",
            "name": "cropping",
            "image": "$inputs.image",
            "predictions": "$steps.general_detection.predictions"
        },
        {
            "type": "roboflow_core/anthropic_claude@v1",
            "name": "claude",
            "images": "$steps.cropping.crops",
            "task_type": "classification",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_classifier@v1",
            "name": "parser",
            "image": "$steps.cropping.crops",
            "vlm_output": "$steps.claude.output",
            "classes": "$steps.claude.classes"
        },
        {
            "type": "roboflow_core/detections_classes_replacement@v1",
            "name": "classes_replacement",
            "object_detection_predictions": "$steps.general_detection.predictions",
            "classification_predictions": "$steps.parser.predictions"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "predictions",
            "selector": "$steps.classes_replacement.predictions"
        }
    ]
}

Florence 2 - grounded classification

THIS EXAMPLE CAN ONLY BE RUN LOCALLY OR USING DEDICATED DEPLOYMENT

In this example, we use object detection model to find regions of interest in the input image, which are later classified by Florence 2 model. With Workflows it is possible to pass grounding_detection as an input for all of the tasks named detection-grounded-*.

Grounding detection can either be input parameter or output of detection model. If the latter is true, one should choose grounding_selection_mode - as Florence do only support a single bounding box as grounding - when multiple detections can be provided, block will select one based on parameter.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "InferenceImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "confidence",
            "default_value": 0.4
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/roboflow_object_detection_model@v1",
            "name": "model_1",
            "images": "$inputs.image",
            "model_id": "yolov8n-640",
            "confidence": "$inputs.confidence"
        },
        {
            "type": "roboflow_core/florence_2@v1",
            "name": "model",
            "images": "$inputs.image",
            "task_type": "detection-grounded-classification",
            "grounding_detection": "$steps.model_1.predictions",
            "grounding_selection_mode": "most-confident"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "model_predictions",
            "coordinates_system": "own",
            "selector": "$steps.model.*"
        }
    ]
}

Florence 2 - grounded segmentation

THIS EXAMPLE CAN ONLY BE RUN LOCALLY OR USING DEDICATED DEPLOYMENT

In this example, we use object detection model to find regions of interest in the input image and run segmentation of selected region with Florence 2. With Workflows it is possible to pass grounding_detection as an input for all of the tasks named detection-grounded-*.

Grounding detection can either be input parameter or output of detection model. If the latter is true, one should choose grounding_selection_mode - as Florence do only support a single bounding box as grounding - when multiple detections can be provided, block will select one based on parameter.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "InferenceImage",
            "name": "image"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/roboflow_object_detection_model@v1",
            "name": "model_1",
            "images": "$inputs.image",
            "model_id": "yolov8n-640"
        },
        {
            "type": "roboflow_core/florence_2@v1",
            "name": "model",
            "images": "$inputs.image",
            "task_type": "detection-grounded-instance-segmentation",
            "grounding_detection": "$steps.model_1.predictions",
            "grounding_selection_mode": "most-confident"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "model_predictions",
            "coordinates_system": "own",
            "selector": "$steps.model.*"
        }
    ]
}

Florence 2 - grounded captioning

THIS EXAMPLE CAN ONLY BE RUN LOCALLY OR USING DEDICATED DEPLOYMENT

In this example, we use object detection model to find regions of interest in the input image and run captioning of selected region with Florence 2. With Workflows it is possible to pass grounding_detection as an input for all of the tasks named detection-grounded-*.

Grounding detection can either be input parameter or output of detection model. If the latter is true, one should choose grounding_selection_mode - as Florence do only support a single bounding box as grounding - when multiple detections can be provided, block will select one based on parameter.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "InferenceImage",
            "name": "image"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/roboflow_object_detection_model@v1",
            "name": "model_1",
            "images": "$inputs.image",
            "model_id": "yolov8n-640"
        },
        {
            "type": "roboflow_core/florence_2@v1",
            "name": "model",
            "images": "$inputs.image",
            "task_type": "detection-grounded-instance-segmentation",
            "grounding_detection": "$steps.model_1.predictions",
            "grounding_selection_mode": "most-confident"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "model_predictions",
            "coordinates_system": "own",
            "selector": "$steps.model.*"
        }
    ]
}

Florence 2 - object detection

THIS EXAMPLE CAN ONLY BE RUN LOCALLY OR USING DEDICATED DEPLOYMENT

In this example, we use Florence 2 as zero-shot object detection model, specifically performing open-vocabulary detection. Input parameter classes can be used to provide list of objects that model should find. Beware that Florence 2 is prone to seek for all of the classes provided in your list - so if you select class which is not visible in the image, you can expect either big bounding box covering whole image, or multiple bounding boxes over one of detected instance, with auxiliary boxes providing not meaningful labels for all of the objects you specified in class list.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "InferenceImage",
            "name": "image"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/roboflow_object_detection_model@v1",
            "name": "model_1",
            "images": "$inputs.image",
            "model_id": "yolov8n-640"
        },
        {
            "type": "roboflow_core/florence_2@v1",
            "name": "model",
            "images": "$inputs.image",
            "task_type": "detection-grounded-instance-segmentation",
            "grounding_detection": "$steps.model_1.predictions",
            "grounding_selection_mode": "most-confident"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "model_predictions",
            "coordinates_system": "own",
            "selector": "$steps.model.*"
        }
    ]
}

Prompting Google's Gemini with arbitrary prompt

In this example, Google's Gemini model is prompted with arbitrary text from user

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/google_gemini@v1",
            "name": "gemini",
            "images": "$inputs.image",
            "task_type": "unconstrained",
            "prompt": "Give me dominant color of the image",
            "api_key": "$inputs.api_key"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.gemini.output"
        }
    ]
}

Using Google's Gemini as OCR model

In this example, Google's Gemini model is used as OCR system. User just points task type and do not need to provide any prompt.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/google_gemini@v1",
            "name": "gemini",
            "images": "$inputs.image",
            "task_type": "ocr",
            "api_key": "$inputs.api_key"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.gemini.output"
        }
    ]
}

Using Google's Gemini as Visual Question Answering system

In this example, Google's Gemini model is used as VQA system. User provides question via prompt.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "prompt"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/google_gemini@v1",
            "name": "gemini",
            "images": "$inputs.image",
            "task_type": "visual-question-answering",
            "prompt": "$inputs.prompt",
            "api_key": "$inputs.api_key"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.gemini.output"
        }
    ]
}

Using Google's Gemini as Image Captioning system

In this example, Google's Gemini model is used as Image Captioning system.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/google_gemini@v1",
            "name": "gemini",
            "images": "$inputs.image",
            "task_type": "caption",
            "api_key": "$inputs.api_key",
            "temperature": 1.0
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.gemini.output"
        }
    ]
}

Using Google's Gemini as multi-class classifier

In this example, Google's Gemini model is used as classifier. Output from the model is parsed by special roboflow_core/vlm_as_classifier@v1 block which turns model output text into full-blown prediction, which can later be used by other blocks compatible with classification predictions - in this case we extract top-class property.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/google_gemini@v1",
            "name": "gemini",
            "images": "$inputs.image",
            "task_type": "classification",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_classifier@v1",
            "name": "parser",
            "image": "$inputs.image",
            "vlm_output": "$steps.gemini.output",
            "classes": "$steps.gemini.classes"
        },
        {
            "type": "roboflow_core/property_definition@v1",
            "name": "top_class",
            "operations": [
                {
                    "type": "ClassificationPropertyExtract",
                    "property_name": "top_class"
                }
            ],
            "data": "$steps.parser.predictions"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "gemini_result",
            "selector": "$steps.gemini.output"
        },
        {
            "type": "JsonField",
            "name": "top_class",
            "selector": "$steps.top_class.output"
        },
        {
            "type": "JsonField",
            "name": "parsed_prediction",
            "selector": "$steps.parser.*"
        }
    ]
}

Using Google's Gemini as multi-label classifier

In this example, Google's Gemini model is used as multi-label classifier. Output from the model is parsed by special roboflow_core/vlm_as_classifier@v1 block which turns model output text into full-blown prediction, which can later be used by other blocks compatible with classification predictions - in this case we extract top-class property.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/google_gemini@v1",
            "name": "gemini",
            "images": "$inputs.image",
            "task_type": "multi-label-classification",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_classifier@v1",
            "name": "parser",
            "image": "$inputs.image",
            "vlm_output": "$steps.gemini.output",
            "classes": "$steps.gemini.classes"
        },
        {
            "type": "roboflow_core/property_definition@v1",
            "name": "top_class",
            "operations": [
                {
                    "type": "ClassificationPropertyExtract",
                    "property_name": "top_class"
                }
            ],
            "data": "$steps.parser.predictions"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.top_class.output"
        },
        {
            "type": "JsonField",
            "name": "parsed_prediction",
            "selector": "$steps.parser.*"
        }
    ]
}

Using Google's Gemini to provide structured JSON

In this example, Google's Gemini model is expected to provide structured output in JSON, which can later be parsed by dedicated roboflow_core/json_parser@v1 block which transforms string into dictionary and expose it's keys to other blocks for further processing. In this case, parsed output is transformed using roboflow_core/property_definition@v1 block.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/google_gemini@v1",
            "name": "gemini",
            "images": "$inputs.image",
            "task_type": "structured-answering",
            "output_structure": {
                "dogs_count": "count of dogs instances in the image",
                "cats_count": "count of cats instances in the image"
            },
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/json_parser@v1",
            "name": "parser",
            "raw_json": "$steps.gemini.output",
            "expected_fields": [
                "dogs_count",
                "cats_count"
            ]
        },
        {
            "type": "roboflow_core/property_definition@v1",
            "name": "property_definition",
            "operations": [
                {
                    "type": "ToString"
                }
            ],
            "data": "$steps.parser.dogs_count"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.property_definition.output"
        }
    ]
}

Using Google's Gemini as object-detection model

In this example, Google's Gemini model is expected to provide output, which can later be parsed by dedicated roboflow_core/vlm_as_detector@v1 block which transforms string into sv.Detections, which can later be used by other blocks processing object-detection predictions.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/google_gemini@v1",
            "name": "gemini",
            "images": "$inputs.image",
            "task_type": "object-detection",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_detector@v1",
            "name": "parser",
            "vlm_output": "$steps.gemini.output",
            "image": "$inputs.image",
            "classes": "$steps.gemini.classes",
            "model_type": "google-gemini",
            "task_type": "object-detection"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "gemini_result",
            "selector": "$steps.gemini.output"
        },
        {
            "type": "JsonField",
            "name": "parsed_prediction",
            "selector": "$steps.parser.predictions"
        }
    ]
}

Using Google's Gemini as secondary classifier

In this example, Google's Gemini model is used as secondary classifier - first, YOLO model detects dogs, then for each dog we run classification with VLM and at the end we replace detections classes to have bounding boxes with dogs breeds labels.

Breeds that we classify: russell-terrier, wirehaired-pointing-griffon, beagle

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes",
            "default_value": [
                "russell-terrier",
                "wirehaired-pointing-griffon",
                "beagle"
            ]
        }
    ],
    "steps": [
        {
            "type": "ObjectDetectionModel",
            "name": "general_detection",
            "image": "$inputs.image",
            "model_id": "yolov8n-640",
            "class_filter": [
                "dog"
            ]
        },
        {
            "type": "Crop",
            "name": "cropping",
            "image": "$inputs.image",
            "predictions": "$steps.general_detection.predictions"
        },
        {
            "type": "roboflow_core/google_gemini@v1",
            "name": "gemini",
            "images": "$steps.cropping.crops",
            "task_type": "classification",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_classifier@v1",
            "name": "parser",
            "image": "$steps.cropping.crops",
            "vlm_output": "$steps.gemini.output",
            "classes": "$steps.gemini.classes"
        },
        {
            "type": "roboflow_core/detections_classes_replacement@v1",
            "name": "classes_replacement",
            "object_detection_predictions": "$steps.general_detection.predictions",
            "classification_predictions": "$steps.parser.predictions"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "predictions",
            "selector": "$steps.classes_replacement.predictions"
        }
    ]
}

Prompting GPT with arbitrary prompt

In this example, GPT model is prompted with arbitrary text from user

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "prompt"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/open_ai@v2",
            "name": "gpt",
            "images": "$inputs.image",
            "task_type": "unconstrained",
            "prompt": "$inputs.prompt",
            "api_key": "$inputs.api_key"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.gpt.output"
        }
    ]
}

Using GPT as OCR model

In this example, GPT model is used as OCR system. User just points task type and do not need to provide any prompt.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/open_ai@v2",
            "name": "gpt",
            "images": "$inputs.image",
            "task_type": "ocr",
            "api_key": "$inputs.api_key",
            "model_version": "gpt-4o-mini"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.gpt.output"
        }
    ]
}

Using GPT as Visual Question Answering system

In this example, GPT model is used as VQA system. User provides question via prompt.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "prompt"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/open_ai@v2",
            "name": "gpt",
            "images": "$inputs.image",
            "task_type": "visual-question-answering",
            "prompt": "$inputs.prompt",
            "api_key": "$inputs.api_key"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.gpt.output"
        }
    ]
}

Using GPT as Image Captioning system

In this example, GPT model is used as Image Captioning system.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/open_ai@v2",
            "name": "gpt",
            "images": "$inputs.image",
            "task_type": "caption",
            "api_key": "$inputs.api_key"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.gpt.output"
        }
    ]
}

Using GPT as multi-class classifier

In this example, GPT model is used as classifier. Output from the model is parsed by special roboflow_core/vlm_as_classifier@v1 block which turns GPT output text into full-blown prediction, which can later be used by other blocks compatible with classification predictions - in this case we extract top-class property.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/open_ai@v2",
            "name": "gpt",
            "images": "$inputs.image",
            "task_type": "classification",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_classifier@v1",
            "name": "parser",
            "image": "$inputs.image",
            "vlm_output": "$steps.gpt.output",
            "classes": "$steps.gpt.classes"
        },
        {
            "type": "roboflow_core/property_definition@v1",
            "name": "top_class",
            "operations": [
                {
                    "type": "ClassificationPropertyExtract",
                    "property_name": "top_class"
                }
            ],
            "data": "$steps.parser.predictions"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "gpt_result",
            "selector": "$steps.gpt.output"
        },
        {
            "type": "JsonField",
            "name": "top_class",
            "selector": "$steps.top_class.output"
        },
        {
            "type": "JsonField",
            "name": "parsed_prediction",
            "selector": "$steps.parser.*"
        }
    ]
}

Using GPT as multi-label classifier

In this example, GPT model is used as multi-label classifier. Output from the model is parsed by special roboflow_core/vlm_as_classifier@v1 block which turns GPT output text into full-blown prediction, which can later be used by other blocks compatible with classification predictions - in this case we extract top-class property.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/open_ai@v2",
            "name": "gpt",
            "images": "$inputs.image",
            "task_type": "multi-label-classification",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_classifier@v1",
            "name": "parser",
            "image": "$inputs.image",
            "vlm_output": "$steps.gpt.output",
            "classes": "$steps.gpt.classes"
        },
        {
            "type": "roboflow_core/property_definition@v1",
            "name": "top_class",
            "operations": [
                {
                    "type": "ClassificationPropertyExtract",
                    "property_name": "top_class"
                }
            ],
            "data": "$steps.parser.predictions"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.top_class.output"
        },
        {
            "type": "JsonField",
            "name": "parsed_prediction",
            "selector": "$steps.parser.*"
        }
    ]
}

Using GPT to provide structured JSON

In this example, GPT model is expected to provide structured output in JSON, which can later be parsed by dedicated roboflow_core/json_parser@v1 block which transforms string into dictionary and expose it's keys to other blocks for further processing. In this case, parsed output is transformed using roboflow_core/property_definition@v1 block.

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        }
    ],
    "steps": [
        {
            "type": "roboflow_core/open_ai@v2",
            "name": "gpt",
            "images": "$inputs.image",
            "task_type": "structured-answering",
            "output_structure": {
                "dogs_count": "count of dogs instances in the image",
                "cats_count": "count of cats instances in the image"
            },
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/json_parser@v1",
            "name": "parser",
            "raw_json": "$steps.gpt.output",
            "expected_fields": [
                "dogs_count",
                "cats_count"
            ]
        },
        {
            "type": "roboflow_core/property_definition@v1",
            "name": "property_definition",
            "operations": [
                {
                    "type": "ToString"
                }
            ],
            "data": "$steps.parser.dogs_count"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "result",
            "selector": "$steps.property_definition.output"
        }
    ]
}

Using GPT as secondary classifier

In this example, GPT model is used as secondary classifier - first, YOLO model detects dogs, then for each dog we run classification with VLM and at the end we replace detections classes to have bounding boxes with dogs breeds labels.

Breeds that we classify: russell-terrier, wirehaired-pointing-griffon, beagle

Workflow definition
{
    "version": "1.0",
    "inputs": [
        {
            "type": "WorkflowImage",
            "name": "image"
        },
        {
            "type": "WorkflowParameter",
            "name": "api_key"
        },
        {
            "type": "WorkflowParameter",
            "name": "classes",
            "default_value": [
                "russell-terrier",
                "wirehaired-pointing-griffon",
                "beagle"
            ]
        }
    ],
    "steps": [
        {
            "type": "ObjectDetectionModel",
            "name": "general_detection",
            "image": "$inputs.image",
            "model_id": "yolov8n-640",
            "class_filter": [
                "dog"
            ]
        },
        {
            "type": "Crop",
            "name": "cropping",
            "image": "$inputs.image",
            "predictions": "$steps.general_detection.predictions"
        },
        {
            "type": "roboflow_core/open_ai@v2",
            "name": "gpt",
            "images": "$steps.cropping.crops",
            "task_type": "classification",
            "classes": "$inputs.classes",
            "api_key": "$inputs.api_key"
        },
        {
            "type": "roboflow_core/vlm_as_classifier@v1",
            "name": "parser",
            "image": "$steps.cropping.crops",
            "vlm_output": "$steps.gpt.output",
            "classes": "$steps.gpt.classes"
        },
        {
            "type": "roboflow_core/detections_classes_replacement@v1",
            "name": "classes_replacement",
            "object_detection_predictions": "$steps.general_detection.predictions",
            "classification_predictions": "$steps.parser.predictions"
        }
    ],
    "outputs": [
        {
            "type": "JsonField",
            "name": "predictions",
            "selector": "$steps.classes_replacement.predictions"
        }
    ]
}