CLIP Classify Content of Video¶
CLIP is a powerful foundation model for zero-shot classification. In this scenario, we are using CLIP to classify the topics in a Youtube video. Plug in your own video and set of prompts!
Click the Open in Colab button to run the cookbook on Google Colab.
Let's begin!
Install required packages¶
In this cookbook, we'll leverage two Python packages - opencv
and supervision
In [ ]:
Copied!
!pip install supervision opencv-python
!pip install supervision opencv-python
Imports & Configure Roboflow Inference Server¶
In [25]:
Copied!
import requests
import base64
from PIL import Image
from io import BytesIO
import os
import supervision as sv
from tqdm import tqdm
from supervision import get_video_frames_generator
import time
INFERENCE_ENDPOINT = "https://infer.roboflow.com"
API_KEY = "YOUR_API_KEY"
VIDEO = "VIDEO_PATH"
import requests
import base64
from PIL import Image
from io import BytesIO
import os
import supervision as sv
from tqdm import tqdm
from supervision import get_video_frames_generator
import time
INFERENCE_ENDPOINT = "https://infer.roboflow.com"
API_KEY = "YOUR_API_KEY"
VIDEO = "VIDEO_PATH"
Prompt List for CLIP similarity function¶
In [ ]:
Copied!
#Prompt list to evaluate similarity between each image and each prompt. If something else is selected, then we ignore the caption
#change this to your desired prompt list
prompt_list = [['action video game shooting xbox','Drake rapper music','soccer game ball',
'marvel combic book','beyonce','Church pope praying',
'Mcdonalds French Fries',"something else"]]
#Prompt list to evaluate similarity between each image and each prompt. If something else is selected, then we ignore the caption
#change this to your desired prompt list
prompt_list = [['action video game shooting xbox','Drake rapper music','soccer game ball',
'marvel combic book','beyonce','Church pope praying',
'Mcdonalds French Fries',"something else"]]
CLIP Endpoint Compare Frame & Prompt List Similarity¶
In [26]:
Copied!
def classify_image(image: str, prompt: str) -> dict:
image_data = Image.fromarray(image)
buffer = BytesIO()
image_data.save(buffer, format="JPEG")
image_data = base64.b64encode(buffer.getvalue()).decode("utf-8")
payload = {
"api_key": API_KEY,
"subject": {
"type": "base64",
"value": image_data
},
"prompt": prompt,
}
data = requests.post(INFERENCE_ENDPOINT + "/clip/compare?api_key=" + API_KEY, json=payload)
response = data.json()
#print(response["similarity"])
sim = response["similarity"]
highest_prediction = 0
highest_prediction_index = 0
for i, prediction in enumerate(response["similarity"]):
if prediction > highest_prediction:
highest_prediction = prediction
highest_prediction_index = i
return prompt[highest_prediction_index], sim[highest_prediction_index]
def classify_image(image: str, prompt: str) -> dict:
image_data = Image.fromarray(image)
buffer = BytesIO()
image_data.save(buffer, format="JPEG")
image_data = base64.b64encode(buffer.getvalue()).decode("utf-8")
payload = {
"api_key": API_KEY,
"subject": {
"type": "base64",
"value": image_data
},
"prompt": prompt,
}
data = requests.post(INFERENCE_ENDPOINT + "/clip/compare?api_key=" + API_KEY, json=payload)
response = data.json()
#print(response["similarity"])
sim = response["similarity"]
highest_prediction = 0
highest_prediction_index = 0
for i, prediction in enumerate(response["similarity"]):
if prediction > highest_prediction:
highest_prediction = prediction
highest_prediction_index = i
return prompt[highest_prediction_index], sim[highest_prediction_index]
Process Video & Return Most Similar Prompt to Frame¶
In [ ]:
Copied!
def process_video_frames(video_path, prompt_list, total_frames=160, total_seconds=80, stride_length=30,max_retries):
if not os.path.exists(video_path):
print(f"The specified video file '{video_path}' does not exist.")
return
frames_per_second = total_frames / total_seconds
frame_dict = {}
for frame_index, frame in enumerate(sv.get_video_frames_generator(source_path=video_path, stride=stride_length, start=0)):
frame_second = frame_index * (1 / frames_per_second)
frame_key = f"Frame {frame_index}: {frame_second:.2f} seconds"
frame_dict[frame_key] = []
print(frame_key)
retries = 0
for prompt in prompt_list:
try:
label, similarity = classify_image(frame)
if label != "something else":
print('label found')
frame_dict[frame_key].append({label: similarity})
print('\n')
except Exception as e:
retries += 1
print(f"Error: {e}")
print(f"Retrying... (Attempt {retries}/{max_retries})")
if retries >= max_retries:
print("Max retries exceeded. Skipping frame.")
break
return frame_dict
# Example usage:
max_retries = 4
prompt_list = prompt_list
clip_results = process_video_frames(VIDEO, prompt_list,max_retries)
def process_video_frames(video_path, prompt_list, total_frames=160, total_seconds=80, stride_length=30,max_retries):
if not os.path.exists(video_path):
print(f"The specified video file '{video_path}' does not exist.")
return
frames_per_second = total_frames / total_seconds
frame_dict = {}
for frame_index, frame in enumerate(sv.get_video_frames_generator(source_path=video_path, stride=stride_length, start=0)):
frame_second = frame_index * (1 / frames_per_second)
frame_key = f"Frame {frame_index}: {frame_second:.2f} seconds"
frame_dict[frame_key] = []
print(frame_key)
retries = 0
for prompt in prompt_list:
try:
label, similarity = classify_image(frame)
if label != "something else":
print('label found')
frame_dict[frame_key].append({label: similarity})
print('\n')
except Exception as e:
retries += 1
print(f"Error: {e}")
print(f"Retrying... (Attempt {retries}/{max_retries})")
if retries >= max_retries:
print("Max retries exceeded. Skipping frame.")
break
return frame_dict
# Example usage:
max_retries = 4
prompt_list = prompt_list
clip_results = process_video_frames(VIDEO, prompt_list,max_retries)
Create JSON file and filter out low similarity classes¶
In [ ]:
Copied!
# Flatten the nested dictionary
data = clip_results
# Define the threshold based on the similarity score returned for the most similar prompt
threshold = 0.22
# Filter out key-value pairs below the threshold for each frame
filtered_data = [
{
frame: [
{key: value}
for item in items
for key, value in item.items()
if value > threshold
]
}
for frame, items in data.items()
]
print(filtered_data)
# Flatten the nested dictionary
data = clip_results
# Define the threshold based on the similarity score returned for the most similar prompt
threshold = 0.22
# Filter out key-value pairs below the threshold for each frame
filtered_data = [
{
frame: [
{key: value}
for item in items
for key, value in item.items()
if value > threshold
]
}
for frame, items in data.items()
]
print(filtered_data)
In [44]:
Copied!
# Specify the filename for the JSON file
import json
filename = f"{str(threshold)}.json"
# Write the dictionary to the JSON file
with open(filename, 'w') as json_file:
json.dump(filtered_data, json_file, indent=4) # The indent parameter is optional for pretty-printing
#print(f'Data has been written to {filename})
# Specify the filename for the JSON file
import json
filename = f"{str(threshold)}.json"
# Write the dictionary to the JSON file
with open(filename, 'w') as json_file:
json.dump(filtered_data, json_file, indent=4) # The indent parameter is optional for pretty-printing
#print(f'Data has been written to {filename})