Supervision simplifies the process of working with vision models. It offers connectors to popular model libraries, a plethora of visualizers (annotators), powerful post-processing features and an easy learning curve.
pip install supervision inference -q
wget https://media.roboflow.com/notebooks/examples/dog.jpeg
import cv2
import supervision as sv
from inference import get_model
image = cv2.imread("dog.jpeg")
model = get_model(model_id="yolov8n-640")
results = model.infer(image)[0]
detections = sv.Detections.from_inference(results)
annotated_image = sv.BoxAnnotator().annotate(
scene=image.copy(), detections=detections
)
annotated_image = sv.LabelAnnotator().annotate(
scene=annotated_image, detections=detections
)
sv.plot_image(annotated_image)
Load a single image
import cv2
image = cv2.imread("dog.jpeg")
Iterate over video frames
for frame in sv.get_video_frames_generator(source_path=<VIDEO_PATH>):
print(frame.shape)
Run a function over every frame, save output
import numpy as np
def callback(scene: np.ndarray, index: int) -> np.ndarray:
print(f"Processing frame {index}")
return scene;
sv.process_video(
source_path=<SOURCE_VIDEO_PATH>,
target_path="out.mp4",
callback=callback)
import cv2
import supervision as sv
image = cv2.imread("dog.jpeg")
from inference import get_model
model = get_model(model_id="yolov8n-640")
results = model.infer(image)[0]
detections = sv.Detections.from_inference(results)
from ultralytics import YOLO
model = YOLO("yolov8n.pt")
results = model(image)[0]
detections = sv.Detections.from_ultralytics(results)
import torch
from PIL import Image
from transformers import DetrImageProcessor, DetrForObjectDetection
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
image = Image.open("dog.jpeg")
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
target_size = torch.tensor([[image.size[1], image.size[0]]])
results = processor.post_process_object_detection(
outputs=outputs, target_sizes=target_size)[0]
detections = sv.Detections.from_transformers(
transformers_results=results,
id2label=model.config.id2label)
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()
annotated_image = box_annotator.annotate(
scene=image.copy(), detections=detections)
annotated_image = label_annotator.annotate(
scene=annotated_image, detections=detections)
sv.plot_image(annotated_image)
inference
and ultralytics
, you only need to change the
model ID:from inference import get_model
model = get_model(model_id="yolov8n-seg-640")
results = model.infer(image)[0]
detections = sv.Detections.from_inference(results)
from ultralytics import YOLO
model = YOLO("yolov8n-seg.pt")
results = model(image)[0]
detections = sv.Detections.from_ultralytics(results)
mask_annotator = sv.MaskAnnotator()
label_annotator = sv.LabelAnnotator()
annotated_image = mask_annotator.annotate(
scene=image.copy(), detections=detections)
annotated_image = label_annotator.annotate(
scene=annotated_image, detections=detections)
sv.plot_image(annotated_image)
import cv2
import supervision as sv
image = cv2.imread("dog.jpeg")
from inference import get_model
model = get_model(model_id="yolov8s-pose-640")
results = model.infer(image)[0]
key_points = sv.KeyPoints.from_inference(results)
from ultralytics import YOLO
model = YOLO("yolov8s-pose.pt")
results = model(image)[0]
key_points = sv.KeyPoints.from_ultralytics(results)
import torch
import super_gradients
device = "cuda" if torch.cuda.is_available() else "cpu"
model = super_gradients.training.models.get(
"yolo_nas_pose_s", pretrained_weights="coco_pose").to(device)
results = model.predict(image, conf=0.1)
key_points = sv.KeyPoints.from_yolo_nas(results)
⚠️ Available in pre-release: pip install git+https://github.com/roboflow/supervision.git@develop
import mediapipe as mp
image_height, image_width, _ = image.shape
mediapipe_image = mp.Image(
image_format=mp.ImageFormat.SRGB,
data=cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
options = mp.tasks.vision.PoseLandmarkerOptions(
base_options=mp.tasks.BaseOptions(
model_asset_path="pose_landmarker_heavy.task"
),
running_mode=mp.tasks.vision.RunningMode.IMAGE,
num_poses=2)
PoseLandmarker = mp.tasks.vision.PoseLandmarker
with PoseLandmarker.create_from_options(options) as landmarker:
pose_landmarker_result = landmarker.detect(mediapipe_image)
key_points = sv.KeyPoints.from_mediapipe(
pose_landmarker_result, (image_width, image_height))
vertex_annotator = sv.VertexAnnotator(radius=10)
edge_annotator = sv.EdgeAnnotator(thickness=5)
annotated_frame = edge_annotator.annotate(
scene=image.copy(),
key_points=key_points
)
annotated_frame = vertex_annotator.annotate(
scene=annotated_frame,
key_points=key_points)
import cv2
import supervision as sv
from inference import get_model
video_info = sv.VideoInfo.from_video_path(video_path=<VIDEO_PATH>)
frames_generator = sv.get_video_frames_generator(source_path=<VIDEO_PATH>)
model = get_model("yolov8s-640")
tracker = sv.ByteTrack()
smoother = sv.DetectionsSmoother()
trace_annotator = sv.TraceAnnotator()
with sv.VideoSink(target_path="out.mp4", video_info=video_info) as sink:
for frame in frames_generator:
results = model.infer(frame)[0]
detections = sv.Detections.from_inference(results)
detections = tracker.update_with_detections(detections)
detections = smoother.update_with_detections(detections)
annotated_frame = trace_annotator.annotate(
frame.copy(), detections)
sink.write_frame(frame=annotated_frame)
frames_generator = sv.get_video_frames_generator(source_path=<VIDEO_PATH>)
model = get_model("yolov8s-640")
tracker = sv.ByteTrack()
start, end = sv.Point(x=0, y=500), sv.Point(x=200, y=1000)
line_zone = sv.LineZone(start=start, end=end)
for frame in frames_generator:
results = model.infer(frame)[0]
detections = sv.Detections.from_inference(results)
detections = tracker.update_with_detections(detections)
crossed_in, crossed_out = line_zone.trigger(detections)
print(line_zone.in_count, line_zone.out_count)
InferenceSlicer breaks the image into small parts and runs the model on each one
import cv2
import supervision as sv
from inference import get_model
image = cv2.imread("dog.jpeg")
model = get_model("yolov8s-640")
def callback(image_slice: np.ndarray) -> sv.Detections:
results = model.infer(image_slice)[0]
return sv.Detections.from_inference(results)
slicer = sv.InferenceSlicer(
callback=callback,
overlap_filter_strategy=sv.OverlapFilter.NON_MAX_SUPPRESSION,
)
detections = slicer(image)
frames_generator = sv.get_video_frames_generator(source_path=<VIDEO_PATH>)
model = get_model("yolov8s-640")
tracker = sv.ByteTrack()
polygon = np.array([[100, 200], [200, 100], [300, 200], [200, 300]])
polygon_zone = sv.PolygonZone(polygon=polygon)
for frame in frames_generator:
results = model.infer(frame)[0]
detections = sv.Detections.from_inference(results)
detections = tracker.update_with_detections(detections)
is_detections_in_zone = polygon_zone.trigger(detections)
print(polygon_zone.current_count)
frames_generator = sv.get_video_frames_generator(<VIDEO_PATH>)
model = get_model("yolov8s-640")
csv_sink = sv.CSVSink("out.csv")
with csv_sink as sink:
for frame in frames_generator:
results = model.infer(frame)[0]
detections = sv.Detections.from_inference(results)
sink.append(
detections, custom_data={"<YOUR_LABEL>":"<YOUR_DATA>"})
frames_generator = sv.get_video_frames_generator(<VIDEO_PATH>)
model = get_model("yolov8s-640")
json_sink = sv.JSONSink("out.json")
with json_sink as sink:
for frame in frames_generator:
results = model.infer(frame)[0]
detections = sv.Detections.from_inference(results)
sink.append(
detections, custom_data={"<YOUR_LABEL>":"<YOUR_DATA>"})
pip install peft -q
from inference.models.paligemma.paligemma import PaliGemma
from PIL import Image
import supervision as sv
image = Image.open("dog.jpeg")
prompt = "Detect the dog."
pg = PaliGemma(model_id= api_key="<ROBOFLOW_API_KEY>")
results = pg.predict(image, prompt)
detections = sv.Detections.from_lmm(
sv.LMM.PALIGEMMA,
results,
resolution_wh=(1000, 1000),
classes=["cat", "dog"]
)
import supervision as sv
from supervision.metrics import F1Score
predictions = sv.Detections(...)
targets = sv.Detections(...)
f1_metric = F1Score()
f1_result = f1_metric.update(predictions, targets).compute()
print(f1_result)
print(f1_result.f1_50)
print(f1_result.small_objects.f1_50)
Empty detections. Returned by every model when nothing is detected.
empty_detections = sv.Detections.empty()
if empty_detections.is_empty():
print("Nothing was detected!")
Count detected objects
len(detections)
Loop over detection results
for xyxy, mask, confidence, class_id, tracker_id, data in detections:
print(xyxy, mask, confidence, class_id, tracker_id, data)
Filter detections by class
detections = sv.Detections.from_inference(results)
detections = detections[detections.class_id == 0]
Filter by class name
detections = sv.Detections.from_inference(results)
detections = detections[detections.data["class_name"] == "cat"]
Merge multiple sv.Detections
detections1 = sv.Detections.from_inference(results1)
detections2 = sv.Detections.from_inference(results2)
merged_detections = sv.Detections.merge([detections1, detections2])
supervision provides a handful of videos for testing
pip install "supervision[assets]" -q
from supervision.assets import download_assets, VideoAssets
download_assets(VideoAssets.VEHICLES)
print(VideoAssets.VEHICLES.value)
Crop image
cropped_image = sv.crop_image(image=image, xyxy=[200, 400, 600, 800])
Scale image
scaled_image = sv.scale_image(image=image, scale_factor=0.5)
Resize image
resized_image = sv.resize_image(
image=image, resolution_wh=(1000, 1000), keep_aspect_ratio=True)
Letterbox image (resize + pad)
letterboxed_image = sv.letterbox_image(
image=image, resolution_wh=(1000, 1000))
Overlay image
overlay = np.zeros((400, 400, 3), dtype=np.uint8)
resulting_image = sv.overlay_image(
image=image, overlay=overlay, anchor=(200, 400)
Install custom branch of supervision
pip install git+https://github.com/YourName/supervision.git@your-branch
Display image in Colab by converting to PIL
sv.cv2_to_pillow(frame)
Display image in Colab by plotting with matplotlib
%matplotlib inline
sv.plot_image(frame)