From 310ec0786d53c634bae150fab98dd8a9fc1033ff Mon Sep 17 00:00:00 2001 From: Paul Tunison Date: Mon, 16 Dec 2024 15:56:55 -0500 Subject: [PATCH] Add inference script for ultralytics yolo object models --- .../object_detection/yolov11_detect.py | 409 ++++++++++++++++++ .../object_detection/yolov8_detect.py | 17 +- pyproject.toml | 1 + 3 files changed, 424 insertions(+), 3 deletions(-) create mode 100644 angel_system/object_detection/yolov11_detect.py diff --git a/angel_system/object_detection/yolov11_detect.py b/angel_system/object_detection/yolov11_detect.py new file mode 100644 index 000000000..168512a04 --- /dev/null +++ b/angel_system/object_detection/yolov11_detect.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 + +from collections import defaultdict +import logging +from pathlib import Path +import random +from typing import Dict +from typing import Optional +from typing import Sequence +import warnings + +import click +import cv2 +import kwcoco +import moviepy.video.io.ImageSequenceClip +import numpy as np +import torch +import ubelt as ub +from ultralytics import YOLO + +from angel_system.object_detection.yolov8_detect import predict_hands + + +LOG = logging.getLogger(__name__) + + +def plot_one_box(xywh, img, color=None, label=None, line_thickness=1) -> None: + """ + Plot one detection box into the given image with CV2. + + Based on the similar function from YOLO v7 plotting code. + + :param xywh: Extent of the box to plot in xywh format, where the xy is the + upper-left coordinate of the box. + :param img: is the image matrix to draw the box into. + :param color: Optional RGB value tuple of the color to draw. + :param label: Optional text label to draw for the box. + :param line_thickness: Thickness of the box lines to draw. + """ + # Plots one bounding box on image img + tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(xywh[0]), int(xywh[1])), (int(xywh[0] + xywh[2]), int(xywh[1] + xywh[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled + cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA) + + +@click.command() +@click.help_option("-h", "--help") +@click.option( + "-i", "--input-coco-file", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help=( + "MS-COCO file specifying image files to perform object detection over. " + "Image and Video sections from this COCO file will be maintained in the " + "output COCO file." + ), + required=True, +) +@click.option( + "--img-root", + type=click.Path(exists=True, file_okay=False, path_type=Path), + default=None, + help=( + "Optional override for the input COCO dataset bundle root. This is " + "necessary when the input COCO file uses relative paths and the COCO " + "file itself is not located in the bundle root directory." + ), +) +@click.option( + "-o", "--output-coco-file", + type=click.Path(dir_okay=False, path_type=Path), + help="Output COCO file to write object detection results.", + required=True, +) +@click.option( + "--model-hands", "hand_model_ckpt", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="Model checkpoint for the Yolo v8 hand detector.", + required=True, +) +@click.option( + "--model-objects", "objs_model_ckpt", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="Model checkpoint for the Yolo v7 object detector.", + required=True, +) +@click.option( + "-e", "--exclude-obj-class", + "obj_exclude_classes", + multiple=True, + help=( + "Exclude these object classes from the class list provided by the " + "object model. This is for when the object model was trained with " + "some classes excluded, but YOLO provided the metadata for them " + "anyway." + ) +) +@click.option( + "--model-device", + default="", + help="The CUDA device to use, i.e. '0' or '0,1,2,3' or 'cpu'." +) +@click.option( + "--obj-img-size", + type=int, + help=( + "Data input size for the detection models for objects. This should be " + "a multiple of the model's stride parameter." + ) +) +@click.option( + "--hand-img-size", + type=int, + help=( + "Data input size for the detection model for hands. This should be a " + "multiple of the model's stride parameter." + ) +) +@click.option( + "--conf-thresh", + type=float, + default=0.25, + help=( + "Object confidence threshold. Predicted objects with confidence less " + "than this will not be considered for output." + ), +) +@click.option( + "--iou-thresh", + type=float, + default=0.45, + help=( + "IoU threshold used during NMS to filter out overlapping bounding " + "boxes." + ), +) +@click.option( + "--save-img", "save_dir", + type=click.Path(file_okay=False, path_type=Path), + default=None, + help=( + "Optionally enable the plotting of detections back to the image and " + "saving them out to disk, rooted in this directory. Only detections " + "with confidence above our configured threshold will be considered " + "for plotting." + ) +) +@click.option( + "--top-k", "save_top_k", + type=int, + default=None, + help=( + "Optionally specify that only the top N confidence detections should " + "be saved to the output images. If this is not provided, all " + "detections with confidence above the --conf-thres value will be " + "plotted. This only applies to objects, not detected hands by that " + "respective model." + ) +) +@click.option( + "--save-vid", + is_flag=True, + help=( + "Optionally enable the creation of an MP4 video from the images " + "rendered due to --save-img. This option only has an effect if the " + "--save-img option is provided. The video file will be save next to " + "the directory into which component images are saved." + ) +) +@torch.inference_mode() +def yolo_v11_inference_objects( + input_coco_file: Path, + img_root: Optional[Path], + output_coco_file: Path, + hand_model_ckpt: Path, + objs_model_ckpt: Path, + obj_exclude_classes: Sequence[str], + model_device: str, + obj_img_size: int, + hand_img_size: int, + conf_thresh: float, + iou_thresh: float, + save_dir: Optional[Path], + save_top_k: Optional[int], + save_vid: bool, +): + """ + Script for use in generating object detection results based on an input + COCO file's video/image specifications. + + Expected use-case: generate object detections for video frames (images) + that we have activity classification truth for. + + \b + Example: + python3 yolo_v11_inference_objects \\ + """ + logging.basicConfig( + level=logging.INFO, + ) + + guiding_dset = kwcoco.CocoDataset(input_coco_file, bundle_dpath=img_root) + + # Prevent overwriting an existing file. These are expensive to compute so + # we don't want to mess that up. + if output_coco_file.is_file(): + raise ValueError( + f"Output COCO file already exists, refusing to overwrite: " + f"{output_coco_file}" + ) + output_coco_file.parent.mkdir(parents=True, exist_ok=True) + dset = kwcoco.CocoDataset() + dset.fpath = output_coco_file.as_posix() + + object_model = YOLO(objs_model_ckpt, task="detect") + LOG.info( + "Loaded object model with classes:\n" + + "\n".join(f'\t- [{n[0]}] "{n[1]}"' for n in object_model.names.items()) + ) + hand_model = YOLO(hand_model_ckpt, task="detect") + LOG.info( + "Loaded hand model with classes:\n" + + "\n".join(f'\t- [{n[0]}] "{n[1]}"' for n in hand_model.names.items()) + ) + + # TODO: option to use tensor RT -- Convert to a known location and reload + # models. + + cls_names = [p[1] for p in sorted(object_model.names.items())] + cls_colors = [[random.randint(0, 255) for _ in range(3)] for _ in cls_names] + + # Port over the videos and images sections from the input dataset to the + # new one. + dset.dataset['videos'] = guiding_dset.dataset['videos'] + dset.dataset['images'] = guiding_dset.dataset['images'] + dset.index.build(dset) + # Equality can later be tested with: + # guiding_dset.index.videos == dset.index.videos + # guiding_dset.index.imgs == dset.index.imgs + + # Add categories + for cls_name in obj_exclude_classes: + if cls_name not in cls_names: + warnings.warn( + f"Requested exclusion of object class named \"{cls_name}\", " + f"however this class is not present in the object model." + ) + exclude_set = set(obj_exclude_classes) + for i, object_label in enumerate(cls_names): + if object_label not in exclude_set: + dset.ensure_category(name=object_label, id=i) + else: + LOG.info(f"Excluding object model class: \"{object_label}\"") + # Inject categories for the hand-model additions. + left_hand_cid = dset.ensure_category(name="hand (left)") + right_hand_cid = dset.ensure_category(name="hand (right)") + hands_cat_to_cid = {"hand (left)": left_hand_cid, + "hand (right)": right_hand_cid} + + # model warm-up going into the prediction loop + LOG.info("Warming up models...") + warmup_image = np.random.randint(0, 255, (16, 16, 3), dtype=np.uint8) + object_model(source=warmup_image, device=model_device, verbose=False) + hand_model(source=warmup_image, device=model_device, verbose=False) + LOG.info("Warming up models... Done") + + # ------------------------------------------------------------------------- + # Generate object/hand predictions + + # Mapping of video_id to the filepath for which component frames have been + # written out to. + video_image_outputs: Dict[int, Dict[int, str]] = defaultdict(dict) + # Dictionary of image output directories per video ID + video_id_to_frame_dir: Dict[int, Path] = dict() + + object_predict_kwargs = dict( + conf=conf_thresh, + device=model_device, + nms=True, + verbose=False, + ) + if obj_img_size is not None: + object_predict_kwargs["imgsz"] = obj_img_size + + hand_predict_kwargs = dict( + hand_model=hand_model, + device=model_device, + ) + if hand_img_size is not None: + hand_predict_kwargs["imgsz"] = hand_img_size + + for img_id in ub.ProgIter( + dset.images(), + desc="Processing Images", + verbose=3, + ): + img_path: Path = dset.get_image_fpath(img_id) + img0 = cv2.imread(img_path.as_posix()) + + # returns list of length=num images, which is always 1 here. + object_preds = object_model.predict(source=img0, **object_predict_kwargs)[0] + + hand_boxes, hand_labels, hand_confs = predict_hands( + img0=img0, + **hand_predict_kwargs, + ) + + # YOLO xywh output defines the xy as the cetner point, not the + # upper-left as required by the COCO format, thus take the xyxy output + # and subtracting out the upper-left. + obj_box_xywh = object_preds.boxes.xyxy.cpu() + obj_box_xywh[:, 2:] -= obj_box_xywh[:, :2] + obj_box_areas = torch.multiply(obj_box_xywh[:, 2], obj_box_xywh[:, 3]) + for box_xywh, box_cls, box_conf, box_area in zip( + obj_box_xywh.tolist(), + object_preds.boxes.cls.to(int).tolist(), + object_preds.boxes.conf.tolist(), + obj_box_areas.tolist(), + ): + dset.add_annotation( + image_id=img_id, + category_id=box_cls, + bbox=box_xywh, + score=box_conf, + area=box_area, + ) + if save_dir is not None: + plot_one_box( + box_xywh, + img0, + color=cls_colors[box_cls], + label=f"{cls_names[box_cls]} {box_conf:.2f}", + ) + + # Convert hand box XYXY coordinates into XYWH where XY is the + # upper-left. + hand_boxes_xywh = np.asarray(hand_boxes).reshape(-1, 4) + hand_boxes_xywh[:, 2:] -= hand_boxes_xywh[:, :2] + hand_areas = np.multiply(hand_boxes_xywh[:, 2], hand_boxes_xywh[:, 3]) + for box_xywh, box_lbl, box_conf, box_area in zip( + hand_boxes_xywh.tolist(), + hand_labels, + hand_confs, + hand_areas, + ): + box_cls = hands_cat_to_cid[box_lbl] + dset.add_annotation( + image_id=img_id, + category_id=box_cls, + bbox=box_xywh, + score=box_conf, + area=box_area, + ) + if save_dir is not None: + plot_one_box( + box_xywh, + img0, + color=[0, 0, 0], + label=f"{box_lbl} {box_conf:.2f}", + ) + + # Optionally draw object detection results to an image. + # If we want to save as a video, also save the paths so we can create + # the video after detecting everything. + if save_dir is not None: + vid_id = dset.index.imgs[img_id]["video_id"] + if vid_id not in video_id_to_frame_dir: + vid_obj = dset.index.videos[vid_id] + save_imgs_dir = save_dir / Path(vid_obj["name"]).stem + save_imgs_dir.mkdir(parents=True, exist_ok=True) + video_id_to_frame_dir[vid_id] = save_imgs_dir + save_path = (video_id_to_frame_dir[vid_id] / img_path.name).as_posix() + if not cv2.imwrite(save_path, img0): + raise RuntimeError(f"Failed to write debug image: {save_path}") + img_obj = dset.index.imgs[img_id] + video_image_outputs[vid_id][img_obj["frame_index"]] = save_path + + # If configured, create and save videos of debug images for each video + # effectively processed. + if save_dir and save_vid: + for vid_id, frame_set in ub.ProgIter( + video_image_outputs.items(), + desc="Creating Videos", + verbose=3, + ): + frame_set: Dict[int, str] + vid_obj = dset.index.videos[vid_id] + video_save_path = save_dir / f"{Path(vid_obj['name']).stem}-objects.mp4" + vid_frames = [p[0] for p in sorted(frame_set.items())] + clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip( + vid_frames, + fps=vid_obj["framerate"] + ) + clip.write_videofile(video_save_path.as_posix()) + LOG.info(f"Saved video to: {video_save_path}") + + LOG.info(f"Saving output COCO file... ({output_coco_file})") + dset.dump(dset.fpath, newlines=True) + LOG.info(f"Saved output COCO file: {output_coco_file}") + + +if __name__ == "__main__": + yolo_v11_inference_objects() diff --git a/angel_system/object_detection/yolov8_detect.py b/angel_system/object_detection/yolov8_detect.py index c2e676952..e062ee9b9 100644 --- a/angel_system/object_detection/yolov8_detect.py +++ b/angel_system/object_detection/yolov8_detect.py @@ -1,15 +1,26 @@ -import numpy as np +from typing import Optional +import numpy as np from ultralytics import YOLO as YOLOv8 -def predict_hands(hand_model: YOLOv8, img0: np.array, device: str, imgsz: int) -> tuple: +def predict_hands( + hand_model: YOLOv8, + img0: np.array, + device: str, + imgsz: Optional[int] = None +) -> tuple: """Predict hands using a YOLOv8 hand model and update the labels to be hand(left) and hand(right) + + Boxes returned are in xyxy format (upper-left, lower-right). """ width, height = img0.shape[:2] + pred_kwargs = {} + if imgsz: + pred_kwargs["imgsz"] = imgsz hands_preds = hand_model.predict( - source=img0, conf=0.1, imgsz=imgsz, device=device, verbose=False + source=img0, conf=0.1, device=device, verbose=False, **pred_kwargs )[ 0 ] # list of length=num images diff --git a/pyproject.toml b/pyproject.toml index ad397e6bc..cb2a5dee3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -126,6 +126,7 @@ pytest-cov = "*" # Evaluation ptg_eval_activity = "angel_system.ptg_eval.activity_classification.evaluate_activity:main" ptg_eval_step = "angel_system.ptg_eval.step_completion.evaluate_step:main" +yolo_v11_inference_objects = "angel_system.object_detection.yolov11_detect:yolo_v11_inference_objects" [tool.poetry.plugins."smqtk_plugins"] # Activity detector