Giskard-AI · rabah-khalek · Aug 8, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/examples/object_detection/test_face_detection.ipynb b/examples/object_detection/test_face_detection.ipynb
diff --git a/giskard_vision/object_detection/dataloaders/loaders.py b/giskard_vision/object_detection/dataloaders/loaders.py
@@ -1,6 +1,7 @@
+import json
 import os
 from pathlib import Path
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import cv2
 import numpy as np
@@ -10,6 +11,13 @@
 from giskard_vision.core.dataloaders.base import DataIteratorBase, PerformanceIssueMeta
 from giskard_vision.core.dataloaders.hf import HFDataLoader
 from giskard_vision.core.dataloaders.meta import MetaData
+from giskard_vision.core.dataloaders.utils import flatten_dict
+from giskard_vision.landmark_detection.dataloaders.loaders import (
+    DataLoader300W,
+    DataLoaderFFHQ,
+    EthicalIssueMeta,
+    PerformanceIssueMeta,
+)
 
 from ..types import Types
 
@@ -187,6 +195,251 @@ def get_image(self, idx: int) -> np.ndarray:
         return self.load_image_from_file(self.get_image_path(idx))
 
 
+class DataLoader300WFaceDetection(DataLoader300W):
+    """Data loader for the 300W dataset for face detection. Ref: https://ibug.doc.ic.ac.uk/resources/300-W/"""
+
+    def get_labels(self, idx: int) -> Optional[np.ndarray]:
+        """
+        Gets marks for a specific index after validation.
+        Args:
+            idx (int): Index of the data.
+        Returns:
+            Optional[np.ndarray]: Marks for the given index.
+        """
+        landmarks = super().get_labels(idx)
+
+        if landmarks is None:
+            return None
+
+        min_point = np.min(landmarks, axis=0)
+        max_point = np.max(landmarks, axis=0)
+
+        return {
+            "boxes": np.array([min_point[0], min_point[1], max_point[0], max_point[1]]),
+            "labels": "face",
+        }
+
+
+class DataLoaderFFHQFaceDetection(DataLoaderFFHQ):
+    """Data loader for the FFHQ (Flickr-Faces-HQ) dataset for face detection."""
+
+    def __init__(
+        self,
+        dir_path: Union[str, Path],
+        batch_size: Optional[int] = 1,
+        shuffle: Optional[bool] = False,
+        rng_seed: Optional[int] = None,
+    ) -> None:
+        super().__init__(dir_path, batch_size, shuffle, rng_seed)
+
+        # Load face bbox data
+        with (Path(dir_path) / "ffhq-dataset-meta.json").open(encoding="utf-8") as fp:
+            self.bboxes: Dict[int, List[float]] = {
+                int(k): [e for e in v["in_the_wild"]["face_rect"]]
+                + v["in_the_wild"]["pixel_size"]
+                + v["thumbnail"]["pixel_size"]
+                + v["image"]["pixel_size"]
+                for k, v in json.load(fp).items()
+            }
+
+    def get_labels(self, idx: int) -> Optional[np.ndarray]:
+        """
+        Gets marks for a specific index after validation.
+        Args:
+            idx (int): Index of the data.
+        Returns:
+            Optional[np.ndarray]: Marks for the given index.
+        """
+        original_bbox = self.bboxes.get(idx, None)
+        try:
+            with Path(self.images_dir_path / f"{idx:05d}.json").open(encoding="utf-8") as fp:
+                meta = json.load(fp)
+                w, h = original_bbox[8], original_bbox[9]
+                thumbnail_w, thumbnail_h = original_bbox[6], original_bbox[7]
+                return {
+                    "boxes": np.array(
+                        [
+                            meta[0]["faceRectangle"]["left"] * w / thumbnail_w,
+                            meta[0]["faceRectangle"]["top"] * h / thumbnail_h,
+                            (meta[0]["faceRectangle"]["left"] + meta[0]["faceRectangle"]["width"]) * w / thumbnail_w,
+                            (meta[0]["faceRectangle"]["top"] + meta[0]["faceRectangle"]["height"]) * h / thumbnail_h,
+                        ]
+                    ),
+                    "labels": "face",
+                }
+        except FileNotFoundError:
+            return np.array(original_bbox)
+
+    def get_meta(self, idx: int) -> Optional[Dict[str, Any]]:
+        """
+        Gets metadata for a specific index and flattens it.
+        Args:
+            idx (int): Index of the image.
+        Returns:
+            Optional[Dict[str, Any]]: Flattened metadata for the given index.
+        """
+        try:
+            with Path(self.images_dir_path / f"{idx:05d}.json").open(encoding="utf-8") as fp:
+                meta = json.load(fp)
+            flat_meta = self.process_hair_color_data(
+                flatten_dict(
+                    meta[0],
+                    excludes=[
+                        "faceRectangle_top",
+                        "faceRectangle_left",
+                        "faceRectangle_width",
+                        "faceRectangle_height",
+                    ],
+                )
+            )
+            flat_meta = self.process_emotions_data(flat_meta)
+            flat_meta_without_prefix = {key.replace("faceAttributes_", ""): value for key, value in flat_meta.items()}
+            flat_meta_without_prefix.pop("confidence")
+            return MetaData(
+                data=flat_meta_without_prefix,
+                categories=[
+                    "gender",
+                    "glasses",
+                    "exposure_exposureLevel",
+                    "noise_noiseLevel",
+                    "makeup_eyeMakeup",
+                    "makeup_lipMakeup",
+                    "occlusion_foreheadOccluded",
+                    "occlusion_eyeOccluded",
+                    "occlusion_mouthOccluded",
+                    "hair_invisible",
+                    "hairColor",
+                    "emotion",
+                ],
+                issue_groups={
+                    "smile": PerformanceIssueMeta,
+                    "headPose_pitch": PerformanceIssueMeta,
+                    "headPose_roll": PerformanceIssueMeta,
+                    "headPose_yaw": PerformanceIssueMeta,
+                    "gender": EthicalIssueMeta,
+                    "age": EthicalIssueMeta,
+                    "facialHair_moustache": EthicalIssueMeta,
+                    "facialHair_beard": EthicalIssueMeta,
+                    "facialHair_sideburns": EthicalIssueMeta,
+                    "glasses": EthicalIssueMeta,
+                    "emotion": PerformanceIssueMeta,
+                    "blur_blurLevel": PerformanceIssueMeta,
+                    "blur_value": PerformanceIssueMeta,
+                    "exposure_exposureLevel": PerformanceIssueMeta,
+                    "exposure_value": PerformanceIssueMeta,
+                    "noise_noiseLevel": PerformanceIssueMeta,
+                    "noise_value": PerformanceIssueMeta,
+                    "makeup_eyeMakeup": EthicalIssueMeta,
+                    "makeup_lipMakeup": EthicalIssueMeta,
+                    "occlusion_foreheadOccluded": PerformanceIssueMeta,
+                    "occlusion_eyeOccluded": PerformanceIssueMeta,
+                    "occlusion_mouthOccluded": PerformanceIssueMeta,
+                    "hair_bald": EthicalIssueMeta,
+                    "hair_invisible": PerformanceIssueMeta,
+                    "hairColor": EthicalIssueMeta,
+                },
+            )
+        except FileNotFoundError:
+            return None
+
+
+class DataLoaderFFHQFaceDetectionLandmark(DataLoaderFFHQ):
+    """Data loader for the FFHQ (Flickr-Faces-HQ) dataset for face detection."""
+
+    def get_labels(self, idx: int) -> Optional[np.ndarray]:
+        """
+        Gets marks for a specific index after validation.
+        Args:
+            idx (int): Index of the data.
+        Returns:
+            Optional[np.ndarray]: Marks for the given index.
+        """
+        landmarks = super().get_labels(idx)
+
+        if landmarks is None:
+            return None
+
+        min_point = np.min(landmarks, axis=0)
+        max_point = np.max(landmarks, axis=0)
+
+        return {
+            "boxes": np.array([min_point[0], min_point[1], max_point[0], max_point[1]]),
+            "labels": "face",
+        }
+
+    def get_meta(self, idx: int) -> Optional[Dict[str, Any]]:
+        """
+        Gets metadata for a specific index and flattens it.
+        Args:
+            idx (int): Index of the image.
+        Returns:
+            Optional[Dict[str, Any]]: Flattened metadata for the given index.
+        """
+        try:
+            with Path(self.images_dir_path / f"{idx:05d}.json").open(encoding="utf-8") as fp:
+                meta = json.load(fp)
+            flat_meta = self.process_hair_color_data(
+                flatten_dict(
+                    meta[0],
+                    excludes=[
+                        "faceRectangle_top",
+                        "faceRectangle_left",
+                        "faceRectangle_width",
+                        "faceRectangle_height",
+                    ],
+                )
+            )
+            flat_meta = self.process_emotions_data(flat_meta)
+            flat_meta_without_prefix = {key.replace("faceAttributes_", ""): value for key, value in flat_meta.items()}
+            flat_meta_without_prefix.pop("confidence")
+            return MetaData(
+                data=flat_meta_without_prefix,
+                categories=[
+                    "gender",
+                    "glasses",
+                    "exposure_exposureLevel",
+                    "noise_noiseLevel",
+                    "makeup_eyeMakeup",
+                    "makeup_lipMakeup",
+                    "occlusion_foreheadOccluded",
+                    "occlusion_eyeOccluded",
+                    "occlusion_mouthOccluded",
+                    "hair_invisible",
+                    "hairColor",
+                    "emotion",
+                ],
+                issue_groups={
+                    "smile": PerformanceIssueMeta,
+                    "headPose_pitch": PerformanceIssueMeta,
+                    "headPose_roll": PerformanceIssueMeta,
+                    "headPose_yaw": PerformanceIssueMeta,
+                    "gender": EthicalIssueMeta,
+                    "age": EthicalIssueMeta,
+                    "facialHair_moustache": EthicalIssueMeta,
+                    "facialHair_beard": EthicalIssueMeta,
+                    "facialHair_sideburns": EthicalIssueMeta,
+                    "glasses": EthicalIssueMeta,
+                    "emotion": PerformanceIssueMeta,
+                    "blur_blurLevel": PerformanceIssueMeta,
+                    "blur_value": PerformanceIssueMeta,
+                    "exposure_exposureLevel": PerformanceIssueMeta,
+                    "exposure_value": PerformanceIssueMeta,
+                    "noise_noiseLevel": PerformanceIssueMeta,
+                    "noise_value": PerformanceIssueMeta,
+                    "makeup_eyeMakeup": EthicalIssueMeta,
+                    "makeup_lipMakeup": EthicalIssueMeta,
+                    "occlusion_foreheadOccluded": PerformanceIssueMeta,
+                    "occlusion_eyeOccluded": PerformanceIssueMeta,
+                    "occlusion_mouthOccluded": PerformanceIssueMeta,
+                    "hair_bald": EthicalIssueMeta,
+                    "hair_invisible": PerformanceIssueMeta,
+                    "hairColor": EthicalIssueMeta,
+                },
+            )
+        except FileNotFoundError:
+            return None
+
+
 class DataLoaderFurnitureHuggingFaceDataset(HFDataLoader):
     """
     A data loader for the `Nfiniteai/living-room-passes` dataset on HF, extending the HFDataLoader class.

diff --git a/giskard_vision/object_detection/models/base.py b/giskard_vision/object_detection/models/base.py
@@ -0,0 +1,41 @@
+from typing import Any, Optional
+
+import numpy as np
+from PIL import Image
+
+from giskard_vision.core.models.hf_pipeline import HFPipelineModelBase, HFPipelineTask
+from giskard_vision.object_detection.types import Types
+
+
+class ObjectDetectionHFModel(HFPipelineModelBase):
+    """Hugging Face pipeline wrapper class that serves as a template for image classification predictions
+    Args:
+        model_id (str): Hugging Face model ID
+        name (Optional[str]): name of the model
+        device (str): device to run the model on
+    """
+
+    model_type = "object_detection"
+    prediction_result_cls = Types.prediction_result
+
+    def __init__(self, model_id: str, name: Optional[str] = None, device: str = "cpu"):
+        """init method that accepts a model id, name and device
+        Args:
+            model_id (str): Hugging Face model ID
+            name (Optional[str]): name of the model
+            device (str): device to run the model on
+        """
+
+        super().__init__(
+            model_id=model_id,
+            pipeline_task=HFPipelineTask.OBJECT_DETECTION,
+            name=name,
+            device=device,
+        )
+
+    def predict_raw(self, image: np.ndarray) -> Any:
+        """method that takes one image as input and outputs the raw predictions
+        Args:
+            image (np.ndarray): input image
+        """
+        return self.pipeline(Image.fromarray(image, "RGB"))
diff --git a/giskard_vision/object_detection/models/wrappers.py b/giskard_vision/object_detection/models/wrappers.py
@@ -1,12 +1,13 @@
 import os
-from typing import Optional
+from typing import Any, Optional
 
 import cv2
 import numpy as np
 import pandas as pd
 from tqdm.notebook import tqdm
 
 from giskard_vision.core.models.base import ModelBase
+from giskard_vision.object_detection.models.base import ObjectDetectionHFModel
 from giskard_vision.utils.errors import GiskardImportError
 
 
@@ -242,3 +243,36 @@ def train(self, data_path):
         reduce_lr = ReduceLROnPlateau(monitor="IoU", factor=0.2, patience=PATIENCE, min_lr=1e-7, verbose=1, mode="max")
 
         self.model.fit(batch_images, gt, epochs=100, callbacks=[stop, reduce_lr], verbose=2)
+
+
+class DetrFinetunedFaceDetectionHuggingFaceModel(ObjectDetectionHFModel):
+    """Wrapper class for goshiv's detr finetuned face detection model on Hugging Face.
+    Args:
+        name (str): The name of the model.
+        device (str): The device to run the model on.
+    """
+
+    def __init__(self, name: str = None, device: str = "cpu"):
+        super().__init__(
+            model_id="goshiv/detr-finetuned-face",
+            name=name,
+            device=device,
+        )
+
+    def predict_image(self, image: np.ndarray) -> Any:
+        raw_predictions = super().predict_raw(image)
+
+        # Filter out predictions with a highest score
+        best_prediction = max(raw_predictions, key=lambda x: x["score"])
+
+        return {
+            "boxes": np.array(
+                [
+                    best_prediction["box"]["xmin"],
+                    best_prediction["box"]["ymin"],
+                    best_prediction["box"]["xmax"],
+                    best_prediction["box"]["ymax"],
+                ]
+            ),
+            "labels": "face",
+        }