From 415ee8c494fed696428a74544218ff99ebca1d61 Mon Sep 17 00:00:00 2001 From: Jacob Sela Date: Wed, 15 Jan 2025 17:27:24 -0800 Subject: [PATCH 1/3] open clip fix --- fiftyone/utils/open_clip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fiftyone/utils/open_clip.py b/fiftyone/utils/open_clip.py index d51c350144..40230cc6c6 100644 --- a/fiftyone/utils/open_clip.py +++ b/fiftyone/utils/open_clip.py @@ -88,7 +88,7 @@ def _load_model(self, config): ( self._model, _, - self.preprocess, + self._preprocess, ) = open_clip.create_model_and_transforms( config.clip_model, pretrained=config.pretrained, @@ -134,7 +134,7 @@ def _get_class_logits(self, text_features, image_features): def _predict_all(self, imgs): if self._preprocess: - imgs = [self._preprocess(img).unsqueeze(0) for img in imgs] + imgs = [self._preprocess(img) for img in imgs] if isinstance(imgs, (list, tuple)): imgs = torch.stack(imgs) From 23d6a6d01532f053cba5ba9b0af23d6ad757b508 Mon Sep 17 00:00:00 2001 From: Jacob Sela Date: Sun, 19 Jan 2025 14:39:19 -0800 Subject: [PATCH 2/3] readded preprocessor fix --- fiftyone/utils/open_clip.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fiftyone/utils/open_clip.py b/fiftyone/utils/open_clip.py index 40230cc6c6..ed2527fddf 100644 --- a/fiftyone/utils/open_clip.py +++ b/fiftyone/utils/open_clip.py @@ -57,6 +57,7 @@ class TorchOpenClipModel(fout.TorchImageModel, fom.PromptMixin): def __init__(self, config): super().__init__(config) self._text_features = None + self.preprocess = self._preprocess_aux @property def can_embed_prompts(self): @@ -88,7 +89,7 @@ def _load_model(self, config): ( self._model, _, - self._preprocess, + self._preprocess_aux, ) = open_clip.create_model_and_transforms( config.clip_model, pretrained=config.pretrained, From cb284818031ce2b080873652fafcdedff2776b0d Mon Sep 17 00:00:00 2001 From: brimoor Date: Mon, 20 Jan 2025 16:22:48 -0500 Subject: [PATCH 3/3] only use autocast when using GPU --- fiftyone/utils/open_clip.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fiftyone/utils/open_clip.py b/fiftyone/utils/open_clip.py index ed2527fddf..45f8897651 100644 --- a/fiftyone/utils/open_clip.py +++ b/fiftyone/utils/open_clip.py @@ -5,6 +5,7 @@ | `voxel51.com `_ | """ +import contextlib import logging import fiftyone.core.models as fom @@ -143,12 +144,15 @@ def _predict_all(self, imgs): height, width = imgs.size()[-2:] frame_size = (width, height) - if self._using_gpu: - imgs = imgs.to(self.device) + with torch.no_grad(), contextlib.ExitStack() as ctx: + if self._using_gpu: + imgs = imgs.to(self.device) + + # https://github.com/voxel51/fiftyone/pull/5395#issuecomment-2601055784 + ctx.enter_context( + torch.amp.autocast(device_type=self.device.type) + ) - with torch.no_grad(), torch.amp.autocast( - device_type=self.device.type if self._using_gpu else "cpu" - ): image_features = self._model.encode_image(imgs) text_features = self._get_text_features()