voxel51 · jacobsela · Jan 21, 2025 · Jan 16, 2025 · Jan 19, 2025 · Jan 20, 2025
diff --git a/fiftyone/utils/open_clip.py b/fiftyone/utils/open_clip.py
@@ -5,6 +5,7 @@
 | `voxel51.com <https://voxel51.com/>`_
 |
 """
+import contextlib
 import logging
 
 import fiftyone.core.models as fom
@@ -57,6 +58,7 @@ class TorchOpenClipModel(fout.TorchImageModel, fom.PromptMixin):
     def __init__(self, config):
         super().__init__(config)
         self._text_features = None
+        self.preprocess = self._preprocess_aux
 
     @property
     def can_embed_prompts(self):
@@ -88,7 +90,7 @@ def _load_model(self, config):
         (
             self._model,
             _,
-            self.preprocess,
+            self._preprocess_aux,
         ) = open_clip.create_model_and_transforms(
             config.clip_model,
             pretrained=config.pretrained,
@@ -134,20 +136,23 @@ def _get_class_logits(self, text_features, image_features):
 
     def _predict_all(self, imgs):
         if self._preprocess:
-            imgs = [self._preprocess(img).unsqueeze(0) for img in imgs]
+            imgs = [self._preprocess(img) for img in imgs]
 
         if isinstance(imgs, (list, tuple)):
             imgs = torch.stack(imgs)
 
         height, width = imgs.size()[-2:]
         frame_size = (width, height)
 
-        if self._using_gpu:
-            imgs = imgs.to(self.device)
+        with torch.no_grad(), contextlib.ExitStack() as ctx:
+            if self._using_gpu:
+                imgs = imgs.to(self.device)
+
+                # https://github.com/voxel51/fiftyone/pull/5395#issuecomment-2601055784
+                ctx.enter_context(
+                    torch.amp.autocast(device_type=self.device.type)
+                )
 
-        with torch.no_grad(), torch.amp.autocast(
-            device_type=self.device.type if self._using_gpu else "cpu"
-        ):
             image_features = self._model.encode_image(imgs)
             text_features = self._get_text_features()