From 997e98949c8ae3df9856c1a17c1f295aed73bf1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 26 Oct 2023 21:04:50 +0200 Subject: [PATCH] fix: mark images as deleted in DB when deleted on Product Opener --- robotoff/app/api.py | 1 + robotoff/images.py | 69 +++++++++++++++++- robotoff/workers/tasks/product_updated.py | 13 +++- tests/integration/test_images.py | 87 +++++++++++++++++++++++ 4 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 tests/integration/test_images.py diff --git a/robotoff/app/api.py b/robotoff/app/api.py index 1f1c395ad9..305e4c3b5d 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -1217,6 +1217,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): settings.UPDATED_PRODUCT_WAIT, job_kwargs={"result_ttl": 0}, product_id=product_id, + diffs=diffs, ) elif action == "deleted": enqueue_job( diff --git a/robotoff/images.py b/robotoff/images.py index f00369d88a..6cb45f28a9 100644 --- a/robotoff/images.py +++ b/robotoff/images.py @@ -6,7 +6,7 @@ import numpy as np from PIL import Image -from robotoff.models import ImageModel +from robotoff.models import ImageModel, Prediction, ProductInsight from robotoff.off import generate_image_path, generate_image_url from robotoff.types import JSONType, ProductIdentifier from robotoff.utils import get_image_from_url, get_logger, http_session @@ -167,3 +167,70 @@ def generate_image_fingerprint(image: Image.Image) -> int: # convert the 64-bit array to a 64 bits integer fingerprint = int_array.dot(2 ** np.arange(int_array.size)[::-1]) return fingerprint + + +def delete_images(product_id: ProductIdentifier, image_ids: list[str]): + """Delete images and related items in DB. + + This function must be called when Robotoff gets notified of an image + deletion. It proceeds as follow: + + - mark the image as `deleted` in the `image` table + - delete all predictions associated with the image (`prediction` table) + - delete all non-annotated insights associated with the image + (`product_insight` table). Annotated insights are kept for reference. + + :param product_id: identifier of the product + :param image_ids: a list of image IDs to delete. + Each image ID must be a digit. + """ + server_type = product_id.server_type.name + # Perform batching as we don't know the number of images to delete + updated_models = [] + source_images = [] + for image_id in image_ids: + source_image = generate_image_path(product_id, image_id) + image_model = ImageModel.get_or_none( + source_image=source_image, server_type=server_type + ) + + if image_model is None: + logger.info( + "image to delete %s for product %s not found in DB, skipping", + image_id, + product_id, + ) + continue + + # set the `deleted` flag to True: image models are always kept in DB + image_model.deleted = True + updated_models.append(image_model) + source_images.append(source_image) + + updated_image_models: int = ImageModel.bulk_update( + updated_models, fields=["deleted"] + ) + deleted_predictions: int = ( + Prediction.delete() + .where( + Prediction.source_image.in_(source_images), + Prediction.server_type == server_type, + ) + .execute() + ) + deleted_insights: int = ( + ProductInsight.delete() + .where( + ProductInsight.source_image.in_(source_images), + ProductInsight.server_type == server_type, + ProductInsight.annotation.is_null(), + ) + .execute() + ) + + logger.info( + "deleted %s image in DB, %s deleted predictions, %s deleted insights", + updated_image_models, + deleted_predictions, + deleted_insights, + ) diff --git a/robotoff/workers/tasks/product_updated.py b/robotoff/workers/tasks/product_updated.py index ba6f3f3f97..2b85751979 100644 --- a/robotoff/workers/tasks/product_updated.py +++ b/robotoff/workers/tasks/product_updated.py @@ -1,5 +1,6 @@ import requests +from robotoff.images import delete_images from robotoff.insights.extraction import get_predictions_from_product_name from robotoff.insights.importer import import_insights, refresh_insights from robotoff.models import with_db @@ -14,7 +15,7 @@ @with_db -def update_insights_job(product_id: ProductIdentifier): +def update_insights_job(product_id: ProductIdentifier, diffs: JSONType) -> None: """This job is triggered by the webhook API, when product information has been updated. @@ -22,6 +23,10 @@ def update_insights_job(product_id: ProductIdentifier): 1. Generate new predictions related to the product's category and name. 2. Regenerate all insights from the product associated predictions. + + :param product_id: identifier of the product + :param diffs: a dict containing a diff of the update, the format is + defined by Product Opener """ logger.info("Running `update_insights` for %s", product_id) @@ -36,6 +41,12 @@ def update_insights_job(product_id: ProductIdentifier): # reprocessing with another task arriving concurrently. # The expire is there only in case the lock is not released # (process killed) + deleted_images = diffs.get("uploaded_images", {}).get("delete") + if deleted_images: + # deleted_images is a list of image IDs that have been deleted + logger.info("images deleted: %s, launching DB update", deleted_images) + delete_images(product_id, deleted_images) + product_dict = get_product(product_id) if product_dict is None: diff --git a/tests/integration/test_images.py b/tests/integration/test_images.py new file mode 100644 index 0000000000..870650f79c --- /dev/null +++ b/tests/integration/test_images.py @@ -0,0 +1,87 @@ +import pytest + +from robotoff.images import delete_images +from robotoff.models import ImageModel, Prediction, ProductInsight +from robotoff.off import generate_image_path +from robotoff.types import ProductIdentifier, ServerType + +from .models_utils import ( + ImageModelFactory, + PredictionFactory, + ProductInsightFactory, + clean_db, +) + +DEFAULT_SERVER_TYPE = ServerType.off + + +@pytest.fixture(autouse=True) +def _set_up_and_tear_down(peewee_db): + with peewee_db: + clean_db() + # Run the test case. + yield + + with peewee_db: + clean_db() + + +def test_delete_images(peewee_db, mocker): + with peewee_db: + barcode = "1" + image_id = "1" + product_id = ProductIdentifier(barcode, DEFAULT_SERVER_TYPE) + source_image = generate_image_path(product_id, image_id) + # to be deleted + image_1 = ImageModelFactory(barcode=barcode, image_id=image_id) + # to be kept (different image ID) + image_2 = ImageModelFactory(barcode=barcode, image_id="2") + # to be kept (different barcode) + image_3 = ImageModelFactory(barcode="2", image_id=image_id) + # to be kept (same barcode/image ID, but different server type) + image_4 = ImageModelFactory( + barcode=barcode, image_id=image_id, server_type="obf" + ) + # to be deleted + prediction_1 = PredictionFactory(barcode=barcode, source_image=source_image) + # to be kept (different barcode) + prediction_2 = PredictionFactory(barcode="2") + # to be kept (different server type) + prediction_3 = PredictionFactory( + barcode=barcode, server_type="obf", source_image=source_image + ) + # to be deleted + insight_1 = ProductInsightFactory( + barcode=barcode, source_image=source_image, annotation=None + ) + # to be kept + insight_2 = ProductInsightFactory( + barcode=barcode, source_image=source_image, annotation=1 + ) + # to be kept + insight_3 = ProductInsightFactory(barcode="2", annotation=None) + # to be kept (different server type) + insight_4 = ProductInsightFactory( + barcode="1", server_type="obf", source_image=source_image, annotation=None + ) + delete_images(product_id, [image_id]) + + assert ImageModel.get_or_none(id=image_1.id).deleted + assert not ImageModel.get_or_none(id=image_2.id).deleted + assert not ImageModel.get_or_none(id=image_3.id).deleted + assert not ImageModel.get_or_none(id=image_4.id).deleted + + # the prediction associated with the image should be deleted + assert Prediction.get_or_none(id=prediction_1.id) is None + # but not the unrelated ones + assert Prediction.get_or_none(id=prediction_2.id) is not None + assert Prediction.get_or_none(id=prediction_3.id) is not None + + # the unannotated product insight associated with the image should be + # deleted + assert ProductInsight.get_or_none(id=insight_1.id) is None + # but not the annotated one + assert ProductInsight.get_or_none(id=insight_2.id) is not None + # nor the ones unrelated to the deleted image + assert ProductInsight.get_or_none(id=insight_3.id) is not None + assert ProductInsight.get_or_none(id=insight_4.id) is not None