feat: add a disk cache mechanism to cache images

openfoodfacts · Oct 27, 2023 · 0330216 · 0330216
1 parent 5d8c007
commit 0330216
Show file tree

Hide file tree

Showing 11 changed files with 230 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,4 @@ site/
 gh_pages/
 doc/README.md
 doc/references/cli.md 
+data/diskcache
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -79,6 +79,7 @@ toml = "~0.10.2"
 openfoodfacts = "0.1.10"
 imagehash = "~4.3.1"
 peewee-migrate = "~1.12.2"
+diskcache = "~5.6.3"
 
 [tool.poetry.dependencies.sentry-sdk]
 version = "~1.14.0"

diff --git a/robotoff/app/api.py b/robotoff/app/api.py
@@ -696,7 +696,11 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
         x_min = req.get_param_as_float("x_min", required=True)
         y_max = req.get_param_as_float("y_max", required=True)
         x_max = req.get_param_as_float("x_max", required=True)
-        image = get_image_from_url(image_url, session=http_session, error_raise=False)
+        # Get image from cache, as Hunger Games can requests many crops
+        # from the same image
+        image = get_image_from_url(
+            image_url, session=http_session, error_raise=False, use_cache=True
+        )
 
         if image is None:
             raise falcon.HTTPBadRequest(f"Could not fetch image: {image_url}")
@@ -799,7 +803,9 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
                 "when `output_image` is True",
             )
 
-        image = get_image_from_url(image_url, session=http_session, error_raise=False)
+        image = get_image_from_url(
+            image_url, session=http_session, error_raise=False, use_cache=True
+        )
 
         if image is None:
             raise falcon.HTTPBadRequest(f"Could not fetch image: {image_url}")

diff --git a/robotoff/images.py b/robotoff/images.py
@@ -19,6 +19,7 @@ def save_image(
     source_image: str,
     image_url: str,
     images: Optional[JSONType],
+    use_cache: bool = False,
 ) -> Optional[ImageModel]:
     """Save imported image details in DB.
 
@@ -83,7 +84,9 @@ def save_image(
         # MongoDB (in the `images` field), we download the image to know the
         # image size
         logger.info("DB Product check disabled, downloading image to get image size")
-        image = get_image_from_url(image_url, error_raise=False, session=http_session)
+        image = get_image_from_url(
+            image_url, error_raise=False, session=http_session, use_cache=use_cache
+        )
 
         if image is None:
             logger.info("Could not import image %s in DB", image_url)
@@ -131,7 +134,7 @@ def refresh_images_in_db(product_id: ProductIdentifier, images: JSONType):
         source_image = generate_image_path(product_id, missing_image_id)
         image_url = generate_image_url(product_id, missing_image_id)
         logger.debug("Creating missing image %s in DB", source_image)
-        save_image(product_id, source_image, image_url, images)
+        save_image(product_id, source_image, image_url, images, use_cache=True)
 
 
 def add_image_fingerprint(image_model: ImageModel):
@@ -140,7 +143,9 @@ def add_image_fingerprint(image_model: ImageModel):
     :param image_model: the image model to update
     """
     image_url = image_model.get_image_url()
-    image = get_image_from_url(image_url, error_raise=False, session=http_session)
+    image = get_image_from_url(
+        image_url, error_raise=False, session=http_session, use_cache=True
+    )
 
     if image is None:
         logger.info(

diff --git a/robotoff/prediction/category/neural/keras_category_classifier_3_0/__init__.py b/robotoff/prediction/category/neural/keras_category_classifier_3_0/__init__.py
@@ -149,6 +149,7 @@ def generate_image_embeddings(
                     generate_image_url(product_id, f"{image_id}.400"),
                     error_raise=False,
                     session=http_session,
+                    use_cache=True,
                 )
                 for image_id in missing_embedding_ids
             }

diff --git a/robotoff/settings.py b/robotoff/settings.py
@@ -336,3 +336,7 @@ def get_package_version() -> str:
 # (/~https://github.com/klen/peewee_migrate)
 # Migrations are automatically applied when the API service is launched
 MIGRATE_DIR = PROJECT_DIR / "migrations"
+
+
+# Path of the local disk cache, see robotoff.cache for more information
+DISKCACHE_DIR = DATA_DIR / "diskcache"
diff --git a/robotoff/utils/cache.py b/robotoff/utils/cache.py
@@ -1,3 +1,46 @@
-from robotoff.utils import get_logger
+from typing import Callable
 
-logger = get_logger(__name__)
+from diskcache import Cache
+
+from robotoff import settings
+
+# Disk-cache to store any kind of content (but currently mostly images).
+# It avoids having to download multiple times the same image from the server,
+# with a reasonable disk usage (default to 1GB).
+# diskcache Cache is thread-safe and process-safe, and every transaction is
+# atomic. We can therefore define a single cache here and use it across the
+# project.
+disk_cache = Cache(settings.DISKCACHE_DIR)
+
+
+def cache_http_request(
+    key: str,
+    func: Callable,
+    cache_expire: int | None = None,
+    tag: str | None = None,
+    *args,
+    **kwargs,
+) -> bytes | None:
+    """Cache raw response (bytes) of HTTP requests.
+
+    :param key: the cache key
+    :param func: the function to call, must return a Request object
+    :param cache_expire: expiration time of the item in the cache, defaults to
+        None (no expiration)
+    :param tag: a tag of the item in the cache (optional), defaults to None
+    :return: the response bytes or None if an error occured while calling
+      `func`
+    """
+    # Check if the item is already cached, and use it instead of sending
+    # the HTTP request if it is
+    content_bytes = disk_cache.get(key)
+    if content_bytes is None:
+        r = func(*args, **kwargs)
+        if r is None:
+            # Don't save in cache if an error (or HTTP 404) occurred
+            return None
+        content_bytes = r.content
+        # We store the raw byte content of the response in the cache
+        disk_cache.set(key, r.content, expire=cache_expire, tag=tag)
+
+    return content_bytes
diff --git a/robotoff/utils/image.py b/robotoff/utils/image.py
@@ -12,6 +12,7 @@
 
 from robotoff import settings
 
+from .cache import cache_http_request
 from .logger import get_logger
 
 logger = get_logger(__name__)
@@ -44,17 +45,60 @@ class ImageLoadingException(Exception):
 def get_image_from_url(
     image_url: str,
     error_raise: bool = True,
-    session: Optional[requests.Session] = None,
-) -> Optional[Image.Image]:
+    session: requests.Session | None = None,
+    use_cache: bool = False,
+    cache_expire: int = 86400,
+) -> Image.Image | None:
     """Fetch an image from `image_url` and load it.
 
     :param image_url: URL of the image to load
     :param error_raise: if True, raises a `ImageLoadingException` if an error
-    occured, defaults to False. If False, None is returned if an error occurs.
+      occured, defaults to False. If False, None is returned if an error
+      occured.
     :param session: requests Session to use, by default no session is used.
-    :raises ImageLoadingException: _description_
+    :param use_cache: if True, we use the local file cache (and save the
+      result in the cache in case of cache miss)
+    :param cache_expire: the expiration value of the item in the cache (in
+      seconds), default to 86400 (24h).
     :return: the Pillow Image or None.
     """
+    if use_cache:
+        content_bytes = cache_http_request(
+            key=f"image:{image_url}",
+            cache_expire=cache_expire,
+            tag="image",
+            func=_get_image_from_url,
+            # kwargs passed to func
+            error_raise=error_raise,
+            session=session,
+        )
+    else:
+        r = _get_image_from_url(image_url, error_raise, session)
+        if r is None:
+            return None
+        content_bytes = r.content
+
+    try:
+        return Image.open(BytesIO(content_bytes))
+    except PIL.UnidentifiedImageError:
+        error_message = f"Cannot identify image {image_url}"
+        if error_raise:
+            raise ImageLoadingException(error_message)
+        logger.info(error_message)
+    except PIL.Image.DecompressionBombError:
+        error_message = f"Decompression bomb error for image {image_url}"
+        if error_raise:
+            raise ImageLoadingException(error_message)
+        logger.info(error_message)
+
+    return None
+
+
+def _get_image_from_url(
+    image_url: str,
+    error_raise: bool = True,
+    session: Optional[requests.Session] = None,
+) -> requests.Request | None:
     auth = (
         settings._off_net_auth
         if urlparse(image_url).netloc.endswith("openfoodfacts.net")
@@ -84,17 +128,4 @@ def get_image_from_url(
         )
         return None
 
-    try:
-        return Image.open(BytesIO(r.content))
-    except PIL.UnidentifiedImageError:
-        error_message = f"Cannot identify image {image_url}"
-        if error_raise:
-            raise ImageLoadingException(error_message)
-        logger.info(error_message)
-    except PIL.Image.DecompressionBombError:
-        error_message = f"Decompression bomb error for image {image_url}"
-        if error_raise:
-            raise ImageLoadingException(error_message)
-        logger.info(error_message)
-
-    return None
+    return r
diff --git a/robotoff/workers/tasks/import_image.py b/robotoff/workers/tasks/import_image.py
@@ -61,6 +61,7 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url:
     2. Extracts the nutriscore prediction based on the nutriscore ML model.
     3. Triggers the 'object_detection' task
     4. Stores the imported image metadata in the Robotoff DB.
+    5. Compute image fingerprint, for duplicate image detection.
     """
     logger.info("Running `import_image` for %s, image %s", product_id, image_url)
     source_image = get_source_from_url(image_url)
@@ -75,7 +76,9 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url:
 
     product_images: Optional[JSONType] = getattr(product, "images", None)
     with db:
-        image_model = save_image(product_id, source_image, image_url, product_images)
+        image_model = save_image(
+            product_id, source_image, image_url, product_images, use_cache=True
+        )
 
         if image_model is None:
             # The image is invalid, no need to perform image extraction jobs
@@ -143,7 +146,9 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url:
 def import_insights_from_image(
     product_id: ProductIdentifier, image_url: str, ocr_url: str
 ):
-    image = get_image_from_url(image_url, error_raise=False, session=http_session)
+    image = get_image_from_url(
+        image_url, error_raise=False, session=http_session, use_cache=True
+    )
 
     if image is None:
         logger.info("Error while downloading image %s", image_url)
@@ -196,6 +201,8 @@ def save_image_job(batch: list[tuple[ProductIdentifier, str]], server_type: Serv
                     source_image,
                     image_url,
                     getattr(product, "images", None),
+                    # set use_cache=False, as we process many images only once
+                    use_cache=False,
                 )
 
 
@@ -206,7 +213,9 @@ def run_nutrition_table_object_detection(product_id: ProductIdentifier, image_ur
         image_url,
     )
 
-    image = get_image_from_url(image_url, error_raise=False, session=http_session)
+    image = get_image_from_url(
+        image_url, error_raise=False, session=http_session, use_cache=True
+    )
 
     if image is None:
         logger.info("Error while downloading image %s", image_url)
@@ -269,7 +278,7 @@ def run_upc_detection(product_id: ProductIdentifier, image_url: str) -> None:
             # run upc detection
             if (
                 image := get_image_from_url(
-                    image_url, error_raise=False, session=http_session
+                    image_url, error_raise=False, session=http_session, use_cache=True
                 )
             ) is None:
                 logger.info("Error while downloading image %s", image_url)
@@ -319,7 +328,9 @@ def run_nutriscore_object_detection(product_id: ProductIdentifier, image_url: st
         "Running nutriscore object detection for %s, image %s", product_id, image_url
     )
 
-    image = get_image_from_url(image_url, error_raise=False, session=http_session)
+    image = get_image_from_url(
+        image_url, error_raise=False, session=http_session, use_cache=True
+    )
 
     if image is None:
         logger.info("Error while downloading image %s", image_url)
@@ -385,7 +396,9 @@ def run_logo_object_detection(product_id: ProductIdentifier, image_url: str):
     """
     logger.info("Running logo object detection for %s, image %s", product_id, image_url)
 
-    image = get_image_from_url(image_url, error_raise=False, session=http_session)
+    image = get_image_from_url(
+        image_url, error_raise=False, session=http_session, use_cache=True
+    )
 
     if image is None:
         logger.info("Error while downloading image %s", image_url)