Skip to content

Commit

Permalink
feat: add a disk cache mechanism to cache images
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Oct 27, 2023
1 parent 5d8c007 commit 0330216
Show file tree
Hide file tree
Showing 11 changed files with 230 additions and 32 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,4 @@ site/
gh_pages/
doc/README.md
doc/references/cli.md
data/diskcache
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ toml = "~0.10.2"
openfoodfacts = "0.1.10"
imagehash = "~4.3.1"
peewee-migrate = "~1.12.2"
diskcache = "~5.6.3"

[tool.poetry.dependencies.sentry-sdk]
version = "~1.14.0"
Expand Down
10 changes: 8 additions & 2 deletions robotoff/app/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,11 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
x_min = req.get_param_as_float("x_min", required=True)
y_max = req.get_param_as_float("y_max", required=True)
x_max = req.get_param_as_float("x_max", required=True)
image = get_image_from_url(image_url, session=http_session, error_raise=False)
# Get image from cache, as Hunger Games can requests many crops
# from the same image
image = get_image_from_url(
image_url, session=http_session, error_raise=False, use_cache=True
)

if image is None:
raise falcon.HTTPBadRequest(f"Could not fetch image: {image_url}")
Expand Down Expand Up @@ -799,7 +803,9 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
"when `output_image` is True",
)

image = get_image_from_url(image_url, session=http_session, error_raise=False)
image = get_image_from_url(
image_url, session=http_session, error_raise=False, use_cache=True
)

if image is None:
raise falcon.HTTPBadRequest(f"Could not fetch image: {image_url}")
Expand Down
11 changes: 8 additions & 3 deletions robotoff/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def save_image(
source_image: str,
image_url: str,
images: Optional[JSONType],
use_cache: bool = False,
) -> Optional[ImageModel]:
"""Save imported image details in DB.
Expand Down Expand Up @@ -83,7 +84,9 @@ def save_image(
# MongoDB (in the `images` field), we download the image to know the
# image size
logger.info("DB Product check disabled, downloading image to get image size")
image = get_image_from_url(image_url, error_raise=False, session=http_session)
image = get_image_from_url(
image_url, error_raise=False, session=http_session, use_cache=use_cache
)

if image is None:
logger.info("Could not import image %s in DB", image_url)
Expand Down Expand Up @@ -131,7 +134,7 @@ def refresh_images_in_db(product_id: ProductIdentifier, images: JSONType):
source_image = generate_image_path(product_id, missing_image_id)
image_url = generate_image_url(product_id, missing_image_id)
logger.debug("Creating missing image %s in DB", source_image)
save_image(product_id, source_image, image_url, images)
save_image(product_id, source_image, image_url, images, use_cache=True)


def add_image_fingerprint(image_model: ImageModel):
Expand All @@ -140,7 +143,9 @@ def add_image_fingerprint(image_model: ImageModel):
:param image_model: the image model to update
"""
image_url = image_model.get_image_url()
image = get_image_from_url(image_url, error_raise=False, session=http_session)
image = get_image_from_url(
image_url, error_raise=False, session=http_session, use_cache=True
)

if image is None:
logger.info(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def generate_image_embeddings(
generate_image_url(product_id, f"{image_id}.400"),
error_raise=False,
session=http_session,
use_cache=True,
)
for image_id in missing_embedding_ids
}
Expand Down
4 changes: 4 additions & 0 deletions robotoff/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,7 @@ def get_package_version() -> str:
# (/~https://github.com/klen/peewee_migrate)
# Migrations are automatically applied when the API service is launched
MIGRATE_DIR = PROJECT_DIR / "migrations"


# Path of the local disk cache, see robotoff.cache for more information
DISKCACHE_DIR = DATA_DIR / "diskcache"
47 changes: 45 additions & 2 deletions robotoff/utils/cache.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,46 @@
from robotoff.utils import get_logger
from typing import Callable

logger = get_logger(__name__)
from diskcache import Cache

from robotoff import settings

# Disk-cache to store any kind of content (but currently mostly images).
# It avoids having to download multiple times the same image from the server,
# with a reasonable disk usage (default to 1GB).
# diskcache Cache is thread-safe and process-safe, and every transaction is
# atomic. We can therefore define a single cache here and use it across the
# project.
disk_cache = Cache(settings.DISKCACHE_DIR)


def cache_http_request(
key: str,
func: Callable,
cache_expire: int | None = None,
tag: str | None = None,
*args,
**kwargs,
) -> bytes | None:
"""Cache raw response (bytes) of HTTP requests.
:param key: the cache key
:param func: the function to call, must return a Request object
:param cache_expire: expiration time of the item in the cache, defaults to
None (no expiration)
:param tag: a tag of the item in the cache (optional), defaults to None
:return: the response bytes or None if an error occured while calling
`func`
"""
# Check if the item is already cached, and use it instead of sending
# the HTTP request if it is
content_bytes = disk_cache.get(key)
if content_bytes is None:
r = func(*args, **kwargs)
if r is None:
# Don't save in cache if an error (or HTTP 404) occurred
return None
content_bytes = r.content
# We store the raw byte content of the response in the cache
disk_cache.set(key, r.content, expire=cache_expire, tag=tag)

return content_bytes
67 changes: 49 additions & 18 deletions robotoff/utils/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from robotoff import settings

from .cache import cache_http_request
from .logger import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -44,17 +45,60 @@ class ImageLoadingException(Exception):
def get_image_from_url(
image_url: str,
error_raise: bool = True,
session: Optional[requests.Session] = None,
) -> Optional[Image.Image]:
session: requests.Session | None = None,
use_cache: bool = False,
cache_expire: int = 86400,
) -> Image.Image | None:
"""Fetch an image from `image_url` and load it.
:param image_url: URL of the image to load
:param error_raise: if True, raises a `ImageLoadingException` if an error
occured, defaults to False. If False, None is returned if an error occurs.
occured, defaults to False. If False, None is returned if an error
occured.
:param session: requests Session to use, by default no session is used.
:raises ImageLoadingException: _description_
:param use_cache: if True, we use the local file cache (and save the
result in the cache in case of cache miss)
:param cache_expire: the expiration value of the item in the cache (in
seconds), default to 86400 (24h).
:return: the Pillow Image or None.
"""
if use_cache:
content_bytes = cache_http_request(
key=f"image:{image_url}",
cache_expire=cache_expire,
tag="image",
func=_get_image_from_url,
# kwargs passed to func
error_raise=error_raise,
session=session,
)
else:
r = _get_image_from_url(image_url, error_raise, session)
if r is None:
return None
content_bytes = r.content

try:
return Image.open(BytesIO(content_bytes))
except PIL.UnidentifiedImageError:
error_message = f"Cannot identify image {image_url}"
if error_raise:
raise ImageLoadingException(error_message)
logger.info(error_message)
except PIL.Image.DecompressionBombError:
error_message = f"Decompression bomb error for image {image_url}"
if error_raise:
raise ImageLoadingException(error_message)
logger.info(error_message)

return None


def _get_image_from_url(
image_url: str,
error_raise: bool = True,
session: Optional[requests.Session] = None,
) -> requests.Request | None:
auth = (
settings._off_net_auth
if urlparse(image_url).netloc.endswith("openfoodfacts.net")
Expand Down Expand Up @@ -84,17 +128,4 @@ def get_image_from_url(
)
return None

try:
return Image.open(BytesIO(r.content))
except PIL.UnidentifiedImageError:
error_message = f"Cannot identify image {image_url}"
if error_raise:
raise ImageLoadingException(error_message)
logger.info(error_message)
except PIL.Image.DecompressionBombError:
error_message = f"Decompression bomb error for image {image_url}"
if error_raise:
raise ImageLoadingException(error_message)
logger.info(error_message)

return None
return r
25 changes: 19 additions & 6 deletions robotoff/workers/tasks/import_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url:
2. Extracts the nutriscore prediction based on the nutriscore ML model.
3. Triggers the 'object_detection' task
4. Stores the imported image metadata in the Robotoff DB.
5. Compute image fingerprint, for duplicate image detection.
"""
logger.info("Running `import_image` for %s, image %s", product_id, image_url)
source_image = get_source_from_url(image_url)
Expand All @@ -75,7 +76,9 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url:

product_images: Optional[JSONType] = getattr(product, "images", None)
with db:
image_model = save_image(product_id, source_image, image_url, product_images)
image_model = save_image(
product_id, source_image, image_url, product_images, use_cache=True
)

if image_model is None:
# The image is invalid, no need to perform image extraction jobs
Expand Down Expand Up @@ -143,7 +146,9 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url:
def import_insights_from_image(
product_id: ProductIdentifier, image_url: str, ocr_url: str
):
image = get_image_from_url(image_url, error_raise=False, session=http_session)
image = get_image_from_url(
image_url, error_raise=False, session=http_session, use_cache=True
)

if image is None:
logger.info("Error while downloading image %s", image_url)
Expand Down Expand Up @@ -196,6 +201,8 @@ def save_image_job(batch: list[tuple[ProductIdentifier, str]], server_type: Serv
source_image,
image_url,
getattr(product, "images", None),
# set use_cache=False, as we process many images only once
use_cache=False,
)


Expand All @@ -206,7 +213,9 @@ def run_nutrition_table_object_detection(product_id: ProductIdentifier, image_ur
image_url,
)

image = get_image_from_url(image_url, error_raise=False, session=http_session)
image = get_image_from_url(
image_url, error_raise=False, session=http_session, use_cache=True
)

if image is None:
logger.info("Error while downloading image %s", image_url)
Expand Down Expand Up @@ -269,7 +278,7 @@ def run_upc_detection(product_id: ProductIdentifier, image_url: str) -> None:
# run upc detection
if (
image := get_image_from_url(
image_url, error_raise=False, session=http_session
image_url, error_raise=False, session=http_session, use_cache=True
)
) is None:
logger.info("Error while downloading image %s", image_url)
Expand Down Expand Up @@ -319,7 +328,9 @@ def run_nutriscore_object_detection(product_id: ProductIdentifier, image_url: st
"Running nutriscore object detection for %s, image %s", product_id, image_url
)

image = get_image_from_url(image_url, error_raise=False, session=http_session)
image = get_image_from_url(
image_url, error_raise=False, session=http_session, use_cache=True
)

if image is None:
logger.info("Error while downloading image %s", image_url)
Expand Down Expand Up @@ -385,7 +396,9 @@ def run_logo_object_detection(product_id: ProductIdentifier, image_url: str):
"""
logger.info("Running logo object detection for %s, image %s", product_id, image_url)

image = get_image_from_url(image_url, error_raise=False, session=http_session)
image = get_image_from_url(
image_url, error_raise=False, session=http_session, use_cache=True
)

if image is None:
logger.info("Error while downloading image %s", image_url)
Expand Down
Loading

0 comments on commit 0330216

Please sign in to comment.