From 07ada5bd0bde125f3e3cc3b718c5bcf97b8b05e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 29 Aug 2023 12:16:45 +0200 Subject: [PATCH] feat: disable matcher predictor for category --- doc/explanations/category-prediction.md | 20 +---------- doc/introduction/architecture.md | 8 +---- robotoff/app/api.py | 2 +- robotoff/cli/main.py | 15 -------- robotoff/prediction/category/matcher.py | 7 ++++ robotoff/scheduler/__init__.py | 34 +------------------ robotoff/workers/main.py | 2 -- robotoff/workers/tasks/product_updated.py | 10 ++---- .../workers/tasks/test_product_updated.py | 8 ----- 9 files changed, 14 insertions(+), 92 deletions(-) diff --git a/doc/explanations/category-prediction.md b/doc/explanations/category-prediction.md index 1107e0f5a0..38062a7853 100644 --- a/doc/explanations/category-prediction.md +++ b/doc/explanations/category-prediction.md @@ -4,24 +4,6 @@ Knowing the category of each product is critically important at Open Food Facts, In Open Food Facts, more 12,500 categories exist in the [category taxonomy](https://static.openfoodfacts.org/data/taxonomies/categories.full.json) (as of March 2023). Category prediction using product meta-data was one the first project developed as part of Robotoff in 2018. -Two complementary approaches currently exist in production to predict categories: a matching-based approach and a machine learning one. - -## Matcher - -A simple "matcher" algorithm is used to predict categories from product names. This used to be done using Elasticsearch but it's directly included in Robotoff codebase [^matcher]. It currently works for the following languages: `fr`, `en`, `de`, `es`, `it`, `nl`. -The product name and all category names in target languages are preprocessed with the following pipeline: - -- lowercasing -- language-specific stop word removal -- language-specific lookup-based lemmatization: fast and independent of part of speech for speed and simplicity -- text normalization and accent stripping - -Then a category is predicted if the category name is a substring of the product name. - -Many false positive came from the fact some category names were also ingredients: category *fraise* matched product name *jus de fraise*. To prevent this, we only allow non-full matches (full match=the two preprocessed string are the same) to occur for an ingredient category if the match starts at the beginning of the product name. There are still false positive in English as adjectives come before nouns (ex: *strawberry juice*), so partial matching for ingredient categories is disabled for English. - -## ML prediction - A neural network model is used to predict categories [^neural]. Details about the model training, results and model assets are available on the [model robotoff-models release page](/~https://github.com/openfoodfacts/robotoff-models/releases/tag/keras-category-classifier-image-embeddings-3.0). This model takes as inputs (all inputs are optional): @@ -53,6 +35,6 @@ Here is a summary on the milestones in category detection: - 2022-10 | Remove Elasticsearch-based category predictor, switch to custom model in Robotoff codebase - 2023-03 | Deployment of the [v3 model](/~https://github.com/openfoodfacts/robotoff-models/releases/tag/keras-category-classifier-image-embeddings-3.0) +- 2023-08 | Disabling of the `matcher` predictor: after an analysis through Hunger Games, most errors were due to the `matcher` predictor, and the `neural` predictor gave most of the time accurate predictions for products for which the `matcher` predictor failed. -[^matcher]: see `robotoff.prediction.category.matcher` [^neural]: see `robotoff.prediction.category.neural` \ No newline at end of file diff --git a/doc/introduction/architecture.md b/doc/introduction/architecture.md index a20da7c2c6..b0273528e3 100644 --- a/doc/introduction/architecture.md +++ b/doc/introduction/architecture.md @@ -75,15 +75,9 @@ Robotoff is also notified by Product Opener every time a product is updated or d Robotoff also depends on the following services: -- a single node Elasticsearch instance, used to: - - infer the product category from the product name, using an improved string matching algorithm. [^predict_category] (used in conjunction with ML detection) - - index all logos to run ANN search for automatic logo classification [^logos] +- a single node Elasticsearch instance, used to index all logos to run ANN search for automatic logo classification [^logos] - a Triton instance, used to serve object detection models (nutriscore, nutrition-table, universal-logo-detector) [^robotoff_ml]. -- a Tensorflow Serving instance, used to serve the category detection model. We're going to get rid of Tensorflow Serving once a new categorizer is trained. [^robotoff_ml] -- [robotoff-ann](/~https://github.com/openfoodfacts/robotoff-ann/) which uses an approximate KNN approach to predict logo label - MongoDB, to fetch the product latest version without querying Product Opener API. -[^predict_category]: see `robotoff.prediction.category.matcher` - [^robotoff_ml]: see `docker/ml.yml` diff --git a/robotoff/app/api.py b/robotoff/app/api.py index fd5b2b97a8..5d0eed15e7 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -565,7 +565,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): f"category predictor is only available for 'off' server type (here: '{server_type.name}')" ) - predictors: list[str] = req.media.get("predictors") or ["neural", "matcher"] + predictors: list[str] = req.media.get("predictors") or ["neural"] neural_model_name = None if (neural_model_name_str := req.media.get("neural_model_name")) is not None: neural_model_name = NeuralCategoryClassifierModel[neural_model_name_str] diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py index df10808600..8b72690067 100644 --- a/robotoff/cli/main.py +++ b/robotoff/cli/main.py @@ -113,21 +113,6 @@ def generate_ocr_predictions( ) -@app.command() -def predict_category(output: str) -> None: - """Predict categories from the product JSONL dataset stored in `datasets` - directory.""" - from robotoff import settings - from robotoff.prediction.category.matcher import predict_from_dataset - from robotoff.products import ProductDataset - from robotoff.utils import dump_jsonl - - dataset = ProductDataset(settings.JSONL_DATASET_PATH) - insights = predict_from_dataset(dataset) - dict_insights = (i.to_dict() for i in insights) - dump_jsonl(output, dict_insights) - - @app.command() def download_dataset(minify: bool = False) -> None: """Download Open Food Facts dataset and save it in `datasets` directory.""" diff --git a/robotoff/prediction/category/matcher.py b/robotoff/prediction/category/matcher.py index b7f8322a35..cab7911ad4 100644 --- a/robotoff/prediction/category/matcher.py +++ b/robotoff/prediction/category/matcher.py @@ -1,3 +1,10 @@ +"""Simple "matcher" algorithm is used to predict categories from product names. + +It's currently disabled, as categorization errors mostly come from the matcher +predictor on Hunger Games, and as the neural categorizer almost always returns +more accurate predictions for products for which the matcher predictor fails. +""" + import datetime import functools import itertools diff --git a/robotoff/scheduler/__init__.py b/robotoff/scheduler/__init__.py index 2222508634..3b40e08227 100644 --- a/robotoff/scheduler/__init__.py +++ b/robotoff/scheduler/__init__.py @@ -13,21 +13,15 @@ from robotoff import settings, slack from robotoff.insights.annotate import UPDATED_ANNOTATION_RESULT, annotate -from robotoff.insights.importer import ( - BrandInsightImporter, - import_insights, - is_valid_insight_image, -) +from robotoff.insights.importer import BrandInsightImporter, is_valid_insight_image from robotoff.metrics import ( ensure_influx_database, save_facet_metrics, save_insight_metrics, ) from robotoff.models import Prediction, ProductInsight, db -from robotoff.prediction.category.matcher import predict_from_dataset from robotoff.products import ( Product, - ProductDataset, fetch_dataset, get_min_product_store, has_dataset_changed, @@ -294,26 +288,6 @@ def _update_data(): logger.exception("Exception during product dataset refresh") -def generate_insights() -> None: - """Generate and import category insights from the latest dataset dump, for - products added at day-1.""" - logger.info("Generating new category insights") - - datetime_threshold = datetime.datetime.utcnow().replace( - hour=0, minute=0, second=0, microsecond=0 - ) - datetime.timedelta(days=1) - dataset = ProductDataset(settings.JSONL_DATASET_PATH) - product_predictions_iter = predict_from_dataset(dataset, datetime_threshold) - - with db: - import_result = import_insights( - product_predictions_iter, - # Currently the JSONL dataset is OFF-only - server_type=ServerType.off, - ) - logger.info(import_result) - - def transform_insight_iter(insights_iter: Iterable[dict]): for insight in insights_iter: for field, value in insight.items(): @@ -366,12 +340,6 @@ def run(): max_instances=1, ) - # This job generates category insights using matcher algorithm from the - # last Product Opener data dump. - scheduler.add_job( - generate_insights, "cron", day="*", hour="10", minute=15, max_instances=1 - ) - scheduler.add_job( generate_quality_facets, "cron", diff --git a/robotoff/workers/main.py b/robotoff/workers/main.py index 41179b4cb6..5871a5e607 100644 --- a/robotoff/workers/main.py +++ b/robotoff/workers/main.py @@ -23,10 +23,8 @@ def load_resources(refresh: bool = False): logger.info("Loading resources in memory...") from robotoff import brands, logos, taxonomy - from robotoff.prediction.category import matcher from robotoff.prediction.object_detection import ObjectDetectionModelRegistry - matcher.load_resources() taxonomy.load_resources() logos.load_resources() brands.load_resources() diff --git a/robotoff/workers/tasks/product_updated.py b/robotoff/workers/tasks/product_updated.py index 009fae84ff..ba6f3f3f97 100644 --- a/robotoff/workers/tasks/product_updated.py +++ b/robotoff/workers/tasks/product_updated.py @@ -3,7 +3,6 @@ from robotoff.insights.extraction import get_predictions_from_product_name from robotoff.insights.importer import import_insights, refresh_insights from robotoff.models import with_db -from robotoff.prediction.category.matcher import predict as predict_category_matcher from robotoff.prediction.category.neural.category_classifier import CategoryClassifier from robotoff.products import get_product from robotoff.redis import Lock, LockedResourceException @@ -55,7 +54,7 @@ def update_insights_job(product_id: ProductIdentifier): ) -def add_category_insight(product_id: ProductIdentifier, product: JSONType): +def add_category_insight(product_id: ProductIdentifier, product: JSONType) -> None: """Predict categories for product and import predicted category insight. :param product_id: identifier of the product @@ -68,21 +67,18 @@ def add_category_insight(product_id: ProductIdentifier, product: JSONType): ) return - logger.info("Predicting product categories...") - # predict category using matching algorithm on product name - product_predictions = predict_category_matcher(product) - # predict category using neural model try: neural_predictions, _ = CategoryClassifier( get_taxonomy(TaxonomyType.category.name) ).predict(product, product_id) - product_predictions += neural_predictions + product_predictions = neural_predictions except requests.exceptions.HTTPError as e: resp = e.response logger.error( f"Category classifier returned an error: {resp.status_code}: %s", resp.text ) + return if len(product_predictions) < 1: return diff --git a/tests/unit/workers/tasks/test_product_updated.py b/tests/unit/workers/tasks/test_product_updated.py index ae5e0c4db2..573dab880f 100644 --- a/tests/unit/workers/tasks/test_product_updated.py +++ b/tests/unit/workers/tasks/test_product_updated.py @@ -16,10 +16,6 @@ def test_add_category_insight_no_insights(mocker): - mocker.patch( - "robotoff.workers.tasks.product_updated.predict_category_matcher", - return_value=[], - ) mocker.patch( "robotoff.workers.tasks.product_updated.CategoryClassifier.predict", return_value=([], {}), @@ -43,10 +39,6 @@ def test_add_category_insight_with_ml_insights(mocker): confidence=0.9, server_type=DEFAULT_PRODUCT_ID.server_type, ) - mocker.patch( - "robotoff.workers.tasks.product_updated.predict_category_matcher", - return_value=[], - ) mocker.patch( "robotoff.workers.tasks.product_updated.CategoryClassifier.predict", return_value=([expected_prediction], {}),