From b86795c3559d0076a05ac048b8c5c571efbae2ea Mon Sep 17 00:00:00 2001 From: Simon Jenkins Date: Mon, 1 Aug 2022 15:57:10 -0700 Subject: [PATCH 1/2] Initial service commit --- .gitignore | 4 + README.md | 36 +++ api.py | 45 ++++ docker-compose.yml | 144 +++++++++++ models/__init__.py | 0 models/product.py | 241 +++++++++++++++++++ models/request.py | 46 ++++ requirements.txt | 24 ++ scripts/data-fields.txt | 179 ++++++++++++++ scripts/es_query.py | 24 ++ scripts/generate_product_from_data_fields.py | 76 ++++++ scripts/http_query.py | 33 +++ scripts/perform_import.py | 91 +++++++ utils/__init__.py | 0 utils/analyzers.py | 8 + utils/connection.py | 5 + utils/constants.py | 59 +++++ utils/response.py | 28 +++ 18 files changed, 1043 insertions(+) create mode 100644 api.py create mode 100644 docker-compose.yml create mode 100644 models/__init__.py create mode 100644 models/product.py create mode 100644 models/request.py create mode 100644 requirements.txt create mode 100644 scripts/data-fields.txt create mode 100644 scripts/es_query.py create mode 100644 scripts/generate_product_from_data_fields.py create mode 100644 scripts/http_query.py create mode 100644 scripts/perform_import.py create mode 100644 utils/__init__.py create mode 100644 utils/analyzers.py create mode 100644 utils/connection.py create mode 100644 utils/constants.py create mode 100644 utils/response.py diff --git a/.gitignore b/.gitignore index b6e47617..58669e3d 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,7 @@ dmypy.json # Pyre type checker .pyre/ + +# Pycharm +.idea/ +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index cca41f6d..799502cf 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,38 @@ # openfoodfacts-search Open Food Facts Search API V3 using ElasticSearch - https://wiki.openfoodfacts.org/Search_API_V3 + +This API is currently in development. It is not serving any production traffic. The [Work Plan](https://wiki.openfoodfacts.org/Search_API_V3#Work_Plan) will be updated as development continues + +### Organization +The main file is `api.py`, and the Product schema is in `models/product.py`. + +The `scripts/` directory contains various scripts for manual validation, constructing the product schema, importing, etc. + +### Running locally +Docker spins up: +- Two elasticsearch nodes +- [Elasticvue](https://elasticvue.com/) + +You will then need to import from CSV (see instructions below). + +Make sure your environment is configured: +```commandline +export ELASTIC_PASSWORD=PASSWORD_HERE +``` + + +### Helpful commands: + +To start docker: +```console +docker-compose up -d +``` + +To start server: +```console +uvicorn api:app --reload +``` + +To import data from the [CSV export](https://world.openfoodfacts.org/data): +```console +python scripts/perform_import.py --filename=/path/to/file.csv \ No newline at end of file diff --git a/api.py b/api.py new file mode 100644 index 00000000..4ad2cb8b --- /dev/null +++ b/api.py @@ -0,0 +1,45 @@ +from elasticsearch_dsl import Q +from fastapi import FastAPI, HTTPException + +from models.product import Product +from models.request import AutocompleteRequest, SearchRequest +from utils import connection, constants, response + +app = FastAPI() +connection.get_connection() + + +# TODO: Remove this commented out code, so that it's not confusing about where the current GET API is served +# (retaining temporarily as a proof of concept) +# @app.get("/{barcode}") +# def get_product(barcode: str): +# results = Product.search().query("match", code=barcode).execute() +# results_dict = [r.to_dict() for r in results] +# +# if not results_dict: +# raise HTTPException(status_code=404, detail="Barcode not found") +# +# product = results_dict[0] +# return product + +@app.post("/autocomplete") +def autocomplete(request: AutocompleteRequest): + # TODO: This function needs unit testing + if not request.search_fields: + request.search_fields = constants.AUTOCOMPLETE_FIELDS + for field in request.search_fields: + if field not in constants.AUTOCOMPLETE_FIELDS: + raise HTTPException(status_code=400, detail="Invalid field: {}".format(field)) + + match_queries = [] + for field in request.search_fields: + match_queries.append(Q('match', **{field: request.text})) + + results = Product.search().query('bool', should=match_queries).extra(size=request.get_num_results()).execute() + resp = response.create_response(results, request) + return resp + + +@app.post("/search") +def search(request: SearchRequest): + raise NotImplementedError() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..b9aabc16 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,144 @@ +version: "2.2" + +services: + setup: + image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} + volumes: + - certs:/usr/share/elasticsearch/config/certs + user: "0" + command: > + bash -c ' + if [ x${ELASTIC_PASSWORD} == x ]; then + echo "Set the ELASTIC_PASSWORD environment variable in the .env file"; + exit 1; + fi; + if [ ! -f config/certs/ca.zip ]; then + echo "Creating CA"; + bin/elasticsearch-certutil ca --silent --pem -out config/certs/ca.zip; + unzip config/certs/ca.zip -d config/certs; + fi; + if [ ! -f config/certs/certs.zip ]; then + echo "Creating certs"; + echo -ne \ + "instances:\n"\ + " - name: es01\n"\ + " dns:\n"\ + " - es01\n"\ + " - localhost\n"\ + " ip:\n"\ + " - 127.0.0.1\n"\ + " - name: es02\n"\ + " dns:\n"\ + " - es02\n"\ + " - localhost\n"\ + " ip:\n"\ + " - 127.0.0.1\n"\ + > config/certs/instances.yml; + bin/elasticsearch-certutil cert --silent --pem -out config/certs/certs.zip --in config/certs/instances.yml --ca-cert config/certs/ca/ca.crt --ca-key config/certs/ca/ca.key; + unzip config/certs/certs.zip -d config/certs; + fi; + echo "Setting file permissions" + chown -R root:root config/certs; + find . -type d -exec chmod 750 \{\} \;; + find . -type f -exec chmod 640 \{\} \;; + echo "Waiting for Elasticsearch availability"; + until curl -s --cacert config/certs/ca/ca.crt https://es01:9200 | grep -q "missing authentication credentials"; do sleep 30; done; + echo "All done!"; + ' + healthcheck: + test: ["CMD-SHELL", "[ -f config/certs/es01/es01.crt ]"] + interval: 1s + timeout: 5s + retries: 120 + + es01: + depends_on: + setup: + condition: service_healthy + image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} + volumes: + - certs:/usr/share/elasticsearch/config/certs + - esdata01:/usr/share/elasticsearch/data + ports: + - ${ES_PORT}:9200 + environment: + - node.name=es01 + - cluster.name=${CLUSTER_NAME} + - cluster.initial_master_nodes=es01,es02 + - discovery.seed_hosts=es02 + - ELASTIC_PASSWORD=${ELASTIC_PASSWORD} + - bootstrap.memory_lock=true + - xpack.security.enabled=false + - xpack.license.self_generated.type=${LICENSE} + - http.cors.enabled=true + - http.cors.allow-origin=http://localhost:8080,http://127.0.0.1:8080 + - http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization + - http.cors.allow-credentials=true + mem_limit: ${MEM_LIMIT} + ulimits: + memlock: + soft: -1 + hard: -1 + healthcheck: + test: + [ + "CMD-SHELL", + "curl -s --cacert config/certs/ca/ca.crt https://localhost:9200 | grep -q 'missing authentication credentials'", + ] + interval: 10s + timeout: 10s + retries: 120 + + es02: + depends_on: + - es01 + image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} + volumes: + - certs:/usr/share/elasticsearch/config/certs + - esdata02:/usr/share/elasticsearch/data + environment: + - node.name=es02 + - cluster.name=${CLUSTER_NAME} + - cluster.initial_master_nodes=es01,es02 + - discovery.seed_hosts=es01 + - bootstrap.memory_lock=true + - xpack.security.enabled=false + - xpack.license.self_generated.type=${LICENSE} + - http.cors.enabled=true + - http.cors.allow-origin=http://localhost:8080,http://127.0.0.1:8080 + - http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization + - http.cors.allow-credentials=true + mem_limit: ${MEM_LIMIT} + ulimits: + memlock: + soft: -1 + hard: -1 + healthcheck: + test: + [ + "CMD-SHELL", + "curl -s --cacert config/certs/ca/ca.crt https://localhost:9200 | grep -q 'missing authentication credentials'", + ] + interval: 10s + timeout: 10s + retries: 120 + + + # elasticsearch browser + elasticvue: + image: cars10/elasticvue + container_name: elasticvue + ports: + - '8080:8080' + links: + - es01 + +volumes: + certs: + driver: local + esdata01: + driver: local + esdata02: + driver: local + esdata03: + driver: local \ No newline at end of file diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/models/product.py b/models/product.py new file mode 100644 index 00000000..04087639 --- /dev/null +++ b/models/product.py @@ -0,0 +1,241 @@ +from elasticsearch_dsl import Document, Date, Double, Keyword, Text, Integer + +from utils import constants +from utils.analyzers import autocomplete + + +class Product(Document): + """ + This should mirror the fields here: /~https://github.com/openfoodfacts/openfoodfacts-server/blob/main/html/data/data-fields.txt + Use scripts/generate_product_from_data_fields.py to regenerate from data-fields.txt, but be careful for manual + adjustments + """ + + class Index: + name = constants.INDEX_ALIAS + settings = { + "number_of_shards": 4, + } + + # barcode of the product (can be EAN-13 or internal codes for some food stores), for products without a barcode, Open Food Facts assigns a number starting with the 200 reserved prefix + code = Keyword() + # url of the product page on Open Food Facts + url = Keyword() + # contributor who first added the product + creator = Keyword() + # date that the product was added (UNIX timestamp format) + created_t = Integer() + # date that the product was added (iso8601 format: yyyy-mm-ddThh:mn:ssZ) + created_datetime = Date() + # date that the product page was last modified + last_modified_t = Integer() + last_modified_datetime = Date() + # name of the product + product_name = Text(analyzer='snowball', fields={'autocomplete': Text(analyzer=autocomplete), 'raw': Keyword()}) + generic_name = Keyword() + # quantity and unit + quantity = Keyword() + # shape, material + packaging = Keyword() + packaging_tags = Text(multi=True) + brands = Text(analyzer='snowball', fields={'autocomplete': Text(analyzer=autocomplete), 'raw': Keyword()}) + brands_tags = Text(multi=True) + categories = Text(analyzer='snowball', fields={'autocomplete': Text(analyzer=autocomplete), 'raw': Keyword()}) + categories_tags = Text(multi=True) + categories_fr = Keyword() + # origins of ingredients + origins = Keyword() + origins_tags = Text(multi=True) + # places where manufactured or transformed + manufacturing_places = Keyword() + manufacturing_places_tags = Text(multi=True) + labels = Keyword() + labels_tags = Text(multi=True) + labels_fr = Keyword() + emb_codes = Keyword() + emb_codes_tags = Text(multi=True) + # coordinates corresponding to the first packaging code indicated + first_packaging_code_geo = Keyword() + cities = Keyword() + cities_tags = Text(multi=True) + purchase_places = Keyword() + stores = Keyword() + # list of countries where the product is sold + countries = Keyword() + countries_tags = Text(multi=True) + countries_fr = Keyword() + ingredients_text = Keyword() + traces = Keyword() + traces_tags = Text(multi=True) + # serving size in g + serving_size = Keyword() + # indicates if the nutrition facts are indicated on the food label + no_nutriments = Keyword() + # number of food additives + additives_n = Keyword() + additives = Keyword() + additives_tags = Text(multi=True) + ingredients_from_palm_oil_n = Keyword() + ingredients_from_palm_oil = Keyword() + ingredients_from_palm_oil_tags = Text(multi=True) + ingredients_that_may_be_from_palm_oil_n = Keyword() + ingredients_that_may_be_from_palm_oil = Keyword() + ingredients_that_may_be_from_palm_oil_tags = Text(multi=True) + # nutrition grade ('a' to 'e'). see https://fr.openfoodfacts.org/nutriscore + nutrition_grade_fr = Keyword() + main_category = Keyword() + main_category_fr = Keyword() + image_url = Keyword() + image_small_url = Keyword() + energy_100g = Double() + energy_kj_100g = Double() + energy_kcal_100g = Double() + proteins_100g = Double() + casein_100g = Double() + serum_proteins_100g = Double() + nucleotides_100g = Double() + carbohydrates_100g = Double() + sugars_100g = Double() + sucrose_100g = Double() + glucose_100g = Double() + fructose_100g = Double() + lactose_100g = Double() + maltose_100g = Double() + maltodextrins_100g = Double() + starch_100g = Double() + polyols_100g = Double() + fat_100g = Double() + saturated_fat_100g = Double() + butyric_acid_100g = Double() + caproic_acid_100g = Double() + caprylic_acid_100g = Double() + capric_acid_100g = Double() + lauric_acid_100g = Double() + myristic_acid_100g = Double() + palmitic_acid_100g = Double() + stearic_acid_100g = Double() + arachidic_acid_100g = Double() + behenic_acid_100g = Double() + lignoceric_acid_100g = Double() + cerotic_acid_100g = Double() + montanic_acid_100g = Double() + melissic_acid_100g = Double() + monounsaturated_fat_100g = Double() + polyunsaturated_fat_100g = Double() + omega_3_fat_100g = Double() + alpha_linolenic_acid_100g = Double() + eicosapentaenoic_acid_100g = Double() + docosahexaenoic_acid_100g = Double() + omega_6_fat_100g = Double() + linoleic_acid_100g = Double() + arachidonic_acid_100g = Double() + gamma_linolenic_acid_100g = Double() + dihomo_gamma_linolenic_acid_100g = Double() + omega_9_fat_100g = Double() + oleic_acid_100g = Double() + elaidic_acid_100g = Double() + gondoic_acid_100g = Double() + mead_acid_100g = Double() + erucic_acid_100g = Double() + nervonic_acid_100g = Double() + trans_fat_100g = Double() + cholesterol_100g = Double() + fiber_100g = Double() + sodium_100g = Double() + # % vol of alcohol + alcohol_100g = Double() + vitamin_a_100g = Double() + vitamin_d_100g = Double() + vitamin_e_100g = Double() + vitamin_k_100g = Double() + vitamin_c_100g = Double() + vitamin_b1_100g = Double() + vitamin_b2_100g = Double() + vitamin_pp_100g = Double() + vitamin_b6_100g = Double() + vitamin_b9_100g = Double() + vitamin_b12_100g = Double() + # also known as Vitamine B8 + biotin_100g = Double() + # also known as Vitamine B5 + pantothenic_acid_100g = Double() + silica_100g = Double() + bicarbonate_100g = Double() + potassium_100g = Double() + chloride_100g = Double() + calcium_100g = Double() + phosphorus_100g = Double() + iron_100g = Double() + magnesium_100g = Double() + zinc_100g = Double() + copper_100g = Double() + manganese_100g = Double() + fluoride_100g = Double() + selenium_100g = Double() + chromium_100g = Double() + molybdenum_100g = Double() + iodine_100g = Double() + caffeine_100g = Double() + taurine_100g = Double() + # pH (no unit) + ph_100g = Double() + # % of fruits, vegetables and nuts (excluding potatoes, yams, manioc) + fruits_vegetables_nuts_100g = Double() + # carbon footprint (as indicated on the packaging of some products) + carbon_footprint_100g = Double() + # Nutri-Score - Nutrition score derived from the UK FSA score and adapted for the French market (formula defined by the team of Professor Hercberg) + nutrition_score_fr_100g = Double() + # nutrition score defined by the UK Food Standards Administration (FSA) + nutrition_score_uk_100g = Double() + countries_en = Keyword() + pnns_groups_1 = Keyword() + pnns_groups_2 = Keyword() + states = Keyword() + states_tags = Text(multi=True) + states_en = Keyword() + ecoscore_grade = Keyword() + image_nutrition_url = Keyword() + image_nutrition_small_url = Keyword() + origins_en = Keyword() + ingredients_tags = Text(multi=True) + image_ingredients_url = Keyword() + image_ingredients_small_url = Keyword() + salt_100g = Double() + fruits_vegetables_nuts_estimate_100g = Double() + fruits_vegetables_nuts_estimate_from_ingredients_100g = Double() + fruits_vegetables_nuts_dried_100g = Double() + categories_en = Keyword() + nutriscore_score = Keyword() + nutriscore_grade = Keyword() + food_groups = Keyword() + food_groups_tags = Text(multi=True) + food_groups_en = Keyword() + ecoscore_score = Keyword() + main_category_en = Keyword() + additives_en = Keyword() + nova_group = Keyword() + labels_en = Keyword() + allergens = Keyword() + packaging_en = Keyword() + packaging_text = Keyword() + serving_quantity = Keyword() + carbon_footprint_from_meat_or_fish_100g = Double() + energy_from_fat_100g = Double() + folates_100g = Double() + soluble_fiber_100g = Double() + insoluble_fiber_100g = Double() + phylloquinone_100g = Double() + cocoa_100g = Double() + choline_100g = Double() + inositol_100g = Double() + collagen_meat_protein_ratio_100g = Double() + beta_carotene_100g = Double() + chlorophyl_100g = Double() + glycemic_index_100g = Double() + water_hardness_100g = Double() + beta_glucan_100g = Double() + carnitine_100g = Double() + traces_en = Keyword() + brand_owner = Keyword() + abbreviated_product_name = Keyword() + allergens_en = Keyword() diff --git a/models/request.py b/models/request.py new file mode 100644 index 00000000..4c4593be --- /dev/null +++ b/models/request.py @@ -0,0 +1,46 @@ +import datetime +from typing import Optional, List, Set +from pydantic import BaseModel + +from utils import constants + + +class SearchBase(BaseModel): + response_fields: Optional[Set[str]] + num_results: int = 10 + + def get_num_results(self): + return min(self.num_results, constants.MAX_RESULTS) + + +class AutocompleteRequest(SearchBase): + text: str + search_fields: List[str] = constants.AUTOCOMPLETE_FIELDS + + +class StringFilter(BaseModel): + field: str + value: str + # One of eq, ne, like, without + operator: str = 'eq' + + +class NumericFilter(BaseModel): + field: str + value: float + # One of eq, ne, lt, gt, without + operator: str = 'eq' + + +class DateTimeFilter(BaseModel): + field: str + value: datetime.datetime + # One of eq, ne, lt, gt, without + operator: str = 'eq' + + +class SearchRequest(SearchBase): + # Works as an intersection/AND query + string_filters: List[StringFilter] + numeric_filters: List[NumericFilter] + date_time_filters: List[DateTimeFilter] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..590564a6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,24 @@ +anyio==3.6.1 +certifi==2022.6.15 +charset-normalizer==2.1.0 +click==8.1.3 +elasticsearch==7.17.4 +elasticsearch-dsl==7.4.0 +fastapi==0.79.0 +h11==0.13.0 +httptools==0.4.0 +idna==3.3 +pydantic==1.9.1 +python-dateutil==2.8.2 +python-dotenv==0.20.0 +PyYAML==6.0 +requests==2.28.1 +six==1.16.0 +sniffio==1.2.0 +starlette==0.19.1 +typing_extensions==4.3.0 +urllib3==1.26.11 +uvicorn==0.18.2 +uvloop==0.16.0 +watchfiles==0.16.1 +websockets==10.3 \ No newline at end of file diff --git a/scripts/data-fields.txt b/scripts/data-fields.txt new file mode 100644 index 00000000..8bd1f707 --- /dev/null +++ b/scripts/data-fields.txt @@ -0,0 +1,179 @@ +his file describes the fields from the CSV export of the products in the Open Food Facts database. + +See https://world.openfoodfacts.org/data for more information. + +The file encoding is Unicode UTF-8. The character that separates fields is (tabulation). + +Generalities: + +- fields that end with _t are dates in the UNIX timestamp format (number of seconds since Jan 1st 1970) +- fields that end with _datetime are dates in the iso8601 format: yyyy-mm-ddThh:mn:ssZ +- fields that end with _tags are comma separated list of tags (e.g. categories_tags is the set of normalized tags computer from the categories field) +- fields that end with a language 2 letter code (e.g. fr for French) is the set of tags in that language +- fields that end with _100g correspond to the amount of a nutriment (in g, or kJ for energy) for 100 g or 100 ml of product +- fields that end with _serving correspond to the amount of a nutriment (in g, or kJ for energy) for 1 serving of the product + +List of fields: + +# general information: + +code : barcode of the product (can be EAN-13 or internal codes for some food stores), for products without a barcode, Open Food Facts assigns a number starting with the 200 reserved prefix +url : url of the product page on Open Food Facts +creator : contributor who first added the product +created_t : date that the product was added (UNIX timestamp format) +created_datetime : date that the product was added (iso8601 format: yyyy-mm-ddThh:mn:ssZ) +last_modified_t : date that the product page was last modified +last_modified_datetime +product_name : name of the product +generic_name +quantity : quantity and unit + +# tags: + +packaging : shape, material +packaging_tags +brands +brands_tags +categories +categories_tags +categories_fr +origins : origins of ingredients +origins_tags +manufacturing_places : places where manufactured or transformed +manufacturing_places_tags +labels +labels_tags +labels_fr +emb_codes +emb_codes_tags +first_packaging_code_geo : coordinates corresponding to the first packaging code indicated +cities +cities_tags +purchase_places +stores +countries : list of countries where the product is sold +countries_tags +countries_fr + +# ingredients: + +ingredients_text +traces +traces_tags + +# misc. data: + +serving_size : serving size in g +no_nutriments : indicates if the nutrition facts are indicated on the food label +additives_n : number of food additives +additives +additives_tags +ingredients_from_palm_oil_n +ingredients_from_palm_oil +ingredients_from_palm_oil_tags +ingredients_that_may_be_from_palm_oil_n +ingredients_that_may_be_from_palm_oil +ingredients_that_may_be_from_palm_oil_tags +nutrition_grade_fr : nutrition grade ('a' to 'e'). see https://fr.openfoodfacts.org/nutriscore +main_category +main_category_fr +image_url +image_small_url + +# nutrition facts: +# Please see https://wiki.openfoodfacts.org/Nutrients_handling_in_Open_Food_Facts for more information on nutrients + +energy_100g +energy-kj_100g +energy-kcal_100g +proteins_100g +casein_100g +serum-proteins_100g +nucleotides_100g +carbohydrates_100g +sugars_100g +sucrose_100g +glucose_100g +fructose_100g +lactose_100g +maltose_100g +maltodextrins_100g +starch_100g +polyols_100g +fat_100g +saturated-fat_100g +butyric-acid_100g +caproic-acid_100g +caprylic-acid_100g +capric-acid_100g +lauric-acid_100g +myristic-acid_100g +palmitic-acid_100g +stearic-acid_100g +arachidic-acid_100g +behenic-acid_100g +lignoceric-acid_100g +cerotic-acid_100g +montanic-acid_100g +melissic-acid_100g +monounsaturated-fat_100g +polyunsaturated-fat_100g +omega-3-fat_100g +alpha-linolenic-acid_100g +eicosapentaenoic-acid_100g +docosahexaenoic-acid_100g +omega-6-fat_100g +linoleic-acid_100g +arachidonic-acid_100g +gamma-linolenic-acid_100g +dihomo-gamma-linolenic-acid_100g +omega-9-fat_100g +oleic-acid_100g +elaidic-acid_100g +gondoic-acid_100g +mead-acid_100g +erucic-acid_100g +nervonic-acid_100g +trans-fat_100g +cholesterol_100g +fiber_100g +sodium_100g +alcohol_100g : % vol of alcohol +vitamin-a_100g +vitamin-d_100g +vitamin-e_100g +vitamin-k_100g +vitamin-c_100g +vitamin-b1_100g +vitamin-b2_100g +vitamin-pp_100g +vitamin-b6_100g +vitamin-b9_100g +vitamin-b12_100g +biotin_100g : also known as Vitamine B8 +pantothenic-acid_100g : also known as Vitamine B5 +silica_100g +bicarbonate_100g +potassium_100g +chloride_100g +calcium_100g +phosphorus_100g +iron_100g +magnesium_100g +zinc_100g +copper_100g +manganese_100g +fluoride_100g +selenium_100g +chromium_100g +molybdenum_100g +iodine_100g +caffeine_100g +taurine_100g +ph_100g : pH (no unit) +fruits-vegetables-nuts_100g : % of fruits, vegetables and nuts (excluding potatoes, yams, manioc) + +carbon-footprint_100g : carbon footprint (as indicated on the packaging of some products) + +nutrition-score-fr_100g : Nutri-Score - Nutrition score derived from the UK FSA score and adapted for the French market (formula defined by the team of Professor Hercberg) +nutrition-score-uk_100g : nutrition score defined by the UK Food Standards Administration (FSA) \ No newline at end of file diff --git a/scripts/es_query.py b/scripts/es_query.py new file mode 100644 index 00000000..9b535b4e --- /dev/null +++ b/scripts/es_query.py @@ -0,0 +1,24 @@ +""" +Script that allows manually querying ES +""" +import time + +from models.product import Product +from utils import connection + + +def manual_query(): + connection.get_connection() + + while True: + search_term = input("Please enter search term:\n") + start_time = time.perf_counter() + results = Product.search().query('match', product_name__autocomplete=search_term).execute() + for result in results[:10]: + print(result.meta.score, result.product_name) + end_time = time.perf_counter() + print("Time: {} seconds".format(end_time - start_time)) + + +if __name__ == "__main__": + manual_query() \ No newline at end of file diff --git a/scripts/generate_product_from_data_fields.py b/scripts/generate_product_from_data_fields.py new file mode 100644 index 00000000..bd0ccdf0 --- /dev/null +++ b/scripts/generate_product_from_data_fields.py @@ -0,0 +1,76 @@ +""" +This script takes the data-fields.txt and generates the updated product fields. +Note, if field names are changed, etc, this will have considerable implications for the index. +""" +from utils import constants + + +def get_type_for_field(field): + """ + The docs state: + - fields that end with _t are dates in the UNIX timestamp format (number of seconds since Jan 1st 1970) + - fields that end with _datetime are dates in the iso8601 format: yyyy-mm-ddThh:mn:ssZ + - fields that end with _tags are comma separated list of tags (e.g. categories_tags is the set of normalized tags computer from the categories field) + - fields that end with a language 2 letter code (e.g. fr for French) is the set of tags in that language + - fields that end with _100g correspond to the amount of a nutriment (in g, or kJ for energy) for 100 g or 100 ml of product + - fields that end with _serving correspond to the amount of a nutriment (in g, or kJ for energy) for 1 serving of the product + """ + + suffix_to_type = { + 't': 'Integer()', + 'datetime': 'Date()', + 'tags': 'Text(multi=True)', + '100g': 'Double()', + 'serving': 'Double()', + } + + suffix = field.split('_')[-1] + + type = suffix_to_type.get(suffix) + if type: + return type + + # Otherwise, just do as keyword + return 'Keyword()' + + +def generate_product_from_data_fields(): + with open('data-fields.txt', 'r') as f: + lines = f.readlines() + + # Add undocumented fields + lines += constants.UNDOCUMENTED_FIELDS + + for line in lines: + words = line.split() + # Lines with fields should follow the pattern of or : + if len(words) != 1 and ':' not in words: + continue + + # Remove any lines with a : but only one word (as these are headings) + if len(words) == 1 and ':' in words[0]: + continue + + field_name = words[0] + description = '' + if len(words) > 2: + description = ' '.join(words[2:]) + + if description: + print("# {}".format(description)) + + # Some fields have dashes, let's replace them + field_name = field_name.replace('-', '_') + + # Autocomplete cases + if field_name in constants.AUTOCOMPLETE_FIELDS: + # Do text with snowball (for direct searches), and autocomplete too + print(field_name + "= Text(analyzer='snowball', fields={'autocomplete': Text(analyzer=autocomplete), " + "'raw': Keyword()})") + else: + field_type = get_type_for_field(field_name) + print("{} = {}".format(field_name, field_type)) + + +if __name__ == "__main__": + generate_product_from_data_fields() diff --git a/scripts/http_query.py b/scripts/http_query.py new file mode 100644 index 00000000..c252eedb --- /dev/null +++ b/scripts/http_query.py @@ -0,0 +1,33 @@ +""" +Script that allows manually querying the local search service +""" +import json +import time +import requests + +from models.product import Product +from utils import connection + + +def manual_query(): + + connection.get_connection() + + while True: + search_term = input("Please enter search term:\n") + start_time = time.perf_counter() + + payload = { + 'text': search_term, + 'num_results': 10, + 'response_fields': ['product_name', 'pnns_groups_1'], + } + response = requests.post("http://127.0.0.1:8000/autocomplete", json=payload) + print(json.dumps(response.json(), indent=4, sort_keys=True)) + print("Number of results: {}".format(len(response.json()))) + end_time = time.perf_counter() + print("Time: {} seconds".format(end_time - start_time)) + + +if __name__ == "__main__": + manual_query() \ No newline at end of file diff --git a/scripts/perform_import.py b/scripts/perform_import.py new file mode 100644 index 00000000..59e22889 --- /dev/null +++ b/scripts/perform_import.py @@ -0,0 +1,91 @@ +""" +Performs an import from a CSV file. Note that this will delete the old index, causing downtime + +Pass in the path of the file with the filename argument + +Example: +python scripts/perform_import.py --filename=X +""" +import csv +import argparse +import time +from datetime import datetime + +from elasticsearch.helpers import bulk + +from models.product import Product +from utils import connection, constants + + +def gen_documents(filename, next_index): + valid_field_names = set(field_name for field_name in Product._doc_type.mapping.properties.to_dict()['properties'].keys()) + with open(filename) as f: + input_file = csv.DictReader(f, delimiter='\t') + + for i, row in enumerate(input_file): + # Use underscores for consistency + row = {k.replace('-', '_'): v for k, v in row.items()} + + # Some fields have a leading dash (now underscore), remove them + row = {k: v for k, v in row.items() if not k.startswith('_')} + + # For the first row, check that we have every column name in our index + if i == 0: + column_names = row.keys() + missing_column_names = [] + for column_name in column_names: + if column_name not in valid_field_names and column_name: + missing_column_names.append(column_name) + + if missing_column_names: + print("Missing: {}".format(missing_column_names)) + exit(-1) + + # Remove all empty values, we don't want to waste space in the index + row = {k: v for k, v in row.items() if v != ''} + + # Split tags + for k in row.keys(): + if k.endswith('_tags'): + row[k] = row[k].split(',') + + product = Product(**row).to_dict(True) + # Override the index + product['_index'] = next_index + yield product + + if i % 100000 == 0 and i: + # Roughly 2.5M lines as of July 2022 + print("Processed: {} lines".format(i)) + + +def update_alias(es, next_index): + # repoint the alias to point to the newly created index + es.indices.update_aliases( + body={ + "actions": [ + {"remove": {"alias": constants.INDEX_ALIAS, "index": constants.INDEX_ALIAS_PATTERN}}, + {"add": {"alias": constants.INDEX_ALIAS, "index": next_index}}, + ] + } + ) + + +def perform_import(filename): + es = connection.get_connection() + next_index = constants.INDEX_ALIAS_PATTERN.replace("*", datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")) + + Product.init(index=next_index) + bulk(es, gen_documents(filename, next_index)) + update_alias(es, next_index) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("perform_import") + parser.add_argument("--filename", help="Filename where CSV file is located", type=str) + args = parser.parse_args() + + start_time = time.perf_counter() + perform_import(args.filename) + end_time = time.perf_counter() + print("Import time: {} seconds".format(end_time - start_time)) diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/analyzers.py b/utils/analyzers.py new file mode 100644 index 00000000..36958879 --- /dev/null +++ b/utils/analyzers.py @@ -0,0 +1,8 @@ +from elasticsearch_dsl import analyzer +from elasticsearch_dsl import tokenizer + +autocomplete = analyzer( + 'autocomplete', + tokenizer=tokenizer('bigram', 'edge_ngram', min_gram=2, max_gram=25, token_chars=['letter', 'digit', 'punctuation']), + filter=['lowercase', 'asciifolding'] +) \ No newline at end of file diff --git a/utils/connection.py b/utils/connection.py new file mode 100644 index 00000000..7c8659b0 --- /dev/null +++ b/utils/connection.py @@ -0,0 +1,5 @@ +from elasticsearch_dsl.connections import connections + + +def get_connection(): + return connections.create_connection(hosts=['localhost:9200']) \ No newline at end of file diff --git a/utils/constants.py b/utils/constants.py new file mode 100644 index 00000000..a84db698 --- /dev/null +++ b/utils/constants.py @@ -0,0 +1,59 @@ +AUTOCOMPLETE_FIELDS = ['product_name', 'brands', 'categories'] +INDEX_ALIAS = 'openfoodfacts' +INDEX_ALIAS_PATTERN = INDEX_ALIAS + "-*" +MAX_RESULTS = 100 + +UNDOCUMENTED_FIELDS = [ + 'countries_en', + 'pnns_groups_1', + 'pnns_groups_2', + 'states', + 'states_tags', + 'states_en', + 'ecoscore_grade', + 'image_nutrition_url', + 'image_nutrition_small_url', + 'origins_en', + 'ingredients_tags', + 'image_ingredients_url', + 'image_ingredients_small_url', + 'salt_100g', + 'fruits_vegetables_nuts_estimate_100g', + 'fruits_vegetables_nuts_estimate_from_ingredients_100g', + 'fruits_vegetables_nuts_dried_100g', + 'categories_en', + 'nutriscore_score', + 'nutriscore_grade', + 'food_groups', + 'food_groups_tags', + 'food_groups_en', + 'ecoscore_score', + 'main_category_en', + 'additives_en', + 'nova_group', + 'labels_en', + 'allergens', + 'packaging_en', + 'packaging_text', + 'serving_quantity', + 'carbon_footprint_from_meat_or_fish_100g', + 'energy_from_fat_100g', + 'folates_100g', + 'soluble_fiber_100g', + 'insoluble_fiber_100g', + 'phylloquinone_100g', + 'cocoa_100g', + 'choline_100g', + 'inositol_100g', + 'collagen_meat_protein_ratio_100g', + 'beta_carotene_100g', + 'chlorophyl_100g', + 'glycemic_index_100g', + 'water_hardness_100g', + 'beta_glucan_100g', + 'carnitine_100g', + 'traces_en', + 'brand_owner', + 'abbreviated_product_name', + 'allergens_en', +] \ No newline at end of file diff --git a/utils/response.py b/utils/response.py new file mode 100644 index 00000000..5667d99e --- /dev/null +++ b/utils/response.py @@ -0,0 +1,28 @@ +from models.product import Product +from models.request import SearchBase + + +def create_response(es_results, request: SearchBase): + resp = [convert_es_result(r, request) for r in es_results] + return resp + + +def convert_es_result(es_result, request: SearchBase): + if not es_result: + return None + + # Add missing fields to maintain backwards compatibility + field_names = list(Product._doc_type.mapping.properties.to_dict()['properties'].keys()) + result_dict = {field_name: [] if field_name.endswith('_tags') else '' for field_name in field_names} + result_dict.update(es_result.to_dict()) + + # Trim fields as needed + if request.response_fields: + trimmed_result_dict = {} + for response_field in request.response_fields: + if response_field in result_dict: + trimmed_result_dict[response_field] = result_dict[response_field] + + result_dict = trimmed_result_dict + + return result_dict From 56755341fd40000aa0ff42ea728dfcc9aac7fdde Mon Sep 17 00:00:00 2001 From: Simon Jenkins Date: Tue, 2 Aug 2022 11:43:53 -0700 Subject: [PATCH 2/2] Docker for search service, rearchitecture --- .env | 20 ++++++++++ .gitignore | 1 - Dockerfile | 7 ++++ README.md | 37 ++++++++++++++----- {models => app}/__init__.py | 0 api.py => app/api.py | 6 +-- {utils => app/models}/__init__.py | 0 {models => app/models}/product.py | 4 +- {models => app/models}/request.py | 2 +- {scripts => app/scripts}/data-fields.txt | 0 {scripts => app/scripts}/es_query.py | 4 +- .../generate_product_from_data_fields.py | 2 +- {scripts => app/scripts}/http_query.py | 3 +- {scripts => app/scripts}/perform_import.py | 4 +- app/utils/__init__.py | 0 {utils => app/utils}/analyzers.py | 2 +- app/utils/connection.py | 6 +++ {utils => app/utils}/constants.py | 0 {utils => app/utils}/response.py | 4 +- docker-compose.yml | 9 ++++- utils/connection.py | 5 --- 21 files changed, 83 insertions(+), 33 deletions(-) create mode 100644 .env create mode 100644 Dockerfile rename {models => app}/__init__.py (100%) rename api.py => app/api.py (90%) rename {utils => app/models}/__init__.py (100%) rename {models => app/models}/product.py (99%) rename {models => app/models}/request.py (96%) rename {scripts => app/scripts}/data-fields.txt (100%) rename {scripts => app/scripts}/es_query.py (88%) rename {scripts => app/scripts}/generate_product_from_data_fields.py (98%) rename {scripts => app/scripts}/http_query.py (92%) rename {scripts => app/scripts}/perform_import.py (97%) create mode 100644 app/utils/__init__.py rename {utils => app/utils}/analyzers.py (99%) create mode 100644 app/utils/connection.py rename {utils => app/utils}/constants.py (100%) rename {utils => app/utils}/response.py (91%) delete mode 100644 utils/connection.py diff --git a/.env b/.env new file mode 100644 index 00000000..cdc5150b --- /dev/null +++ b/.env @@ -0,0 +1,20 @@ +# Password for the 'elastic' user (at least 6 characters) +# This needs to be set in the environment variables +# ELASTIC_PASSWORD= + +# Version of Elastic products +STACK_VERSION=8.3.3 + +# Set the cluster name +CLUSTER_NAME=docker-cluster + +# Set to 'basic' or 'trial' to automatically start the 30-day trial +LICENSE=basic + +# Port to expose Elasticsearch HTTP API to the host +ES_PORT=9200 +#ES_PORT=127.0.0.1:9200 + +# Increase or decrease based on the available host memory (in bytes) +# 1GB works well, 2GB and above leads to lower latency +MEM_LIMIT=2147483648 diff --git a/.gitignore b/.gitignore index 58669e3d..a2a24871 100644 --- a/.gitignore +++ b/.gitignore @@ -102,7 +102,6 @@ celerybeat.pid *.sage.py # Environments -.env .venv env/ venv/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..83dc4c72 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +# Instructions from https://fastapi.tiangolo.com/deployment/docker/ +FROM python:3.9 +WORKDIR /code +COPY ./requirements.txt /code/requirements.txt +RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt +COPY ./app /code/app +CMD ["uvicorn", "app.api:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index 799502cf..563edc87 100644 --- a/README.md +++ b/README.md @@ -9,30 +9,47 @@ The main file is `api.py`, and the Product schema is in `models/product.py`. The `scripts/` directory contains various scripts for manual validation, constructing the product schema, importing, etc. ### Running locally +Firstly, make sure your environment is configured: +```commandline +export ELASTIC_PASSWORD=PASSWORD_HERE +``` + +Then start docker: +```console +docker-compose up -d +``` + Docker spins up: - Two elasticsearch nodes - [Elasticvue](https://elasticvue.com/) +- The search service on port 8000 You will then need to import from CSV (see instructions below). -Make sure your environment is configured: -```commandline -export ELASTIC_PASSWORD=PASSWORD_HERE -``` - - -### Helpful commands: +### Development +For development, you have two options for running the service: +1. Docker +2. Locally -To start docker: +To develop on docker, make the changes you need, then build the image and compose by running: ```console +docker build -t off_search_image . docker-compose up -d ``` -To start server: +However, this tends to be slower than developing locally. + +To develop locally, create a venv, install dependencies, then run the service: ```console -uvicorn api:app --reload +virtualenv . +source venv/bin/activate +pip install -r requirements.txt +uvicorn app.api:app --reload --port=8001 ``` +Note that it's important to use port 8001, as port 8000 will be used by the docker version of the search service. + +### Helpful commands: To import data from the [CSV export](https://world.openfoodfacts.org/data): ```console python scripts/perform_import.py --filename=/path/to/file.csv \ No newline at end of file diff --git a/models/__init__.py b/app/__init__.py similarity index 100% rename from models/__init__.py rename to app/__init__.py diff --git a/api.py b/app/api.py similarity index 90% rename from api.py rename to app/api.py index 4ad2cb8b..e61ba988 100644 --- a/api.py +++ b/app/api.py @@ -1,9 +1,9 @@ from elasticsearch_dsl import Q from fastapi import FastAPI, HTTPException -from models.product import Product -from models.request import AutocompleteRequest, SearchRequest -from utils import connection, constants, response +from app.models.product import Product +from app.models.request import AutocompleteRequest, SearchRequest +from app.utils import connection, constants, response app = FastAPI() connection.get_connection() diff --git a/utils/__init__.py b/app/models/__init__.py similarity index 100% rename from utils/__init__.py rename to app/models/__init__.py diff --git a/models/product.py b/app/models/product.py similarity index 99% rename from models/product.py rename to app/models/product.py index 04087639..a0290abf 100644 --- a/models/product.py +++ b/app/models/product.py @@ -1,7 +1,7 @@ from elasticsearch_dsl import Document, Date, Double, Keyword, Text, Integer -from utils import constants -from utils.analyzers import autocomplete +from app.utils import constants +from app.utils.analyzers import autocomplete class Product(Document): diff --git a/models/request.py b/app/models/request.py similarity index 96% rename from models/request.py rename to app/models/request.py index 4c4593be..1948eba0 100644 --- a/models/request.py +++ b/app/models/request.py @@ -2,7 +2,7 @@ from typing import Optional, List, Set from pydantic import BaseModel -from utils import constants +from app.utils import constants class SearchBase(BaseModel): diff --git a/scripts/data-fields.txt b/app/scripts/data-fields.txt similarity index 100% rename from scripts/data-fields.txt rename to app/scripts/data-fields.txt diff --git a/scripts/es_query.py b/app/scripts/es_query.py similarity index 88% rename from scripts/es_query.py rename to app/scripts/es_query.py index 9b535b4e..bb7256b5 100644 --- a/scripts/es_query.py +++ b/app/scripts/es_query.py @@ -3,8 +3,8 @@ """ import time -from models.product import Product -from utils import connection +from app.models.product import Product +from app.utils import connection def manual_query(): diff --git a/scripts/generate_product_from_data_fields.py b/app/scripts/generate_product_from_data_fields.py similarity index 98% rename from scripts/generate_product_from_data_fields.py rename to app/scripts/generate_product_from_data_fields.py index bd0ccdf0..0c4a8f2d 100644 --- a/scripts/generate_product_from_data_fields.py +++ b/app/scripts/generate_product_from_data_fields.py @@ -2,7 +2,7 @@ This script takes the data-fields.txt and generates the updated product fields. Note, if field names are changed, etc, this will have considerable implications for the index. """ -from utils import constants +from app.utils import constants def get_type_for_field(field): diff --git a/scripts/http_query.py b/app/scripts/http_query.py similarity index 92% rename from scripts/http_query.py rename to app/scripts/http_query.py index c252eedb..ee94ea00 100644 --- a/scripts/http_query.py +++ b/app/scripts/http_query.py @@ -5,8 +5,7 @@ import time import requests -from models.product import Product -from utils import connection +from app.utils import connection def manual_query(): diff --git a/scripts/perform_import.py b/app/scripts/perform_import.py similarity index 97% rename from scripts/perform_import.py rename to app/scripts/perform_import.py index 59e22889..161a982c 100644 --- a/scripts/perform_import.py +++ b/app/scripts/perform_import.py @@ -13,8 +13,8 @@ from elasticsearch.helpers import bulk -from models.product import Product -from utils import connection, constants +from app.models.product import Product +from app.utils import connection, constants def gen_documents(filename, next_index): diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/analyzers.py b/app/utils/analyzers.py similarity index 99% rename from utils/analyzers.py rename to app/utils/analyzers.py index 36958879..aad86965 100644 --- a/utils/analyzers.py +++ b/app/utils/analyzers.py @@ -5,4 +5,4 @@ 'autocomplete', tokenizer=tokenizer('bigram', 'edge_ngram', min_gram=2, max_gram=25, token_chars=['letter', 'digit', 'punctuation']), filter=['lowercase', 'asciifolding'] -) \ No newline at end of file +) diff --git a/app/utils/connection.py b/app/utils/connection.py new file mode 100644 index 00000000..27d296e4 --- /dev/null +++ b/app/utils/connection.py @@ -0,0 +1,6 @@ +import os +from elasticsearch_dsl.connections import connections + + +def get_connection(): + return connections.create_connection(hosts=[os.getenv('ELASTICSEARCH_URL', '127.0.0.1:9200')]) \ No newline at end of file diff --git a/utils/constants.py b/app/utils/constants.py similarity index 100% rename from utils/constants.py rename to app/utils/constants.py diff --git a/utils/response.py b/app/utils/response.py similarity index 91% rename from utils/response.py rename to app/utils/response.py index 5667d99e..1a6f66fa 100644 --- a/utils/response.py +++ b/app/utils/response.py @@ -1,5 +1,5 @@ -from models.product import Product -from models.request import SearchBase +from app.models.product import Product +from app.models.request import SearchBase def create_response(es_results, request: SearchBase): diff --git a/docker-compose.yml b/docker-compose.yml index b9aabc16..44e7ecc0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -123,7 +123,6 @@ services: timeout: 10s retries: 120 - # elasticsearch browser elasticvue: image: cars10/elasticvue @@ -133,6 +132,14 @@ services: links: - es01 + searchservice: + image: off_search_image + container_name: searchservice + environment: + - ELASTICSEARCH_URL=host.docker.internal:9200 + ports: + - '8000:8000' + volumes: certs: driver: local diff --git a/utils/connection.py b/utils/connection.py deleted file mode 100644 index 7c8659b0..00000000 --- a/utils/connection.py +++ /dev/null @@ -1,5 +0,0 @@ -from elasticsearch_dsl.connections import connections - - -def get_connection(): - return connections.create_connection(hosts=['localhost:9200']) \ No newline at end of file