diff --git a/robotoff/utils/text.py b/robotoff/utils/text.py index b01ecaa756..f14b3f7683 100644 --- a/robotoff/utils/text.py +++ b/robotoff/utils/text.py @@ -54,11 +54,23 @@ def get_lemmatizing_nlp(lang: str) -> spacy.Language: def get_tag(text: str) -> str: + """Return a tag from a text. + + In Open Food Facts, tags are obtained from free text by performing the + following: + - lowercasing + - accent removal + - replacement of punctuation by either a comma ("-") or nothing, depending + on the punctuation + """ text = strip_accents_ascii_v2(text) - return ( + text = ( text.lower() .replace(" & ", "-") .replace(" ", "-") .replace("'", "-") .replace(".", "-") + .replace("!", "") + .replace("?", "") ) + return strip_consecutive_spaces(text).strip("-") diff --git a/tests/unit/utils/test_text.py b/tests/unit/utils/test_text.py index 95a0108f11..ff4967c72e 100644 --- a/tests/unit/utils/test_text.py +++ b/tests/unit/utils/test_text.py @@ -14,6 +14,7 @@ ("monop'daily", "monop-daily"), ("épi d'or", "epi-d-or"), ("Health Star Rating 0.5", "health-star-rating-0-5"), + ("C'est qui le Patron ?!", "c-est-qui-le-patron"), ], ) def test_get_tag(value: str, output: str):