Skip to content

Commit

Permalink
feat: store raw stopwords and rewrite them at export (#358)
Browse files Browse the repository at this point in the history
* store raw stopwords and rewrite them at export

* Apply suggestions from code review

Co-authored-by: Alex Garel <alex@garel.org>

---------

Co-authored-by: Alex Garel <alex@garel.org>
  • Loading branch information
perierc and alexgarel authored Jan 24, 2024
1 parent b18ac03 commit 444f76d
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 15 deletions.
11 changes: 8 additions & 3 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,15 +243,20 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
id = "stopwords:" + str(index_stopwords)
data = self._set_data_id(data, id, line_number)
index_stopwords += 1
# remove "stopwords:" part
line = line[10:]
# compute raw values outside _get_lc_value as it removes stop words !
tags = [words.strip() for words in line[3:].split(",")]
try:
lc, value = self._get_lc_value(line[10:])
lc, value = self._get_lc_value(line)
except ValueError:
self.parser_logger.error(
f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'"
)
else:
data.tags["tags_" + lc] = value
# add the list with its lc
data.tags["tags_" + lc] = tags
data.tags["tags_ids_" + lc] = value
# add the normalized list with its lc
self.stopwords[lc] = value
elif line.startswith("synonyms"):
# general synonyms definition for a language
Expand Down
11 changes: 3 additions & 8 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,9 @@ def get_all_nodes(self, project_label):
def list_tags_lc(self, node):
"""return an ordered list of the language codes (lc) used in a node"""
lc_list = []
if "stopwords" in node["id"]:
# stopwords node only have a tags_lc property
key = "tags_"
# number of dashes to split on to get language code
dash_before_lc = 1
else:
key = "tags_ids_"
dash_before_lc = 2
key = "tags_ids_"
# number of dashes to split on to get language code
dash_before_lc = 2

for property in node:
if property.startswith(key):
Expand Down
2 changes: 1 addition & 1 deletion parser/tests/data/test.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# test taxonomy

stopwords:fr: aux,au,de,le,du,la,a,et
stopwords:fr: aux,au,de,le,du,la,a,et,test normalisation

synonyms:en:passion fruit, passionfruit

Expand Down
4 changes: 2 additions & 2 deletions parser/tests/integration/test_parse_unparse_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_round_trip(neo4j):
for line in original_lines:
# first tweak: spaces between stopwords
if line.startswith("stopwords:fr: aux"):
line = "stopwords:fr:aux, au, de, le, du, la, a, et"
line = "stopwords:fr:aux, au, de, le, du, la, a, et, test normalisation"
# second tweak: renaming parent
elif line.startswith("<fr:yaourts fruit de la passion"):
line = "<en:Passion fruit yogurts"
Expand Down Expand Up @@ -98,7 +98,7 @@ def test_two_branch_round_trip(neo4j):
for line in original_lines:
# first tweak: spaces between stopwords
if line.startswith("stopwords:fr: aux"):
line = "stopwords:fr:aux, au, de, le, du, la, a, et"
line = "stopwords:fr:aux, au, de, le, du, la, a, et, test normalisation"
# second tweak: renaming parent
elif line.startswith("<fr:yaourts fruit de la passion"):
line = "<en:Passion fruit yogurts"
Expand Down
3 changes: 2 additions & 1 deletion parser/tests/integration/test_parser_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def test_calling(neo4j):
results = session.run(query)
expected_stopwords = {
"id": "stopwords:0",
"tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et"],
"tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test normalisation"],
"tags_ids_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test-normalisation"],
"preceding_lines": [],
}
for result in results:
Expand Down

0 comments on commit 444f76d

Please sign in to comment.