Skip to content

Commit

Permalink
fix: save mapping between position of words and full annotation text
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Apr 3, 2023
1 parent 0ecef5e commit 674ad77
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 62 deletions.
4 changes: 2 additions & 2 deletions robotoff/insights/normalize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from robotoff.utils.text import strip_accents_ascii
from robotoff.utils.text import strip_accents_v1


def normalize_emb_code(emb_code: str):
Expand All @@ -10,7 +10,7 @@ def normalize_emb_code(emb_code: str):
emb_code.strip().lower().replace(" ", "").replace("-", "").replace(".", "")
)

emb_code = strip_accents_ascii(emb_code)
emb_code = strip_accents_v1(emb_code)

"""if the code ends with "ce" replace it with "ec"
here "fr40261001ce" becomes "fr40261001ec"
Expand Down
4 changes: 2 additions & 2 deletions robotoff/prediction/category/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from robotoff.utils import dump_json, get_logger, load_json
from robotoff.utils.text import (
get_lemmatizing_nlp,
strip_accents_ascii,
strip_accents_v1,
strip_consecutive_spaces,
)

Expand Down Expand Up @@ -169,7 +169,7 @@ def process(text: str, lang: str) -> str:
continue
lemmas.append(token.lemma_)

return strip_accents_ascii(" ".join(lemmas))
return strip_accents_v1(" ".join(lemmas))


def generate_match_maps(taxonomy_type: str) -> MatchMapType:
Expand Down
155 changes: 106 additions & 49 deletions robotoff/prediction/ocr/dataclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

from robotoff.types import JSONType
from robotoff.utils import get_logger
from robotoff.utils.text import strip_accents

# Some classes documentation were adapted from Google documentation on
# https://cloud.google.com/vision/docs/reference/rpc/google.cloud.vision.v1#google.cloud.vision.v1.Symbol

MULTIPLE_SPACES_REGEX = re.compile(r" {2,}")

logger = get_logger(__name__)

Expand Down Expand Up @@ -92,7 +92,7 @@ class OCRResult:
"label_annotations",
)

def __init__(self, data: JSONType, lazy: bool = True):
def __init__(self, data: JSONType):
self.text_annotations: list[OCRTextAnnotation] = []
self.full_text_annotation: Optional[OCRFullTextAnnotation] = None
self.logo_annotations: list[LogoAnnotation] = []
Expand All @@ -111,9 +111,7 @@ def __init__(self, data: JSONType, lazy: bool = True):
full_text_annotation_data = data.get("fullTextAnnotation")

if full_text_annotation_data:
self.full_text_annotation = OCRFullTextAnnotation(
full_text_annotation_data, lazy=lazy
)
self.full_text_annotation = OCRFullTextAnnotation(full_text_annotation_data)

for logo_annotation_data in data.get("logoAnnotations", []):
logo_annotation = LogoAnnotation(logo_annotation_data)
Expand All @@ -130,13 +128,13 @@ def __init__(self, data: JSONType, lazy: bool = True):

def get_full_text(self) -> str:
if self.full_text_annotation is not None:
return self.full_text_annotation.text
return self.full_text_annotation.api_text

return ""

def get_full_text_contiguous(self) -> str:
if self.full_text_annotation is not None:
return self.full_text_annotation.contiguous_text
return self.full_text_annotation.api_text

return ""

Expand Down Expand Up @@ -258,21 +256,33 @@ class OCRFullTextAnnotation:
properties. Properties describe detected languages, breaks etc.."""

__slots__ = (
"api_text",
"text",
"_pages",
"_pages_data",
"contiguous_text",
"continuous_text",
"unnaccented_text",
"pages",
)

def __init__(self, data: JSONType, lazy: bool = True):
self.text = MULTIPLE_SPACES_REGEX.sub(" ", data["text"])
self.contiguous_text = self.text.replace("\n", " ")
self.contiguous_text = MULTIPLE_SPACES_REGEX.sub(" ", self.contiguous_text)
self._pages_data = data["pages"]
self._pages: list[TextAnnotationPage] = []

if not lazy:
self.load_pages()
def __init__(self, data: JSONType):
self.api_text = data["text"]
self.pages = []
initial_offset = 0
text_list: list[str] = []
for page_data in data["pages"]:
page = TextAnnotationPage(page_data, initial_offset=initial_offset)
initial_offset += len(page.text) + 1
text_list.append(page.text)
self.pages.append(page)
self.text = "|".join(text_list)
# Replace line break with space characters to match over several lines
# We use to replace consecutive spaces (2+) with a single space so that
# spurious spaces don't prevent a match, but this is unnecessary: on
# X millions OCRs, only Y had double spaces, and it was m
# This way, the word offsets (word.start_idx, word.end_idx) match the
# FullTextAnnotation text, and we can very easily determine the
# position of the matched words
self.continuous_text = self.text.replace("\n", " ")
self.unnaccented_text = strip_accents(self.continuous_text, keep_length=True)

def get_languages(self) -> dict[str, int]:
counts: dict[str, int] = defaultdict(int)
Expand All @@ -284,17 +294,6 @@ def get_languages(self) -> dict[str, int]:

return dict(counts)

@property
def pages(self) -> list["TextAnnotationPage"]:
if self._pages_data is not None:
self.load_pages()

return self._pages

def load_pages(self):
self._pages = [TextAnnotationPage(page) for page in self._pages_data]
self._pages_data = None

def detect_orientation(self) -> OrientationResult:
word_orientations: list[ImageOrientation] = []

Expand Down Expand Up @@ -326,10 +325,26 @@ def match(
class TextAnnotationPage:
"""Detected page from OCR."""

def __init__(self, data: JSONType):
__slots__ = (
"width",
"height",
"blocks",
"text",
)

def __init__(self, data: JSONType, initial_offset: int = 0):
self.width = data["width"]
self.height = data["height"]
self.blocks: list[Block] = [Block(d) for d in data["blocks"]]
self.blocks: list[Block] = []
text_list: list[str] = []
for block_data in data["blocks"]:
block = Block(block_data, initial_offset)
# We add a '|' between each block, so that it's not possible to
# match over several blocks, so we add + 1 to offset
initial_offset += len(block.text) + 1
text_list.append(block.text)
self.blocks.append(block)
self.text = "|".join(text_list)

def get_languages(self) -> dict[str, int]:
counts: dict[str, int] = defaultdict(int)
Expand Down Expand Up @@ -371,16 +386,43 @@ def match(
class Block:
"""Logical element on the page."""

def __init__(self, data: JSONType):
__slots__ = (
"type",
"paragraphs",
"text",
"bounding_poly",
)

def __init__(self, data: JSONType, initial_offset: int = 0):
self.type = data["blockType"]
self.paragraphs: list[Paragraph] = [
Paragraph(paragraph) for paragraph in data["paragraphs"]
]
self.paragraphs: list[Paragraph] = []
text_list = []
add_space_prefix = False
for paragraph_data in data["paragraphs"]:
paragraph = Paragraph(paragraph_data, initial_offset)
# We add a space between each paragraph
initial_offset += len(paragraph.text) + 1
self.paragraphs.append(paragraph)
if add_space_prefix:
text_list.append(" ")
text_list.append(paragraph.text)
add_space_prefix = bool(paragraph.text) and paragraph.text[-1] not in (
" ",
"\n",
)
self.text: str = "".join(text_list)

self.bounding_poly = None
if "boundingBox" in data:
self.bounding_poly = BoundingPoly(data["boundingBox"])

def get_words(self):
return list(
itertools.chain.from_iterable(
paragraph.words for paragraph in self.paragraphs
)
)

def get_languages(self) -> dict[str, int]:
counts: dict[str, int] = defaultdict(int)
for paragraph in self.paragraphs:
Expand Down Expand Up @@ -428,9 +470,22 @@ class Paragraph:
"""Structural unit of text representing a number of words in certain
order."""

def __init__(self, data: JSONType):
self.words: list[Word] = [Word(word) for word in data["words"]]
__slots__ = (
"words",
"text",
"bounding_poly",
)

def __init__(self, data: JSONType, initial_offset: int = 0):
self.words: list[Word] = []

offset = initial_offset
for word_data in data["words"]:
word = Word(word_data, offset)
self.words.append(word)
offset += len(word.text)

self.text: str = "".join(w.text for w in self.words)
self.bounding_poly = None
if "boundingBox" in data:
self.bounding_poly = BoundingPoly(data["boundingBox"])
Expand Down Expand Up @@ -510,9 +565,16 @@ def match(
class Word:
"""A word representation."""

__slots__ = ("bounding_poly", "symbols", "languages", "_text")
__slots__ = (
"bounding_poly",
"symbols",
"languages",
"text",
"start_idx",
"end_idx",
)

def __init__(self, data: JSONType):
def __init__(self, data: JSONType, offset: int = 0):
self.bounding_poly = BoundingPoly(data["boundingBox"])
self.symbols: list[Symbol] = [Symbol(s) for s in data["symbols"]]

Expand All @@ -525,14 +587,9 @@ def __init__(self, data: JSONType):
]

# Attribute to store text generated from symbols
self._text = None

@property
def text(self):
if not self._text:
self._text = self._get_text()

return self._text
self.text = self._get_text()
self.start_idx = offset
self.end_idx = offset + len(self.text)

def _get_text(self) -> str:
text_list = []
Expand Down Expand Up @@ -601,7 +658,7 @@ def match(
)

def __repr__(self) -> str:
return f"<Word: {self.text}>"
return f"<Word: {self.text.__repr__()}>"


class Symbol:
Expand Down
4 changes: 2 additions & 2 deletions robotoff/prediction/ocr/grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from robotoff.taxonomy import Taxonomy, TaxonomyType, get_taxonomy
from robotoff.utils import dump_json, get_logger
from robotoff.utils.text import strip_accents_ascii_v2
from robotoff.utils.text import strip_accents

logger = get_logger(__name__)

Expand All @@ -14,7 +14,7 @@ def normalize_string(text: str, lowercase: bool, strip_accent: bool) -> str:
if lowercase:
text = text.lower()
if strip_accent:
text = strip_accents_ascii_v2(text)
text = strip_accents(text)
return text


Expand Down
4 changes: 2 additions & 2 deletions robotoff/prediction/ocr/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from robotoff.types import PredictionType
from robotoff.utils import get_logger
from robotoff.utils.cache import CachedStore
from robotoff.utils.text import strip_accents_ascii
from robotoff.utils.text import strip_accents_v1

from .dataclass import OCRResult

Expand Down Expand Up @@ -186,7 +186,7 @@ def get_text(ocr_result: OCRResult) -> str:
@staticmethod
def normalize_text(text: str) -> str:
text = text.lower()
text = strip_accents_ascii(text)
text = strip_accents_v1(text)
return text.replace("'", " ").replace("-", " ")

def find_city_names(self, text: str) -> list[tuple[City, int, int]]:
Expand Down
38 changes: 38 additions & 0 deletions robotoff/utils/fold_to_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -1382,6 +1382,44 @@
translate_table = codepoint_to_self + codepoint_to_replacement


class TranslateTableWithoutReplacement:
translate_table = dict(
(ordinal, replacement)
for (ordinal, replacement) in translate_table
if len(replacement) == 1
)

def __getitem__(self, value):
if (replacement_value := self.translate_table.get(value)) is None:
return value
return replacement_value


translate_table_without_replacement = TranslateTableWithoutReplacement()


def fold_without_replacement(string: str):
"""Replace.
Unmapped characters should be replaced with empty string by default, or other
replacement if provided.
All astral plane characters are always removed, even if a replacement is
provided.
"""
if string is None:
return ""

try:
# If string contains only ASCII characters, return it.
string.encode("ascii")
return string
except UnicodeEncodeError:
pass

return string.translate(translate_table_without_replacement)


def fold(string: str, replacement: str = "") -> str:
"""Fold string to ASCII.
Expand Down
Loading

0 comments on commit 674ad77

Please sign in to comment.