fix: add CLI command to pretty print OCR result

openfoodfacts · May 31, 2023 · 26f44a4 · 26f44a4
1 parent a61eef4
commit 26f44a4
Show file tree

Hide file tree

Showing 3 changed files with 152 additions and 6 deletions.
diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import pathlib
 from pathlib import Path
 from typing import Optional
@@ -48,6 +49,9 @@ def regenerate_ocr_insights(
     server_type: ServerType = typer.Option(
         ServerType.off, help="Server type of the product"
     ),
+    ocr_prediction_types: Optional[list[PredictionType]] = typer.Option(
+        None, help="Types of OCR prediction to use"
+    ),
 ) -> None:
     """Regenerate OCR predictions/insights for a specific product and import
     them."""
@@ -61,6 +65,9 @@ def regenerate_ocr_insights(
     from robotoff.products import get_product
     from robotoff.utils import get_logger
 
+    if ocr_prediction_types is None:
+        ocr_prediction_types = DEFAULT_OCR_PREDICTION_TYPES
+
     logger = get_logger()
 
     product_id = ProductIdentifier(barcode, server_type)
@@ -75,7 +82,7 @@ def regenerate_ocr_insights(
 
         ocr_url = generate_json_ocr_url(product_id, image_id)
         predictions += extract_ocr_predictions(
-            product_id, ocr_url, DEFAULT_OCR_PREDICTION_TYPES
+            product_id, ocr_url, ocr_prediction_types
         )
 
     with db:
@@ -794,5 +801,80 @@ def import_image_webhook(
         logger.info("Robotoff response: %s", r.json())
 
 
+@app.command()
+def pprint_ocr_result(
+    uri: str = typer.Argument(..., help="URI of the image or OCR"),
+) -> None:
+    """Pretty print OCR result."""
+    import sys
+
+    import orjson
+
+    from robotoff.prediction.ocr.core import get_ocr_result
+    from robotoff.prediction.ocr.dataclass import OCRResult
+    from robotoff.utils import get_logger, http_session
+
+    logger = get_logger()
+
+    if uri.endswith(".jpg"):
+        uri = uri.replace(".jpg", ".json")
+
+    logger.info("displaying OCR result %s", uri)
+
+    if uri.startswith("http"):
+        ocr_result = get_ocr_result(uri, http_session)
+    else:
+        with open(uri, "rb") as f:
+            data = orjson.loads(f.read())
+            ocr_result = OCRResult.from_json(data)
+
+    if ocr_result is None:
+        logger.info("error while downloading %s", uri)
+        sys.exit(0)
+
+    if ocr_result.full_text_annotation is None:
+        logger.info("no full text annotation available")
+        sys.exit(0)
+    ocr_result.pprint()
+
+
+@app.command()
+def generate_ocr_result(
+    image_url: str = typer.Argument(..., help="URL of the image"),
+    output_dir: Path = typer.Argument(
+        ...,
+        file_okay=False,
+        dir_okay=True,
+        help="Directory where the OCR JSON should be saved",
+    ),
+) -> None:
+    import orjson
+
+    from robotoff.cli.ocr import run_ocr_on_image
+    from robotoff.off import get_source_from_url
+    from robotoff.utils import get_logger, http_session
+
+    logger = get_logger()
+    API_KEY = os.environ["GOOGLE_CLOUD_VISION_API_KEY"]
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    source_image_path = Path(get_source_from_url(image_url))
+    output_file = output_dir / (
+        str(source_image_path.parent).replace("/", "_")[1:]
+        + f"_{source_image_path.stem}.json"
+    )
+    logger.info("Downloading image %s", image_url)
+    r = http_session.get(image_url)
+    r.raise_for_status()
+
+    logger.info("Generating OCR result")
+    response = run_ocr_on_image(r.content, API_KEY)
+
+    with open(output_file, "wb") as f:
+        f.write(orjson.dumps(response))
+
+    pprint_ocr_result(str(output_file))
+
+
 def main() -> None:
     app()
diff --git a/robotoff/cli/ocr.py b/robotoff/cli/ocr.py
@@ -0,0 +1,39 @@
+import base64
+from typing import List
+
+import orjson
+import requests
+
+from robotoff.utils import get_logger, http_session
+
+logger = get_logger(__name__)
+
+
+def run_ocr_on_image_batch(base64_images: List[str], api_key: str) -> requests.Response:
+    url = f"https://vision.googleapis.com/v1/images:annotate?key={api_key}"
+    return http_session.post(
+        url,
+        json={
+            "requests": [
+                {
+                    "features": [{"type": "TEXT_DETECTION"}],
+                    "image": {"content": base64_image},
+                }
+                for base64_image in base64_images
+            ]
+        },
+    )
+
+
+def run_ocr_on_image(image_bytes: bytes, api_key: str):
+    if not image_bytes:
+        raise ValueError("empty image")
+
+    content = base64.b64encode(image_bytes).decode("utf-8")
+    r = run_ocr_on_image_batch([content], api_key)
+
+    if not r.ok:
+        logger.info("HTTP %s received", r.status_code)
+        logger.info("Response: %s", r.text)
+        return
+    return orjson.loads(r.content)
diff --git a/robotoff/prediction/ocr/dataclass.py b/robotoff/prediction/ocr/dataclass.py
@@ -223,6 +223,7 @@ def get_match_bounding_box(
             annotation is not available
         """
         words = self.get_words_from_indices(start_idx, end_idx, raises)
+        logger.debug("get_match_bounding_box: words: %s", words)
 
         if words is not None:
             if words:
@@ -257,6 +258,13 @@ def get_words_from_indices(
             start_idx, end_idx, raises
         )
 
+    def pprint(self):
+        """Pretty print the full text annotation, if it is not null."""
+        if self.full_text_annotation:
+            print(self._generate_pretty_print_string())
+        else:
+            print("No full text annotation available")
+
 
 def get_text(
     content: Union[OCRResult, str], ocr_regex: Optional[OCRRegex] = None
@@ -396,6 +404,27 @@ def get_words_from_indices(
 
         return selected
 
+    def pprint(self):
+        """Pretty print the full text annotation."""
+        print(self._generate_pretty_print_string())
+
+    def _generate_pretty_print_string(self) -> str:
+        """Generate a pretty print version of the full text annotation, ready
+        to print.
+
+        :return: the generated string
+        """
+        strings = []
+        for page_id, page in enumerate(self.pages):
+            strings.append(f"> page #{page_id}")
+            for block_id, block in enumerate(page.blocks):
+                strings.append(f">> block #{block_id}")
+                for paragraph_id, paragraph in enumerate(block.paragraphs):
+                    strings.append(f">>> paragraph #{paragraph_id}")
+                    text = paragraph.text
+                    strings.append(f"    {repr(text)}")
+        return "\n".join(strings)
+
 
 class TextAnnotationPage:
     """Detected page from OCR."""
@@ -566,11 +595,7 @@ class Paragraph:
     """Structural unit of text representing a number of words in certain
     order."""
 
-    __slots__ = (
-        "words",
-        "text",
-        "bounding_poly",
-    )
+    __slots__ = ("words", "text", "bounding_poly")
 
     def __init__(self, data: JSONType, initial_offset: int = 0):
         """Initialize a Paragraph.