From 273a8b0d0a343fa559cb7afdb97a9afd4bb985a7 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 19 Nov 2024 15:11:02 -0800 Subject: [PATCH] Logging fallback pages --- pdelfin/beakerpipeline.py | 3 ++- pdelfin/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pdelfin/beakerpipeline.py b/pdelfin/beakerpipeline.py index 071f413..d1a16d2 100644 --- a/pdelfin/beakerpipeline.py +++ b/pdelfin/beakerpipeline.py @@ -289,7 +289,8 @@ def build_dolma_document(pdf_s3_path, page_results): "Source-File": pdf_s3_path, "pdf-total-pages": len(page_results), "total-input-tokens": sum(page.input_tokens for page in page_results), - "total-output-tokens": sum(page.output_tokens for page in page_results) + "total-output-tokens": sum(page.output_tokens for page in page_results), + "total-fallback-pages": sum(page.is_fallback for page in page_results), } id_ = hashlib.sha1(document_text.encode()).hexdigest() diff --git a/pdelfin/version.py b/pdelfin/version.py index a60ca62..76a40ed 100644 --- a/pdelfin/version.py +++ b/pdelfin/version.py @@ -2,7 +2,7 @@ _MINOR = "1" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "35" +_PATCH = "36" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""