Skip to content

Commit

Permalink
Removing rotation invalid ones to see what happens
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 17, 2024
1 parent dd4f967 commit 1c8602c
Showing 1 changed file with 6 additions and 0 deletions.
6 changes: 6 additions & 0 deletions pdelfin/birrpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,9 @@ def process_jsonl_content(inference_s3_path: str) -> List[DatabaseManager.BatchI

last_error = data.get("completion_error", None)

if not model_response_json["is_rotation_valid"]:
last_error = "rotation_invalid"

index_entries.append(DatabaseManager.BatchInferenceRecord(
inference_s3_path=inference_s3_path,
pdf_s3_path=pdf_s3_path,
Expand Down Expand Up @@ -547,6 +550,9 @@ def get_current_round(s3_workspace: str) -> int:
parser = argparse.ArgumentParser(description='Manager for running millions of PDFs through a batch inference pipeline')
parser.add_argument('workspace', help='The S3 path where work will be done e.g., s3://bucket/prefix/)')
parser.add_argument('--add_pdfs', help='Path to add pdfs stored in s3 to the workspace, can be a glob path s3://bucket/prefix/*.pdf or path to file containing list of pdf paths', default=None)
parser.add_argument('--prefilter_lang', help='If set, tries to detect the language of the pdf and only accepts it if it matches (ex. ENGLISH)')
parser.add_argument('--prefilter_spam', help='If set, tries to detect spammy pdfs and not include them')

parser.add_argument('--workspace_profile', help='S3 configuration profile for accessing the workspace', default=None)
parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None)
parser.add_argument('--max_size_mb', type=int, default=250, help='Max file size in MB')
Expand Down

0 comments on commit 1c8602c

Please sign in to comment.