From 280580e43edf9ed4ce84a74fdf6ee85f95d8ef31 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 19 Feb 2025 17:17:05 -0500 Subject: [PATCH 1/2] Fix llm layout missing text --- marker/providers/pdf.py | 2 +- marker/schema/blocks/base.py | 1 + marker/schema/groups/page.py | 27 +++++++++++------ marker/scripts/run_streamlit_app.py | 2 +- pyproject.toml | 2 +- tests/builders/test_layout_replace.py | 42 +++++++++++++++++++++++++++ 6 files changed, 64 insertions(+), 12 deletions(-) create mode 100644 tests/builders/test_layout_replace.py diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 60a974b4..a63723ff 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -33,7 +33,7 @@ class PdfProvider(BaseProvider): pdftext_workers: Annotated[ int, "The number of workers to use for pdftext.", - ] = 1 + ] = 4 flatten_pdf: Annotated[ bool, "Whether to flatten the PDF structure.", diff --git a/marker/schema/blocks/base.py b/marker/schema/blocks/base.py index 5ff40d4b..21fe0468 100644 --- a/marker/schema/blocks/base.py +++ b/marker/schema/blocks/base.py @@ -85,6 +85,7 @@ class Block(BaseModel): metadata: BlockMetadata | None = None lowres_image: Image.Image | None = None highres_image: Image.Image | None = None + removed: bool = False # Has block been replaced by new block? model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 681a07cd..6bda0386 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -4,6 +4,8 @@ from PIL import Image, ImageDraw from pdftext.schema import Reference +from pydantic import computed_field + from marker.providers import ProviderOutput from marker.schema import BlockTypes from marker.schema.blocks import Block, BlockId, Text @@ -53,6 +55,10 @@ def get_image(self, *args, highres: bool = False, remove_blocks: Sequence[BlockT return image + @computed_field + @property + def current_children(self) -> List[Block]: + return [child for child in self.children if not child.removed] def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None): if ignored_block_types is None: @@ -102,13 +108,9 @@ def assemble_html(self, document, child_blocks, parent_structure=None): template += f"" return template - def compute_line_block_intersections(self, provider_outputs: List[ProviderOutput]): + def compute_line_block_intersections(self, blocks: List[Block], provider_outputs: List[ProviderOutput]): max_intersections = {} - blocks = [ - block for block in self.children - if block.block_type not in self.excluded_block_types - ] block_bboxes = [block.polygon.bbox for block in blocks] line_bboxes = [provider_output.line.polygon.bbox for provider_output in provider_outputs] @@ -137,6 +139,10 @@ def replace_block(self, block: Block, new_block: Block): for child in self.children: child.replace_block(block, new_block) + # Mark block as removed + block.removed = True + + def identify_missing_blocks( self, provider_line_idxs: List[int], @@ -224,7 +230,12 @@ def merge_blocks( text_extraction_method: str ): provider_line_idxs = list(range(len(provider_outputs))) - max_intersections = self.compute_line_block_intersections(provider_outputs) + valid_blocks = [ + block for block in self.current_children # ensure we only look at children that haven't been replaced + if block.block_type not in self.excluded_block_types + ] + + max_intersections = self.compute_line_block_intersections(valid_blocks, provider_outputs) # Try to assign lines by intersection assigned_line_idxs = set() @@ -241,9 +252,7 @@ def merge_blocks( min_dist_idx = None provider_output: ProviderOutput = provider_outputs[line_idx] line = provider_output.line - for block in self.children: - if block.block_type in self.excluded_block_types: - continue + for block in valid_blocks: # We want to assign to blocks closer in y than x dist = line.polygon.center_distance(block.polygon, x_weight=5) if min_dist_idx is None or dist < min_dist: diff --git a/marker/scripts/run_streamlit_app.py b/marker/scripts/run_streamlit_app.py index e364f4f1..597d7213 100644 --- a/marker/scripts/run_streamlit_app.py +++ b/marker/scripts/run_streamlit_app.py @@ -5,5 +5,5 @@ def streamlit_app_cli(): cur_dir = os.path.dirname(os.path.abspath(__file__)) app_path = os.path.join(cur_dir, "streamlit_app.py") - cmd = ["streamlit", "run", app_path, "--server.fileWatcherType", "none"] + cmd = ["streamlit", "run", app_path, "--server.fileWatcherType", "none", "--server.headless", "true"] subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"}) diff --git a/pyproject.toml b/pyproject.toml index d3d3e33c..bb47df3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.5.4" +version = "1.5.5" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md" diff --git a/tests/builders/test_layout_replace.py b/tests/builders/test_layout_replace.py new file mode 100644 index 00000000..be92cc62 --- /dev/null +++ b/tests/builders/test_layout_replace.py @@ -0,0 +1,42 @@ +import pytest + +from marker.builders.document import DocumentBuilder +from marker.builders.layout import LayoutBuilder +from marker.builders.line import LineBuilder +from marker.renderers.markdown import MarkdownRenderer +from marker.schema import BlockTypes +from marker.schema.registry import get_block_class + + +@pytest.mark.config({"page_range": [0]}) +def test_layout_replace(request, config, pdf_provider, layout_model, ocr_error_model, detection_model, inline_detection_model): + # The llm layout builder replaces blocks - this makes sure text is still merged properly + layout_builder = LayoutBuilder(layout_model, config) + line_builder = LineBuilder(detection_model, inline_detection_model, ocr_error_model, config) + builder = DocumentBuilder(config) + document = builder.build_document(pdf_provider) + layout_builder(document, pdf_provider) + page = document.pages[0] + new_blocks = [] + for block in page.contained_blocks(document, (BlockTypes.Text,)): + generated_block_class = get_block_class(BlockTypes.TextInlineMath) + generated_block = generated_block_class( + polygon=block.polygon, + page_id=block.page_id, + structure=block.structure, + ) + page.replace_block(block, generated_block) + new_blocks.append(generated_block) + line_builder(document, pdf_provider) + + for block in new_blocks: + assert block.raw_text(document).strip() + + renderer = MarkdownRenderer(config) + rendered = renderer(document) + + assert "worst-case perturbations" in rendered.markdown + assert "projected gradient descent" in rendered.markdown + + + From f0c9f22eef62c70465c1d37cbe16076d67d53778 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 19 Feb 2025 17:52:21 -0500 Subject: [PATCH 2/2] Update children calls --- marker/schema/groups/page.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 6bda0386..9f033051 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -47,7 +47,7 @@ def get_image(self, *args, highres: bool = False, remove_blocks: Sequence[BlockT if remove_blocks: image = image.copy() draw = ImageDraw.Draw(image) - bad_blocks = [block for block in self.children if block.block_type in remove_blocks] + bad_blocks = [block for block in self.current_children if block.block_type in remove_blocks] for bad_block in bad_blocks: poly = bad_block.polygon.rescale(self.polygon.size, image.size).polygon poly = [(int(p[0]), int(p[1])) for p in poly] @@ -274,7 +274,7 @@ def aggregate_block_metadata(self) -> BlockMetadata: if self.metadata is None: self.metadata = BlockMetadata() - for block in self.children: + for block in self.current_children: if block.metadata is not None: self.metadata = self.metadata.merge(block.metadata) return self.metadata