Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Blockquote Processor #404

Merged
merged 4 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from marker.builders.ocr import OcrBuilder
from marker.builders.structure import StructureBuilder
from marker.converters import BaseConverter
from marker.processors.blockquote import BlockquoteProcessor
from marker.processors.code import CodeProcessor
from marker.processors.debug import DebugProcessor
from marker.processors.document_toc import DocumentTOCProcessor
Expand Down Expand Up @@ -53,17 +54,18 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
processor_list = strings_to_classes(processor_list)
else:
processor_list = [
FootnoteProcessor,
PageHeaderProcessor,
EquationProcessor,
TableProcessor,
SectionHeaderProcessor,
TextProcessor,
ListProcessor,
BlockquoteProcessor,
CodeProcessor,
DocumentTOCProcessor,
EquationProcessor,
FootnoteProcessor,
IgnoreTextProcessor,
LineNumbersProcessor,
ListProcessor,
PageHeaderProcessor,
SectionHeaderProcessor,
TableProcessor,
TextProcessor,
DebugProcessor,
]

Expand Down
48 changes: 48 additions & 0 deletions marker/processors/blockquote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document

class BlockquoteProcessor(BaseProcessor):
"""
A processor for tagging blockquotes
"""
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
min_x_indent = 0.05 # % of block width
x_start_tolerance = 0.01 # % of block width
x_end_tolerance = 0.01 # % of block width

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue

if not len(block.structure) >= 2:
continue

next_block = page.get_next_block(block)
if next_block is None:
continue
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None:
continue
if next_block.ignore_for_output:
continue

matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width
matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width
x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width)
y_indent = next_block.polygon.y_start > block.polygon.y_end

if block.block_type in self.block_types and block.blockquote:
next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent)
next_block.blockquote_level = block.blockquote_level
if (x_indent and y_indent):
next_block.blockquote_level += 1
else:
next_block.blockquote = len(next_block.structure) >= 2 and (x_indent and y_indent)
next_block.blockquote_level = 1
2 changes: 0 additions & 2 deletions marker/processors/list.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import math

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
Expand Down
11 changes: 10 additions & 1 deletion marker/schema/blocks/inlinemath.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
class InlineMath(Block):
block_type: BlockTypes = BlockTypes.TextInlineMath
has_continuation: bool = False
blockquote: bool = False
blockquote_level: int = 0

def assemble_html(self, child_blocks, parent_structure):
if self.ignore_for_output:
Expand All @@ -16,4 +18,11 @@ def assemble_html(self, child_blocks, parent_structure):
class_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"

if self.blockquote:
# Add indentation for blockquote levels
blockquote_prefix = "<blockquote>" * self.blockquote_level
blockquote_suffix = "</blockquote>" * self.blockquote_level
return f"{blockquote_prefix}<p{class_attr}>{template}</p>{blockquote_suffix}"
else:
return f"<p{class_attr}>{template}</p>"
10 changes: 9 additions & 1 deletion marker/schema/blocks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
class Text(Block):
block_type: BlockTypes = BlockTypes.Text
has_continuation: bool = False
blockquote: bool = False
blockquote_level: int = 0

def assemble_html(self, child_blocks, parent_structure):
if self.ignore_for_output:
Expand All @@ -16,4 +18,10 @@ def assemble_html(self, child_blocks, parent_structure):
class_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"

if self.blockquote:
blockquote_prefix = "<blockquote>" * self.blockquote_level
blockquote_suffix = "</blockquote>" * self.blockquote_level
return f"{blockquote_prefix}<p{class_attr}>{template}</p>{blockquote_suffix}"
else:
return f"<p{class_attr}>{template}</p>"
Loading