diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 5bb7f1cc..b77eb1e8 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -10,6 +10,7 @@ from marker.builders.ocr import OcrBuilder from marker.builders.structure import StructureBuilder from marker.converters import BaseConverter +from marker.processors.blockquote import BlockquoteProcessor from marker.processors.code import CodeProcessor from marker.processors.debug import DebugProcessor from marker.processors.document_toc import DocumentTOCProcessor @@ -53,17 +54,18 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No processor_list = strings_to_classes(processor_list) else: processor_list = [ - FootnoteProcessor, - PageHeaderProcessor, - EquationProcessor, - TableProcessor, - SectionHeaderProcessor, - TextProcessor, - ListProcessor, + BlockquoteProcessor, CodeProcessor, DocumentTOCProcessor, + EquationProcessor, + FootnoteProcessor, IgnoreTextProcessor, LineNumbersProcessor, + ListProcessor, + PageHeaderProcessor, + SectionHeaderProcessor, + TableProcessor, + TextProcessor, DebugProcessor, ] diff --git a/marker/processors/blockquote.py b/marker/processors/blockquote.py new file mode 100644 index 00000000..a5fb641c --- /dev/null +++ b/marker/processors/blockquote.py @@ -0,0 +1,48 @@ +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.document import Document + +class BlockquoteProcessor(BaseProcessor): + """ + A processor for tagging blockquotes + """ + block_types = (BlockTypes.Text, BlockTypes.TextInlineMath) + min_x_indent = 0.05 # % of block width + x_start_tolerance = 0.01 # % of block width + x_end_tolerance = 0.01 # % of block width + + def __init__(self, config): + super().__init__(config) + + def __call__(self, document: Document): + for page in document.pages: + for block in page.contained_blocks(document, self.block_types): + if block.structure is None: + continue + + if not len(block.structure) >= 2: + continue + + next_block = page.get_next_block(block) + if next_block is None: + continue + if next_block.block_type not in self.block_types: + continue + if next_block.structure is None: + continue + if next_block.ignore_for_output: + continue + + matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width + matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width + x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width) + y_indent = next_block.polygon.y_start > block.polygon.y_end + + if block.block_type in self.block_types and block.blockquote: + next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent) + next_block.blockquote_level = block.blockquote_level + if (x_indent and y_indent): + next_block.blockquote_level += 1 + else: + next_block.blockquote = len(next_block.structure) >= 2 and (x_indent and y_indent) + next_block.blockquote_level = 1 diff --git a/marker/processors/list.py b/marker/processors/list.py index 56baa6cf..4517983a 100644 --- a/marker/processors/list.py +++ b/marker/processors/list.py @@ -1,5 +1,3 @@ -import math - from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document diff --git a/marker/schema/blocks/inlinemath.py b/marker/schema/blocks/inlinemath.py index b0adbc6d..15c65360 100644 --- a/marker/schema/blocks/inlinemath.py +++ b/marker/schema/blocks/inlinemath.py @@ -5,6 +5,8 @@ class InlineMath(Block): block_type: BlockTypes = BlockTypes.TextInlineMath has_continuation: bool = False + blockquote: bool = False + blockquote_level: int = 0 def assemble_html(self, child_blocks, parent_structure): if self.ignore_for_output: @@ -16,4 +18,11 @@ def assemble_html(self, child_blocks, parent_structure): class_attr = f" block-type='{self.block_type}'" if self.has_continuation: class_attr += " class='has-continuation'" - return f"
{template}
" + + if self.blockquote: + # Add indentation for blockquote levels + blockquote_prefix = "" * self.blockquote_level + blockquote_suffix = "" * self.blockquote_level + return f"{blockquote_prefix}
{template}
{blockquote_suffix}" + else: + return f"{template}
" \ No newline at end of file diff --git a/marker/schema/blocks/text.py b/marker/schema/blocks/text.py index 6a40407a..c5ed59d7 100644 --- a/marker/schema/blocks/text.py +++ b/marker/schema/blocks/text.py @@ -5,6 +5,8 @@ class Text(Block): block_type: BlockTypes = BlockTypes.Text has_continuation: bool = False + blockquote: bool = False + blockquote_level: int = 0 def assemble_html(self, child_blocks, parent_structure): if self.ignore_for_output: @@ -16,4 +18,10 @@ def assemble_html(self, child_blocks, parent_structure): class_attr = f" block-type='{self.block_type}'" if self.has_continuation: class_attr += " class='has-continuation'" - return f"{template}
" + + if self.blockquote: + blockquote_prefix = "" * self.blockquote_level + blockquote_suffix = "" * self.blockquote_level + return f"{blockquote_prefix}
{template}
{blockquote_suffix}" + else: + return f"{template}
" \ No newline at end of file