VikParuchuri · VikParuchuri · Feb 28, 2025 · Jan 21, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # Marker
 
-Marker converts PDFs and images to markdown, JSON, and HTML quickly and accurately.
+Marker converts documents to markdown, JSON, and HTML quickly and accurately.
 
-- Supports a range of documents in all languages
+- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
 - Formats tables, forms, equations, inline math, links, references, and code blocks
 - Extracts and saves images
 - Removes headers/footers/other artifacts
@@ -95,6 +95,7 @@ Options:
 - `--output_format [markdown|json|html]`: Specify the format for the output results.
 - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n` 
 - `--use_llm`: Uses an LLM to improve accuracy.  You must set your Gemini API key using the `GOOGLE_API_KEY` env var.
+- `--redo_inline_math`: If you want the highest quality inline math conversion, use this along with `--use_llm`.
 - `--disable_image_extraction`: Don't extract images from the PDF.  If you also specify `--use_llm`, then images will be replaced with a description.
 - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
 - `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.
@@ -320,6 +321,7 @@ When running with the `--use_llm` flag, you have a choice of services you can us
 - `Gemini` - this will use the Gemini developer API by default.  You'll need to pass `--gemini_api_key` to configuration.
 - `Google Vertex` - this will use vertex, which can be more reliable.  You'll need to pass `--vertex_project_id`.  To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`.
 - `Ollama` - this will use local models.  You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`.
+- `Claude` - this will use the anthropic API.  You can configure `--claude_api_key`, and `--claude_model_name`.  To use it, set `--llm_service=marker.services.claude.ClaudeService`.
 
 These services may have additional optional configuration as well - you can see it by viewing the classes.
 

diff --git a/convert.py b/convert.py
@@ -1,4 +1,4 @@
 from marker.scripts.convert import convert_cli
 
 if __name__ == "__main__":
-    convert_cli()
+    convert_cli()
diff --git a/data/examples/markdown/multicolcnn/multicolcnn.md b/data/examples/markdown/multicolcnn/multicolcnn.md
diff --git a/data/examples/markdown/switch_transformers/switch_trans.md b/data/examples/markdown/switch_transformers/switch_trans.md
diff --git a/marker/builders/line.py b/marker/builders/line.py
@@ -1,4 +1,6 @@
+from collections import defaultdict
 from copy import deepcopy
+from itertools import chain
 from typing import Annotated, List, Optional, Tuple
 
 import numpy as np
@@ -71,14 +73,14 @@ class LineBuilder(BaseBuilder):
         float,
         "The minimum overlap of a line with an inline math box to consider as a match"
     ] = 0.
+    line_inline_min_overlap_pct: Annotated[
+        float,
+        "The percentage of a provider line that has to be covered by a math line."
+    ] = .3
     line_text_overlap_threshold: Annotated[
         float,
         "The minimum overlap of an equation with a text line to consider as a match"
     ] = .5
-    inline_math_minimum_area: Annotated[
-        float,
-        "The minimum area for an inline math block, in pixels."
-    ] = 20
     inline_math_line_vertical_merge_threshold: Annotated[
         int,
         "The maximum pixel distance between y1s for two lines to be merged"
@@ -109,9 +111,13 @@ def __init__(self, detection_model: DetectionPredictor, inline_detection_model:
         self.ocr_error_model = ocr_error_model
 
     def __call__(self, document: Document, provider: PdfProvider):
-        # Disable Inline Detection for documents where layout model doesn't detect any equations
+        # Disable inline detection for documents where layout model doesn't detect any equations
         # Also disable if we won't use the inline detections (if we aren't using the LLM or texify)
-        do_inline_math_detection = document.contained_blocks([BlockTypes.Equation]) and (self.texify_inline_spans or self.use_llm)
+        do_inline_math_detection = all([
+            len(document.contained_blocks([BlockTypes.Equation, BlockTypes.TextInlineMath])) > 0,
+            (self.texify_inline_spans or self.use_llm)
+        ])
+
         provider_lines, ocr_lines = self.get_all_lines(document, provider, do_inline_math_detection)
         self.merge_blocks(document, provider_lines, ocr_lines)
 
@@ -144,6 +150,7 @@ def get_detection_results(self, page_images: List[Image.Image], run_detection: L
                 batch_size=self.get_detection_batch_size()
             )
 
+        assert len(page_detection_results) == len(inline_detection_results) == sum(run_detection)
         detection_results = []
         inline_results = []
         idx = 0
@@ -183,7 +190,7 @@ def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_mat
         if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
             layout_good = [True] * len(document.pages)
 
-        run_detection = [not good or do_inline_math_detection for good in layout_good]
+        run_detection = [(not good or do_inline_math_detection) for good in layout_good]
         page_images = [page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks) for page, good in zip(document.pages, run_detection) if good]
 
         # Note: run_detection is longer than page_images, since it has a value for each page, not just good ones
@@ -218,7 +225,7 @@ def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_mat
                 page_lines[document_page.page_id].extend(
                     self.merge_provider_lines_inline_math(
                         provider_lines,
-                        [b for _,b in math_detection_boxes],
+                        merged_detection_boxes,
                         image_size,
                         page_size
                     )
@@ -365,7 +372,7 @@ def determine_math_lines(
             max_overlap = np.max(overlap_row) / inline_box.area
 
             # Avoid small or nonoverlapping inline math regions
-            if max_overlap <= self.line_inline_math_overlap_threshold or inline_box.area < self.inline_math_minimum_area:
+            if max_overlap <= self.line_inline_math_overlap_threshold:
                 continue
 
             # Ignore vertical lines
@@ -386,67 +393,65 @@ def add_math_span_format(self, provider_line):
     def merge_provider_lines_inline_math(
         self,
         provider_lines: List[ProviderOutput],
-        inline_math_lines: List[TextBox],
+        text_lines: List[TextBox],
         image_size,
         page_size
     ):
         # When provider lines is empty or no inline math detected, return provider lines
-        if not provider_lines or not inline_math_lines:
+        if not provider_lines or not text_lines:
             return provider_lines
 
         horizontal_provider_lines = [
             (j, provider_line) for j, provider_line in enumerate(provider_lines)
-            if provider_line.line.polygon.height < provider_line.line.polygon.width * 3 # Multiply to account for small blocks inside equations, but filter out big vertical lines
+            if provider_line.line.polygon.height < provider_line.line.polygon.width * 5 # Multiply to account for small blocks inside equations, but filter out big vertical lines
         ]
         provider_line_boxes = [p.line.polygon.bbox for _, p in horizontal_provider_lines]
-        math_line_boxes = [PolygonBox(polygon=m.polygon).rescale(image_size, page_size).bbox for m in inline_math_lines]
+        math_line_boxes = [PolygonBox(polygon=m.polygon).rescale(image_size, page_size).bbox for m in text_lines]
 
-        overlaps = matrix_intersection_area(math_line_boxes, provider_line_boxes)
+        overlaps = matrix_intersection_area(provider_line_boxes, math_line_boxes)
 
         # Find potential merges
-        merge_lines = []
-        for i in range(len(math_line_boxes)):
-            merge_line = []
-            math_line_polygon = PolygonBox(polygon=inline_math_lines[i].polygon).rescale(image_size, page_size)
-            max_overlap = np.max(overlaps[i])
-            if max_overlap <= self.line_inline_math_overlap_threshold:
+        merge_lines = defaultdict(list)
+        for i in range(len(provider_line_boxes)):
+            max_overlap_pct = np.max(overlaps[i]) / max(1, horizontal_provider_lines[i][1].line.polygon.area)
+            if max_overlap_pct <= self.line_inline_min_overlap_pct:
                 continue
 
             best_overlap = np.argmax(overlaps[i])
-            best_overlap_line = horizontal_provider_lines[best_overlap]
-            best_overlap_y1 = best_overlap_line[1].line.polygon.y_start
-
-            nonzero_idxs = np.nonzero(overlaps[i] > self.line_inline_math_overlap_threshold)[0]
-            for idx in nonzero_idxs:
-                provider_idx, provider_line = horizontal_provider_lines[idx]
-                provider_line_y1 = provider_line.line.polygon.y_start
-
-                should_merge_line = False
-                if abs(provider_line_y1 - best_overlap_y1) <= self.inline_math_line_vertical_merge_threshold:
-                    should_merge_line = True
-
-                line_overlaps = self.find_overlapping_math_chars(provider_line, math_line_polygon, remove_chars=not should_merge_line)
-
-                # Do not merge if too far above/below (but remove characters)
-                if line_overlaps and should_merge_line:
-                    # Add the index of the provider line to the merge line
-                    merge_line.append(provider_idx)
-
-            if len(merge_line) > 0:
-                merge_lines.append(merge_line)
+            merge_lines[best_overlap].append(i)
+
+        # Filter to get rid of detected lines that include multiple provider lines
+        filtered_merge_lines = {}
+        for line_idx in merge_lines:
+            first_line = horizontal_provider_lines[merge_lines[line_idx][0]][1].line.polygon
+            all_close = all([
+                (
+                    abs(horizontal_provider_lines[ml][1].line.polygon.y_start - first_line.y_start) < self.inline_math_line_vertical_merge_threshold
+                    or
+                    abs(horizontal_provider_lines[ml][1].line.polygon.y_end - first_line.y_end) < self.inline_math_line_vertical_merge_threshold
+                )
+                for ml in
+                merge_lines[line_idx]
+            ])
+            if all_close:
+                filtered_merge_lines[line_idx] = merge_lines[line_idx]
 
         # Handle the merging
         already_merged = set()
-        potential_merges = set([m for merge_line in merge_lines for m in merge_line])
+        potential_merges = set(chain.from_iterable(filtered_merge_lines.values()))
         out_provider_lines = [(i, p) for i, p in enumerate(provider_lines) if i not in potential_merges]
-        for merge_section in merge_lines:
+        for line_idx in filtered_merge_lines:
+            text_line = text_lines[line_idx]
+            merge_section = filtered_merge_lines[line_idx]
             merge_section = [m for m in merge_section if m not in already_merged]
             if len(merge_section) == 0:
                 continue
             elif len(merge_section) == 1:
                 line_idx = merge_section[0]
                 merged_line = provider_lines[line_idx]
-                self.add_math_span_format(merged_line)
+                # Only add math format to single lines if the detected line is math
+                if text_line.math:
+                    self.add_math_span_format(merged_line)
                 out_provider_lines.append((line_idx, merged_line))
                 already_merged.add(merge_section[0])
                 continue
@@ -461,6 +466,7 @@ def merge_provider_lines_inline_math(
                 else:
                     # Combine the spans of the provider line with the merged line
                     merged_line = merged_line.merge(provider_line)
+                    # Add math regardless, since we assume heavily broken lines are math lines
                     self.add_math_span_format(merged_line)
                 already_merged.add(idx) # Prevent double merging
             out_provider_lines.append((min_idx, merged_line))

diff --git a/marker/config/parser.py b/marker/config/parser.py
@@ -65,7 +65,7 @@ def generate_config_dict(self) -> Dict[str, any]:
                 case "languages":
                     config["languages"] = v.split(",")
                 case "config_json":
-                    with open(v, "r") as f:
+                    with open(v, "r", encoding="utf-8") as f:
                         config.update(json.load(f))
                 case "disable_multiprocessing":
                     config["pdftext_workers"] = 1

diff --git a/marker/converters/__init__.py b/marker/converters/__init__.py
@@ -6,7 +6,7 @@
 from marker.processors import BaseProcessor
 from marker.processors.llm import BaseLLMSimpleBlockProcessor
 from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
-from marker.util import assign_config
+from marker.util import assign_config, download_font
 
 
 class BaseConverter:
@@ -15,6 +15,9 @@ def __init__(self, config: Optional[BaseModel | dict] = None):
         self.config = config
         self.llm_service = None
 
+        # Download render font, needed for some providers
+        download_font()
+
     def __call__(self, *args, **kwargs):
         raise NotImplementedError
 

diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
@@ -1,5 +1,4 @@
 import os
-
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning
 
 from collections import defaultdict
@@ -43,6 +42,8 @@
 from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
 from marker.processors.order import OrderProcessor
 from marker.services.gemini import GoogleGeminiService
+from marker.processors.line_merge import LineMergeProcessor
+from marker.processors.llm.llm_inlinemath import LLMInlineMathProcessor
 
 
 class PdfConverter(BaseConverter):
@@ -62,6 +63,7 @@ class PdfConverter(BaseConverter):
     ] = False
     default_processors: Tuple[BaseProcessor, ...] = (
         OrderProcessor,
+        LineMergeProcessor,
         BlockquoteProcessor,
         CodeProcessor,
         DocumentTOCProcessor,
@@ -82,6 +84,7 @@ class PdfConverter(BaseConverter):
         LLMImageDescriptionProcessor,
         LLMEquationProcessor,
         LLMHandwritingProcessor,
+        LLMInlineMathProcessor,
         ReferenceProcessor,
         DebugProcessor,
     )

diff --git a/marker/logger.py b/marker/logger.py
@@ -7,3 +7,7 @@ def configure_logging():
 
     logging.getLogger('PIL').setLevel(logging.ERROR)
     warnings.simplefilter(action='ignore', category=FutureWarning)
+
+    logging.getLogger('fontTools.subset').setLevel(logging.ERROR)
+    logging.getLogger('fontTools.ttLib.ttFont').setLevel(logging.ERROR)
+    logging.getLogger('weasyprint').setLevel(logging.CRITICAL)
diff --git a/marker/processors/debug.py b/marker/processors/debug.py
@@ -2,7 +2,6 @@
 import os
 from typing import Annotated
 
-import requests
 from PIL import Image, ImageDraw, ImageFont
 
 from marker.processors import BaseProcessor
@@ -36,14 +35,7 @@ class DebugProcessor(BaseProcessor):
         bool,
         "Whether to dump block debug data.",
     ] = False
-    render_font: Annotated[
-        str,
-        "The path to the font to use for rendering debug images.",
-    ] = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf")
-    font_dl_path: Annotated[
-        str,
-        "The path to download the font from.",
-    ] = "/~https://github.com/satbyy/go-noto-universal/releases/download/v7.0"
+
 
     def __call__(self, document: Document):
         # Remove extension from doc name
@@ -72,16 +64,21 @@ def draw_pdf_debug_images(self, document: Document):
 
             line_bboxes = []
             span_bboxes = []
+            line_ids = []
             for child in page.children:
+                # Skip any blocks that have been removed
+                if child.removed:
+                    continue
+
                 if child.block_type == BlockTypes.Line:
                     bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
                     line_bboxes.append(bbox)
+                    line_ids.append(child.block_id)
                 elif child.block_type == BlockTypes.Span:
                     bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
                     span_bboxes.append(bbox)
 
-            self.render_on_image(line_bboxes, png_image, color="blue", draw_bbox=True, label_font_size=24)
-            self.render_on_image(span_bboxes, png_image, color="green", draw_bbox=True, label_font_size=24)
+            self.render_on_image(line_bboxes, png_image, color="blue", draw_bbox=True, label_font_size=24, labels=[str(i) for i in line_ids])
 
             png_image = self.render_layout_boxes(page, png_image)
 
@@ -146,17 +143,6 @@ def dump_block_debug_data(self, document: Document):
         with open(debug_file, "w+") as f:
             json.dump(debug_data, f)
 
-    def get_font_path(self) -> str:
-        if not os.path.exists(self.render_font):
-            os.makedirs(os.path.dirname(self.render_font), exist_ok=True)
-            font_dl_path = f"{self.font_dl_path}/{os.path.basename(self.render_font)}"
-            with requests.get(font_dl_path, stream=True) as r, open(self.render_font, 'wb') as f:
-                r.raise_for_status()
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
-
-        return self.render_font
-
     def get_text_size(self, text, font):
         im = Image.new(mode="P", size=(0, 0))
         draw = ImageDraw.Draw(im)
@@ -165,7 +151,7 @@ def get_text_size(self, text, font):
 
     def render_on_image(self, bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list = 'red', draw_bbox=True):
         draw = ImageDraw.Draw(image)
-        font_path = self.get_font_path()
+        font_path = settings.FONT_PATH
         label_font = ImageFont.truetype(font_path, label_font_size)
 
         for i, bbox in enumerate(bboxes):