VikParuchuri · VikParuchuri · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
@@ -1,12 +1,8 @@
 import os
 
-from marker.services.gemini import GoogleGeminiService
-
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning
 
-import inspect
 from collections import defaultdict
-from functools import cache
 from typing import Annotated, Any, Dict, List, Optional, Type, Tuple
 
 from marker.processors import BaseProcessor
@@ -46,6 +42,7 @@
 from marker.util import strings_to_classes
 from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
 from marker.processors.order import OrderProcessor
+from marker.services.gemini import GoogleGeminiService
 
 
 class PdfConverter(BaseConverter):

diff --git a/marker/processors/equation.py b/marker/processors/equation.py
@@ -1,6 +1,6 @@
 from typing import Annotated, List, Optional, Tuple
 
-from marker.models import TexifyPredictor
+from surya.texify import TexifyPredictor
 from marker.processors import BaseProcessor
 from marker.processors.util import add_math_spans_to_line
 from marker.schema import BlockTypes
@@ -80,11 +80,28 @@ def __call__(self, document: Document):
 
             block = document.get_block(equation_d["block_id"])
             if isinstance(block, Equation):
+                prediction = self.inline_to_block(prediction)
                 block.html = prediction
             else:
                 block.structure = []
                 add_math_spans_to_line(prediction, block, equation_d["page"])
 
+    def inline_to_block(self, latex: str):
+        latex = latex.strip()
+        math_count = latex.count("<math")
+        math_start = latex.startswith("<math>")
+        math_end = latex.endswith("</math>")
+        if any([
+            math_count != 1,
+            not math_start,
+            not math_end
+        ]):
+            return latex
+
+        latex = latex.replace("<math>", '<math display="block">')
+        return latex
+
+
     def get_batch_size(self):
         if self.texify_batch_size is not None:
             return self.texify_batch_size

diff --git a/marker/processors/llm/llm_equation.py b/marker/processors/llm/llm_equation.py
@@ -101,6 +101,7 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum
         if len(html_equation) < len(text) * .5:
             block.update_metadata(llm_error_count=1)
             return
+
         block.html = html_equation
 
 class EquationSchema(BaseModel):

diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py
@@ -24,6 +24,7 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):
     text_math_rewriting_prompt = r"""You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
 Your task is to correct any errors in the extracted lines, including math, formatting, and other inaccuracies, and output the corrected lines in a JSON format.
+
 The number of output lines MUST match the number of input lines.  Stay as faithful to the original text as possible.
 
 **Instructions:**
@@ -32,14 +33,13 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):
 2. Analyze the extracted lines.
 3. For each extracted line, compare it to the corresponding line in the image.
 4. Correct any errors in the extracted line, including:
-    * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.
+    * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Use the `<math>` and `</math>` tags to surround inline math properly.  Make sure the opening and closing tags appear in pairs, on the same line.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
-5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.
-6. Ensure that inline math is properly formatted with inline math tags.
-7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.  There are exactly {line_count} input lines.
-8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
-9. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.
+5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.  The formatting 
+6. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.  There are exactly {line_count} input lines.
+7. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.  Each line should be in HTML format. Only use the math, br, a, i, b, sup, sub, and span tags.
+8. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.
 
 **Example:**
 
@@ -148,7 +148,12 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum
             return
 
         corrected_lines = response["corrected_lines"]
-        if not corrected_lines or len(corrected_lines) != len(blocks):
+        balanced_math = all([line.count("<math") == line.count("</math>") for line in corrected_lines])
+        if any([
+            not corrected_lines,
+            len(corrected_lines) != len(blocks),
+            not balanced_math
+        ]):
             blocks[0].update_metadata(llm_error_count=1)
             return
 

diff --git a/marker/processors/util.py b/marker/processors/util.py
@@ -1,3 +1,5 @@
+import re
+
 from bs4 import BeautifulSoup
 
 from marker.schema import BlockTypes
@@ -6,6 +8,14 @@
 from marker.schema.text import Line
 
 
+def escape_latex_commands(text: str):
+    text = (text
+            .replace('\n', '\\n')
+            .replace('\t', '\\t')
+            .replace('\r', '\\r'))
+    return text
+
+
 def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup):
     SpanClass = get_block_class(BlockTypes.Span)
     corrected_spans = text_to_spans(corrected_text)
@@ -49,9 +59,12 @@ def text_to_spans(text):
         url = element.attrs.get('href') if hasattr(element, 'attrs') else None
 
         if element.name in tag_types:
+            text = element.get_text()
+            if element.name == "math":
+                text = escape_latex_commands(text)
             spans.append({
                 'type': tag_types[element.name],
-                'content': element.get_text(),
+                'content': text,
                 'url': url
             })
         elif element.string:

diff --git a/marker/services/__init__.py b/marker/services/__init__.py
@@ -11,7 +11,7 @@ class BaseService:
     timeout: Annotated[
         int,
         "The timeout to use for the service."
-    ] = 15
+    ] = 30
     max_retries: Annotated[
         int,
         "The maximum number of retries to use for the service."

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.5.3"
+version = "1.5.4"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
@@ -26,7 +26,7 @@ torch = "^2.5.1"
 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 rapidfuzz = "^3.8.1"
-surya-ocr = "~0.11.1"
+surya-ocr = "~0.12.0"
 regex = "^2024.4.28"
 pdftext = "~0.6.0"
 markdownify = "^0.13.1"