Inline math fix

VikParuchuri · Feb 19, 2025 · 96ed9df · 96ed9df
1 parent a1649ef
commit 96ed9df
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 12 deletions.
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
@@ -1,12 +1,8 @@
 import os
 
-from marker.services.gemini import GoogleGeminiService
-
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning
 
-import inspect
 from collections import defaultdict
-from functools import cache
 from typing import Annotated, Any, Dict, List, Optional, Type, Tuple
 
 from marker.processors import BaseProcessor
@@ -46,6 +42,7 @@
 from marker.util import strings_to_classes
 from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
 from marker.processors.order import OrderProcessor
+from marker.services.gemini import GoogleGeminiService
 
 
 class PdfConverter(BaseConverter):

diff --git a/marker/processors/equation.py b/marker/processors/equation.py
@@ -80,11 +80,28 @@ def __call__(self, document: Document):
 
             block = document.get_block(equation_d["block_id"])
             if isinstance(block, Equation):
+                prediction = self.inline_to_block(prediction)
                 block.html = prediction
             else:
                 block.structure = []
                 add_math_spans_to_line(prediction, block, equation_d["page"])
 
+    def inline_to_block(self, latex: str):
+        latex = latex.strip()
+        math_count = latex.count("<math")
+        math_start = latex.startswith("<math>")
+        math_end = latex.endswith("</math>")
+        if any([
+            math_count != 1,
+            not math_start,
+            not math_end
+        ]):
+            return latex
+
+        latex = latex.replace("<math>", '<math display="block">')
+        return latex
+
+
     def get_batch_size(self):
         if self.texify_batch_size is not None:
             return self.texify_batch_size

diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py
@@ -24,6 +24,7 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):
     text_math_rewriting_prompt = r"""You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
 Your task is to correct any errors in the extracted lines, including math, formatting, and other inaccuracies, and output the corrected lines in a JSON format.
+
 The number of output lines MUST match the number of input lines.  Stay as faithful to the original text as possible.
 
 **Instructions:**
@@ -32,14 +33,13 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):
 2. Analyze the extracted lines.
 3. For each extracted line, compare it to the corresponding line in the image.
 4. Correct any errors in the extracted line, including:
-    * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.
+    * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Use the `<math>` and `</math>` tags to surround inline math properly.  Make sure the opening and closing tags appear in pairs, on the same line.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
-5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.
-6. Ensure that inline math is properly formatted with inline math tags.
-7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.  There are exactly {line_count} input lines.
-8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
-9. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.
+5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.  The formatting 
+6. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.  There are exactly {line_count} input lines.
+7. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.  Each line should be in HTML format. Only use the math, br, a, i, b, sup, sub, and span tags.
+8. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.
 
 **Example:**
 
@@ -148,7 +148,12 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum
             return
 
         corrected_lines = response["corrected_lines"]
-        if not corrected_lines or len(corrected_lines) != len(blocks):
+        balanced_math = all([line.count("<math") == line.count("</math>") for line in corrected_lines])
+        if any([
+            not corrected_lines,
+            len(corrected_lines) != len(blocks),
+            not balanced_math
+        ]):
             blocks[0].update_metadata(llm_error_count=1)
             return
 

diff --git a/marker/processors/util.py b/marker/processors/util.py
@@ -1,3 +1,5 @@
+import re
+
 from bs4 import BeautifulSoup
 
 from marker.schema import BlockTypes
@@ -49,9 +51,15 @@ def text_to_spans(text):
         url = element.attrs.get('href') if hasattr(element, 'attrs') else None
 
         if element.name in tag_types:
+            text = element.get_text()
+            if element.name == "math":
+                text = (text
+                        .replace('\n', '\\n')
+                        .replace('\t', '\\t')
+                        .replace('\r', '\\r'))
             spans.append({
                 'type': tag_types[element.name],
-                'content': element.get_text(),
+                'content': text,
                 'url': url
             })
         elif element.string: