Skip to content

Commit

Permalink
Inline math fix
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Feb 19, 2025
1 parent a1649ef commit 96ed9df
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 12 deletions.
5 changes: 1 addition & 4 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import os

from marker.services.gemini import GoogleGeminiService

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning

import inspect
from collections import defaultdict
from functools import cache
from typing import Annotated, Any, Dict, List, Optional, Type, Tuple

from marker.processors import BaseProcessor
Expand Down Expand Up @@ -46,6 +42,7 @@
from marker.util import strings_to_classes
from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
from marker.processors.order import OrderProcessor
from marker.services.gemini import GoogleGeminiService


class PdfConverter(BaseConverter):
Expand Down
17 changes: 17 additions & 0 deletions marker/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,28 @@ def __call__(self, document: Document):

block = document.get_block(equation_d["block_id"])
if isinstance(block, Equation):
prediction = self.inline_to_block(prediction)
block.html = prediction
else:
block.structure = []
add_math_spans_to_line(prediction, block, equation_d["page"])

def inline_to_block(self, latex: str):
latex = latex.strip()
math_count = latex.count("<math")
math_start = latex.startswith("<math>")
math_end = latex.endswith("</math>")
if any([
math_count != 1,
not math_start,
not math_end
]):
return latex

latex = latex.replace("<math>", '<math display="block">')
return latex


def get_batch_size(self):
if self.texify_batch_size is not None:
return self.texify_batch_size
Expand Down
19 changes: 12 additions & 7 deletions marker/processors/llm/llm_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):
text_math_rewriting_prompt = r"""You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
Your task is to correct any errors in the extracted lines, including math, formatting, and other inaccuracies, and output the corrected lines in a JSON format.
The number of output lines MUST match the number of input lines. Stay as faithful to the original text as possible.
**Instructions:**
Expand All @@ -32,14 +33,13 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):
2. Analyze the extracted lines.
3. For each extracted line, compare it to the corresponding line in the image.
4. Correct any errors in the extracted line, including:
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered.
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Use the `<math>` and `</math>` tags to surround inline math properly. Make sure the opening and closing tags appear in pairs, on the same line.
* Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
* Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.
6. Ensure that inline math is properly formatted with inline math tags.
7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. There are exactly {line_count} input lines.
8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
9. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.
5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. The formatting
6. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. There are exactly {line_count} input lines.
7. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. Each line should be in HTML format. Only use the math, br, a, i, b, sup, sub, and span tags.
8. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.
**Example:**
Expand Down Expand Up @@ -148,7 +148,12 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum
return

corrected_lines = response["corrected_lines"]
if not corrected_lines or len(corrected_lines) != len(blocks):
balanced_math = all([line.count("<math") == line.count("</math>") for line in corrected_lines])
if any([
not corrected_lines,
len(corrected_lines) != len(blocks),
not balanced_math
]):
blocks[0].update_metadata(llm_error_count=1)
return

Expand Down
10 changes: 9 additions & 1 deletion marker/processors/util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from bs4 import BeautifulSoup

from marker.schema import BlockTypes
Expand Down Expand Up @@ -49,9 +51,15 @@ def text_to_spans(text):
url = element.attrs.get('href') if hasattr(element, 'attrs') else None

if element.name in tag_types:
text = element.get_text()
if element.name == "math":
text = (text
.replace('\n', '\\n')
.replace('\t', '\\t')
.replace('\r', '\\r'))
spans.append({
'type': tag_types[element.name],
'content': element.get_text(),
'content': text,
'url': url
})
elif element.string:
Expand Down

0 comments on commit 96ed9df

Please sign in to comment.