Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #564

Merged
merged 4 commits into from
Feb 19, 2025
Merged

Dev #564

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import os

from marker.services.gemini import GoogleGeminiService

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning

import inspect
from collections import defaultdict
from functools import cache
from typing import Annotated, Any, Dict, List, Optional, Type, Tuple

from marker.processors import BaseProcessor
Expand Down Expand Up @@ -46,6 +42,7 @@
from marker.util import strings_to_classes
from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
from marker.processors.order import OrderProcessor
from marker.services.gemini import GoogleGeminiService


class PdfConverter(BaseConverter):
Expand Down
19 changes: 18 additions & 1 deletion marker/processors/equation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Annotated, List, Optional, Tuple

from marker.models import TexifyPredictor
from surya.texify import TexifyPredictor
from marker.processors import BaseProcessor
from marker.processors.util import add_math_spans_to_line
from marker.schema import BlockTypes
Expand Down Expand Up @@ -80,11 +80,28 @@ def __call__(self, document: Document):

block = document.get_block(equation_d["block_id"])
if isinstance(block, Equation):
prediction = self.inline_to_block(prediction)
block.html = prediction
else:
block.structure = []
add_math_spans_to_line(prediction, block, equation_d["page"])

def inline_to_block(self, latex: str):
latex = latex.strip()
math_count = latex.count("<math")
math_start = latex.startswith("<math>")
math_end = latex.endswith("</math>")
if any([
math_count != 1,
not math_start,
not math_end
]):
return latex

latex = latex.replace("<math>", '<math display="block">')
return latex


def get_batch_size(self):
if self.texify_batch_size is not None:
return self.texify_batch_size
Expand Down
1 change: 1 addition & 0 deletions marker/processors/llm/llm_equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum
if len(html_equation) < len(text) * .5:
block.update_metadata(llm_error_count=1)
return

block.html = html_equation

class EquationSchema(BaseModel):
Expand Down
19 changes: 12 additions & 7 deletions marker/processors/llm/llm_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):
text_math_rewriting_prompt = r"""You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
Your task is to correct any errors in the extracted lines, including math, formatting, and other inaccuracies, and output the corrected lines in a JSON format.

The number of output lines MUST match the number of input lines. Stay as faithful to the original text as possible.

**Instructions:**
Expand All @@ -32,14 +33,13 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):
2. Analyze the extracted lines.
3. For each extracted line, compare it to the corresponding line in the image.
4. Correct any errors in the extracted line, including:
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered.
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Use the `<math>` and `</math>` tags to surround inline math properly. Make sure the opening and closing tags appear in pairs, on the same line.
* Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
* Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.
6. Ensure that inline math is properly formatted with inline math tags.
7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. There are exactly {line_count} input lines.
8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
9. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.
5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. The formatting
6. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. There are exactly {line_count} input lines.
7. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. Each line should be in HTML format. Only use the math, br, a, i, b, sup, sub, and span tags.
8. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.

**Example:**

Expand Down Expand Up @@ -148,7 +148,12 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum
return

corrected_lines = response["corrected_lines"]
if not corrected_lines or len(corrected_lines) != len(blocks):
balanced_math = all([line.count("<math") == line.count("</math>") for line in corrected_lines])
if any([
not corrected_lines,
len(corrected_lines) != len(blocks),
not balanced_math
]):
blocks[0].update_metadata(llm_error_count=1)
return

Expand Down
15 changes: 14 additions & 1 deletion marker/processors/util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from bs4 import BeautifulSoup

from marker.schema import BlockTypes
Expand All @@ -6,6 +8,14 @@
from marker.schema.text import Line


def escape_latex_commands(text: str):
text = (text
.replace('\n', '\\n')
.replace('\t', '\\t')
.replace('\r', '\\r'))
return text


def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup):
SpanClass = get_block_class(BlockTypes.Span)
corrected_spans = text_to_spans(corrected_text)
Expand Down Expand Up @@ -49,9 +59,12 @@ def text_to_spans(text):
url = element.attrs.get('href') if hasattr(element, 'attrs') else None

if element.name in tag_types:
text = element.get_text()
if element.name == "math":
text = escape_latex_commands(text)
spans.append({
'type': tag_types[element.name],
'content': element.get_text(),
'content': text,
'url': url
})
elif element.string:
Expand Down
2 changes: 1 addition & 1 deletion marker/services/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class BaseService:
timeout: Annotated[
int,
"The timeout to use for the service."
] = 15
] = 30
max_retries: Annotated[
int,
"The maximum number of retries to use for the service."
Expand Down
47 changes: 24 additions & 23 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "1.5.3"
version = "1.5.4"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <github@vikas.sh>"]
readme = "README.md"
Expand All @@ -26,7 +26,7 @@ torch = "^2.5.1"
tqdm = "^4.66.1"
ftfy = "^6.1.1"
rapidfuzz = "^3.8.1"
surya-ocr = "~0.11.1"
surya-ocr = "~0.12.0"
regex = "^2024.4.28"
pdftext = "~0.6.0"
markdownify = "^0.13.1"
Expand Down
Loading