VikParuchuri · VikParuchuri · Feb 28, 2025 · Jan 21, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Install python dependencies
         run: |
           pip install poetry
-          poetry install
+          poetry install --extras "full"
       - name: Run benchmark test
         run: |
           poetry run python benchmarks/overall/overall.py --max_rows 5

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
       - name: Install python dependencies
         run: |
           pip install poetry
-          poetry install
+          poetry install --extras "full"
       - name: Run tests
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -15,7 +15,7 @@ jobs:
       - name: Install python dependencies
         run: |
           pip install poetry
-          poetry install
+          poetry install --extras "full"
       - name: Build package
         run: |
           poetry build

diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml
@@ -14,7 +14,7 @@ jobs:
       - name: Install python dependencies
         run: |
           pip install poetry
-          poetry install
+          poetry install --extras "full"
       - name: Download benchmark data
         run: |
           wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"

diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # Marker
 
-Marker converts PDFs and images to markdown, JSON, and HTML quickly and accurately.
+Marker converts documents to markdown, JSON, and HTML quickly and accurately.
 
-- Supports a range of documents in all languages
+- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
 - Formats tables, forms, equations, inline math, links, references, and code blocks
 - Extracts and saves images
 - Removes headers/footers/other artifacts
@@ -66,6 +66,12 @@ Install with:
 pip install marker-pdf
 ```
 
+If you want to use marker on documents other than PDFs, you will need to install additional dependencies with:
+
+```shell
+pip install marker-pdf[full]
+```
+
 # Usage
 
 First, some configuration:
@@ -95,6 +101,7 @@ Options:
 - `--output_format [markdown|json|html]`: Specify the format for the output results.
 - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n` 
 - `--use_llm`: Uses an LLM to improve accuracy.  You must set your Gemini API key using the `GOOGLE_API_KEY` env var.
+- `--redo_inline_math`: If you want the highest quality inline math conversion, use this along with `--use_llm`.
 - `--disable_image_extraction`: Don't extract images from the PDF.  If you also specify `--use_llm`, then images will be replaced with a description.
 - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
 - `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.
@@ -320,6 +327,7 @@ When running with the `--use_llm` flag, you have a choice of services you can us
 - `Gemini` - this will use the Gemini developer API by default.  You'll need to pass `--gemini_api_key` to configuration.
 - `Google Vertex` - this will use vertex, which can be more reliable.  You'll need to pass `--vertex_project_id`.  To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`.
 - `Ollama` - this will use local models.  You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`.
+- `Claude` - this will use the anthropic API.  You can configure `--claude_api_key`, and `--claude_model_name`.  To use it, set `--llm_service=marker.services.claude.ClaudeService`.
 
 These services may have additional optional configuration as well - you can see it by viewing the classes.
 

diff --git a/benchmarks/overall/display/dataset.py b/benchmarks/overall/display/dataset.py
@@ -29,7 +29,12 @@ def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_typ
 
             method_cls = METHOD_REGISTRY[method]()
             md = result["markdown"][idx][method]
-            method_img = method_cls.render(result["markdown"][idx][method])
+            try:
+                method_img = method_cls.render(result["markdown"][idx][method])
+            except Exception as e:
+                # This can happen when the markdown is None
+                method_img = PIL.Image.new("RGB", (200, 200))
+
             row[f"{method}_md"] = md
             row[f"{method}_img"] = method_img
 

diff --git a/benchmarks/overall/elo.py b/benchmarks/overall/elo.py
@@ -1,9 +1,12 @@
 import json
 import random
 import time
+import os
 from dataclasses import dataclass
 from typing import List, Dict, Tuple, Literal
 from PIL import Image
+from collections import defaultdict
+import tabulate
 
 import click
 import datasets
@@ -48,7 +51,7 @@
 
 Notes on scoring:
 - Perfect markdown will include all of the important text from the image, and the formatting will be correct (minor mistakes okay).  It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings.  If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
-- Bad markdown will have major missing text segments from the markdown or completely unreadable formatting.
+- Bad markdown will have major missing text segments from the markdown or completely unreadable formatting.  It may also have key values that are different from the values in the image.
 
 Output json, like in the example below.
 
@@ -63,15 +66,15 @@
 ```markdown
 # Section 1
 This is some markdown extracted from a document.  Here is a block equation:
-$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
+$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 124}{t}$$
 ```
 Output
 ```json
 {
     "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
     "version_a_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
-    "version_b_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.  The formatting in version b is slightly different from the image.",
-    "comparison": "Version A is better than version B.  The text and formatting in version A matches the image better than version B.",
+    "version_b_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.  The formatting in version b is slightly different from the image.  The value 124 is also different from the image.",
+    "comparison": "Version A is better than version B.  The text and formatting in version A matches the image better than version B.  Version B also has an incorrect value.",
     "winner": "version_a",
 }
 ```
@@ -105,6 +108,11 @@ def __call__(
         version_a: str,
         version_b: str
     ) -> str | None:
+        if version_a is None and version_b is not None:
+            return "version_b"
+        elif version_b is None and version_a is not None:
+            return "version_a"
+
         hydrated_prompt = rating_prompt.replace("{{version_a}}", version_a).replace("{{version_b}}", version_b)
         try:
             rating = self.llm_rater(img, hydrated_prompt)
@@ -128,12 +136,14 @@ def llm_response_wrapper(
         response_schema,
     ):
         client = genai.Client(
-            api_key=settings.GOOGLE_API_KEY,
-            http_options={"timeout": 60000}
+            http_options={"timeout": 60000},
+            vertexai=True,
+            project=os.getenv("VERTEX_PROJECT_ID"),
+            location=os.getenv("VERTEX_LOCATION"),
         )
         try:
             responses = client.models.generate_content(
-                model="gemini-2.0-flash",
+                model="gemini-2.0-flash-001",
                 contents=prompt,
                 config={
                     "temperature": 0,
@@ -150,35 +160,19 @@ def llm_response_wrapper(
             print(f"Error: {e}")
             return
 
-@dataclass
-class Method:
-    name: str
-    rating: float = 1500
-    k_factor: float = 32
-
-
-class EloSystem:
-    def __init__(self, player_names: List[str]):
-        self.methods = {name: Method(name) for name in player_names}
-
-    def expected_score(self, rating_a: float, rating_b: float) -> float:
-        return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
-
-    def update_ratings(self, winner: str, loser: str) -> Tuple[float, float]:
-        method_a = self.methods[winner]
-        method_b = self.methods[loser]
-
-        expected_a = self.expected_score(method_a.rating, method_b.rating)
-        expected_b = self.expected_score(method_b.rating, method_a.rating)
-
-        # Winner gets score of 1, loser gets 0
-        method_a.rating += method_a.k_factor * (1 - expected_a)
-        method_b.rating += method_b.k_factor * (0 - expected_b)
 
-        return method_a.rating, method_b.rating
+def display_win_rates_table(win_rates: dict):
+    table = []
+    headers = ["Method A", "Method B", "Wins", "Losses", "Win %"]
+    for method_a, method_b_dict in win_rates.items():
+        row = [method_a]
+        for method_b, results in method_b_dict.items():
+            row = [method_a, method_b, results["win"], results["loss"], (results["win"] / (results["win"] + results["loss"])) * 100]
+            table.append(row)
+    print(tabulate.tabulate(table, headers=headers, tablefmt="pretty"))
 
 
-@click.command("Calculate ELO scores for document conversion methods")
+@click.command("Calculate win rates for document conversion methods")
 @click.argument("dataset", type=str)
 @click.option("--methods", type=str, help="List of methods to compare: comma separated like marker,mathpix")
 @click.option("--row_samples", type=int, default=2, help="Number of samples per row")
@@ -191,10 +185,10 @@ def main(
 ):
     ds = datasets.load_dataset(dataset, split="train")
     method_lst = methods.split(",")
-    elo = EloSystem(method_lst)
+    win_rates = {m: defaultdict(lambda: defaultdict(int)) for m in method_lst}
     comparer = Comparer()
 
-    for i in tqdm(range(min(len(ds), max_rows)), desc="Calculating ELO"):
+    for i in tqdm(range(min(len(ds), max_rows)), desc="Calculating win rates..."):
         row = ds[i]
         # Avoid any bias in ordering
         random.shuffle(method_lst)
@@ -211,14 +205,15 @@ def main(
                     continue
 
                 if winner == "version_a":
-                    elo.update_ratings(method_a, method_b)
+                    win_rates[method_a][method_b]["win"] += 1
+                    win_rates[method_b][method_a]["loss"] += 1
                 else:
-                    elo.update_ratings(method_b, method_a)
+                    win_rates[method_b][method_a]["win"] += 1
+                    win_rates[method_a][method_b]["loss"] += 1
         if i % 10 == 0:
-            print(elo.methods)
+            display_win_rates_table(win_rates)
 
-    # Print out ratings
-    print(elo.methods)
+    display_win_rates_table(win_rates)
 
 
 if __name__ == "__main__":

diff --git a/benchmarks/overall/methods/marker.py b/benchmarks/overall/methods/marker.py
@@ -1,7 +1,9 @@
+import os
 import tempfile
 import time
 
 from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+from marker.config.parser import ConfigParser
 from marker.converters.pdf import PdfConverter
 
 
@@ -11,9 +13,19 @@ class MarkerMethod(BaseMethod):
 
     def __call__(self, sample) -> BenchmarkResult:
         pdf_bytes = sample["pdf"]  # This is a single page PDF
+        parser = ConfigParser({
+                "page_range": "0",
+                "disable_tqdm": True,
+                "use_llm": self.use_llm,
+                "redo_inline_math": self.use_llm,
+                "llm_service": "marker.services.vertex.GoogleVertexService",
+                "vertex_project_id": os.getenv("VERTEX_PROJECT_ID"),
+            })
+
         block_converter = PdfConverter(
             artifact_dict=self.model_dict,
-            config={"page_range": [0], "disable_tqdm": True, "use_llm": self.use_llm}
+            config=parser.generate_config_dict(),
+            llm_service=parser.get_llm_service()
         )
 
         with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:

diff --git a/benchmarks/overall/methods/olmocr.py b/benchmarks/overall/methods/olmocr.py
@@ -0,0 +1,91 @@
+import base64
+import json
+import tempfile
+import time
+from io import BytesIO
+
+import torch
+from PIL import Image
+
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+
+
+def convert_single_page(filename: str, model, processor, device):
+    from olmocr.data.renderpdf import render_pdf_to_base64png
+    from olmocr.prompts import build_finetuning_prompt
+    from olmocr.prompts.anchor import get_anchor_text
+
+    image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024)
+
+    # Build the prompt, using document metadata
+    anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000)
+    prompt = build_finetuning_prompt(anchor_text)
+
+    # Build the full prompt
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+            ],
+        }
+    ]
+
+    # Apply the chat template and processor
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
+
+    inputs = processor(
+        text=[text],
+        images=[main_image],
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = {key: value.to(device) for (key, value) in inputs.items()}
+
+    # Generate the output
+    output = model.generate(
+        **inputs,
+        temperature=0.8,
+        max_new_tokens=8192,
+        num_return_sequences=1,
+        do_sample=True,
+    )
+
+    # Decode the output
+    prompt_length = inputs["input_ids"].shape[1]
+    new_tokens = output[:, prompt_length:]
+    text_output = processor.tokenizer.batch_decode(
+        new_tokens, skip_special_tokens=True
+    )[0]
+
+    try:
+        text_output = json.loads(text_output)
+        text = text_output["natural_text"]
+    except Exception:
+        try:
+            text = text_output.split("natural_text")[1].strip()
+        except Exception:
+            text = ""
+
+    return text
+
+
+class OlmOCRMethod(BaseMethod):
+    olmocr_model: dict = None
+    use_llm: bool = False
+
+    def __call__(self, sample) -> BenchmarkResult:
+        pdf_bytes = sample["pdf"]  # This is a single page PDF
+
+        with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
+            f.write(pdf_bytes)
+            start = time.time()
+            result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device)
+            total = time.time() - start
+
+        return {
+            "markdown": result,
+            "time": total
+        }