Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add support for caret #96

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 53 additions & 19 deletions pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,13 @@
import pdfminer.settings
import pdfminer.utils

from .types import Page, Outline, AnnotationType, Annotation, Document, RGB
from .types import Page, Outline, AnnotationType, Annotation, Document, RGB, ANNOT_SUBTYPES, IGNORED_ANNOT_SUBTYPES
from .utils import cleanup_text, decode_datetime

pdfminer.settings.STRICT = False

logger = logging.getLogger('pdfannots')

ANNOT_SUBTYPES: typ.Dict[PSLiteral, AnnotationType] = {
PSLiteralTable.intern(e.name): e for e in AnnotationType}
"""Mapping from PSliteral to our own enumerant, for supported annotation types."""

IGNORED_ANNOT_SUBTYPES = \
frozenset(PSLiteralTable.intern(n) for n in (
'Link', # Links are used for internal document links (e.g. to other pages).
'Popup', # Controls the on-screen appearance of other annotations. TODO: we may want to
# check for an optional 'Contents' field for alternative human-readable contents.
))
"""Annotation types that we ignore without issuing a warning."""


def _mkannotation(
pa: typ.Dict[str, typ.Any],
Expand Down Expand Up @@ -85,7 +73,7 @@ def _mkannotation(
rect = pdftypes.resolve1(pa.get('Rect'))

# QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut,
# Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation.
# Squiggly, Caret), where they specify the quadrilaterals (boxes) covered by the annotation.
quadpoints = pdftypes.resolve1(pa.get('QuadPoints'))

author = pdftypes.resolve1(pa.get('T'))
Expand All @@ -103,8 +91,13 @@ def _mkannotation(
createds = pdfminer.utils.decode_text(createds)
created = decode_datetime(createds)

name = pdftypes.resolve1(pa.get('NM')).decode('utf-8')

in_reply_to = pdftypes.resolve1(pa.get('IRT'))

return Annotation(page, annot_type, quadpoints, rect,
contents, author=author, created=created, color=rgb)
contents, author=author, created=created, color=rgb,
name=name, in_reply_to=in_reply_to)


def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]:
Expand Down Expand Up @@ -261,7 +254,9 @@ def capture_char(self, text: str) -> None:
# Locate and remove the annotation's existing context subscription.
assert last_charseq != 0
i = bisect.bisect_left(self.context_subscribers, (last_charseq,))
assert 0 <= i < len(self.context_subscribers)
if not (0 <= i < len(self.context_subscribers)):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why/when does this happen?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was specific to one of the pdf files that I have. I haven't had a chance to debug it yet.

logger.warning("Annotation %s lost context subscription", a)
continue
(found_charseq, found_annot) = self.context_subscribers.pop(i)
assert found_charseq == last_charseq
assert found_annot is a
Expand Down Expand Up @@ -329,6 +324,47 @@ def render(self, item: LTItem) -> None:
self.capture_char(text)


def _find_and_modify_replace_annots(annots: typ.List[Annotation], outlines: typ.List[Outline]) -> typ.Tuple[typ.List[Annotation], typ.List[Outline]]:
strikeout_pending_replies = {}
carets_to_remove = []

# Give the annotations a chance to update their internals
for a in annots:
in_reply_to_name = None

if a.in_reply_to:
in_reply_to_name = a.in_reply_to.get('NM').decode('utf-8')

if in_reply_to_name and a.subtype == AnnotationType.StrikeOut:
strikeout_pending_replies[in_reply_to_name] = a
carets_to_remove.append(in_reply_to_name)
a.postprocess()

for a in annots:
in_reply_to_name = None

if a.in_reply_to:
in_reply_to_name = a.in_reply_to.get('NM').decode('utf-8')

if in_reply_to_name and a.subtype == AnnotationType.StrikeOut:
strikeout_pending_replies[in_reply_to_name] = a
carets_to_remove.append(in_reply_to_name)

# Copy the contents of the StrikeOut annotations to the corresponding Caret annotations
for a in annots:
if a.name in strikeout_pending_replies:
strikeout_a = strikeout_pending_replies[a.name]
strikeout_a.contents = a.contents

# Remove any Caret annotations that are replies to other StrikeOut annotations
idxs_to_remove = [i for i, a in enumerate(annots) if a.name in carets_to_remove]

annots = [a for i, a in enumerate(annots) if i not in idxs_to_remove]
outlines = [o for o in outlines if o.title]

return annots, outlines


def process_file(
file: typ.BinaryIO,
*, # Subsequent arguments are keyword-only
Expand Down Expand Up @@ -418,9 +454,7 @@ def emit_progress(msg: str) -> None:
page.annots.sort()
page.outlines.sort()

# Give the annotations a chance to update their internals
for a in page.annots:
a.postprocess()
page.annots, page.outlines = _find_and_modify_replace_annots(page.annots, page.outlines)

emit_progress("\n")

Expand Down
2 changes: 2 additions & 0 deletions pdfannots/printer/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def annot_to_dict(
"prior_outline": getattr(doc.nearest_outline(annot.pos), 'title', None),
"text": annot.gettext(remove_hyphens),
"contents": annot.contents,
"pre_context": annot.pre_context,
"post_context": annot.post_context,
"author": annot.author,
"created": annot.created.strftime('%Y-%m-%dT%H:%M:%S') if annot.created else None,
"color": ('#' + annot.color.ashex()) if annot.color else None
Expand Down
27 changes: 22 additions & 5 deletions pdfannots/printer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import typing as typ

from . import Printer
from ..types import RGB, AnnotationType, Pos, Annotation, Document
from ..types import RGB, AnnotationType, Pos, Annotation, Document, ANNOT_SUBTYPES

logger = logging.getLogger('pdfannots')

Expand Down Expand Up @@ -223,7 +223,7 @@ def format_annot(
if annot.contents else [])

if annot.has_context():
assert annot.subtype == AnnotationType.StrikeOut
assert annot.subtype == AnnotationType.StrikeOut or annot.subtype == AnnotationType.Caret
text = self.merge_strikeout_context(annot, text)

# we are either printing: item text and item contents, or one of the two
Expand Down Expand Up @@ -276,7 +276,8 @@ def emit_body(

class GroupedMarkdownPrinter(MarkdownPrinter):
ANNOT_NITS = frozenset({
AnnotationType.Squiggly, AnnotationType.StrikeOut, AnnotationType.Underline})
AnnotationType.Squiggly, AnnotationType.StrikeOut, AnnotationType.Caret,
AnnotationType.Underline})
ALL_SECTIONS = ["highlights", "comments", "nits"]

def __init__(
Expand Down Expand Up @@ -355,5 +356,21 @@ def fmt_header(name: str, level: int = 2) -> str:
if nits and secname == 'nits':
yield fmt_header("Nits")
for a in nits:
extra = "suggested deletion" if a.subtype == AnnotationType.StrikeOut else None
yield self.format_annot(a, document, extra)
extra = None

if a.subtype == AnnotationType.StrikeOut:
irt_type = None

if a.in_reply_to:
irt_subtype = a.in_reply_to.get('Subtype')
if irt_subtype:
irt_type = ANNOT_SUBTYPES[irt_subtype]

if a.contents and irt_type == AnnotationType.Caret:
extra = "suggested replacement"
else:
extra = "suggested deletion"
elif a.subtype == AnnotationType.Caret:
extra = "suggested insertion"

yield self.format_annot(a, document, extra)
35 changes: 30 additions & 5 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from pdfminer.layout import LTComponent, LTText
from pdfminer.pdftypes import PDFObjRef
from pdfminer.psparser import PSLiteralTable, PSLiteral

from .utils import merge_lines

Expand All @@ -33,6 +34,9 @@ def __init__(self, x0: float, y0: float, x1: float, y1: float):
self.y0 = y0
self.y1 = y1

def __repr__(self) -> str:
return '<Box x0:%f x1:%f y0:%f y1:%f>' % (self.x0, self.x1, self.y0, self.y1)

@staticmethod
def from_item(item: LTComponent) -> Box:
"""Construct a Box from the bounding box of a given PDF component."""
Expand Down Expand Up @@ -260,6 +264,7 @@ class AnnotationType(enum.Enum):
Squiggly = enum.auto()
StrikeOut = enum.auto()
Underline = enum.auto()
Caret = enum.auto()

# A single rectangle, that is abused by some Apple tools to render custom
# highlights. We do not attempt to capture the affected text.
Expand All @@ -281,8 +286,10 @@ class Annotation(ObjectWithPos):
created Timestamp the annotation was created
color RGB color of the annotation
last_charseq Sequence number of the most recent character in text
name Unique identifier for the annotation
in_reply_to Name of the annotation this annotation is in reply to

Attributes updated only for StrikeOut annotations:
Attributes updated for StrikeOut and Caret annotations:
pre_context Text captured just prior to the beginning of 'text'
post_context Text captured just after the end of 'text'
"""
Expand All @@ -302,7 +309,9 @@ def __init__(
contents: typ.Optional[str] = None,
author: typ.Optional[str] = None,
created: typ.Optional[datetime.datetime] = None,
color: typ.Optional[RGB] = None):
color: typ.Optional[RGB] = None,
in_reply_to: typ.Optional[PDFObjRef] = None,
name: typ.Optional[str] = None):

# Construct boxes from quadpoints
boxes = []
Expand Down Expand Up @@ -334,12 +343,15 @@ def __init__(
self.post_context = None
self.boxes = boxes
self.last_charseq = 0
self.name = name
self.in_reply_to = in_reply_to

def __repr__(self) -> str:
return ('<Annotation %s %r%s%s>' %
return ('<Annotation %s %r%s%s%s>' %
(self.subtype.name, self.pos,
" '%s'" % self.contents[:10] if self.contents else '',
" '%s'" % ''.join(self.text[:10]) if self.text else ''))
" '%s'" % ''.join(self.text[:10]) if self.text else '',
" IRT" if self.in_reply_to else ''))

def capture(self, text: str, charseq: int = 0) -> None:
"""Capture text (while rendering the PDF page)."""
Expand All @@ -363,7 +375,7 @@ def gettext(self, remove_hyphens: bool = False) -> typ.Optional[str]:

def wants_context(self) -> bool:
"""Returns true if this annotation type should include context."""
return self.subtype == AnnotationType.StrikeOut
return self.subtype == AnnotationType.StrikeOut or self.subtype == AnnotationType.Caret

def set_pre_context(self, pre_context: str) -> None:
assert self.pre_context is None
Expand Down Expand Up @@ -504,3 +516,16 @@ def ashex(self) -> str:

def __str__(self) -> str:
return f"RGB({self.ashex()})"


ANNOT_SUBTYPES: typ.Dict[PSLiteral, AnnotationType] = {
PSLiteralTable.intern(e.name): e for e in AnnotationType}
"""Mapping from PSliteral to our own enumerant, for supported annotation types."""

IGNORED_ANNOT_SUBTYPES = \
frozenset(PSLiteralTable.intern(n) for n in (
'Link', # Links are used for internal document links (e.g. to other pages).
'Popup', # Controls the on-screen appearance of other annotations. TODO: we may want to
# check for an optional 'Contents' field for alternative human-readable contents.
))
"""Annotation types that we ignore without issuing a warning."""