0xabu · suyashmahar · Jun 26, 2024 · Jun 26, 2024 · 0xabu · Jun 26, 2024
diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py
@@ -22,25 +22,13 @@
 import pdfminer.settings
 import pdfminer.utils
 
-from .types import Page, Outline, AnnotationType, Annotation, Document, RGB
+from .types import Page, Outline, AnnotationType, Annotation, Document, RGB, ANNOT_SUBTYPES, IGNORED_ANNOT_SUBTYPES
 from .utils import cleanup_text, decode_datetime
 
 pdfminer.settings.STRICT = False
 
 logger = logging.getLogger('pdfannots')
 
-ANNOT_SUBTYPES: typ.Dict[PSLiteral, AnnotationType] = {
-    PSLiteralTable.intern(e.name): e for e in AnnotationType}
-"""Mapping from PSliteral to our own enumerant, for supported annotation types."""
-
-IGNORED_ANNOT_SUBTYPES = \
-    frozenset(PSLiteralTable.intern(n) for n in (
-        'Link',   # Links are used for internal document links (e.g. to other pages).
-        'Popup',  # Controls the on-screen appearance of other annotations. TODO: we may want to
-                  # check for an optional 'Contents' field for alternative human-readable contents.
-    ))
-"""Annotation types that we ignore without issuing a warning."""
-
 
 def _mkannotation(
     pa: typ.Dict[str, typ.Any],
@@ -85,7 +73,7 @@ def _mkannotation(
     rect = pdftypes.resolve1(pa.get('Rect'))
 
     # QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut,
-    # Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation.
+    # Squiggly, Caret), where they specify the quadrilaterals (boxes) covered by the annotation.
     quadpoints = pdftypes.resolve1(pa.get('QuadPoints'))
 
     author = pdftypes.resolve1(pa.get('T'))
@@ -103,8 +91,13 @@ def _mkannotation(
         createds = pdfminer.utils.decode_text(createds)
         created = decode_datetime(createds)
 
+    name = pdftypes.resolve1(pa.get('NM')).decode('utf-8')
+
+    in_reply_to = pdftypes.resolve1(pa.get('IRT'))
+
     return Annotation(page, annot_type, quadpoints, rect,
-                      contents, author=author, created=created, color=rgb)
+                      contents, author=author, created=created, color=rgb,
+                      name=name, in_reply_to=in_reply_to)
 
 
 def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]:
@@ -261,7 +254,9 @@ def capture_char(self, text: str) -> None:
                     # Locate and remove the annotation's existing context subscription.
                     assert last_charseq != 0
                     i = bisect.bisect_left(self.context_subscribers, (last_charseq,))
-                    assert 0 <= i < len(self.context_subscribers)
+                    if not (0 <= i < len(self.context_subscribers)):
+                        logger.warning("Annotation %s lost context subscription", a)
+                        continue
                     (found_charseq, found_annot) = self.context_subscribers.pop(i)
                     assert found_charseq == last_charseq
                     assert found_annot is a
@@ -329,6 +324,47 @@ def render(self, item: LTItem) -> None:
                 self.capture_char(text)
 
 
+def _find_and_modify_replace_annots(annots: typ.List[Annotation], outlines: typ.List[Outline]) -> typ.Tuple[typ.List[Annotation], typ.List[Outline]]:
+    strikeout_pending_replies = {}
+    carets_to_remove = []
+
+    # Give the annotations a chance to update their internals
+    for a in annots:
+        in_reply_to_name = None
+
+        if a.in_reply_to:
+            in_reply_to_name = a.in_reply_to.get('NM').decode('utf-8')
+
+        if in_reply_to_name and a.subtype == AnnotationType.StrikeOut:
+            strikeout_pending_replies[in_reply_to_name] = a
+            carets_to_remove.append(in_reply_to_name)
+        a.postprocess()
+
+    for a in annots:
+        in_reply_to_name = None
+
+        if a.in_reply_to:
+            in_reply_to_name = a.in_reply_to.get('NM').decode('utf-8')
+
+        if in_reply_to_name and a.subtype == AnnotationType.StrikeOut:
+            strikeout_pending_replies[in_reply_to_name] = a
+            carets_to_remove.append(in_reply_to_name)
+
+    # Copy the contents of the StrikeOut annotations to the corresponding Caret annotations
+    for a in annots:
+        if a.name in strikeout_pending_replies:
+            strikeout_a = strikeout_pending_replies[a.name]
+            strikeout_a.contents = a.contents
+
+    # Remove any Caret annotations that are replies to other StrikeOut annotations
+    idxs_to_remove = [i for i, a in enumerate(annots) if a.name in carets_to_remove]
+
+    annots = [a for i, a in enumerate(annots) if i not in idxs_to_remove]
+    outlines = [o for o in outlines if o.title]
+
+    return annots, outlines
+
+
 def process_file(
     file: typ.BinaryIO,
     *,  # Subsequent arguments are keyword-only
@@ -418,9 +454,7 @@ def emit_progress(msg: str) -> None:
         page.annots.sort()
         page.outlines.sort()
 
-        # Give the annotations a chance to update their internals
-        for a in page.annots:
-            a.postprocess()
+        page.annots, page.outlines = _find_and_modify_replace_annots(page.annots, page.outlines)
 
     emit_progress("\n")
 

diff --git a/pdfannots/printer/json.py b/pdfannots/printer/json.py
@@ -21,6 +21,8 @@ def annot_to_dict(
         "prior_outline": getattr(doc.nearest_outline(annot.pos), 'title', None),
         "text": annot.gettext(remove_hyphens),
         "contents": annot.contents,
+        "pre_context": annot.pre_context,
+        "post_context": annot.post_context,
         "author": annot.author,
         "created": annot.created.strftime('%Y-%m-%dT%H:%M:%S') if annot.created else None,
         "color": ('#' + annot.color.ashex()) if annot.color else None

diff --git a/pdfannots/printer/markdown.py b/pdfannots/printer/markdown.py
@@ -4,7 +4,7 @@
 import typing as typ
 
 from . import Printer
-from ..types import RGB, AnnotationType, Pos, Annotation, Document
+from ..types import RGB, AnnotationType, Pos, Annotation, Document, ANNOT_SUBTYPES
 
 logger = logging.getLogger('pdfannots')
 
@@ -223,7 +223,7 @@ def format_annot(
                    if annot.contents else [])
 
         if annot.has_context():
-            assert annot.subtype == AnnotationType.StrikeOut
+            assert annot.subtype == AnnotationType.StrikeOut or annot.subtype == AnnotationType.Caret
             text = self.merge_strikeout_context(annot, text)
 
         # we are either printing: item text and item contents, or one of the two
@@ -276,7 +276,8 @@ def emit_body(
 
 class GroupedMarkdownPrinter(MarkdownPrinter):
     ANNOT_NITS = frozenset({
-        AnnotationType.Squiggly, AnnotationType.StrikeOut, AnnotationType.Underline})
+        AnnotationType.Squiggly, AnnotationType.StrikeOut, AnnotationType.Caret, 
+        AnnotationType.Underline})
     ALL_SECTIONS = ["highlights", "comments", "nits"]
 
     def __init__(
@@ -355,5 +356,21 @@ def fmt_header(name: str, level: int = 2) -> str:
             if nits and secname == 'nits':
                 yield fmt_header("Nits")
                 for a in nits:
-                    extra = "suggested deletion" if a.subtype == AnnotationType.StrikeOut else None
-                    yield self.format_annot(a, document, extra)
+                    extra = None
+
+                    if a.subtype == AnnotationType.StrikeOut:
+                        irt_type = None
+
+                        if a.in_reply_to:
+                            irt_subtype = a.in_reply_to.get('Subtype')
+                            if irt_subtype:
+                                irt_type = ANNOT_SUBTYPES[irt_subtype]
+
+                        if a.contents and irt_type == AnnotationType.Caret:
+                            extra = "suggested replacement" 
+                        else:
+                            extra = "suggested deletion"
+                    elif a.subtype == AnnotationType.Caret:
+                        extra = "suggested insertion"
+
+                    yield self.format_annot(a, document, extra)
diff --git a/pdfannots/types.py b/pdfannots/types.py
@@ -9,6 +9,7 @@
 
 from pdfminer.layout import LTComponent, LTText
 from pdfminer.pdftypes import PDFObjRef
+from pdfminer.psparser import PSLiteralTable, PSLiteral
 
 from .utils import merge_lines
 
@@ -33,6 +34,9 @@ def __init__(self, x0: float, y0: float, x1: float, y1: float):
         self.y0 = y0
         self.y1 = y1
 
+    def __repr__(self) -> str:
+        return '<Box x0:%f x1:%f y0:%f y1:%f>' % (self.x0, self.x1, self.y0, self.y1)
+
     @staticmethod
     def from_item(item: LTComponent) -> Box:
         """Construct a Box from the bounding box of a given PDF component."""
@@ -260,6 +264,7 @@ class AnnotationType(enum.Enum):
     Squiggly = enum.auto()
     StrikeOut = enum.auto()
     Underline = enum.auto()
+    Caret = enum.auto()
 
     # A single rectangle, that is abused by some Apple tools to render custom
     # highlights. We do not attempt to capture the affected text.
@@ -281,8 +286,10 @@ class Annotation(ObjectWithPos):
         created      Timestamp the annotation was created
         color        RGB color of the annotation
         last_charseq Sequence number of the most recent character in text
+        name         Unique identifier for the annotation
+        in_reply_to  Name of the annotation this annotation is in reply to
 
-    Attributes updated only for StrikeOut annotations:
+    Attributes updated for StrikeOut and Caret annotations:
         pre_context  Text captured just prior to the beginning of 'text'
         post_context Text captured just after the end of 'text'
     """
@@ -302,7 +309,9 @@ def __init__(
             contents: typ.Optional[str] = None,
             author: typ.Optional[str] = None,
             created: typ.Optional[datetime.datetime] = None,
-            color: typ.Optional[RGB] = None):
+            color: typ.Optional[RGB] = None,
+            in_reply_to: typ.Optional[PDFObjRef] = None,
+            name: typ.Optional[str] = None):
 
         # Construct boxes from quadpoints
         boxes = []
@@ -334,12 +343,15 @@ def __init__(
         self.post_context = None
         self.boxes = boxes
         self.last_charseq = 0
+        self.name = name
+        self.in_reply_to = in_reply_to
 
     def __repr__(self) -> str:
-        return ('<Annotation %s %r%s%s>' %
+        return ('<Annotation %s %r%s%s%s>' %
                 (self.subtype.name, self.pos,
                  " '%s'" % self.contents[:10] if self.contents else '',
-                 " '%s'" % ''.join(self.text[:10]) if self.text else ''))
+                 " '%s'" % ''.join(self.text[:10]) if self.text else '',
+                 " IRT" if self.in_reply_to else ''))
 
     def capture(self, text: str, charseq: int = 0) -> None:
         """Capture text (while rendering the PDF page)."""
@@ -363,7 +375,7 @@ def gettext(self, remove_hyphens: bool = False) -> typ.Optional[str]:
 
     def wants_context(self) -> bool:
         """Returns true if this annotation type should include context."""
-        return self.subtype == AnnotationType.StrikeOut
+        return self.subtype == AnnotationType.StrikeOut or self.subtype == AnnotationType.Caret
 
     def set_pre_context(self, pre_context: str) -> None:
         assert self.pre_context is None
@@ -504,3 +516,16 @@ def ashex(self) -> str:
 
     def __str__(self) -> str:
         return f"RGB({self.ashex()})"
+
+
+ANNOT_SUBTYPES: typ.Dict[PSLiteral, AnnotationType] = {
+    PSLiteralTable.intern(e.name): e for e in AnnotationType}
+"""Mapping from PSliteral to our own enumerant, for supported annotation types."""
+
+IGNORED_ANNOT_SUBTYPES = \
+    frozenset(PSLiteralTable.intern(n) for n in (
+        'Link',   # Links are used for internal document links (e.g. to other pages).
+        'Popup',  # Controls the on-screen appearance of other annotations. TODO: we may want to
+                  # check for an optional 'Contents' field for alternative human-readable contents.
+    ))
+"""Annotation types that we ignore without issuing a warning."""