Merge pull request #70 from 0xabu/skimpdf

Minor usability improvements with Skim PDF
0xabu · Mar 30, 2023 · c829231 · c829231
2 parents 17bd36c + b32d8a0
commit c829231
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 3 deletions.
diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py
@@ -384,9 +384,11 @@ def emit_progress(msg: str) -> None:
         # Construct Annotation objects, and append them to the page.
         for pa in pdftypes.resolve1(pdfpage.annots) if pdfpage.annots else []:
             if isinstance(pa, pdftypes.PDFObjRef):
-                annot = _mkannotation(pdftypes.dict_value(pa), page)
-                if annot is not None:
-                    page.annots.append(annot)
+                annot_dict = pdftypes.dict_value(pa)
+                if annot_dict:  # Would be empty if pa is a broken ref
+                    annot = _mkannotation(annot_dict, page)
+                    if annot is not None:
+                        page.annots.append(annot)
             else:
                 logger.warning("Unknown annotation: %s", pa)
 
@@ -404,6 +406,10 @@ def emit_progress(msg: str) -> None:
         page.annots.sort()
         page.outlines.sort()
 
+        # Give the annotations a chance to update their internals
+        for a in page.annots:
+            a.postprocess()
+
     emit_progress("\n")
 
     device.close()

diff --git a/pdfannots/types.py b/pdfannots/types.py
@@ -385,6 +385,15 @@ def get_context(self, remove_hyphens: bool = False) -> typ.Tuple[str, str]:
         return (merge_lines(self.pre_context or '', remove_hyphens, strip_space=False),
                 merge_lines(self.post_context or '', remove_hyphens, strip_space=False))
 
+    def postprocess(self) -> None:
+        """Update internal state once all text and context has been captured."""
+        # The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose
+        # default initial contents are a copy of the selected text. Unless the user goes to
+        # the trouble of editing each annotation, this goes badly for us because we have
+        # duplicate text and contents (e.g., for simple highlights and strikeout).
+        if self.contents and self.text and ''.join(self.text).strip() == self.contents.strip():
+            self.contents = None
+
 
 UnresolvedPage = typ.Union[int, PDFObjRef]
 """A reference to a page that is *either* a page number, or a PDF object ID."""