Merge pull request #165 from wiseaidev/rm-py-2

cleanup python 2 syntax
rottingresearch · Oct 3, 2022 · b928647 · b928647
2 parents 45395cc + c864a39
commit b928647
Show file tree

Hide file tree

Showing 7 changed files with 32 additions and 38 deletions.
diff --git a/linkrot/__init__.py b/linkrot/__init__.py
@@ -1,4 +1,3 @@
-
 """
 Extract metadata and links from a local or remote PDF, and
 optionally download all referenced PDFs.
@@ -52,12 +51,10 @@
 from io import BytesIO
 from urllib.request import Request, urlopen
 
-unicode = str
-
 logger = logging.getLogger(__name__)
 
 
-class linkrot(object):
+class linkrot:
     """
     Main class which extracts infos from PDF
 
@@ -105,7 +102,7 @@ def __init__(self, uri):
                 self.stream = BytesIO(content)
             except Exception as e:
                 raise DownloadError("Error downloading\
-                '%s' (%s)" % (uri, unicode(e)))
+                '{}' ({})".format(uri, str(e)))
 
         else:
             if not os.path.isfile(uri):
@@ -118,17 +115,16 @@ def __init__(self, uri):
         try:
             self.reader = PDFMinerBackend(self.stream)
         except PDFSyntaxError as e:
-            raise PDFInvalidError("Invalid PDF (%s)" % unicode(e))
+            raise PDFInvalidError("Invalid PDF ({})".format(str(e)))
 
             # Could try to create a TextReader
-            logger.info(unicode(e))
+            logger.info(str(e))
             logger.info("Trying to create a TextReader backend...")
             self.stream.seek(0)
             self.reader = TextBackend(self.stream)
             self.is_pdf = False
         except Exception as e:
-            raise
-            raise PDFInvalidError("Invalid PDF (%s)" % unicode(e))
+            raise PDFInvalidError("Invalid PDF ({})".format(str(e)))
 
         # Save metadata to user-supplied directory
         self.summary = {

diff --git a/linkrot/backends.py b/linkrot/backends.py
@@ -31,17 +31,16 @@
 
 
 logger = logging.getLogger(__name__)
-unicode = str
 
 
 def make_compat_str(in_str):
     """
     Tries to guess encoding of [str/bytes] and
-    return a standard unicode string
+    return a string
     """
-    assert isinstance(in_str, (bytes, str, unicode))
+    assert isinstance(in_str, (bytes, str))
     if not in_str:
-        return unicode()
+        return ""
 
     # Chardet in Py3 works on bytes objects
     if not isinstance(in_str, bytes):
@@ -50,7 +49,7 @@ def make_compat_str(in_str):
     # Detect the encoding now
     enc = chardet.detect(in_str)
 
-    # Decode the object into a unicode object
+    # Decode the object into a string object
     try:
         out_str = in_str.decode(enc["encoding"])
     except UnicodeDecodeError as err:
@@ -59,12 +58,12 @@ def make_compat_str(in_str):
     # Cleanup
     if enc["encoding"] == "UTF-16BE":
         # Remove byte order marks (BOM)
-        if out_str.startswith("\ufeff"):
+        if out_str.startswith("\\ufeff"):
             out_str = out_str[1:]
     return out_str
 
 
-class Reference(object):
+class Reference:
     """ Generic Reference """
 
     ref = ""
@@ -104,10 +103,10 @@ def __eq__(self, other):
         return self.ref == other.ref
 
     def __str__(self):
-        return "<%s: %s>" % (self.reftype, self.ref)
+        return "<{}: {}>".format(self.reftype, self.ref)
 
 
-class ReaderBackend(object):
+class ReaderBackend:
     """
     Base class of all Readers (eg. for PDF files, text, etc.)
 
@@ -128,14 +127,14 @@ def get_metadata(self):
 
     def metadata_key_cleanup(self, d, k):
         """ Recursively clean metadata dictionaries """
-        if isinstance(d[k], (str, unicode)):
+        if isinstance(d[k], str):
             d[k] = d[k].strip()
             if not d[k]:
                 del d[k]
         elif isinstance(d[k], (list, tuple)):
             new_list = []
             for item in d[k]:
-                if isinstance(item, (str, unicode)):
+                if isinstance(item, str):
                     if item.strip():
                         new_list.append(item.strip())
                 elif item:
@@ -158,14 +157,14 @@ def get_text(self):
     def get_references(self, reftype=None, sort=False):
         refs = self.references
         if reftype:
-            refs = set([ref for ref in refs if ref.reftype == "pdf"])
+            refs = {ref for ref in refs if ref.reftype == "pdf"}
         return sorted(refs) if sort else refs
 
     def get_references_as_dict(self, reftype=None, sort=False):
         ret = {}
         refs = self.references
         if reftype:
-            refs = set([ref for ref in refs if ref.reftype == "pdf"])
+            refs = {ref for ref in refs if ref.reftype == "pdf"}
         for r in sorted(refs) if sort else refs:
             if r.reftype in ret:
                 ret[r.reftype].append(r.ref)
@@ -190,7 +189,7 @@ def __init__(self, pdf_stream, password="", pagenos=None, maxpages=0):
             for k in doc.info[0]:
                 v = doc.info[0][k]
                 # print(repr(v), type(v))
-                if isinstance(v, (bytes, str, unicode)):
+                if isinstance(v, (bytes, str)):
                     self.metadata[k] = make_compat_str(v)
                 elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)):
                     self.metadata[k] = make_compat_str(v.name)
@@ -280,7 +279,7 @@ def resolve_PDFObjRef(self, obj_ref):
         if isinstance(obj_resolved, bytes):
             obj_resolved = obj_resolved.decode("utf-8")
 
-        if isinstance(obj_resolved, (str, unicode)):
+        if isinstance(obj_resolved, str):
             ref = obj_resolved
             return Reference(ref, self.curpage)
 

diff --git a/linkrot/cli.py b/linkrot/cli.py
@@ -14,7 +14,6 @@
 from linkrot.downloader import check_refs
 from linkrot.archive import archive_links
 
-parse_str = str
 
 # print(sys.version)
 # print("stdout encoding: %s" % sys.stdout.encoding)
@@ -126,14 +125,14 @@ def get_text_output(pdf, args):
     metadata.pop(None, None)
     for k, v in sorted(pdf.get_metadata().items()):
          if v:
-             ret += "- %s = %s\n" % (k, parse_str(v).strip("/"))
+             ret += "- {} = {}\n".format(k, str(v).strip("/"))
 
     # References
     ref_cnt = pdf.get_references_count()
     ret += "\nReferences: %s\n" % ref_cnt
     refs = pdf.get_references_as_dict()
     for k in refs:
-        ret += "- %s: %s\n" % (k.upper(), len(refs[k]))
+        ret += "- {}: {}\n".format(k.upper(), len(refs[k]))
 
         # doi references
         if k == 'url':
@@ -142,7 +141,7 @@ def get_text_output(pdf, args):
                 host = urlparse(u).hostname
                 if host and host.endswith(".doi.org"):
                     doi_ref.append(u)
-            ret += "- %s: %s\n" % ('DOI', len(doi_ref))
+            ret += "- {}: {}\n".format('DOI', len(doi_ref))
 
     if args.verbose == 0:
         if "pdf" in refs:

diff --git a/linkrot/colorprint.py b/linkrot/colorprint.py
@@ -27,6 +27,6 @@ def colorprint(color, s):
         None  
 
     '''
-    output = "%s%s%s" % (color, s, ENDC)
+    output = "{}{}{}".format(color, s, ENDC)
     print(output)
     return output
diff --git a/linkrot/downloader.py b/linkrot/downloader.py
@@ -64,9 +64,9 @@ def check_url(ref):
         codes[status_code].append(ref)
         if verbose:
             if status_code == "200":
-                colorprint(OKGREEN, "%s - %s" % (status_code, url))
+                colorprint(OKGREEN, "{} - {}".format(status_code, url))
             else:
-                colorprint(FAIL, "%s - %s" % (status_code, url))
+                colorprint(FAIL, "{} - {}".format(status_code, url))
 
     # Start a threadpool and add the check-url tasks
     try:
@@ -86,7 +86,7 @@ def check_url(ref):
         output +="\n" +colorprint(OKGREEN, "%s working" % len(codes["200"]))
     for c in sorted(codes):
         if c != "200":
-            output +="\n" + colorprint(FAIL, "%s broken (reason: %s)" % (len(codes[c]), c))
+            output +="\n" + colorprint(FAIL, "{} broken (reason: {})".format(len(codes[c]), c))
             for ref in codes[c]:                
                 o = "  - %s" % ref.ref
                 if ref.page > 0:
@@ -131,11 +131,11 @@ def download_url(url):
                     colorprint(FAIL, "Error downloading '%s' (%s)" %
                                      (url, status_code))
         except HTTPError as e:
-            colorprint(FAIL, "Error downloading '%s' (%s)" % (url, e.code))
+            colorprint(FAIL, "Error downloading '{}' ({})".format(url, e.code))
         except URLError as e:
-            colorprint(FAIL, "Error downloading '%s' (%s)" % (url, e.reason))
+            colorprint(FAIL, "Error downloading '{}' ({})".format(url, e.reason))
         except Exception as e:
-            colorprint(FAIL, "Error downloading '%s' (%s)" % (url, str(e)))
+            colorprint(FAIL, "Error downloading '{}' ({})".format(url, str(e)))
 
     # Create directory
     if not os.path.exists(output_directory):

diff --git a/linkrot/extractor.py b/linkrot/extractor.py
@@ -71,7 +71,7 @@ def extract_arxiv(text):
     res = re.findall(ARXIV_REGEX, text, re.MULTILINE) + re.findall(
         ARXIV_REGEX2, text, re.MULTILINE
     )
-    return set([r.strip(".") for r in res])
+    return {r.strip(".") for r in res}
 
 
 def extract_doi(text):
@@ -85,7 +85,7 @@ def extract_doi(text):
     """
 
     res = set(re.findall(DOI_REGEX, text, re.MULTILINE))
-    return set([r.strip(".") for r in res])
+    return {r.strip(".") for r in res}
 
 
 if __name__ == "__main__":

diff --git a/linkrot/libs/xmp.py b/linkrot/libs/xmp.py
@@ -28,7 +28,7 @@
 }
 
 
-class XmpParser(object):
+class XmpParser:
     """
         Parses an XMP string into a dictionary.
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,7 +28,7 @@ @@
     }
-    class XmpParser(object):
+    class XmpParser:
         """
             Parses an XMP string into a dictionary.
@@ Expand Down @@