diff --git a/linkrot/__init__.py b/linkrot/__init__.py index 5a78fe6..5328567 100644 --- a/linkrot/__init__.py +++ b/linkrot/__init__.py @@ -1,4 +1,3 @@ - """ Extract metadata and links from a local or remote PDF, and optionally download all referenced PDFs. @@ -52,12 +51,10 @@ from io import BytesIO from urllib.request import Request, urlopen -unicode = str - logger = logging.getLogger(__name__) -class linkrot(object): +class linkrot: """ Main class which extracts infos from PDF @@ -105,7 +102,7 @@ def __init__(self, uri): self.stream = BytesIO(content) except Exception as e: raise DownloadError("Error downloading\ - '%s' (%s)" % (uri, unicode(e))) + '{}' ({})".format(uri, str(e))) else: if not os.path.isfile(uri): @@ -118,17 +115,16 @@ def __init__(self, uri): try: self.reader = PDFMinerBackend(self.stream) except PDFSyntaxError as e: - raise PDFInvalidError("Invalid PDF (%s)" % unicode(e)) + raise PDFInvalidError("Invalid PDF ({})".format(str(e))) # Could try to create a TextReader - logger.info(unicode(e)) + logger.info(str(e)) logger.info("Trying to create a TextReader backend...") self.stream.seek(0) self.reader = TextBackend(self.stream) self.is_pdf = False except Exception as e: - raise - raise PDFInvalidError("Invalid PDF (%s)" % unicode(e)) + raise PDFInvalidError("Invalid PDF ({})".format(str(e))) # Save metadata to user-supplied directory self.summary = { diff --git a/linkrot/backends.py b/linkrot/backends.py index 2d021d5..12298ac 100644 --- a/linkrot/backends.py +++ b/linkrot/backends.py @@ -31,17 +31,16 @@ logger = logging.getLogger(__name__) -unicode = str def make_compat_str(in_str): """ Tries to guess encoding of [str/bytes] and - return a standard unicode string + return a string """ - assert isinstance(in_str, (bytes, str, unicode)) + assert isinstance(in_str, (bytes, str)) if not in_str: - return unicode() + return "" # Chardet in Py3 works on bytes objects if not isinstance(in_str, bytes): @@ -50,7 +49,7 @@ def make_compat_str(in_str): # Detect the encoding now enc = chardet.detect(in_str) - # Decode the object into a unicode object + # Decode the object into a string object try: out_str = in_str.decode(enc["encoding"]) except UnicodeDecodeError as err: @@ -59,12 +58,12 @@ def make_compat_str(in_str): # Cleanup if enc["encoding"] == "UTF-16BE": # Remove byte order marks (BOM) - if out_str.startswith("\ufeff"): + if out_str.startswith("\\ufeff"): out_str = out_str[1:] return out_str -class Reference(object): +class Reference: """ Generic Reference """ ref = "" @@ -104,10 +103,10 @@ def __eq__(self, other): return self.ref == other.ref def __str__(self): - return "<%s: %s>" % (self.reftype, self.ref) + return "<{}: {}>".format(self.reftype, self.ref) -class ReaderBackend(object): +class ReaderBackend: """ Base class of all Readers (eg. for PDF files, text, etc.) @@ -128,14 +127,14 @@ def get_metadata(self): def metadata_key_cleanup(self, d, k): """ Recursively clean metadata dictionaries """ - if isinstance(d[k], (str, unicode)): + if isinstance(d[k], str): d[k] = d[k].strip() if not d[k]: del d[k] elif isinstance(d[k], (list, tuple)): new_list = [] for item in d[k]: - if isinstance(item, (str, unicode)): + if isinstance(item, str): if item.strip(): new_list.append(item.strip()) elif item: @@ -158,14 +157,14 @@ def get_text(self): def get_references(self, reftype=None, sort=False): refs = self.references if reftype: - refs = set([ref for ref in refs if ref.reftype == "pdf"]) + refs = {ref for ref in refs if ref.reftype == "pdf"} return sorted(refs) if sort else refs def get_references_as_dict(self, reftype=None, sort=False): ret = {} refs = self.references if reftype: - refs = set([ref for ref in refs if ref.reftype == "pdf"]) + refs = {ref for ref in refs if ref.reftype == "pdf"} for r in sorted(refs) if sort else refs: if r.reftype in ret: ret[r.reftype].append(r.ref) @@ -190,7 +189,7 @@ def __init__(self, pdf_stream, password="", pagenos=None, maxpages=0): for k in doc.info[0]: v = doc.info[0][k] # print(repr(v), type(v)) - if isinstance(v, (bytes, str, unicode)): + if isinstance(v, (bytes, str)): self.metadata[k] = make_compat_str(v) elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)): self.metadata[k] = make_compat_str(v.name) @@ -280,7 +279,7 @@ def resolve_PDFObjRef(self, obj_ref): if isinstance(obj_resolved, bytes): obj_resolved = obj_resolved.decode("utf-8") - if isinstance(obj_resolved, (str, unicode)): + if isinstance(obj_resolved, str): ref = obj_resolved return Reference(ref, self.curpage) diff --git a/linkrot/cli.py b/linkrot/cli.py index 6583c1c..bbab4bc 100644 --- a/linkrot/cli.py +++ b/linkrot/cli.py @@ -14,7 +14,6 @@ from linkrot.downloader import check_refs from linkrot.archive import archive_links -parse_str = str # print(sys.version) # print("stdout encoding: %s" % sys.stdout.encoding) @@ -126,14 +125,14 @@ def get_text_output(pdf, args): metadata.pop(None, None) for k, v in sorted(pdf.get_metadata().items()): if v: - ret += "- %s = %s\n" % (k, parse_str(v).strip("/")) + ret += "- {} = {}\n".format(k, str(v).strip("/")) # References ref_cnt = pdf.get_references_count() ret += "\nReferences: %s\n" % ref_cnt refs = pdf.get_references_as_dict() for k in refs: - ret += "- %s: %s\n" % (k.upper(), len(refs[k])) + ret += "- {}: {}\n".format(k.upper(), len(refs[k])) # doi references if k == 'url': @@ -142,7 +141,7 @@ def get_text_output(pdf, args): host = urlparse(u).hostname if host and host.endswith(".doi.org"): doi_ref.append(u) - ret += "- %s: %s\n" % ('DOI', len(doi_ref)) + ret += "- {}: {}\n".format('DOI', len(doi_ref)) if args.verbose == 0: if "pdf" in refs: diff --git a/linkrot/colorprint.py b/linkrot/colorprint.py index d6afd03..323ce93 100644 --- a/linkrot/colorprint.py +++ b/linkrot/colorprint.py @@ -27,6 +27,6 @@ def colorprint(color, s): None ''' - output = "%s%s%s" % (color, s, ENDC) + output = "{}{}{}".format(color, s, ENDC) print(output) return output diff --git a/linkrot/downloader.py b/linkrot/downloader.py index d51065f..d0436e6 100644 --- a/linkrot/downloader.py +++ b/linkrot/downloader.py @@ -64,9 +64,9 @@ def check_url(ref): codes[status_code].append(ref) if verbose: if status_code == "200": - colorprint(OKGREEN, "%s - %s" % (status_code, url)) + colorprint(OKGREEN, "{} - {}".format(status_code, url)) else: - colorprint(FAIL, "%s - %s" % (status_code, url)) + colorprint(FAIL, "{} - {}".format(status_code, url)) # Start a threadpool and add the check-url tasks try: @@ -86,7 +86,7 @@ def check_url(ref): output +="\n" +colorprint(OKGREEN, "%s working" % len(codes["200"])) for c in sorted(codes): if c != "200": - output +="\n" + colorprint(FAIL, "%s broken (reason: %s)" % (len(codes[c]), c)) + output +="\n" + colorprint(FAIL, "{} broken (reason: {})".format(len(codes[c]), c)) for ref in codes[c]: o = " - %s" % ref.ref if ref.page > 0: @@ -131,11 +131,11 @@ def download_url(url): colorprint(FAIL, "Error downloading '%s' (%s)" % (url, status_code)) except HTTPError as e: - colorprint(FAIL, "Error downloading '%s' (%s)" % (url, e.code)) + colorprint(FAIL, "Error downloading '{}' ({})".format(url, e.code)) except URLError as e: - colorprint(FAIL, "Error downloading '%s' (%s)" % (url, e.reason)) + colorprint(FAIL, "Error downloading '{}' ({})".format(url, e.reason)) except Exception as e: - colorprint(FAIL, "Error downloading '%s' (%s)" % (url, str(e))) + colorprint(FAIL, "Error downloading '{}' ({})".format(url, str(e))) # Create directory if not os.path.exists(output_directory): diff --git a/linkrot/extractor.py b/linkrot/extractor.py index da425f3..30b061a 100644 --- a/linkrot/extractor.py +++ b/linkrot/extractor.py @@ -71,7 +71,7 @@ def extract_arxiv(text): res = re.findall(ARXIV_REGEX, text, re.MULTILINE) + re.findall( ARXIV_REGEX2, text, re.MULTILINE ) - return set([r.strip(".") for r in res]) + return {r.strip(".") for r in res} def extract_doi(text): @@ -85,7 +85,7 @@ def extract_doi(text): """ res = set(re.findall(DOI_REGEX, text, re.MULTILINE)) - return set([r.strip(".") for r in res]) + return {r.strip(".") for r in res} if __name__ == "__main__": diff --git a/linkrot/libs/xmp.py b/linkrot/libs/xmp.py index f50484a..6998662 100644 --- a/linkrot/libs/xmp.py +++ b/linkrot/libs/xmp.py @@ -28,7 +28,7 @@ } -class XmpParser(object): +class XmpParser: """ Parses an XMP string into a dictionary.