Skip to content

Commit

Permalink
Merge pull request #165 from wiseaidev/rm-py-2
Browse files Browse the repository at this point in the history
cleanup python 2 syntax
  • Loading branch information
marshalmiller authored Oct 3, 2022
2 parents 45395cc + c864a39 commit b928647
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 38 deletions.
14 changes: 5 additions & 9 deletions linkrot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

"""
Extract metadata and links from a local or remote PDF, and
optionally download all referenced PDFs.
Expand Down Expand Up @@ -52,12 +51,10 @@
from io import BytesIO
from urllib.request import Request, urlopen

unicode = str

logger = logging.getLogger(__name__)


class linkrot(object):
class linkrot:
"""
Main class which extracts infos from PDF
Expand Down Expand Up @@ -105,7 +102,7 @@ def __init__(self, uri):
self.stream = BytesIO(content)
except Exception as e:
raise DownloadError("Error downloading\
'%s' (%s)" % (uri, unicode(e)))
'{}' ({})".format(uri, str(e)))

else:
if not os.path.isfile(uri):
Expand All @@ -118,17 +115,16 @@ def __init__(self, uri):
try:
self.reader = PDFMinerBackend(self.stream)
except PDFSyntaxError as e:
raise PDFInvalidError("Invalid PDF (%s)" % unicode(e))
raise PDFInvalidError("Invalid PDF ({})".format(str(e)))

# Could try to create a TextReader
logger.info(unicode(e))
logger.info(str(e))
logger.info("Trying to create a TextReader backend...")
self.stream.seek(0)
self.reader = TextBackend(self.stream)
self.is_pdf = False
except Exception as e:
raise
raise PDFInvalidError("Invalid PDF (%s)" % unicode(e))
raise PDFInvalidError("Invalid PDF ({})".format(str(e)))

# Save metadata to user-supplied directory
self.summary = {
Expand Down
29 changes: 14 additions & 15 deletions linkrot/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,16 @@


logger = logging.getLogger(__name__)
unicode = str


def make_compat_str(in_str):
"""
Tries to guess encoding of [str/bytes] and
return a standard unicode string
return a string
"""
assert isinstance(in_str, (bytes, str, unicode))
assert isinstance(in_str, (bytes, str))
if not in_str:
return unicode()
return ""

# Chardet in Py3 works on bytes objects
if not isinstance(in_str, bytes):
Expand All @@ -50,7 +49,7 @@ def make_compat_str(in_str):
# Detect the encoding now
enc = chardet.detect(in_str)

# Decode the object into a unicode object
# Decode the object into a string object
try:
out_str = in_str.decode(enc["encoding"])
except UnicodeDecodeError as err:
Expand All @@ -59,12 +58,12 @@ def make_compat_str(in_str):
# Cleanup
if enc["encoding"] == "UTF-16BE":
# Remove byte order marks (BOM)
if out_str.startswith("\ufeff"):
if out_str.startswith("\\ufeff"):
out_str = out_str[1:]
return out_str


class Reference(object):
class Reference:
""" Generic Reference """

ref = ""
Expand Down Expand Up @@ -104,10 +103,10 @@ def __eq__(self, other):
return self.ref == other.ref

def __str__(self):
return "<%s: %s>" % (self.reftype, self.ref)
return "<{}: {}>".format(self.reftype, self.ref)


class ReaderBackend(object):
class ReaderBackend:
"""
Base class of all Readers (eg. for PDF files, text, etc.)
Expand All @@ -128,14 +127,14 @@ def get_metadata(self):

def metadata_key_cleanup(self, d, k):
""" Recursively clean metadata dictionaries """
if isinstance(d[k], (str, unicode)):
if isinstance(d[k], str):
d[k] = d[k].strip()
if not d[k]:
del d[k]
elif isinstance(d[k], (list, tuple)):
new_list = []
for item in d[k]:
if isinstance(item, (str, unicode)):
if isinstance(item, str):
if item.strip():
new_list.append(item.strip())
elif item:
Expand All @@ -158,14 +157,14 @@ def get_text(self):
def get_references(self, reftype=None, sort=False):
refs = self.references
if reftype:
refs = set([ref for ref in refs if ref.reftype == "pdf"])
refs = {ref for ref in refs if ref.reftype == "pdf"}
return sorted(refs) if sort else refs

def get_references_as_dict(self, reftype=None, sort=False):
ret = {}
refs = self.references
if reftype:
refs = set([ref for ref in refs if ref.reftype == "pdf"])
refs = {ref for ref in refs if ref.reftype == "pdf"}
for r in sorted(refs) if sort else refs:
if r.reftype in ret:
ret[r.reftype].append(r.ref)
Expand All @@ -190,7 +189,7 @@ def __init__(self, pdf_stream, password="", pagenos=None, maxpages=0):
for k in doc.info[0]:
v = doc.info[0][k]
# print(repr(v), type(v))
if isinstance(v, (bytes, str, unicode)):
if isinstance(v, (bytes, str)):
self.metadata[k] = make_compat_str(v)
elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)):
self.metadata[k] = make_compat_str(v.name)
Expand Down Expand Up @@ -280,7 +279,7 @@ def resolve_PDFObjRef(self, obj_ref):
if isinstance(obj_resolved, bytes):
obj_resolved = obj_resolved.decode("utf-8")

if isinstance(obj_resolved, (str, unicode)):
if isinstance(obj_resolved, str):
ref = obj_resolved
return Reference(ref, self.curpage)

Expand Down
7 changes: 3 additions & 4 deletions linkrot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from linkrot.downloader import check_refs
from linkrot.archive import archive_links

parse_str = str

# print(sys.version)
# print("stdout encoding: %s" % sys.stdout.encoding)
Expand Down Expand Up @@ -126,14 +125,14 @@ def get_text_output(pdf, args):
metadata.pop(None, None)
for k, v in sorted(pdf.get_metadata().items()):
if v:
ret += "- %s = %s\n" % (k, parse_str(v).strip("/"))
ret += "- {} = {}\n".format(k, str(v).strip("/"))

# References
ref_cnt = pdf.get_references_count()
ret += "\nReferences: %s\n" % ref_cnt
refs = pdf.get_references_as_dict()
for k in refs:
ret += "- %s: %s\n" % (k.upper(), len(refs[k]))
ret += "- {}: {}\n".format(k.upper(), len(refs[k]))

# doi references
if k == 'url':
Expand All @@ -142,7 +141,7 @@ def get_text_output(pdf, args):
host = urlparse(u).hostname
if host and host.endswith(".doi.org"):
doi_ref.append(u)
ret += "- %s: %s\n" % ('DOI', len(doi_ref))
ret += "- {}: {}\n".format('DOI', len(doi_ref))

if args.verbose == 0:
if "pdf" in refs:
Expand Down
2 changes: 1 addition & 1 deletion linkrot/colorprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ def colorprint(color, s):
None
'''
output = "%s%s%s" % (color, s, ENDC)
output = "{}{}{}".format(color, s, ENDC)
print(output)
return output
12 changes: 6 additions & 6 deletions linkrot/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ def check_url(ref):
codes[status_code].append(ref)
if verbose:
if status_code == "200":
colorprint(OKGREEN, "%s - %s" % (status_code, url))
colorprint(OKGREEN, "{} - {}".format(status_code, url))
else:
colorprint(FAIL, "%s - %s" % (status_code, url))
colorprint(FAIL, "{} - {}".format(status_code, url))

# Start a threadpool and add the check-url tasks
try:
Expand All @@ -86,7 +86,7 @@ def check_url(ref):
output +="\n" +colorprint(OKGREEN, "%s working" % len(codes["200"]))
for c in sorted(codes):
if c != "200":
output +="\n" + colorprint(FAIL, "%s broken (reason: %s)" % (len(codes[c]), c))
output +="\n" + colorprint(FAIL, "{} broken (reason: {})".format(len(codes[c]), c))
for ref in codes[c]:
o = " - %s" % ref.ref
if ref.page > 0:
Expand Down Expand Up @@ -131,11 +131,11 @@ def download_url(url):
colorprint(FAIL, "Error downloading '%s' (%s)" %
(url, status_code))
except HTTPError as e:
colorprint(FAIL, "Error downloading '%s' (%s)" % (url, e.code))
colorprint(FAIL, "Error downloading '{}' ({})".format(url, e.code))
except URLError as e:
colorprint(FAIL, "Error downloading '%s' (%s)" % (url, e.reason))
colorprint(FAIL, "Error downloading '{}' ({})".format(url, e.reason))
except Exception as e:
colorprint(FAIL, "Error downloading '%s' (%s)" % (url, str(e)))
colorprint(FAIL, "Error downloading '{}' ({})".format(url, str(e)))

# Create directory
if not os.path.exists(output_directory):
Expand Down
4 changes: 2 additions & 2 deletions linkrot/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def extract_arxiv(text):
res = re.findall(ARXIV_REGEX, text, re.MULTILINE) + re.findall(
ARXIV_REGEX2, text, re.MULTILINE
)
return set([r.strip(".") for r in res])
return {r.strip(".") for r in res}


def extract_doi(text):
Expand All @@ -85,7 +85,7 @@ def extract_doi(text):
"""

res = set(re.findall(DOI_REGEX, text, re.MULTILINE))
return set([r.strip(".") for r in res])
return {r.strip(".") for r in res}


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion linkrot/libs/xmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
}


class XmpParser(object):
class XmpParser:
"""
Parses an XMP string into a dictionary.
Expand Down

0 comments on commit b928647

Please sign in to comment.