-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtmlval.py
104 lines (82 loc) · 3.71 KB
/
htmlval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf8 -*-
import logging
import bs4
from pelican import signals
from pelican.generators import ArticlesGenerator, PagesGenerator, TemplatePagesGenerator
TYPES_TO_PROCESS = [
"articles", "pages", "drafts",
"hidden_pages", "hidden_articles",
"translations", "hidden_translations", "draft_translations", "drafts_translations"
]
def process_content(instance, generator=None):
"""
Pelican callback
"""
if instance._content is None:
return
# TODO: This is too slow
issues = []
# strainer = bs4.SoupStrainer("a")
soup_doc = bs4.BeautifulSoup(instance._content, 'html.parser') # , parse_only=strainer)
element_ids = {h['id'] for h in soup_doc.findAll(id=True)}
for anchor in soup_doc.findAll("a", href=True):
url = anchor['href']
if url.startswith("#") and url != "#":
if url[1:] not in element_ids:
issues.append(f"'{anchor}' backlink has no referent")
# Hash text isn't respected by first child, so this catches inline links too.
# for anchor in soup_doc.select("blockquote > p > a:first-child:not(.cite)"):
# issues.append(f"Blockquote begins with plain link, probably meant to be a citation: {anchor}")
if instance.status != "draft" and not isinstance(generator, PagesGenerator):
if instance.summary:
SUMMARY_MAX_LENGTH = instance.settings.get('SUMMARY_MAX_LENGTH')
if SUMMARY_MAX_LENGTH and len(instance.summary) > SUMMARY_MAX_LENGTH:
if instance.content != instance.summary:
issues.append(f"Summary length is {len(instance.summary)}/{SUMMARY_MAX_LENGTH}")
# issues.append(f"{instance.summary[:200]} ... {instance.summary[-200:]}")
else:
issues.append(f"Unsummarized article length is {len(instance.summary)}/{SUMMARY_MAX_LENGTH}")
if instance.has_summary:
issues.append(f"Article incorrectly has 'has_summary' set to True!")
else:
issues.append(f"Missing summary")
if issues:
logging.error(f"HTML validation errors in {instance.relative_source_path}:" + "\n" + "\n".join(issues))
def all_generators_finalized(generators):
# Process the articles and pages
document_generators = [ArticlesGenerator, PagesGenerator, TemplatePagesGenerator]
for generator in generators:
if any(isinstance(generator, t) for t in document_generators):
documents = sum([
getattr(generator, attr, None)
for attr in TYPES_TO_PROCESS
if getattr(generator, attr, None)
], [])
for document in documents:
process_content(document, generator)
else:
logging.debug(f"Renderdeps: Unhandled generator {generator}")
def article_writer_finalized(generator, writer):
# Process the articles and pages
documents = sum([
getattr(generator, attr, None)
for attr in TYPES_TO_PROCESS
if getattr(generator, attr, None)
], [])
for document in documents:
process_content(document, generator)
def register():
"""
Part of Pelican API
"""
# signals.content_object_init.connect(content_object_init)
# signals.all_generators_finalized.connect(all_generators_finalized)
signals.article_writer_finalized.connect(article_writer_finalized)
signals.page_writer_finalized.connect(article_writer_finalized)
# logging.info("Register")
# signals.article_generator_write_article.connect(
# lambda gen, content: process_content(content, gen)
# )
# signals.page_generator_write_page.connect(
# lambda gen, content: process_content(content, gen)
# )