-
-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathgeneratesitemap.py
executable file
·449 lines (392 loc) · 15.6 KB
/
generatesitemap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
#!/usr/bin/env -S python3 -B
#
# generate-sitemap: Github action for automating sitemap generation
#
# Copyright (c) 2020-2024 Vincent A Cicirello
# https://www.cicirello.org/
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import sys
import re
import os
import os.path
import subprocess
from datetime import datetime
def gatherfiles(extensionsToInclude) :
"""Walks the directory tree discovering
files of specified types for inclusion in
sitemap.
Keyword arguments:
extensionsToInclude - a set of the file extensions to include in sitemap
"""
if len(extensionsToInclude) == 0 :
return []
allfiles = []
for root, dirs, files in os.walk(".") :
for f in files :
if getFileExtension(f) in extensionsToInclude :
allfiles.append(os.path.join(root, f))
return allfiles
INDEX_FILENAMES = { "index.html", "index.shtml" }
def sortname(f, dropExtension=False) :
"""Partial url to sort by, which strips out the filename
if the filename is index.html.
Keyword arguments:
f - Filename with path
dropExtension - true to drop extensions of .html from the filename when sorting
"""
slash = f.rfind("/")
if slash >= 0 and slash < len(f)-1 and f[slash+1:] in INDEX_FILENAMES :
return f[:slash+1]
elif f in INDEX_FILENAMES :
return ""
elif dropExtension and len(f) >= 5 and f[-5:] == ".html" :
return f[:-5]
else :
return f
def urlsort(files, dropExtension=False) :
"""Sorts the urls with a primary sort by depth in the website,
and a secondary sort alphabetically.
Keyword arguments:
files - list of files to include in sitemap
dropExtension - true to drop extensions of .html from the filename when sorting
"""
files.sort(key = lambda f : sortname(f, dropExtension))
files.sort(key = lambda f : f.count("/"))
RE_FLAGS = re.I | re.M | re.S
RE_META_TAG = re.compile(r"<meta([^>]*)>", flags=RE_FLAGS)
def hasMetaRobotsNoindex(f) :
"""Checks whether an html file contains
<meta name="robots" content="noindex"> or
any equivalent directive including a noindex.
Only checks head of html since required to be
in the head if specified.
Keyword arguments:
f - Filename including path
"""
try:
with open(f, "r", errors="surrogateescape") as file :
contents = file.read()
m = re.search("</head>", contents, flags=re.I)
if not m :
m = re.search("<body>", contents, flags=re.I)
all_meta_tags = RE_META_TAG.findall(contents, endpos=m.start()) if m else RE_META_TAG.findall(contents)
for tag in all_meta_tags :
if re.search("name\\s*=\\s*\"\\s*robots", tag, flags=re.I) and re.search("content\\s*=\\s*\".*noindex", tag, flags=re.I) :
return True
return False
except OSError:
print("WARNING: OS error while checking for noindex directive in:", f)
print("Assuming", f, "doesn't have noindex directive.")
return False
def getFileExtension(f) :
"""Gets the file extension, and returns it (in all
lowercase). Returns None if file has no extension.
Keyword arguments:
f - file name possibly with path
"""
i = f.rfind(".")
return f[i+1:].lower() if i >= 0 and f.rfind("/") < i else None
HTML_EXTENSIONS = { "html", "htm", "shtml" }
def isHTMLFile(f) :
"""Checks if the file is an HTML file,
which currently means has an extension of html
or htm.
Keyword arguments:
f - file name including path relative from the root of the website.
"""
return getFileExtension(f) in HTML_EXTENSIONS
def createExtensionSet(includeHTML, includePDF, additionalExt) :
"""Creates a set of file extensions for the file types to include
in the sitemap.
Keyword arguments:
includeHTML - boolean, which if true indicates that all html related extensions
should be included.
includePDF - boolean, which if true results in inclusion of the extension pdf
additionalExt - a set of additional file extensions to include
"""
if includeHTML :
fileExtensionsToInclude = additionalExt | HTML_EXTENSIONS
else :
fileExtensionsToInclude = additionalExt
if includePDF :
fileExtensionsToInclude.add("pdf")
return fileExtensionsToInclude
def robotsBlocked(f, blockedPaths=[]) :
"""Checks if robots are blocked from acessing the
url.
Keyword arguments:
f - file name including path relative from the root of the website.
blockedPaths - a list of paths blocked by robots.txt
"""
if len(blockedPaths) > 0 :
f2 = f
if f2[0] == "." :
f2 = f2[1:]
for b in blockedPaths :
if f2.startswith(b) :
return True
if not isHTMLFile(f) :
return False
return hasMetaRobotsNoindex(f)
def parseRobotsTxt(robotsFile="robots.txt") :
"""Parses a robots.txt if present in the root of the
site, and returns a list of disallowed paths. It only
includes paths disallowed for *.
Keyword arguments:
robotsFile - the name of the robots.txt, which in production
must be robots.txt (the default). The parameter is to enable
unit testing with different robots.txt files."""
blockedPaths = []
try:
if os.path.isfile(robotsFile) :
with open(robotsFile, "r", errors="surrogateescape") as robots :
foundBlock = False
rulesStart = False
for line in robots :
commentStart = line.find("#")
if commentStart > 0 :
line = line[:commentStart]
line = line.strip()
lineLow = line.lower()
if lineLow.startswith("user-agent:") :
if len(line)>11 and line[11:].strip() == "*" :
foundBlock = True
rulesStart = False
elif rulesStart :
foundBlock = False
rulesStart = False
elif foundBlock :
if lineLow.startswith("allow:") :
rulesStart = True
elif lineLow.startswith("disallow:") :
rulesStart = True
if len(line) > 9 :
path = line[9:].strip()
if len(path) > 0 and " " not in path and "\t" not in path:
blockedPaths.append(path)
except OSError:
print("WARNING: OS error while parsing robots.txt")
print("Assuming nothing disallowed.")
return blockedPaths
def lastmod(f) :
"""Determines the date when the file was last modified and
returns a string with the date formatted as required for
the lastmod tag in an xml sitemap.
Keyword arguments:
f - filename
"""
mod = subprocess.run(['git', 'log', '-1', '--format=%cI', f],
stdout=subprocess.PIPE,
universal_newlines=True).stdout.strip()
if len(mod) == 0 :
mod = datetime.now().astimezone().replace(microsecond=0).isoformat()
return mod
def urlstring(f, baseUrl, dropExtension=False) :
"""Forms a string with the full url from a filename and base url.
Keyword arguments:
f - filename
baseUrl - address of the root of the website
dropExtension - true to drop extensions of .html from the filename in urls
"""
if f[0]=="." :
u = f[1:]
else :
u = f
u = sortname(u, dropExtension)
if len(u) >= 1 and u[0]=="/" and len(baseUrl) >= 1 and baseUrl[-1]=="/" :
u = u[1:]
elif (len(u)==0 or u[0]!="/") and (len(baseUrl)==0 or baseUrl[-1]!="/") :
u = "/" + u
return baseUrl + u
xmlSitemapEntryTemplate = """<url>
<loc>{0}</loc>
<lastmod>{1}</lastmod>
</url>"""
def removeTime(dateString) :
"""Removes the time from a date-time.
Keyword arguments:
dateString - The date-time.
"""
return dateString[:10]
def xmlEscapeCharacters(f):
"""Escapes any characters that XML requires escaped, such as
ampersands, etc.
Keyword arguments:
f - the filename
"""
return f.replace(
"&", "&"
).replace(
"<", "<"
).replace(
">", ">"
).replace(
"'", "'"
).replace(
'"', """
)
def xmlSitemapEntry(f, baseUrl, dateString, dropExtension=False, dateOnly=False) :
"""Forms a string with an entry formatted for an xml sitemap
including lastmod date.
Keyword arguments:
f - filename
baseUrl - address of the root of the website
dateString - lastmod date correctly formatted
dropExtension - true to drop extensions of .html from the filename in urls
"""
return xmlSitemapEntryTemplate.format(
urlstring(xmlEscapeCharacters(f), baseUrl, dropExtension),
removeTime(dateString) if dateOnly else dateString
)
def writeTextSitemap(files, baseUrl, dropExtension=False) :
"""Writes a plain text sitemap to the file sitemap.txt.
Keyword Arguments:
files - a list of filenames
baseUrl - the base url to the root of the website
dropExtension - true to drop extensions of .html from the filename in urls
"""
with open("sitemap.txt", "w") as sitemap :
for f in files :
sitemap.write(urlstring(f, baseUrl, dropExtension))
sitemap.write("\n")
def writeXmlSitemap(files, baseUrl, dropExtension=False, dateOnly=False) :
"""Writes an xml sitemap to the file sitemap.xml.
Keyword Arguments:
files - a list of filenames
baseUrl - the base url to the root of the website
dropExtension - true to drop extensions of .html from the filename in urls
"""
with open("sitemap.xml", "w") as sitemap :
sitemap.write('<?xml version="1.0" encoding="UTF-8"?>\n')
sitemap.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')
for f in files :
sitemap.write(xmlSitemapEntry(f, baseUrl, lastmod(f), dropExtension, dateOnly))
sitemap.write("\n")
sitemap.write('</urlset>\n')
def set_outputs(names_values) :
"""Sets the GitHub Action outputs.
Keyword arguments:
names_values - Dictionary of output names with values
"""
if "GITHUB_OUTPUT" in os.environ :
with open(os.environ["GITHUB_OUTPUT"], "a") as f :
for name, value in names_values.items() :
print("{0}={1}".format(name, value), file=f)
else : # Fall-back to deprecated set-output for non-updated runners
for name, value in names_values.items() :
print("::set-output name={0}::{1}".format(name, value))
def sanitize_path(websiteRoot) :
"""Verifies specified website root is within the current working directory.
Keyword arguments:
websiteRoot - the root of the website relative to current working directory
"""
repo_root = os.getcwd()
safe_path = os.path.realpath(websiteRoot)
prefix = os.path.commonpath([repo_root, safe_path])
if prefix == repo_root :
return os.path.join(repo_root, safe_path)
else :
print("ERROR: Specified website root directory appears to be outside of current working directory. Exiting....")
exit(1)
def adjust_path(path):
"""Checks that path is formatted as expected, adjusting if necessary.
Keyword arguments:
path - the path to check and adjust
"""
path = path.replace("\\", "/").removeprefix(".")
if len(path) == 0:
return "/"
if path[0] != "/":
return "/" + path
return path
def main(
websiteRoot,
baseUrl,
includeHTML,
includePDF,
sitemapFormat,
additionalExt,
dropExtension,
dateOnly,
excludePaths
) :
"""The main function of the generate-sitemap GitHub Action.
Keyword arguments:
websiteRoot - The path to the root of the website relative
to the root of the repository.
baseUrl - The URL of the website.
includeHTML - A boolean that controls whether to include HTML
files in the sitemap.
includePDF - A boolean that controls whether to include PDF
files in the sitemap.
sitemapFormat - A string either: xml or txt.
additionalExt - A set of additional user-defined filename
extensions for inclusion in the sitemap.
dropExtension - A boolean that controls whether to drop .html from
URLs that are to html files (e.g., GitHub Pages will serve
an html file if URL doesn't include the .html extension).
dateOnly - If true, includes only the date but not the time in XML
sitemaps, otherwise includes full date and time in lastmods
within XML sitemaps.
excludePaths - A set of paths to exclude from the sitemap, which can
include directories (relative from the root) or even full
paths to individual files.
"""
repo_root = os.getcwd()
os.chdir(sanitize_path(websiteRoot))
# Fixes "dubious ownership" warning related to
# how the actions working directory is mounted
# inside container actions.
subprocess.run(['git', 'config', '--global', '--add', 'safe.directory', repo_root])
if len(excludePaths) > 0:
excludePaths = { adjust_path(path) for path in excludePaths}
blockedPaths = set(parseRobotsTxt()) | excludePaths
allFiles = gatherfiles(createExtensionSet(includeHTML, includePDF, additionalExt))
files = [ f for f in allFiles if not robotsBlocked(f, blockedPaths) ]
urlsort(files, dropExtension)
pathToSitemap = websiteRoot
if pathToSitemap[-1] != "/" :
pathToSitemap += "/"
if sitemapFormat == "xml" :
writeXmlSitemap(files, baseUrl, dropExtension, dateOnly)
pathToSitemap += "sitemap.xml"
else :
writeTextSitemap(files, baseUrl, dropExtension)
pathToSitemap += "sitemap.txt"
set_outputs({
"sitemap-path" : pathToSitemap,
"url-count" : len(files),
"excluded-count" : len(allFiles)-len(files)
})
if __name__ == "__main__" :
main(
websiteRoot = sys.argv[1],
baseUrl = sys.argv[2],
includeHTML = sys.argv[3].lower() == "true",
includePDF = sys.argv[4].lower() == "true",
sitemapFormat = sys.argv[5],
additionalExt = set(sys.argv[6].lower().replace(",", " ").replace(".", " ").split()),
dropExtension = sys.argv[7].lower() == "true",
dateOnly = sys.argv[8].lower() == "true",
excludePaths = set(sys.argv[9].replace(",", " ").split())
)