Skip to content

Commit

Permalink
Fix squidfunk#8012 - Privacy plugin crashes on HTTP errors
Browse files Browse the repository at this point in the history
  • Loading branch information
Cimon Lucas (LCM) committed Feb 18, 2025
1 parent 2752b9e commit 80f7ced
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 102 deletions.
105 changes: 54 additions & 51 deletions material/plugins/privacy/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,13 @@
from .config import PrivacyConfig
from .parser import FragmentParser

DEFAULT_TIMEOUT_IN_SECS = 5

# -----------------------------------------------------------------------------
# Classes
# -----------------------------------------------------------------------------


# Privacy plugin
class PrivacyPlugin(BasePlugin[PrivacyConfig]):

Expand All @@ -66,7 +69,7 @@ def on_config(self, config):
self.assets_expr_map = {
".css": r"url\(\s*([\"']?)(?P<url>http?[^)'\"]+)\1\s*\)",
".js": r"[\"'](?P<url>http[^\"']+\.(?:css|js(?:on)?))[\"']",
**self.config.assets_expr_map
**self.config.assets_expr_map,
}

# Process external style sheets and scripts (run latest) - run this after
Expand All @@ -89,7 +92,7 @@ def on_files(self, files, *, config):
# downloaded. Create and enqueue a job for each external asset.
for url in self._parse_media(initiator):
if not self._is_excluded(url, initiator):
file = self._queue(url, config, concurrent = True)
file = self._queue(url, config, concurrent=True)

# If site URL is not given, ensure that Mermaid.js is always
# present. This is a special case, as Material for MkDocs
Expand All @@ -110,7 +113,7 @@ def on_files(self, files, *, config):
for path in config.extra_css:
url = urlparse(path)
if not self._is_excluded(url):
self._queue(url, config, concurrent = True)
self._queue(url, config, concurrent=True)

# Process external script files
for script in config.extra_javascript:
Expand All @@ -120,7 +123,7 @@ def on_files(self, files, *, config):
# Enqueue a job if the script needs to downloaded
url = urlparse(script.path)
if not self._is_excluded(url):
self._queue(url, config, concurrent = True)
self._queue(url, config, concurrent=True)

# Process external images in page (run latest) - this stage is the earliest
# we can start processing external images, since images are the most common
Expand All @@ -137,15 +140,14 @@ def on_page_content(self, html, *, page, config, files):

# Find all external images and download them if not excluded
for match in re.findall(
r"<img[^>]+src=['\"]?http[^>]+>",
html, flags = re.I | re.M
r"<img[^>]+src=['\"]?http[^>]+>", html, flags=re.I | re.M
):
el = self._parse_fragment(match)

# Create and enqueue job to fetch external image
url = urlparse(el.get("src"))
if not self._is_excluded(url, page.file):
self._queue(url, config, concurrent = True)
self._queue(url, config, concurrent=True)

# Sync all concurrent jobs
def on_env(self, env, *, config, files):
Expand Down Expand Up @@ -194,9 +196,7 @@ def on_post_build(self, *, config):
for file in self.assets:
_, extension = posixpath.splitext(file.dest_uri)
if extension in [".css", ".js"]:
self.pool_jobs.append(self.pool.submit(
self._patch, file
))
self.pool_jobs.append(self.pool.submit(self._patch, file))

# Otherwise just copy external asset to output directory
else:
Expand Down Expand Up @@ -226,11 +226,9 @@ def _is_excluded(self, url: URL, initiator: File | None = None):
# If initiator is given, format for printing
via = ""
if initiator:
via = "".join([
Fore.WHITE, Style.DIM,
f"in '{initiator.src_uri}' ",
Style.RESET_ALL
])
via = "".join(
[Fore.WHITE, Style.DIM, f"in '{initiator.src_uri}' ", Style.RESET_ALL]
)

# Print warning if fetching is not enabled
if not self.config.assets_fetch:
Expand All @@ -257,8 +255,7 @@ def _parse_fragment(self, fragment: str):
# quote, we need to catch this here, as we're using pretty basic
# regular expression based extraction
raise PluginError(
f"Could not parse due to possible syntax error in HTML: \n\n"
+ fragment
f"Could not parse due to possible syntax error in HTML: \n\n" + fragment
)

# Parse and extract all external assets from a media file using a preset
Expand All @@ -274,8 +271,8 @@ def _parse_media(self, initiator: File) -> list[URL]:
return []

# Find and extract all external asset URLs
expr = re.compile(self.assets_expr_map[extension], flags = re.I | re.M)
with open(initiator.abs_src_path, encoding = "utf-8-sig") as f:
expr = re.compile(self.assets_expr_map[extension], flags=re.I | re.M)
with open(initiator.abs_src_path, encoding="utf-8-sig") as f:
results = re.finditer(expr, f.read())
return [urlparse(result.group("url")) for result in results]

Expand Down Expand Up @@ -332,7 +329,9 @@ def replace(match: Match):
# Find and replace all external asset URLs in current page
return re.sub(
r"<(?:(?:a|link|image)[^>]+href|(?:script|img)[^>]+src)=['\"]?http[^>]+>",
replace, output, flags = re.I | re.M
replace,
output,
flags=re.I | re.M,
)

# -------------------------------------------------------------------------
Expand All @@ -348,11 +347,11 @@ def _print(self, el: Element):
el.attrib[name] = temp

# Return void or opening tag as string, strip closing tag
data = tostring(el, encoding = "unicode")
return data.replace(" />", ">").replace(f"=\"{temp}\"", "")
data = tostring(el, encoding="unicode")
return data.replace(" />", ">").replace(f'="{temp}"', "")

# Enqueue external asset for download, if not already done
def _queue(self, url: URL, config: MkDocsConfig, concurrent = False):
def _queue(self, url: URL, config: MkDocsConfig, concurrent=False):
path = self._path_from_url(url)
full = posixpath.join(self.config.assets_fetch_dir, path)

Expand All @@ -373,9 +372,7 @@ def _queue(self, url: URL, config: MkDocsConfig, concurrent = False):
# the caller must only ensure to reconcile the concurrent jobs.
_, extension = posixpath.splitext(url.path)
if extension and concurrent:
self.pool_jobs.append(self.pool.submit(
self._fetch, file, config
))
self.pool_jobs.append(self.pool.submit(self._fetch, file, config))

# Fetch external asset synchronously, as it either has no extension
# or is fetched from a context in which replacements are done
Expand Down Expand Up @@ -404,18 +401,28 @@ def _fetch(self, file: File, config: MkDocsConfig):

# Download external asset
log.info(f"Downloading external file: {file.url}")
res = requests.get(file.url, headers = {

# Set user agent explicitly, so Google Fonts gives us *.woff2
# files, which according to caniuse.com is the only format we
# need to download as it covers the entire range of browsers
# we're officially supporting.
"User-Agent": " ".join([
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"AppleWebKit/537.36 (KHTML, like Gecko)",
"Chrome/98.0.4758.102 Safari/537.36"
])
})
try:
res = requests.get(
file.url,
headers={
# Set user agent explicitly, so Google Fonts gives us *.woff2
# files, which according to caniuse.com is the only format we
# need to download as it covers the entire range of browsers
# we're officially supporting.
"User-Agent": " ".join(
[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"AppleWebKit/537.36 (KHTML, like Gecko)",
"Chrome/98.0.4758.102 Safari/537.36",
]
)
},
timeout=DEFAULT_TIMEOUT_IN_SECS,
)
res.raise_for_status()
except Exception as error: # this could be a ConnectionError or an HTTPError
log.error(f"Could not retrieve {file.url}: {error}")
return

# Compute expected file extension and append if missing
mime = res.headers["content-type"].split(";")[0]
Expand All @@ -433,9 +440,7 @@ def _fetch(self, file: File, config: MkDocsConfig):
os.symlink(os.path.basename(path), file.abs_src_path)
except OSError as e:
if e.errno != errno.EEXIST:
log.warning(
f"Couldn't create symbolic link: {file.src_uri}"
)
log.warning(f"Couldn't create symbolic link: {file.src_uri}")

# Fall back for when the symlink could not be created. This
# means that the plugin will download the original file on
Expand Down Expand Up @@ -464,11 +469,11 @@ def _fetch(self, file: File, config: MkDocsConfig):
# Parse and enqueue dependent external assets
for url in self._parse_media(file):
if not self._is_excluded(url, file):
self._queue(url, config, concurrent = True)
self._queue(url, config, concurrent=True)

# Patch all links to external assets in the given file
def _patch(self, initiator: File):
with open(initiator.abs_src_path, encoding = "utf-8-sig") as f:
with open(initiator.abs_src_path, encoding="utf-8-sig") as f:

# Replace callback
def replace(match: Match):
Expand Down Expand Up @@ -515,10 +520,7 @@ def replace(match: Match):
expr = re.compile(self.assets_expr_map[extension], re.I | re.M)

# Resolve links to external assets in file
self._save_to_file(
initiator.abs_dest_path,
expr.sub(replace, f.read())
)
self._save_to_file(initiator.abs_dest_path, expr.sub(replace, f.read()))

# -------------------------------------------------------------------------

Expand All @@ -540,7 +542,7 @@ def _path_from_url(self, url: URL):
path = f"{name}.{digest}{extension}"

# Create and return URL without leading double slashes
url = url._replace(scheme = "", query = "", fragment = "", path = path)
url = url._replace(scheme="", query="", fragment="", path=path)
return url.geturl()[2:]

# Create a file for the given path
Expand All @@ -549,17 +551,18 @@ def _path_to_file(self, path: str, config: MkDocsConfig):
posixpath.join(self.config.assets_fetch_dir, unquote(path)),
os.path.abspath(self.config.cache_dir),
config.site_dir,
False
False,
)

# Create a file on the system with the given content
def _save_to_file(self, path: str, content: str | bytes):
os.makedirs(os.path.dirname(path), exist_ok = True)
os.makedirs(os.path.dirname(path), exist_ok=True)
if isinstance(content, str):
content = bytes(content, "utf-8")
with open(path, "wb") as f:
f.write(content)


# -----------------------------------------------------------------------------
# Data
# -----------------------------------------------------------------------------
Expand All @@ -577,5 +580,5 @@ def _save_to_file(self, path: str, content: str | bytes):
"image/svg+xml": ".svg",
"image/webp": ".webp",
"text/javascript": ".js",
"text/css": ".css"
"text/css": ".css",
}
Loading

0 comments on commit 80f7ced

Please sign in to comment.