Fix squidfunk#8012 - Privacy plugin crashes on HTTP errors

Lucas-C · Feb 18, 2025 · 80f7ced · 80f7ced
1 parent 2752b9e
commit 80f7ced
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 102 deletions.
diff --git a/material/plugins/privacy/plugin.py b/material/plugins/privacy/plugin.py
@@ -44,10 +44,13 @@
 from .config import PrivacyConfig
 from .parser import FragmentParser
 
+DEFAULT_TIMEOUT_IN_SECS = 5
+
 # -----------------------------------------------------------------------------
 # Classes
 # -----------------------------------------------------------------------------
 
+
 # Privacy plugin
 class PrivacyPlugin(BasePlugin[PrivacyConfig]):
 
@@ -66,7 +69,7 @@ def on_config(self, config):
         self.assets_expr_map = {
             ".css": r"url\(\s*([\"']?)(?P<url>http?[^)'\"]+)\1\s*\)",
             ".js": r"[\"'](?P<url>http[^\"']+\.(?:css|js(?:on)?))[\"']",
-            **self.config.assets_expr_map
+            **self.config.assets_expr_map,
         }
 
     # Process external style sheets and scripts (run latest) - run this after
@@ -89,7 +92,7 @@ def on_files(self, files, *, config):
             # downloaded. Create and enqueue a job for each external asset.
             for url in self._parse_media(initiator):
                 if not self._is_excluded(url, initiator):
-                    file = self._queue(url, config, concurrent = True)
+                    file = self._queue(url, config, concurrent=True)
 
                     # If site URL is not given, ensure that Mermaid.js is always
                     # present. This is a special case, as Material for MkDocs
@@ -110,7 +113,7 @@ def on_files(self, files, *, config):
         for path in config.extra_css:
             url = urlparse(path)
             if not self._is_excluded(url):
-                self._queue(url, config, concurrent = True)
+                self._queue(url, config, concurrent=True)
 
         # Process external script files
         for script in config.extra_javascript:
@@ -120,7 +123,7 @@ def on_files(self, files, *, config):
             # Enqueue a job if the script needs to downloaded
             url = urlparse(script.path)
             if not self._is_excluded(url):
-                self._queue(url, config, concurrent = True)
+                self._queue(url, config, concurrent=True)
 
     # Process external images in page (run latest) - this stage is the earliest
     # we can start processing external images, since images are the most common
@@ -137,15 +140,14 @@ def on_page_content(self, html, *, page, config, files):
 
         # Find all external images and download them if not excluded
         for match in re.findall(
-            r"<img[^>]+src=['\"]?http[^>]+>",
-            html, flags = re.I | re.M
+            r"<img[^>]+src=['\"]?http[^>]+>", html, flags=re.I | re.M
         ):
             el = self._parse_fragment(match)
 
             # Create and enqueue job to fetch external image
             url = urlparse(el.get("src"))
             if not self._is_excluded(url, page.file):
-                self._queue(url, config, concurrent = True)
+                self._queue(url, config, concurrent=True)
 
     # Sync all concurrent jobs
     def on_env(self, env, *, config, files):
@@ -194,9 +196,7 @@ def on_post_build(self, *, config):
         for file in self.assets:
             _, extension = posixpath.splitext(file.dest_uri)
             if extension in [".css", ".js"]:
-                self.pool_jobs.append(self.pool.submit(
-                    self._patch, file
-                ))
+                self.pool_jobs.append(self.pool.submit(self._patch, file))
 
             # Otherwise just copy external asset to output directory
             else:
@@ -226,11 +226,9 @@ def _is_excluded(self, url: URL, initiator: File | None = None):
         # If initiator is given, format for printing
         via = ""
         if initiator:
-            via = "".join([
-                Fore.WHITE, Style.DIM,
-                f"in '{initiator.src_uri}' ",
-                Style.RESET_ALL
-            ])
+            via = "".join(
+                [Fore.WHITE, Style.DIM, f"in '{initiator.src_uri}' ", Style.RESET_ALL]
+            )
 
         # Print warning if fetching is not enabled
         if not self.config.assets_fetch:
@@ -257,8 +255,7 @@ def _parse_fragment(self, fragment: str):
         # quote, we need to catch this here, as we're using pretty basic
         # regular expression based extraction
         raise PluginError(
-            f"Could not parse due to possible syntax error in HTML: \n\n"
-            + fragment
+            f"Could not parse due to possible syntax error in HTML: \n\n" + fragment
         )
 
     # Parse and extract all external assets from a media file using a preset
@@ -274,8 +271,8 @@ def _parse_media(self, initiator: File) -> list[URL]:
             return []
 
         # Find and extract all external asset URLs
-        expr = re.compile(self.assets_expr_map[extension], flags = re.I | re.M)
-        with open(initiator.abs_src_path, encoding = "utf-8-sig") as f:
+        expr = re.compile(self.assets_expr_map[extension], flags=re.I | re.M)
+        with open(initiator.abs_src_path, encoding="utf-8-sig") as f:
             results = re.finditer(expr, f.read())
             return [urlparse(result.group("url")) for result in results]
 
@@ -332,7 +329,9 @@ def replace(match: Match):
         # Find and replace all external asset URLs in current page
         return re.sub(
             r"<(?:(?:a|link|image)[^>]+href|(?:script|img)[^>]+src)=['\"]?http[^>]+>",
-            replace, output, flags = re.I | re.M
+            replace,
+            output,
+            flags=re.I | re.M,
         )
 
     # -------------------------------------------------------------------------
@@ -348,11 +347,11 @@ def _print(self, el: Element):
                 el.attrib[name] = temp
 
         # Return void or opening tag as string, strip closing tag
-        data = tostring(el, encoding = "unicode")
-        return data.replace(" />", ">").replace(f"=\"{temp}\"", "")
+        data = tostring(el, encoding="unicode")
+        return data.replace(" />", ">").replace(f'="{temp}"', "")
 
     # Enqueue external asset for download, if not already done
-    def _queue(self, url: URL, config: MkDocsConfig, concurrent = False):
+    def _queue(self, url: URL, config: MkDocsConfig, concurrent=False):
         path = self._path_from_url(url)
         full = posixpath.join(self.config.assets_fetch_dir, path)
 
@@ -373,9 +372,7 @@ def _queue(self, url: URL, config: MkDocsConfig, concurrent = False):
             # the caller must only ensure to reconcile the concurrent jobs.
             _, extension = posixpath.splitext(url.path)
             if extension and concurrent:
-                self.pool_jobs.append(self.pool.submit(
-                    self._fetch, file, config
-                ))
+                self.pool_jobs.append(self.pool.submit(self._fetch, file, config))
 
             # Fetch external asset synchronously, as it either has no extension
             # or is fetched from a context in which replacements are done
@@ -404,18 +401,28 @@ def _fetch(self, file: File, config: MkDocsConfig):
 
             # Download external asset
             log.info(f"Downloading external file: {file.url}")
-            res = requests.get(file.url, headers = {
-
-                # Set user agent explicitly, so Google Fonts gives us *.woff2
-                # files, which according to caniuse.com is the only format we
-                # need to download as it covers the entire range of browsers
-                # we're officially supporting.
-                "User-Agent": " ".join([
-                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
-                    "AppleWebKit/537.36 (KHTML, like Gecko)",
-                    "Chrome/98.0.4758.102 Safari/537.36"
-                ])
-            })
+            try:
+                res = requests.get(
+                    file.url,
+                    headers={
+                        # Set user agent explicitly, so Google Fonts gives us *.woff2
+                        # files, which according to caniuse.com is the only format we
+                        # need to download as it covers the entire range of browsers
+                        # we're officially supporting.
+                        "User-Agent": " ".join(
+                            [
+                                "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
+                                "AppleWebKit/537.36 (KHTML, like Gecko)",
+                                "Chrome/98.0.4758.102 Safari/537.36",
+                            ]
+                        )
+                    },
+                    timeout=DEFAULT_TIMEOUT_IN_SECS,
+                )
+                res.raise_for_status()
+            except Exception as error:  # this could be a ConnectionError or an HTTPError
+                log.error(f"Could not retrieve {file.url}: {error}")
+                return
 
             # Compute expected file extension and append if missing
             mime = res.headers["content-type"].split(";")[0]
@@ -433,9 +440,7 @@ def _fetch(self, file: File, config: MkDocsConfig):
                     os.symlink(os.path.basename(path), file.abs_src_path)
                 except OSError as e:
                     if e.errno != errno.EEXIST:
-                        log.warning(
-                            f"Couldn't create symbolic link: {file.src_uri}"
-                        )
+                        log.warning(f"Couldn't create symbolic link: {file.src_uri}")
 
                     # Fall back for when the symlink could not be created. This
                     # means that the plugin will download the original file on
@@ -464,11 +469,11 @@ def _fetch(self, file: File, config: MkDocsConfig):
         # Parse and enqueue dependent external assets
         for url in self._parse_media(file):
             if not self._is_excluded(url, file):
-                self._queue(url, config, concurrent = True)
+                self._queue(url, config, concurrent=True)
 
     # Patch all links to external assets in the given file
     def _patch(self, initiator: File):
-        with open(initiator.abs_src_path, encoding = "utf-8-sig") as f:
+        with open(initiator.abs_src_path, encoding="utf-8-sig") as f:
 
             # Replace callback
             def replace(match: Match):
@@ -515,10 +520,7 @@ def replace(match: Match):
             expr = re.compile(self.assets_expr_map[extension], re.I | re.M)
 
             # Resolve links to external assets in file
-            self._save_to_file(
-                initiator.abs_dest_path,
-                expr.sub(replace, f.read())
-            )
+            self._save_to_file(initiator.abs_dest_path, expr.sub(replace, f.read()))
 
     # -------------------------------------------------------------------------
 
@@ -540,7 +542,7 @@ def _path_from_url(self, url: URL):
             path = f"{name}.{digest}{extension}"
 
         # Create and return URL without leading double slashes
-        url = url._replace(scheme = "", query = "", fragment = "", path = path)
+        url = url._replace(scheme="", query="", fragment="", path=path)
         return url.geturl()[2:]
 
     # Create a file for the given path
@@ -549,17 +551,18 @@ def _path_to_file(self, path: str, config: MkDocsConfig):
             posixpath.join(self.config.assets_fetch_dir, unquote(path)),
             os.path.abspath(self.config.cache_dir),
             config.site_dir,
-            False
+            False,
         )
 
     # Create a file on the system with the given content
     def _save_to_file(self, path: str, content: str | bytes):
-        os.makedirs(os.path.dirname(path), exist_ok = True)
+        os.makedirs(os.path.dirname(path), exist_ok=True)
         if isinstance(content, str):
             content = bytes(content, "utf-8")
         with open(path, "wb") as f:
             f.write(content)
 
+
 # -----------------------------------------------------------------------------
 # Data
 # -----------------------------------------------------------------------------
@@ -577,5 +580,5 @@ def _save_to_file(self, path: str, content: str | bytes):
     "image/svg+xml": ".svg",
     "image/webp": ".webp",
     "text/javascript": ".js",
-    "text/css": ".css"
+    "text/css": ".css",
 }