Python-Markdown · pawamoy · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 27, 2025
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * DRY fix in `abbr` extension by introducing method `create_element` (#1483).
 * Clean up test directory some removing some redundant tests and port
   non-redundant cases to the newer test framework.
+* Improved performance of the raw HTML post-processor (#1510).
 
 ### Fixed
 

diff --git a/markdown/core.py b/markdown/core.py
@@ -50,7 +50,7 @@ class Markdown:
     Attributes:
         Markdown.tab_length (int): The number of spaces which correspond to a single tab. Default: `4`.
         Markdown.ESCAPED_CHARS (list[str]): List of characters which get the backslash escape treatment.
-        Markdown.block_level_elements (list[str]): List of HTML tags which get treated as block-level elements.
+        Markdown.block_level_elements (set[str]): Set of HTML tags which get treated as block-level elements.
             See [`markdown.util.BLOCK_LEVEL_ELEMENTS`][] for the full list of elements.
         Markdown.registeredExtensions (list[Extension]): List of extensions which have called
             [`registerExtension`][markdown.Markdown.registerExtension] during setup.
@@ -113,7 +113,7 @@ def __init__(self, **kwargs):
         ]
         """ List of characters which get the backslash escape treatment. """
 
-        self.block_level_elements: list[str] = BLOCK_LEVEL_ELEMENTS.copy()
+        self.block_level_elements: set[str] = BLOCK_LEVEL_ELEMENTS.copy()
 
         self.registeredExtensions: list[Extension] = []
         self.docType = ""  # TODO: Maybe delete this. It does not appear to be used anymore.

diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
@@ -42,7 +42,7 @@ class HTMLExtractorExtra(HTMLExtractor):
 
     def __init__(self, md: Markdown, *args, **kwargs):
         # All block-level tags.
-        self.block_level_tags = set(md.block_level_elements.copy())
+        self.block_level_tags = md.block_level_elements.copy()
         # Block-level tags in which the content only gets span level parsing
         self.span_tags = set(
             ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'summary', 'td', 'th']

diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py
@@ -28,7 +28,6 @@
 
 from __future__ import annotations
 
-from collections import OrderedDict
 from typing import TYPE_CHECKING, Any
 from . import util
 import re
@@ -73,37 +72,26 @@ class RawHtmlPostprocessor(Postprocessor):
 
     def run(self, text: str) -> str:
         """ Iterate over html stash and restore html. """
-        replacements = OrderedDict()
-        for i in range(self.md.htmlStash.html_counter):
-            html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[i])
-            if self.isblocklevel(html):
-                replacements["<p>{}</p>".format(
-                    self.md.htmlStash.get_placeholder(i))] = html
-            replacements[self.md.htmlStash.get_placeholder(i)] = html
-
         def substitute_match(m: re.Match[str]) -> str:
-            key = m.group(0)
-
-            if key not in replacements:
-                if key[3:-4] in replacements:
-                    return f'<p>{ replacements[key[3:-4]] }</p>'
-                else:
-                    return key
-
-            return replacements[key]
-
-        if replacements:
+            if key := m.group(1):
+                wrapped = True
+            else:
+                key = m.group(2)
+                wrapped = False
+            if (key := int(key)) >= self.md.htmlStash.html_counter:
+                return m.group(0)
+            html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[key])
+            if not wrapped or self.isblocklevel(html):
+                return pattern.sub(substitute_match, html)
+            return pattern.sub(substitute_match, f"<p>{html}</p>")
+
+        if self.md.htmlStash.html_counter:
             base_placeholder = util.HTML_PLACEHOLDER % r'([0-9]+)'
             pattern = re.compile(f'<p>{ base_placeholder }</p>|{ base_placeholder }')
-            processed_text = pattern.sub(substitute_match, text)
+            return pattern.sub(substitute_match, text)
         else:
             return text
 
-        if processed_text == text:
-            return processed_text
-        else:
-            return self.run(processed_text)
-
     def isblocklevel(self, html: str) -> bool:
         """ Check is block of HTML is block-level. """
         m = self.BLOCK_LEVEL_REGEX.match(html)

diff --git a/markdown/util.py b/markdown/util.py
@@ -29,7 +29,7 @@
 import warnings
 from functools import wraps, lru_cache
 from itertools import count
-from typing import TYPE_CHECKING, Generic, Iterator, NamedTuple, TypeVar, TypedDict, overload
+from typing import TYPE_CHECKING, Callable, Generic, Iterator, NamedTuple, TypeVar, TypedDict, overload
 
 if TYPE_CHECKING:  # pragma: no cover
     from markdown import Markdown
@@ -44,7 +44,269 @@
 """
 
 
-BLOCK_LEVEL_ELEMENTS: list[str] = [
+# TODO: Raise errors from list methods in the future.
+# Later, remove this class entirely and use a regular set.
+class _BlockLevelElements:
+    def __init__(self, elements: list[str], /) -> None:
+        self._list = elements.copy()
+        self._set = set(self._list)
+
+    def __add__(self, other: list[str], /) -> list[str]:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        # Using `+` means user expects a list back.
+        return self._list + other
+
+    def __and__(self, other: set[str], /) -> set[str]:
+        # Using `&` means user expects a set back.
+        return self._set & other
+
+    def __contains__(self, item: str, /) -> bool:
+        return item in self._set
+
+    def __delitem__(self, key: int, /) -> None:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        element = self._list[key]
+        del self._list[key]
+        self._set.remove(element)
+
+    def __getitem__(self, index: int, /) -> str:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        return self._list[index]
+
+    def __iadd__(self, other: list[str], /) -> set[str]:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        # In-place addition should update both list and set.
+        self._list += other
+        self._set.update(set(other))
+        return self  # type: ignore[return-value]
+
+    def __iand__(self, other: set[str], /) -> set[str]:
+        # In-place intersection should update both list and set.
+        self._list = [element for element in self._list if element in other]
+        self._set &= other
+        return self  # type: ignore[return-value]
+
+    def __ior__(self, other: set[str], /) -> set[str]:
+        # In-place union should update both list and set.
+        for element in other:
+            if element not in self._set:
+                self._list.append(element)
+        self._set |= other
+        return self  # type: ignore[return-value]
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._list)
+
+    def __len__(self) -> int:
+        # Length of the list, for backwards compatibility.
+        # If used as a set, both lengths will be the same.
+        return len(self._list)
+
+    def __or__(self, value: set[str], /) -> set[str]:
+        # Using `|` means user expects a set back.
+        return self._set | value
+
+    def __rand__(self, value: set[str], /) -> set[str]:
+        # Using `&` means user expects a set back.
+        return value & self._set
+
+    def __ror__(self, value: set[str], /) -> set[str]:
+        # Using `|` means user expects a set back.
+        return value | self._set
+
+    def __rsub__(self, value: set[str], /) -> set[str]:
+        # Using `-` means user expects a set back.
+        return value - self._set
+
+    def __rxor__(self, value: set[str], /) -> set[str]:
+        # Using `^` means user expects a set back.
+        return value ^ self._set
+
+    def __sub__(self, value: set[str], /) -> set[str]:
+        # Using `-` means user expects a set back.
+        return self._set - value
+
+    def __xor__(self, value: set[str], /) -> set[str]:
+        # Using `^` means user expects a set back.
+        return self._set ^ value
+
+    def __reversed__(self) -> Iterator[str]:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        return reversed(self._list)
+
+    def __setitem__(self, key: int, value: str, /) -> None:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        # In-place item-setting should update both list and set.
+        old = self._list[key]
+        self._list[key] = value
+        self._set.discard(old)
+        self._set.add(value)
+
+    def __str__(self) -> str:
+        return str(self._set)
+
+    def add(self, element: str, /) -> None:
+        # In-place addition should update both list and set.
+        self._set.add(element)
+        self._list.append(element)
+
+    def append(self, element: str, /) -> None:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        # In-place addition should update both list and set.
+        self._list.append(element)
+        self._set.add(element)
+
+    def clear(self) -> None:
+        self._list.clear()
+        self._set.clear()
+
+    def copy(self) -> _BlockLevelElements:
+        # We're not sure yet whether the user wants to use it as a set or list.
+        return _BlockLevelElements(self._list)
+
+    def count(self, value: str, /) -> int:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        # Count in list, for backwards compatibility.
+        # If used as a set, both counts will be the same (1).
+        return self._list.count(value)
+
+    def difference(self, *others: set[str]) -> set[str]:
+        # User expects a set back.
+        return self._set.difference(*others)
+
+    def difference_update(self, *others: set[str]) -> None:
+        # In-place difference should update both list and set.
+        self._set.difference_update(*others)
+        self._list.clear()
+        self._list.extend(sorted(self._set))
+
+    def discard(self, element: str, /) -> None:
+        # In-place discard should update both list and set.
+        self._set.discard(element)
+        while True:
+            try:
+                self._list.remove(element)
+            except ValueError:
+                break
+
+    def extend(self, elements: list[str], /) -> None:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        # In-place extension should update both list and set.
+        for element in elements:
+            if element not in self._list:
+                self._list.append(element)
+        self._set.update(elements)
+
+    def index(self, value, start: int = 0, stop: int = sys.maxsize, /):
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        return self._list.index(value, start, stop)
+
+    def insert(self, index: int, element: str, /) -> None:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        # In-place insertion should update both list and set.
+        self._list.insert(index, element)
+        self._set.add(element)
+
+    def intersection(self, *others: set[str]) -> set[str]:
+        # User expects a set back.
+        return self._set.intersection(*others)
+
+    def intersection_update(self, *others: set[str]) -> None:
+        # In-place intersection should update both list and set.
+        self._set.intersection_update(*others)
+        self._list.clear()
+        self._list.extend(sorted(self._set))
+
+    def isdisjoint(self, other: set[str], /) -> bool:
+        return self._set.isdisjoint(other)
+
+    def issubset(self, other: set[str], /) -> bool:
+        return self._set.issubset(other)
+
+    def issuperset(self, other: set[str], /) -> bool:
+        return self._set.issuperset(other)
+
+    def pop(self, index: int = -1, /) -> str:
+        # In-place pop should update both list and set.
+        element = self._list.pop(index)
+        self._set.remove(element)
+        return element
+
+    def remove(self, element: str, /) -> None:
+        # In-place removal should update both list and set.
+        self._list.remove(element)
+        self._set.remove(element)
+
+    def reverse(self) -> None:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        self._list.reverse()
+
+    def sort(self, /, *, key: Callable | None = None, reverse: bool = False) -> None:
+        warnings.warn(
+            "Using block level elements as a list is deprecated, use it as a set instead.",
+            DeprecationWarning,
+        )
+        self._list.sort(key=key, reverse=reverse)
+
+    def symmetric_difference(self, other: set[str], /) -> set[str]:
+        # User expects a set back.
+        return self._set.symmetric_difference(other)
+
+    def symmetric_difference_update(self, other: set[str], /) -> None:
+        # In-place symmetric difference should update both list and set.
+        self._set.symmetric_difference_update(other)
+        self._list.clear()
+        self._list.extend(sorted(self._set))
+
+    def union(self, *others: set[str]) -> set[str]:
+        # User expects a set back.
+        return self._set.union(*others)
+
+    def update(self, *others: set[str]) -> None:
+        # In-place union should update both list and set.
+        self._set.update(*others)
+        self._list.clear()
+        self._list.extend(sorted(self._set))
+
+
+# Type it as `set[str]` to express our intent for it to be used as such.
+BLOCK_LEVEL_ELEMENTS: set[str] = _BlockLevelElements([
     # Elements which are invalid to wrap in a `<p>` tag.
     # See https://w3c.github.io/html/grouping-content.html#the-p-element
     'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
@@ -56,9 +318,9 @@
     'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
     'style', 'summary', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video',
     'center'
-]
+])  # type: ignore[assignment]
 """
-List of HTML tags which get treated as block-level elements. Same as the `block_level_elements`
+Set of HTML tags which get treated as block-level elements. Same as the `block_level_elements`
 attribute of the [`Markdown`][markdown.Markdown] class. Generally one should use the
 attribute on the class. This remains for compatibility with older extensions.
 """

diff --git a/tests/test_apis.py b/tests/test_apis.py
@@ -920,7 +920,7 @@ class TestBlockAppend(unittest.TestCase):
     def testBlockAppend(self):
         """ Test that appended escapes are only in the current instance. """
         md = markdown.Markdown()
-        md.block_level_elements.append('test')
+        md.block_level_elements.add('test')
         self.assertEqual('test' in md.block_level_elements, True)
         md2 = markdown.Markdown()
         self.assertEqual('test' not in md2.block_level_elements, True)