Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize raw HTML post-processor #1510

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* DRY fix in `abbr` extension by introducing method `create_element` (#1483).
* Clean up test directory some removing some redundant tests and port
non-redundant cases to the newer test framework.
* Improved performance of the raw HTML post-processor (#1510).

### Fixed

Expand Down
4 changes: 2 additions & 2 deletions markdown/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class Markdown:
Attributes:
Markdown.tab_length (int): The number of spaces which correspond to a single tab. Default: `4`.
Markdown.ESCAPED_CHARS (list[str]): List of characters which get the backslash escape treatment.
Markdown.block_level_elements (list[str]): List of HTML tags which get treated as block-level elements.
Markdown.block_level_elements (set[str]): Set of HTML tags which get treated as block-level elements.
See [`markdown.util.BLOCK_LEVEL_ELEMENTS`][] for the full list of elements.
Markdown.registeredExtensions (list[Extension]): List of extensions which have called
[`registerExtension`][markdown.Markdown.registerExtension] during setup.
Expand Down Expand Up @@ -113,7 +113,7 @@ def __init__(self, **kwargs):
]
""" List of characters which get the backslash escape treatment. """

self.block_level_elements: list[str] = BLOCK_LEVEL_ELEMENTS.copy()
self.block_level_elements: set[str] = BLOCK_LEVEL_ELEMENTS.copy()

self.registeredExtensions: list[Extension] = []
self.docType = "" # TODO: Maybe delete this. It does not appear to be used anymore.
Expand Down
2 changes: 1 addition & 1 deletion markdown/extensions/md_in_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class HTMLExtractorExtra(HTMLExtractor):

def __init__(self, md: Markdown, *args, **kwargs):
# All block-level tags.
self.block_level_tags = set(md.block_level_elements.copy())
self.block_level_tags = md.block_level_elements.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed this, but this was already being converted to a set in this extension. Maybe we should keep that behavior rather than leave it as our custom type.

# Block-level tags in which the content only gets span level parsing
self.span_tags = set(
['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'summary', 'td', 'th']
Expand Down
40 changes: 14 additions & 26 deletions markdown/postprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

from __future__ import annotations

from collections import OrderedDict
from typing import TYPE_CHECKING, Any
from . import util
import re
Expand Down Expand Up @@ -73,37 +72,26 @@ class RawHtmlPostprocessor(Postprocessor):

def run(self, text: str) -> str:
""" Iterate over html stash and restore html. """
replacements = OrderedDict()
for i in range(self.md.htmlStash.html_counter):
html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[i])
if self.isblocklevel(html):
replacements["<p>{}</p>".format(
self.md.htmlStash.get_placeholder(i))] = html
replacements[self.md.htmlStash.get_placeholder(i)] = html

def substitute_match(m: re.Match[str]) -> str:
key = m.group(0)

if key not in replacements:
if key[3:-4] in replacements:
return f'<p>{ replacements[key[3:-4]] }</p>'
else:
return key

return replacements[key]

if replacements:
if key := m.group(1):
wrapped = True
else:
key = m.group(2)
wrapped = False
if (key := int(key)) >= self.md.htmlStash.html_counter:
return m.group(0)
html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[key])
if not wrapped or self.isblocklevel(html):
return pattern.sub(substitute_match, html)
return pattern.sub(substitute_match, f"<p>{html}</p>")

if self.md.htmlStash.html_counter:
base_placeholder = util.HTML_PLACEHOLDER % r'([0-9]+)'
pattern = re.compile(f'<p>{ base_placeholder }</p>|{ base_placeholder }')
processed_text = pattern.sub(substitute_match, text)
return pattern.sub(substitute_match, text)
else:
return text

if processed_text == text:
return processed_text
else:
return self.run(processed_text)

def isblocklevel(self, html: str) -> bool:
""" Check is block of HTML is block-level. """
m = self.BLOCK_LEVEL_REGEX.match(html)
Expand Down
270 changes: 266 additions & 4 deletions markdown/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import warnings
from functools import wraps, lru_cache
from itertools import count
from typing import TYPE_CHECKING, Generic, Iterator, NamedTuple, TypeVar, TypedDict, overload
from typing import TYPE_CHECKING, Callable, Generic, Iterator, NamedTuple, TypeVar, TypedDict, overload

if TYPE_CHECKING: # pragma: no cover
from markdown import Markdown
Expand All @@ -44,7 +44,269 @@
"""


BLOCK_LEVEL_ELEMENTS: list[str] = [
# TODO: Raise errors from list methods in the future.
# Later, remove this class entirely and use a regular set.
class _BlockLevelElements:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Elsewhere the type is specified as set. However, as this is not a subclass of set, that would be incorrect. Also, later when we remove this, then we will have a set. Perhaps this class should properly reflect that now. Unfortunately, there are no specialized container datatypes for sets in collections. And sets are not a Sequence Type either. Not really sure how best to address this.

Copy link
Contributor Author

@pawamoy pawamoy Feb 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My feeling was that it was OK to lie about the type, so that users actually get type warnings when they use it as a list instead of a set. That's one more (very effective) way of communicating the change. And since we implement all set functionality, the lie is not too big 🙂

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could do that, but some users run type validation on their code and this could result in bug reports. Either we need to fully document our reasoning for the inconsistency in the code comments or we need to make a change to the code.

def __init__(self, elements: list[str], /) -> None:
self._list = elements.copy()
self._set = set(self._list)

def __add__(self, other: list[str], /) -> list[str]:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
Comment on lines +55 to +58
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I emit the same warning every time, so that users only get it once. The message goes straight to the point: don't use it as a list.

# Using `+` means user expects a list back.
return self._list + other
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Methods with no side-effects return a list or set directly, not a copy of our hybrid instance. The reason is we assume that as soon as a user explicitly uses it as a list (or set), they will only use it as a list (or set), so no need to continue providing both interfaces.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not necessarily. The same object is likely to be accessed by multiple extensions. It is reasonable to assume that some newly updated extensions will assume a set and some older (not yet updated) extensions will assume a list. Whatever solution we chose needs to continue to work with both throughout the life of the Markdown class instance.

Copy link
Contributor Author

@pawamoy pawamoy Feb 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I thought about that, though since these methods return a new object (that's what I meant with "no side-effect", which wasn't super clear), I figured there are very few chances multiple extensions would actually share references to such newly created lists/sets (each new list/set would probably be isolated to the extension that created it). Anyway, I can definitely change that to return a _BlockLevelElements instance, to be more robust 🙂

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I missed that this comment was in a specific method which returns a modified object. Yeah, in this case, returning the same object type makes sense.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries, I wasn't very clear about it. @facelessuser expressed the same concern, so we might want to return a new instance of _BlockLevelElements, just to be safe. Let me know what you prefer.


def __and__(self, other: set[str], /) -> set[str]:
# Using `&` means user expects a set back.
return self._set & other

def __contains__(self, item: str, /) -> bool:
return item in self._set

def __delitem__(self, key: int, /) -> None:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
element = self._list[key]
del self._list[key]
self._set.remove(element)

def __getitem__(self, index: int, /) -> str:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
return self._list[index]

def __iadd__(self, other: list[str], /) -> set[str]:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
# In-place addition should update both list and set.
self._list += other
self._set.update(set(other))
return self # type: ignore[return-value]

def __iand__(self, other: set[str], /) -> set[str]:
# In-place intersection should update both list and set.
self._list = [element for element in self._list if element in other]
self._set &= other
return self # type: ignore[return-value]

def __ior__(self, other: set[str], /) -> set[str]:
# In-place union should update both list and set.
for element in other:
if element not in self._set:
self._list.append(element)
self._set |= other
return self # type: ignore[return-value]

def __iter__(self) -> Iterator[str]:
return iter(self._list)

def __len__(self) -> int:
# Length of the list, for backwards compatibility.
# If used as a set, both lengths will be the same.
return len(self._list)

def __or__(self, value: set[str], /) -> set[str]:
# Using `|` means user expects a set back.
return self._set | value

def __rand__(self, value: set[str], /) -> set[str]:
# Using `&` means user expects a set back.
return value & self._set

def __ror__(self, value: set[str], /) -> set[str]:
# Using `|` means user expects a set back.
return value | self._set

def __rsub__(self, value: set[str], /) -> set[str]:
# Using `-` means user expects a set back.
return value - self._set

def __rxor__(self, value: set[str], /) -> set[str]:
# Using `^` means user expects a set back.
return value ^ self._set

def __sub__(self, value: set[str], /) -> set[str]:
# Using `-` means user expects a set back.
return self._set - value

def __xor__(self, value: set[str], /) -> set[str]:
# Using `^` means user expects a set back.
return self._set ^ value

def __reversed__(self) -> Iterator[str]:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
return reversed(self._list)

def __setitem__(self, key: int, value: str, /) -> None:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
# In-place item-setting should update both list and set.
old = self._list[key]
self._list[key] = value
self._set.discard(old)
self._set.add(value)

def __str__(self) -> str:
return str(self._set)

def add(self, element: str, /) -> None:
# In-place addition should update both list and set.
self._set.add(element)
self._list.append(element)

def append(self, element: str, /) -> None:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
# In-place addition should update both list and set.
self._list.append(element)
self._set.add(element)

def clear(self) -> None:
self._list.clear()
self._set.clear()

def copy(self) -> _BlockLevelElements:
# We're not sure yet whether the user wants to use it as a set or list.
return _BlockLevelElements(self._list)

def count(self, value: str, /) -> int:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
# Count in list, for backwards compatibility.
# If used as a set, both counts will be the same (1).
return self._list.count(value)

def difference(self, *others: set[str]) -> set[str]:
# User expects a set back.
return self._set.difference(*others)

def difference_update(self, *others: set[str]) -> None:
# In-place difference should update both list and set.
self._set.difference_update(*others)
self._list.clear()
self._list.extend(sorted(self._set))

def discard(self, element: str, /) -> None:
# In-place discard should update both list and set.
self._set.discard(element)
while True:
try:
self._list.remove(element)
except ValueError:
break

def extend(self, elements: list[str], /) -> None:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
# In-place extension should update both list and set.
for element in elements:
if element not in self._list:
self._list.append(element)
self._set.update(elements)

def index(self, value, start: int = 0, stop: int = sys.maxsize, /):
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
return self._list.index(value, start, stop)

def insert(self, index: int, element: str, /) -> None:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
# In-place insertion should update both list and set.
self._list.insert(index, element)
self._set.add(element)

def intersection(self, *others: set[str]) -> set[str]:
# User expects a set back.
return self._set.intersection(*others)

def intersection_update(self, *others: set[str]) -> None:
# In-place intersection should update both list and set.
self._set.intersection_update(*others)
self._list.clear()
self._list.extend(sorted(self._set))
Comment on lines +250 to +251
Copy link
Contributor Author

@pawamoy pawamoy Feb 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I took a shortcut, but we should probably maintain the list order, while preventing duplicates.


def isdisjoint(self, other: set[str], /) -> bool:
return self._set.isdisjoint(other)

def issubset(self, other: set[str], /) -> bool:
return self._set.issubset(other)

def issuperset(self, other: set[str], /) -> bool:
return self._set.issuperset(other)

def pop(self, index: int = -1, /) -> str:
# In-place pop should update both list and set.
element = self._list.pop(index)
self._set.remove(element)
return element

def remove(self, element: str, /) -> None:
# In-place removal should update both list and set.
self._list.remove(element)
self._set.remove(element)

def reverse(self) -> None:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
self._list.reverse()

def sort(self, /, *, key: Callable | None = None, reverse: bool = False) -> None:
warnings.warn(
"Using block level elements as a list is deprecated, use it as a set instead.",
DeprecationWarning,
)
self._list.sort(key=key, reverse=reverse)

def symmetric_difference(self, other: set[str], /) -> set[str]:
# User expects a set back.
return self._set.symmetric_difference(other)

def symmetric_difference_update(self, other: set[str], /) -> None:
# In-place symmetric difference should update both list and set.
self._set.symmetric_difference_update(other)
self._list.clear()
self._list.extend(sorted(self._set))
Comment on lines +294 to +295
Copy link
Contributor Author

@pawamoy pawamoy Feb 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same shortcut, though we should maintain list order by first removing then adding elements, preventing duplicates, probably.


def union(self, *others: set[str]) -> set[str]:
# User expects a set back.
return self._set.union(*others)

def update(self, *others: set[str]) -> None:
# In-place union should update both list and set.
self._set.update(*others)
self._list.clear()
self._list.extend(sorted(self._set))
Comment on lines +304 to +305
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shortcut again, we should maintain order and only append elements, preventing duplicates.



# Type it as `set[str]` to express our intent for it to be used as such.
BLOCK_LEVEL_ELEMENTS: set[str] = _BlockLevelElements([
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't guarantee I didn't make subtle mistakes, so it should probably need proper testing.

If this all seems too complicated, happy to try something simpler (it was implemented quickly, no problem with discarding all this 😄).

Copy link
Collaborator

@facelessuser facelessuser Feb 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should definitely get tested. Eventually it'll go away as well. There's probably ways in which people can still mess things up, but I think this tries to provide a reasonable attempt to keep stuff from breaking. I'm open to other ideas if someone has them as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright, I'll start writing tests 🙂 Just wanted initial feedback before tackling them 😊

# Elements which are invalid to wrap in a `<p>` tag.
# See https://w3c.github.io/html/grouping-content.html#the-p-element
'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
Expand All @@ -56,9 +318,9 @@
'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
'style', 'summary', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video',
'center'
]
]) # type: ignore[assignment]
"""
List of HTML tags which get treated as block-level elements. Same as the `block_level_elements`
Set of HTML tags which get treated as block-level elements. Same as the `block_level_elements`
attribute of the [`Markdown`][markdown.Markdown] class. Generally one should use the
attribute on the class. This remains for compatibility with older extensions.
"""
Expand Down
2 changes: 1 addition & 1 deletion tests/test_apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,7 +920,7 @@ class TestBlockAppend(unittest.TestCase):
def testBlockAppend(self):
""" Test that appended escapes are only in the current instance. """
md = markdown.Markdown()
md.block_level_elements.append('test')
md.block_level_elements.add('test')
self.assertEqual('test' in md.block_level_elements, True)
md2 = markdown.Markdown()
self.assertEqual('test' not in md2.block_level_elements, True)
Expand Down
Loading
Loading