Skip to content

Commit

Permalink
Null character should translate to REPLACEMENT CHARACTER (U+FFFD)
Browse files Browse the repository at this point in the history
Fixes #124
  • Loading branch information
facelessuser committed Mar 16, 2019
1 parent 7774795 commit 70d328e
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 8 deletions.
2 changes: 2 additions & 0 deletions docs/src/markdown/about/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
- **NEW**: Allow `:contans()` to accept a list of text to search for.
- **FIX**: Don't install test files when installing the `soupsieve` package.
- **FIX**: Improve efficiency of `:contains()` comparison.
- **FIX**: Null characters should translate to the Unicode REPLACEMENT CHARACTER (`U+FFFD`) according to the spec. This
applies to CSS escaped NULL characters as well.

## 1.8.0

Expand Down
16 changes: 8 additions & 8 deletions soupsieve/css_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from collections import OrderedDict
from .util import SelectorSyntaxError

UNICODE_REPLACEMENT_CHAR = 0xFFFD

# Simple pseudo classes that take no parameters
PSEUDO_SIMPLE = {
":any-link",
Expand Down Expand Up @@ -240,21 +242,19 @@ def css_unescape(content, string=False):
def replace(m):
"""Replace with the appropriate substitute."""

return util.uchr(int(m.group(1)[1:], 16)) if m.group(1) else m.group(2)[1:]

def replace_string(m):
"""Replace with the appropriate substitute for a string."""

if m.group(1):
value = util.uchr(int(m.group(1)[1:], 16))
codepoint = int(m.group(1)[1:], 16)
if codepoint == 0:
codepoint = UNICODE_REPLACEMENT_CHAR
value = util.uchr(codepoint)
elif m.group(2):
value = m.group(2)[1:]
else:
value = ''

return value

return RE_CSS_ESC.sub(replace, content) if not string else RE_CSS_STR_ESC.sub(replace_string, content)
return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)


class SelectorPattern(object):
Expand Down Expand Up @@ -376,7 +376,7 @@ class CSSParser(object):
def __init__(self, selector, custom=None, flags=0):
"""Initialize."""

self.pattern = selector
self.pattern = selector.replace('\x00', '\ufffd')
self.flags = flags
self.debug = self.flags & util.DEBUG
self.quirks = self.flags & util._QUIRKS
Expand Down
21 changes: 21 additions & 0 deletions tests/test_level1/test_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@ class TestClass(util.TestCase):
</div>
"""

# Browsers normally replace NULL with `\uFFFD`, but some of the parsers
# we test just strip out NULL, so we will simulate and just insert `\uFFFD` directly
# to ensure consistent behavior in our tests across parsers.
MARKUP_NULL = """
<div>
<p>Some text <span id="1" class="\ufffdfoo"> in a paragraph</span>.
<a id="2" class="\ufffdbar" href="http://google.com">Link</a>
</p>
</div>
"""

def test_class(self):
"""Test class."""

Expand All @@ -35,6 +46,16 @@ def test_type_and_class(self):
flags=util.HTML
)

def test_type_and_class_escaped_null(self):
"""Test type and class with an escaped null character."""

self.assert_selector(
self.MARKUP_NULL,
r"a.\0 bar",
["2"],
flags=util.HTML
)

def test_malformed_class(self):
"""Test malformed class."""

Expand Down
36 changes: 36 additions & 0 deletions tests/test_level2/test_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,22 @@ class TestAttribute(util.TestCase):
</div>
"""

# Browsers normally replace NULL with `\uFFFD`, but some of the parsers
# we test just strip out NULL, so we will simulate and just insert `\uFFFD` directly
# to ensure consistent behavior in our tests across parsers.
MARKUP_NULL = """
<div id="div">
<p id="0">Some text <span id="1"> in a paragraph</span>.</p>
<a id="2" href="http://google.com">Link</a>
<span id="3">Direct child</span>
<pre id="\ufffdpre">
<span id="4">Child 1</span>
<span id="5">Child 2</span>
<span id="6">Child 3</span>
</pre>
</div>
"""

def test_attribute(self):
"""Test attribute."""

Expand Down Expand Up @@ -150,6 +166,26 @@ def test_attribute_escaped_newline(self):
flags=util.HTML
)

def test_attribute_equal_literal_null(self):
"""Test attribute with value that equals specified value with a literal null character."""

self.assert_selector(
self.MARKUP_NULL,
'[id="\x00pre"]',
["\ufffdpre"],
flags=util.HTML
)

def test_attribute_equal_escaped_null(self):
"""Test attribute with value that equals specified value with an escaped null character."""

self.assert_selector(
self.MARKUP_NULL,
r'[id="\0 pre"]',
["\ufffdpre"],
flags=util.HTML
)

def test_invalid_tag(self):
"""
Test invalid tag.
Expand Down

0 comments on commit 70d328e

Please sign in to comment.