diff --git a/docs/src/markdown/_snippets/links.txt b/docs/src/markdown/_snippets/links.txt index cfb30047..9e2f5aae 100644 --- a/docs/src/markdown/_snippets/links.txt +++ b/docs/src/markdown/_snippets/links.txt @@ -1,5 +1,6 @@ [aspell]: /~https://github.com/GNUAspell/aspell [bs4]: https://beautiful-soup-4.readthedocs.io/en/latest/# +[contains-draft]: https://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors [custom-extensions-1]: https://drafts.csswg.org/css-extensions-1/ [html5lib]: /~https://github.com/html5lib/html5lib-python [lxml]: /~https://github.com/lxml/lxml diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md index defd93ab..37bf3026 100644 --- a/docs/src/markdown/about/changelog.md +++ b/docs/src/markdown/about/changelog.md @@ -1,8 +1,10 @@ # Changelog -## Latest +## 1.9.0 +- **NEW**: Allow `:contans()` to accept a list of text to search for. - **FIX**: Don't install test files when installing the `soupsieve` package. +- **FIX**: Improve efficiency of `:contains()` comparison. ## 1.8.0 diff --git a/docs/src/markdown/about/development.md b/docs/src/markdown/about/development.md index d6f969f1..5af0ed6d 100644 --- a/docs/src/markdown/about/development.md +++ b/docs/src/markdown/about/development.md @@ -239,7 +239,7 @@ Attribute | Description `selectors` | Contains a tuple of `SelectorList` objects for each pseudo-class selector part of the compound selector: `#!css :is()`, `#!css :not()`, `#!css :has()`, etc. `relation` | This will contain a `SelectorList` object with one `Selector` object, which could in turn chain an additional relation depending on the complexity of the compound selector. For instance, `div > p + a` would be a `Selector` for `a` that contains a `relation` for `p` (another `SelectorList` object) which also contains a relation of `div`. When matching, we would match that the tag is `a`, and then walk its relation chain verifying that they all match. In this case, the relation chain would be a direct, previous sibling of `p`, which has a direct parent of `div`. A `:has()` pseudo-class would walk this in the opposite order. `div:has(> p + a)` would verify `div`, and then check for a child of `p` with a sibling of `a`. `rel_type` | `rel_type` is attached to relational selectors. In the case of `#!css div > p + a`, the relational selectors of `div` and `p` would get a relational type of `>` and `+` respectively. `:has()` relational `rel_type` are preceded with `:` to signify a forward looking relation. -`contains` | Contains a tuple of strings of content to match in an element. +`contains` | Contains a tuple of [`SelectorContains`](#selectorcontains) objects. Each object contains the list of text to match an element's content against. `lang` | Contains a tuple of [`SelectorLang`](#selectorlang) objects. `flags` | Selector flags that used to signal a type of selector is present. @@ -288,6 +288,20 @@ Attribute | Description `pattern` | Contains a `re` regular expression object that matches the desired attribute value. `xml_type_pattern` | As the default `type` pattern is case insensitive, when the attribute value is `type` and a case sensitivity has not been explicitly defined, a secondary case sensitive `type` pattern is compiled for use with XML documents when detected. +### `SelectorContains` + +```py3 +class SelectorContains: + """Selector contains rule.""" + + def __init__(self, text): + """Initialize.""" +``` + +Attribute | Description +------------------- | ----------- +`text` | A tuple of acceptable text that that an element should match. An element only needs to match at least one. + ### `SelectorNth` ```py3 diff --git a/docs/src/markdown/selectors.md b/docs/src/markdown/selectors.md index 785777d8..9cf8ecba 100644 --- a/docs/src/markdown/selectors.md +++ b/docs/src/markdown/selectors.md @@ -412,10 +412,15 @@ Selects any `#!html `, `#!html `, o input:checked ``` -### `:contains` {:#:contains} +### `:contains()` {:#:contains} Selects elements that contain the text provided text. Text can be found in either itself, or its descendants. +Contains was originally included in a [CSS early draft][contains-draft], but was in the end dropped from the draft. +Soup Sieve implements it how it was originally proposed in the draft with the addition that `:contains()` can accept +either a single value, or a comma separated list of values. An element needs only to match at least one of the items +in the comma separated list to be considered matching. + !!! warning "Contains" `:contains()` is an expensive operation as it scans all the text nodes of an element under consideration, which includes all descendants. Using highly specific selectors can reduce how often it is evaluated. diff --git a/soupsieve/__meta__.py b/soupsieve/__meta__.py index ce903a1c..2cbe51ad 100644 --- a/soupsieve/__meta__.py +++ b/soupsieve/__meta__.py @@ -186,5 +186,5 @@ def parse_version(ver, pre=False): return Version(major, minor, micro, release, pre, post, dev) -__version_info__ = Version(1, 8, 1, ".dev") +__version_info__ = Version(1, 9, 0, ".dev") __version__ = __version_info__._get_canonical() diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py index 7bb01d1f..923eb845 100644 --- a/soupsieve/css_match.py +++ b/soupsieve/css_match.py @@ -811,10 +811,17 @@ def match_contains(self, el, contains): """Match element if it contains text.""" match = True - for c in contains: - if c not in self.get_text(el): + content = None + for contain_list in contains: + if content is None: + content = self.get_text(el) + found = False + for text in contain_list.text: + if text in content: + found = True + break + if not found: match = False - break return match def match_default(self, el): diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py index c6626625..a537d5f7 100644 --- a/soupsieve/css_parser.py +++ b/soupsieve/css_parser.py @@ -149,13 +149,13 @@ \({ws}*(?P{nth}|even|odd)){ws}*\) '''.format(ws=WSC, nth=NTH) # Pseudo class language (`:lang("*-de", en)`) -PAT_PSEUDO_LANG = r':lang\({ws}*(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE) +PAT_PSEUDO_LANG = r':lang\({ws}*(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE) # Pseudo class direction (`:dir(ltr)`) PAT_PSEUDO_DIR = r':dir\({ws}*(?Pltr|rtl){ws}*\)'.format(ws=WSC) # Combining characters (`>`, `~`, ` `, `+`, `,`) PAT_COMBINE = r'{wsc}*?(?P[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC) # Extra: Contains (`:contains(text)`) -PAT_PSEUDO_CONTAINS = r':contains\({ws}*(?P{value}){ws}*\)'.format(ws=WSC, value=VALUE) +PAT_PSEUDO_CONTAINS = r':contains\({ws}*(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE) # Regular expressions # CSS escape pattern @@ -166,8 +166,8 @@ r'(?P[-+])?(?P[0-9]+n?|n)(?:(?<=n){ws}*(?P[-+]){ws}*(?P[0-9]+))?'.format(ws=WSC), re.I ) -# Pattern to iterate multiple languages. -RE_LANG = re.compile(r'(?:(?P{value})|(?P{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X) +# Pattern to iterate multiple values. +RE_VALUES = re.compile(r'(?:(?P{value})|(?P{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X) # Whitespace checks RE_WS = re.compile(WS) RE_WS_BEGIN = re.compile('^{}*'.format(WSC)) @@ -751,21 +751,27 @@ def parse_class_id(self, sel, m, has_selector): def parse_pseudo_contains(self, sel, m, has_selector): """Parse contains.""" - content = m.group('value') - if content.startswith(("'", '"')): - content = css_unescape(content[1:-1], True) - else: - content = css_unescape(content) - sel.contains.append(content) + values = m.group('values') + patterns = [] + for token in RE_VALUES.finditer(values): + if token.group('split'): + continue + value = token.group('value') + if value.startswith(("'", '"')): + value = css_unescape(value[1:-1], True) + else: + value = css_unescape(value) + patterns.append(value) + sel.contains.append(ct.SelectorContains(tuple(patterns))) has_selector = True return has_selector def parse_pseudo_lang(self, sel, m, has_selector): """Parse pseudo language.""" - lang = m.group('lang') + values = m.group('values') patterns = [] - for token in RE_LANG.finditer(lang): + for token in RE_VALUES.finditer(values): if token.group('split'): continue value = token.group('value') diff --git a/soupsieve/css_types.py b/soupsieve/css_types.py index b690d8f4..d426287a 100644 --- a/soupsieve/css_types.py +++ b/soupsieve/css_types.py @@ -7,6 +7,7 @@ 'SelectorNull', 'SelectorTag', 'SelectorAttribute', + 'SelectorContains', 'SelectorNth', 'SelectorLang', 'SelectorList', @@ -234,6 +235,19 @@ def __init__(self, attribute, prefix, pattern, xml_type_pattern): ) +class SelectorContains(Immutable): + """Selector contains rule.""" + + __slots__ = ("text", "_hash") + + def __init__(self, text): + """Initialize.""" + + super(SelectorContains, self).__init__( + text=text + ) + + class SelectorNth(Immutable): """Selector nth type.""" @@ -324,6 +338,7 @@ def pickle_register(obj): pickle_register(SelectorNull) pickle_register(SelectorTag) pickle_register(SelectorAttribute) +pickle_register(SelectorContains) pickle_register(SelectorNth) pickle_register(SelectorLang) pickle_register(SelectorList) diff --git a/tests/test_api.py b/tests/test_api.py index d810c924..c389870b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -457,9 +457,9 @@ def test_copy_pickle(self): # We force a pattern that contains all custom types: # `Selector`, `NullSelector`, `SelectorTag`, `SelectorAttribute`, # `SelectorNth`, `SelectorLang`, `SelectorList`, `Namespaces`, - # and `CustomSelectors`. + # `SelectorContains`, and `CustomSelectors`. p1 = sv.compile( - 'p.class#id[id]:nth-child(2):lang(en):focus', + 'p.class#id[id]:nth-child(2):lang(en):focus:contains("text", "other text")', {'html': 'http://www.w3.org/TR/html4/'}, custom={':--header': 'h1, h2, h3, h4, h5, h6'} ) @@ -469,7 +469,7 @@ def test_copy_pickle(self): # Test that we pull the same one from cache p2 = sv.compile( - 'p.class#id[id]:nth-child(2):lang(en):focus', + 'p.class#id[id]:nth-child(2):lang(en):focus:contains("text", "other text")', {'html': 'http://www.w3.org/TR/html4/'}, custom={':--header': 'h1, h2, h3, h4, h5, h6'} ) @@ -477,7 +477,7 @@ def test_copy_pickle(self): # Test that we compile a new one when providing a different flags p3 = sv.compile( - 'p.class#id[id]:nth-child(2):lang(en):focus', + 'p.class#id[id]:nth-child(2):lang(en):focus:contains("text", "other text")', {'html': 'http://www.w3.org/TR/html4/'}, custom={':--header': 'h1, h2, h3, h4, h5, h6'}, flags=0x10 diff --git a/tests/test_extra/test_contains.py b/tests/test_extra/test_contains.py index 5b68ddb4..09e146ed 100644 --- a/tests/test_extra/test_contains.py +++ b/tests/test_extra/test_contains.py @@ -66,6 +66,46 @@ def test_contains_quoted_with_escaped_newline_with_carriage_return(self): flags=util.HTML ) + def test_contains_list(self): + """Test contains list.""" + + self.assert_selector( + self.MARKUP, + 'body span:contains("does not exist", "that")', + ['2'], + flags=util.HTML + ) + + def test_contains_multiple(self): + """Test contains multiple.""" + + self.assert_selector( + self.MARKUP, + 'body span:contains("th"):contains("at")', + ['2'], + flags=util.HTML + ) + + def test_contains_multiple_not_match(self): + """Test contains multiple with "not" and with a match.""" + + self.assert_selector( + self.MARKUP, + 'body span:not(:contains("does not exist")):contains("that")', + ['2'], + flags=util.HTML + ) + + def test_contains_multiple_not_no_match(self): + """Test contains multiple with "not" and no match.""" + + self.assert_selector( + self.MARKUP, + 'body span:not(:contains("that")):contains("that")', + [], + flags=util.HTML + ) + def test_contains_with_descendants(self): """Test that contains returns descendants as well as the top level that contain."""