diff --git a/docs/src/markdown/_snippets/links.txt b/docs/src/markdown/_snippets/links.txt
index cfb30047..9e2f5aae 100644
--- a/docs/src/markdown/_snippets/links.txt
+++ b/docs/src/markdown/_snippets/links.txt
@@ -1,5 +1,6 @@
[aspell]: /~https://github.com/GNUAspell/aspell
[bs4]: https://beautiful-soup-4.readthedocs.io/en/latest/#
+[contains-draft]: https://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
[custom-extensions-1]: https://drafts.csswg.org/css-extensions-1/
[html5lib]: /~https://github.com/html5lib/html5lib-python
[lxml]: /~https://github.com/lxml/lxml
diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md
index defd93ab..37bf3026 100644
--- a/docs/src/markdown/about/changelog.md
+++ b/docs/src/markdown/about/changelog.md
@@ -1,8 +1,10 @@
# Changelog
-## Latest
+## 1.9.0
+- **NEW**: Allow `:contans()` to accept a list of text to search for.
- **FIX**: Don't install test files when installing the `soupsieve` package.
+- **FIX**: Improve efficiency of `:contains()` comparison.
## 1.8.0
diff --git a/docs/src/markdown/about/development.md b/docs/src/markdown/about/development.md
index d6f969f1..5af0ed6d 100644
--- a/docs/src/markdown/about/development.md
+++ b/docs/src/markdown/about/development.md
@@ -239,7 +239,7 @@ Attribute | Description
`selectors` | Contains a tuple of `SelectorList` objects for each pseudo-class selector part of the compound selector: `#!css :is()`, `#!css :not()`, `#!css :has()`, etc.
`relation` | This will contain a `SelectorList` object with one `Selector` object, which could in turn chain an additional relation depending on the complexity of the compound selector. For instance, `div > p + a` would be a `Selector` for `a` that contains a `relation` for `p` (another `SelectorList` object) which also contains a relation of `div`. When matching, we would match that the tag is `a`, and then walk its relation chain verifying that they all match. In this case, the relation chain would be a direct, previous sibling of `p`, which has a direct parent of `div`. A `:has()` pseudo-class would walk this in the opposite order. `div:has(> p + a)` would verify `div`, and then check for a child of `p` with a sibling of `a`.
`rel_type` | `rel_type` is attached to relational selectors. In the case of `#!css div > p + a`, the relational selectors of `div` and `p` would get a relational type of `>` and `+` respectively. `:has()` relational `rel_type` are preceded with `:` to signify a forward looking relation.
-`contains` | Contains a tuple of strings of content to match in an element.
+`contains` | Contains a tuple of [`SelectorContains`](#selectorcontains) objects. Each object contains the list of text to match an element's content against.
`lang` | Contains a tuple of [`SelectorLang`](#selectorlang) objects.
`flags` | Selector flags that used to signal a type of selector is present.
@@ -288,6 +288,20 @@ Attribute | Description
`pattern` | Contains a `re` regular expression object that matches the desired attribute value.
`xml_type_pattern` | As the default `type` pattern is case insensitive, when the attribute value is `type` and a case sensitivity has not been explicitly defined, a secondary case sensitive `type` pattern is compiled for use with XML documents when detected.
+### `SelectorContains`
+
+```py3
+class SelectorContains:
+ """Selector contains rule."""
+
+ def __init__(self, text):
+ """Initialize."""
+```
+
+Attribute | Description
+------------------- | -----------
+`text` | A tuple of acceptable text that that an element should match. An element only needs to match at least one.
+
### `SelectorNth`
```py3
diff --git a/docs/src/markdown/selectors.md b/docs/src/markdown/selectors.md
index 785777d8..9cf8ecba 100644
--- a/docs/src/markdown/selectors.md
+++ b/docs/src/markdown/selectors.md
@@ -412,10 +412,15 @@ Selects any `#!html `, `#!html `, o
input:checked
```
-### `:contains` {:#:contains}
+### `:contains()` {:#:contains}
Selects elements that contain the text provided text. Text can be found in either itself, or its descendants.
+Contains was originally included in a [CSS early draft][contains-draft], but was in the end dropped from the draft.
+Soup Sieve implements it how it was originally proposed in the draft with the addition that `:contains()` can accept
+either a single value, or a comma separated list of values. An element needs only to match at least one of the items
+in the comma separated list to be considered matching.
+
!!! warning "Contains"
`:contains()` is an expensive operation as it scans all the text nodes of an element under consideration, which
includes all descendants. Using highly specific selectors can reduce how often it is evaluated.
diff --git a/soupsieve/__meta__.py b/soupsieve/__meta__.py
index ce903a1c..2cbe51ad 100644
--- a/soupsieve/__meta__.py
+++ b/soupsieve/__meta__.py
@@ -186,5 +186,5 @@ def parse_version(ver, pre=False):
return Version(major, minor, micro, release, pre, post, dev)
-__version_info__ = Version(1, 8, 1, ".dev")
+__version_info__ = Version(1, 9, 0, ".dev")
__version__ = __version_info__._get_canonical()
diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py
index 7bb01d1f..923eb845 100644
--- a/soupsieve/css_match.py
+++ b/soupsieve/css_match.py
@@ -811,10 +811,17 @@ def match_contains(self, el, contains):
"""Match element if it contains text."""
match = True
- for c in contains:
- if c not in self.get_text(el):
+ content = None
+ for contain_list in contains:
+ if content is None:
+ content = self.get_text(el)
+ found = False
+ for text in contain_list.text:
+ if text in content:
+ found = True
+ break
+ if not found:
match = False
- break
return match
def match_default(self, el):
diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py
index c6626625..a537d5f7 100644
--- a/soupsieve/css_parser.py
+++ b/soupsieve/css_parser.py
@@ -149,13 +149,13 @@
\({ws}*(?P{nth}|even|odd)){ws}*\)
'''.format(ws=WSC, nth=NTH)
# Pseudo class language (`:lang("*-de", en)`)
-PAT_PSEUDO_LANG = r':lang\({ws}*(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE)
+PAT_PSEUDO_LANG = r':lang\({ws}*(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE)
# Pseudo class direction (`:dir(ltr)`)
PAT_PSEUDO_DIR = r':dir\({ws}*(?Pltr|rtl){ws}*\)'.format(ws=WSC)
# Combining characters (`>`, `~`, ` `, `+`, `,`)
PAT_COMBINE = r'{wsc}*?(?P[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC)
# Extra: Contains (`:contains(text)`)
-PAT_PSEUDO_CONTAINS = r':contains\({ws}*(?P{value}){ws}*\)'.format(ws=WSC, value=VALUE)
+PAT_PSEUDO_CONTAINS = r':contains\({ws}*(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE)
# Regular expressions
# CSS escape pattern
@@ -166,8 +166,8 @@
r'(?P[-+])?(?P[0-9]+n?|n)(?:(?<=n){ws}*(?P[-+]){ws}*(?P[0-9]+))?'.format(ws=WSC),
re.I
)
-# Pattern to iterate multiple languages.
-RE_LANG = re.compile(r'(?:(?P{value})|(?P{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X)
+# Pattern to iterate multiple values.
+RE_VALUES = re.compile(r'(?:(?P{value})|(?P{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X)
# Whitespace checks
RE_WS = re.compile(WS)
RE_WS_BEGIN = re.compile('^{}*'.format(WSC))
@@ -751,21 +751,27 @@ def parse_class_id(self, sel, m, has_selector):
def parse_pseudo_contains(self, sel, m, has_selector):
"""Parse contains."""
- content = m.group('value')
- if content.startswith(("'", '"')):
- content = css_unescape(content[1:-1], True)
- else:
- content = css_unescape(content)
- sel.contains.append(content)
+ values = m.group('values')
+ patterns = []
+ for token in RE_VALUES.finditer(values):
+ if token.group('split'):
+ continue
+ value = token.group('value')
+ if value.startswith(("'", '"')):
+ value = css_unescape(value[1:-1], True)
+ else:
+ value = css_unescape(value)
+ patterns.append(value)
+ sel.contains.append(ct.SelectorContains(tuple(patterns)))
has_selector = True
return has_selector
def parse_pseudo_lang(self, sel, m, has_selector):
"""Parse pseudo language."""
- lang = m.group('lang')
+ values = m.group('values')
patterns = []
- for token in RE_LANG.finditer(lang):
+ for token in RE_VALUES.finditer(values):
if token.group('split'):
continue
value = token.group('value')
diff --git a/soupsieve/css_types.py b/soupsieve/css_types.py
index b690d8f4..d426287a 100644
--- a/soupsieve/css_types.py
+++ b/soupsieve/css_types.py
@@ -7,6 +7,7 @@
'SelectorNull',
'SelectorTag',
'SelectorAttribute',
+ 'SelectorContains',
'SelectorNth',
'SelectorLang',
'SelectorList',
@@ -234,6 +235,19 @@ def __init__(self, attribute, prefix, pattern, xml_type_pattern):
)
+class SelectorContains(Immutable):
+ """Selector contains rule."""
+
+ __slots__ = ("text", "_hash")
+
+ def __init__(self, text):
+ """Initialize."""
+
+ super(SelectorContains, self).__init__(
+ text=text
+ )
+
+
class SelectorNth(Immutable):
"""Selector nth type."""
@@ -324,6 +338,7 @@ def pickle_register(obj):
pickle_register(SelectorNull)
pickle_register(SelectorTag)
pickle_register(SelectorAttribute)
+pickle_register(SelectorContains)
pickle_register(SelectorNth)
pickle_register(SelectorLang)
pickle_register(SelectorList)
diff --git a/tests/test_api.py b/tests/test_api.py
index d810c924..c389870b 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -457,9 +457,9 @@ def test_copy_pickle(self):
# We force a pattern that contains all custom types:
# `Selector`, `NullSelector`, `SelectorTag`, `SelectorAttribute`,
# `SelectorNth`, `SelectorLang`, `SelectorList`, `Namespaces`,
- # and `CustomSelectors`.
+ # `SelectorContains`, and `CustomSelectors`.
p1 = sv.compile(
- 'p.class#id[id]:nth-child(2):lang(en):focus',
+ 'p.class#id[id]:nth-child(2):lang(en):focus:contains("text", "other text")',
{'html': 'http://www.w3.org/TR/html4/'},
custom={':--header': 'h1, h2, h3, h4, h5, h6'}
)
@@ -469,7 +469,7 @@ def test_copy_pickle(self):
# Test that we pull the same one from cache
p2 = sv.compile(
- 'p.class#id[id]:nth-child(2):lang(en):focus',
+ 'p.class#id[id]:nth-child(2):lang(en):focus:contains("text", "other text")',
{'html': 'http://www.w3.org/TR/html4/'},
custom={':--header': 'h1, h2, h3, h4, h5, h6'}
)
@@ -477,7 +477,7 @@ def test_copy_pickle(self):
# Test that we compile a new one when providing a different flags
p3 = sv.compile(
- 'p.class#id[id]:nth-child(2):lang(en):focus',
+ 'p.class#id[id]:nth-child(2):lang(en):focus:contains("text", "other text")',
{'html': 'http://www.w3.org/TR/html4/'},
custom={':--header': 'h1, h2, h3, h4, h5, h6'},
flags=0x10
diff --git a/tests/test_extra/test_contains.py b/tests/test_extra/test_contains.py
index 5b68ddb4..09e146ed 100644
--- a/tests/test_extra/test_contains.py
+++ b/tests/test_extra/test_contains.py
@@ -66,6 +66,46 @@ def test_contains_quoted_with_escaped_newline_with_carriage_return(self):
flags=util.HTML
)
+ def test_contains_list(self):
+ """Test contains list."""
+
+ self.assert_selector(
+ self.MARKUP,
+ 'body span:contains("does not exist", "that")',
+ ['2'],
+ flags=util.HTML
+ )
+
+ def test_contains_multiple(self):
+ """Test contains multiple."""
+
+ self.assert_selector(
+ self.MARKUP,
+ 'body span:contains("th"):contains("at")',
+ ['2'],
+ flags=util.HTML
+ )
+
+ def test_contains_multiple_not_match(self):
+ """Test contains multiple with "not" and with a match."""
+
+ self.assert_selector(
+ self.MARKUP,
+ 'body span:not(:contains("does not exist")):contains("that")',
+ ['2'],
+ flags=util.HTML
+ )
+
+ def test_contains_multiple_not_no_match(self):
+ """Test contains multiple with "not" and no match."""
+
+ self.assert_selector(
+ self.MARKUP,
+ 'body span:not(:contains("that")):contains("that")',
+ [],
+ flags=util.HTML
+ )
+
def test_contains_with_descendants(self):
"""Test that contains returns descendants as well as the top level that contain."""