Skip to content

Commit

Permalink
str checks use plain string instead of re.Pattern (unionai-oss#1729)
Browse files Browse the repository at this point in the history
Signed-off-by: cosmicBboy <niels.bantilan@gmail.com>
  • Loading branch information
cosmicBboy authored and max-raphael committed Jan 24, 2025
1 parent 3701945 commit aec6dac
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 11 deletions.
8 changes: 4 additions & 4 deletions pandera/api/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ def str_matches(cls, pattern: Union[str, re.Pattern], **kwargs) -> "Check":
:param kwargs: key-word arguments passed into the `Check` initializer.
"""
try:
pattern_mod = re.compile(pattern)
re.compile(pattern)
except TypeError as exc:
raise ValueError(
f'pattern="{pattern}" cannot be compiled as regular expression'
Expand All @@ -446,7 +446,7 @@ def str_matches(cls, pattern: Union[str, re.Pattern], **kwargs) -> "Check":
kwargs,
error=f"str_matches('{pattern}')",
statistics={"pattern": pattern},
pattern=pattern_mod,
pattern=pattern,
)

@classmethod
Expand All @@ -459,7 +459,7 @@ def str_contains(
:param kwargs: key-word arguments passed into the `Check` initializer.
"""
try:
pattern_mod = re.compile(pattern)
re.compile(pattern)
except TypeError as exc:
raise ValueError(
f'pattern="{pattern}" cannot be compiled as regular expression'
Expand All @@ -469,7 +469,7 @@ def str_contains(
kwargs,
error=f"str_contains('{pattern}')",
statistics={"pattern": pattern},
pattern=pattern_mod,
pattern=pattern,
)

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion pandera/backends/polars/builtin_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def str_matches(
)
def str_contains(
data: PolarsData,
pattern: re.Pattern,
pattern: Union[str, re.Pattern],
) -> pl.LazyFrame:
"""Ensure that a pattern can be found in the string.
Expand Down
8 changes: 2 additions & 6 deletions pandera/backends/pyspark/builtin_checks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""PySpark implementation of built-in checks"""

import re
from typing import Any, Iterable, TypeVar

import pyspark.sql.types as pst
Expand Down Expand Up @@ -280,9 +279,7 @@ def notin(
error="str_contains('{pattern}')",
)
@register_input_datatypes(acceptable_datatypes=convert_to_list(STRING_TYPE))
def str_contains(
data: PysparkDataframeColumnObject, pattern: re.Pattern
) -> bool:
def str_contains(data: PysparkDataframeColumnObject, pattern: str) -> bool:
"""Ensure that a pattern can be found within each row.
Remember it can be a compute intensive check on large dataset. So, use it with caution.
Expand All @@ -291,9 +288,8 @@ def str_contains(
to access the dataframe is "dataframe" and column name using "column_name".
:param pattern: Regular expression pattern to use for searching
"""

return (
data.dataframe.filter(~col(data.column_name).rlike(pattern.pattern))
data.dataframe.filter(~col(data.column_name).rlike(pattern))
.limit(1)
.count()
== 0
Expand Down

0 comments on commit aec6dac

Please sign in to comment.