From 4ea812f1679ff52a536a584652ccea2769a80948 Mon Sep 17 00:00:00 2001 From: Marc Date: Sat, 7 Aug 2021 22:30:46 +0200 Subject: [PATCH 1/2] add support for regex flags in .re() and .re_first() methods --- docs/usage.rst | 9 ++++++++ parsel/selector.py | 43 ++++++++++++++++++++++++++++++------ parsel/utils.py | 9 ++++++-- tests/test_selector.py | 49 ++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 99 insertions(+), 11 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 55e6a313..af882848 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -359,6 +359,15 @@ Use it to extract just the first matching string:: >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:\s*(.*)') 'My image 1 ' +You can also use compiled regular expressions with both methods:: + + >>> import re + >>> regex = re.compile(r'Name:\s*(.*)') + >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(regex) + 'My image 1' + +As well as adding regex flags with the ``flags`` argument. + .. _topics-selectors-relative-xpaths: Working with relative XPaths diff --git a/parsel/selector.py b/parsel/selector.py index 0a6530fc..26e9d9dd 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -119,7 +119,10 @@ def css(self, query: str) -> "SelectorList[_SelectorType]": return self.__class__(flatten([x.css(query) for x in self])) def re( - self, regex: Union[str, Pattern[str]], replace_entities: bool = True + self, + regex: Union[str, Pattern[str]], + replace_entities: bool = True, + flags: int = 0, ) -> List[str]: """ Call the ``.re()`` method for each element in this list and return @@ -129,8 +132,14 @@ def re( corresponding character (except for ``&`` and ``<``. Passing ``replace_entities`` as ``False`` switches off these replacements. + + It is possible to provide regex flags using the `flags` argument. They + will be applied only if the provided regex is not a compiled regular + expression. """ - return flatten([x.re(regex, replace_entities=replace_entities) for x in self]) + return flatten( + [x.re(regex, replace_entities=replace_entities, flags=flags) for x in self] + ) @typing.overload def re_first( @@ -138,6 +147,7 @@ def re_first( regex: Union[str, Pattern[str]], default: None = None, replace_entities: bool = True, + flags: int = 0, ) -> Optional[str]: pass @@ -147,6 +157,7 @@ def re_first( regex: Union[str, Pattern[str]], default: str, replace_entities: bool = True, + flags: int = 0, ) -> str: pass @@ -155,6 +166,7 @@ def re_first( regex: Union[str, Pattern[str]], default: Optional[str] = None, replace_entities: bool = True, + flags: int = 0, ) -> Optional[str]: """ Call the ``.re()`` method for the first element in this list and @@ -168,7 +180,7 @@ def re_first( replacements. """ for el in iflatten( - x.re(regex, replace_entities=replace_entities) for x in self + x.re(regex, replace_entities=replace_entities, flags=flags) for x in self ): return el return default @@ -358,21 +370,30 @@ def _css2xpath(self, query: str) -> Any: return self._csstranslator.css_to_xpath(query) def re( - self, regex: Union[str, Pattern[str]], replace_entities: bool = True + self, + regex: Union[str, Pattern[str]], + replace_entities: bool = True, + flags: int = 0, ) -> List[str]: """ Apply the given regex and return a list of unicode strings with the matches. ``regex`` can be either a compiled regular expression or a string which - will be compiled to a regular expression using ``re.compile(regex)``. + will be compiled to a regular expression using ``re.compile()``. By default, character entity references are replaced by their corresponding character (except for ``&`` and ``<``). Passing ``replace_entities`` as ``False`` switches off these replacements. + + It is possible to provide regex flags using the `flags` argument. They + will be applied only if the provided regex is not a compiled regular + expression. """ - return extract_regex(regex, self.get(), replace_entities=replace_entities) + return extract_regex( + regex, self.get(), replace_entities=replace_entities, flags=flags + ) @typing.overload def re_first( @@ -380,6 +401,7 @@ def re_first( regex: Union[str, Pattern[str]], default: None = None, replace_entities: bool = True, + flags: int = 0, ) -> Optional[str]: pass @@ -389,6 +411,7 @@ def re_first( regex: Union[str, Pattern[str]], default: str, replace_entities: bool = True, + flags: int = 0, ) -> str: pass @@ -397,6 +420,7 @@ def re_first( regex: Union[str, Pattern[str]], default: Optional[str] = None, replace_entities: bool = True, + flags: int = 0, ) -> Optional[str]: """ Apply the given regex and return the first unicode string which @@ -407,9 +431,14 @@ def re_first( corresponding character (except for ``&`` and ``<``). Passing ``replace_entities`` as ``False`` switches off these replacements. + + It is possible to provide regex flags using the `flags` argument. They + will be applied only if the provided regex is not a compiled regular + expression. """ return next( - iflatten(self.re(regex, replace_entities=replace_entities)), default + iflatten(self.re(regex, replace_entities=replace_entities, flags=flags)), + default, ) def get(self) -> str: diff --git a/parsel/utils.py b/parsel/utils.py index 94d27079..d9e4f991 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -57,15 +57,20 @@ def _is_listlike(x: Any) -> bool: def extract_regex( - regex: Union[str, Pattern[str]], text: str, replace_entities: bool = True + regex: Union[str, Pattern[str]], + text: str, + replace_entities: bool = True, + flags: int = 0, ) -> List[str]: """Extract a list of unicode strings from the given text/encoding using the following policies: + * if the regex is a string it will be compiled using the provided flags * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, str): - regex = re.compile(regex, re.UNICODE) + flags |= re.UNICODE + regex = re.compile(regex, flags) if "extract" in regex.groupindex: # named group diff --git a/tests/test_selector.py b/tests/test_selector.py index 75c0a1e0..97ee103b 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -318,7 +318,7 @@ def test_re_first(self) -> None: self.assertEqual(sel.re_first(r"foo"), None) self.assertEqual(sel.re_first(r"foo", default="bar"), "bar") - def test_extract_first_re_default(self) -> None: + def test_re_first_default(self) -> None: """Test if re_first() returns default value when no results found""" body = '' sel = self.sscls(text=body) @@ -330,6 +330,30 @@ def test_extract_first_re_default(self) -> None: sel.xpath("/ul/li/text()").re_first(r"\w+", default="missing"), "missing" ) + def test_re_first_flags(self) -> None: + body = """ + + """ + sel = self.sscls(text=body) + + self.assertEqual( + sel.xpath("//script/text()").re_first(r"example\(\) ({.*})"), None + ) + self.assertEqual( + sel.xpath("//script/text()").re_first( + r"example\(\) ({.*})", flags=re.DOTALL + ), + """{ + "name": "Adrian", + "points": 3, + }""", + ) + def test_select_unicode_query(self) -> None: body = "

" sel = self.sscls(text=body) @@ -710,7 +734,6 @@ def test_re_replace_entities(self) -> None: self.assertEqual( x.xpath("//script")[0].re(name_re, replace_entities=False), [expected] ) - self.assertEqual( x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected, @@ -724,6 +747,28 @@ def test_re_intl(self) -> None: x = self.sscls(text=body) self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"]) + def test_re_flags(self) -> None: + body = """ + + """ + sel = self.sscls(text=body) + + self.assertEqual(sel.xpath("//script/text()").re(r"example\(\) ({.*})"), []) + self.assertEqual( + sel.xpath("//script/text()").re(r"example\(\) ({.*})", flags=re.DOTALL), + [ + """{ + "name": "Adrian", + "points": 3, + }""" + ], + ) + def test_selector_over_text(self) -> None: hs = self.sscls(text="lala") self.assertEqual(hs.extract(), "lala") From df0f41b7c504bc6eb68484f07fb9a040cd083365 Mon Sep 17 00:00:00 2001 From: Marc Date: Sat, 7 Aug 2021 22:51:56 +0200 Subject: [PATCH 2/2] fix docs typo --- docs/usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.rst b/docs/usage.rst index af882848..398b0fc7 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -364,7 +364,7 @@ You can also use compiled regular expressions with both methods:: >>> import re >>> regex = re.compile(r'Name:\s*(.*)') >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(regex) - 'My image 1' + 'My image 1 ' As well as adding regex flags with the ``flags`` argument.