diff --git a/parsel/selector.py b/parsel/selector.py index 2027599..1092210 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -1,6 +1,7 @@ """XPath and JMESPath selectors based on the lxml and jmespath Python packages.""" +import builtins import json import typing import warnings @@ -141,7 +142,9 @@ def __getitem__( def __getstate__(self) -> None: raise TypeError("can't pickle SelectorList objects") - def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]": + def jmespath( + self, query: str, *, type: Optional[str] = None, **kwargs: Any + ) -> "SelectorList[_SelectorType]": """ Call the ``.jmespath()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. @@ -153,12 +156,16 @@ def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]": selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict)) """ - return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self])) + return self.__class__( + flatten([x.jmespath(query, type=type, **kwargs) for x in self]) + ) def xpath( self, xpath: str, namespaces: Optional[Mapping[str, str]] = None, + *, + type: Optional[str] = None, **kwargs: Any, ) -> "SelectorList[_SelectorType]": """ @@ -178,17 +185,26 @@ def xpath( selector.xpath('//a[href=$url]', url="http://www.example.com") """ return self.__class__( - flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]) + flatten( + [ + x.xpath(xpath, namespaces=namespaces, type=type, **kwargs) + for x in self + ] + ) ) - def css(self, query: str) -> "SelectorList[_SelectorType]": + def css( + self, + query: str, + type: Optional[str] = None, + ) -> "SelectorList[_SelectorType]": """ Call the ``.css()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. ``query`` is the same argument as the one in :meth:`Selector.css` """ - return self.__class__(flatten([x.css(query) for x in self])) + return self.__class__(flatten([x.css(query, type=type) for x in self])) def re( self, regex: Union[str, Pattern[str]], replace_entities: bool = True @@ -532,6 +548,7 @@ def _get_root( def jmespath( self: _SelectorType, query: str, + type: Optional[str] = None, **kwargs: Any, ) -> SelectorList[_SelectorType]: """ @@ -565,9 +582,9 @@ def jmespath( def make_selector(x: Any) -> _SelectorType: # closure function if isinstance(x, str): - return self.__class__(text=x, _expr=query, type="text") + return self.__class__(text=x, _expr=query, type=type or "text") else: - return self.__class__(root=x, _expr=query) + return self.__class__(root=x, _expr=query, type=type) result = [make_selector(x) for x in result] return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result)) @@ -576,6 +593,7 @@ def xpath( self: _SelectorType, query: str, namespaces: Optional[Mapping[str, str]] = None, + type: Optional[str] = None, **kwargs: Any, ) -> SelectorList[_SelectorType]: """ @@ -625,7 +643,7 @@ def xpath( except etree.XPathError as exc: raise ValueError(f"XPath error: {exc} in {query}") - if type(result) is not list: + if builtins.type(result) is not list: result = [result] result = [ @@ -633,13 +651,15 @@ def xpath( root=x, _expr=query, namespaces=self.namespaces, - type=_xml_or_html(self.type), + type=type or _xml_or_html(self.type), ) for x in result ] return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result)) - def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]: + def css( + self: _SelectorType, query: str, type: Optional[str] = None + ) -> SelectorList[_SelectorType]: """ Apply the given CSS selector and return a :class:`SelectorList` instance. @@ -652,7 +672,7 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]: """ if self.type not in ("html", "xml", "text"): raise ValueError(f"Cannot use css on a Selector of type {self.type!r}") - return self.xpath(self._css2xpath(query)) + return self.xpath(self._css2xpath(query), type=type) def _css2xpath(self, query: str) -> str: type = _xml_or_html(self.type) diff --git a/tests/test_selector.py b/tests/test_selector.py index 96713f9..db8d099 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1007,6 +1007,24 @@ def test_remove_selector(self) -> None: self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) + def test_remove_selector_from_nested_html(self) -> None: + json_str = """{ + "title": "hello world", + "body": "
hello world
" + } + """ + expect_result = "hello world
" + sel = self.sscls(text=json_str) + # We need to force the selector type to HTML to make that functionality + # readily available. + html_sel = sel.jmespath("body", type="html")[0] + self.assertEqual(html_sel.type, "html") + li_sel_list = html_sel.css("style") + li_sel_list.drop() + self.assertEqual(html_sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(html_sel.type, "html") + def test_remove_pseudo_element_selector_list(self) -> None: sel = self.sscls( text="