|
1 | 1 | import asyncio
|
2 | 2 | import sys
|
| 3 | +import time |
3 | 4 | from unittest.mock import ANY, AsyncMock, patch
|
4 | 5 |
|
5 | 6 | import aiohttp
|
@@ -1934,3 +1935,264 @@ def fake_parse_or_search_proxy(proxy):
|
1934 | 1935 | )
|
1935 | 1936 | with pytest.raises(ValueError, match="Invalid proxy"):
|
1936 | 1937 | ChromiumLoader(["http://example.com"], backend="playwright", proxy="bad_proxy")
|
| 1938 | + |
| 1939 | + |
| 1940 | +@pytest.mark.asyncio |
| 1941 | +async def test_alazy_load_with_single_url_string(monkeypatch): |
| 1942 | + """Test that alazy_load yields Document objects when urls is a string (iterating over characters).""" |
| 1943 | + # Passing a string as URL; lazy_load will iterate each character. |
| 1944 | + loader = ChromiumLoader( |
| 1945 | + "http://example.com", backend="playwright", requires_js_support=False |
| 1946 | + ) |
| 1947 | + |
| 1948 | + async def dummy_scraper(url, browser_name="chromium"): |
| 1949 | + return f"<html>{url}</html>" |
| 1950 | + |
| 1951 | + monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper) |
| 1952 | + docs = [doc async for doc in loader.alazy_load()] |
| 1953 | + # The expected number of documents is the length of the string |
| 1954 | + expected_length = len("http://example.com") |
| 1955 | + assert len(docs) == expected_length |
| 1956 | + # Check that the first document’s source is the first character ('h') |
| 1957 | + assert docs[0].metadata["source"] == "h" |
| 1958 | + |
| 1959 | + |
| 1960 | +def test_lazy_load_with_single_url_string(monkeypatch): |
| 1961 | + """Test that lazy_load yields Document objects when urls is a string (iterating over characters).""" |
| 1962 | + loader = ChromiumLoader( |
| 1963 | + "http://example.com", backend="playwright", requires_js_support=False |
| 1964 | + ) |
| 1965 | + |
| 1966 | + async def dummy_scraper(url, browser_name="chromium"): |
| 1967 | + return f"<html>{url}</html>" |
| 1968 | + |
| 1969 | + monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper) |
| 1970 | + docs = list(loader.lazy_load()) |
| 1971 | + expected_length = len("http://example.com") |
| 1972 | + assert len(docs) == expected_length |
| 1973 | + # The first character from the URL is 'h' |
| 1974 | + assert docs[0].metadata["source"] == "h" |
| 1975 | + |
| 1976 | + |
| 1977 | +@pytest.mark.asyncio |
| 1978 | +async def test_ascrape_playwright_scroll_invalid_type(monkeypatch): |
| 1979 | + """Test that ascrape_playwright_scroll raises TypeError when invalid types are passed for scroll or sleep.""" |
| 1980 | + # Create a dummy playwright so that evaluate and content can be called |
| 1981 | + |
| 1982 | + loader = ChromiumLoader(["http://example.com"], backend="playwright") |
| 1983 | + # Passing a non‐numeric sleep value should eventually trigger an error |
| 1984 | + with pytest.raises(TypeError): |
| 1985 | + await loader.ascrape_playwright_scroll( |
| 1986 | + "http://example.com", scroll=6000, sleep="2", scroll_to_bottom=False |
| 1987 | + ) |
| 1988 | + |
| 1989 | + |
| 1990 | +@pytest.mark.asyncio |
| 1991 | +async def test_alazy_load_non_iterable_urls(): |
| 1992 | + """Test that alazy_load raises TypeError when urls is not an iterable (e.g., integer).""" |
| 1993 | + with pytest.raises(TypeError): |
| 1994 | + # Passing an integer as urls should cause a TypeError during iteration. |
| 1995 | + loader = ChromiumLoader(123, backend="playwright") |
| 1996 | + [doc async for doc in loader.alazy_load()] |
| 1997 | + |
| 1998 | + |
| 1999 | +def test_lazy_load_non_iterable_urls(): |
| 2000 | + """Test that lazy_load raises TypeError when urls is not an iterable (e.g., integer).""" |
| 2001 | + with pytest.raises(TypeError): |
| 2002 | + loader = ChromiumLoader(456, backend="playwright") |
| 2003 | + |
| 2004 | + class DummyPW: |
| 2005 | + async def __aenter__(self): |
| 2006 | + return self |
| 2007 | + |
| 2008 | + async def __aexit__(self, exc_type, exc, tb): |
| 2009 | + return |
| 2010 | + |
| 2011 | + class chromium: |
| 2012 | + @staticmethod |
| 2013 | + async def launch(headless, proxy, **kwargs): |
| 2014 | + return DummyBrowser() |
| 2015 | + |
| 2016 | + class firefox: |
| 2017 | + @staticmethod |
| 2018 | + async def launch(headless, proxy, **kwargs): |
| 2019 | + return DummyBrowser() |
| 2020 | + |
| 2021 | + monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW()) |
| 2022 | + |
| 2023 | + # Create a loader instance with retry_limit=2 so that one failure is allowed. |
| 2024 | + |
| 2025 | + |
| 2026 | +@pytest.mark.asyncio |
| 2027 | +async def test_ascrape_playwright_caplog(monkeypatch, caplog): |
| 2028 | + """ |
| 2029 | + Test that ascrape_playwright recovers on failure and that error messages are logged. |
| 2030 | + This test simulates one failed attempt (via a Timeout) and then a successful attempt. |
| 2031 | + """ |
| 2032 | + # Create a loader instance with a retry limit of 2 and a short timeout. |
| 2033 | + loader = ChromiumLoader( |
| 2034 | + ["http://example.com"], backend="playwright", retry_limit=2, timeout=1 |
| 2035 | + ) |
| 2036 | + attempt = {"count": 0} |
| 2037 | + |
| 2038 | + async def dummy_ascrape(url, browser_name="chromium"): |
| 2039 | + if attempt["count"] < 1: |
| 2040 | + attempt["count"] += 1 |
| 2041 | + raise asyncio.TimeoutError("Simulated Timeout") |
| 2042 | + return "Recovered Content" |
| 2043 | + |
| 2044 | + monkeypatch.setattr(loader, "ascrape_playwright", dummy_ascrape) |
| 2045 | + with caplog.at_level("ERROR"): |
| 2046 | + result = await loader.ascrape_playwright("http://example.com") |
| 2047 | + assert "Recovered Content" in result |
| 2048 | + assert any( |
| 2049 | + "Attempt 1 failed: Simulated Timeout" in record.message |
| 2050 | + for record in caplog.records |
| 2051 | + ) |
| 2052 | + |
| 2053 | + class DummyContext: |
| 2054 | + def __init__(self): |
| 2055 | + self.new_page_called = False |
| 2056 | + |
| 2057 | + async def new_page(self): |
| 2058 | + self.new_page_called = True |
| 2059 | + return DummyPage() |
| 2060 | + |
| 2061 | + class DummyBrowser: |
| 2062 | + def __init__(self): |
| 2063 | + self.new_context_kwargs = None |
| 2064 | + |
| 2065 | + async def new_context(self, **kwargs): |
| 2066 | + self.new_context_kwargs = kwargs |
| 2067 | + return DummyContext() |
| 2068 | + |
| 2069 | + async def close(self): |
| 2070 | + return |
| 2071 | + |
| 2072 | + class DummyPW: |
| 2073 | + async def __aenter__(self): |
| 2074 | + return self |
| 2075 | + |
| 2076 | + async def __aexit__(self, exc_type, exc, tb): |
| 2077 | + return |
| 2078 | + |
| 2079 | + class chromium: |
| 2080 | + @staticmethod |
| 2081 | + async def launch(headless, proxy, **kwargs): |
| 2082 | + return DummyBrowser() |
| 2083 | + |
| 2084 | + monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW()) |
| 2085 | + |
| 2086 | + # Initialize the loader with a non-empty storage_state value. |
| 2087 | + loader = ChromiumLoader( |
| 2088 | + ["http://example.com"], backend="playwright", storage_state="dummy_state" |
| 2089 | + ) |
| 2090 | + |
| 2091 | + # Call ascrape_playwright and capture its result. |
| 2092 | + result = await loader.ascrape_playwright("http://example.com") |
| 2093 | + |
| 2094 | + # To verify that ignore_https_errors was passed into new_context, |
| 2095 | + # simulate a separate launch to inspect the new_context_kwargs. |
| 2096 | + browser_instance = await DummyPW.chromium.launch( |
| 2097 | + headless=loader.headless, proxy=loader.proxy |
| 2098 | + ) |
| 2099 | + await browser_instance.new_context( |
| 2100 | + storage_state=loader.storage_state, ignore_https_errors=True |
| 2101 | + ) |
| 2102 | + kwargs = browser_instance.new_context_kwargs |
| 2103 | + |
| 2104 | + assert kwargs is not None |
| 2105 | + assert kwargs.get("ignore_https_errors") is True |
| 2106 | + assert kwargs.get("storage_state") == "dummy_state" |
| 2107 | + assert "<html>Ignore HTTPS errors Test</html>" in result |
| 2108 | + |
| 2109 | + |
| 2110 | +@pytest.mark.asyncio |
| 2111 | +async def test_ascrape_with_js_support_context_error_cleanup(monkeypatch): |
| 2112 | + """Test that ascrape_with_js_support calls browser.close() even if new_context fails.""" |
| 2113 | + close_called = {"called": False} |
| 2114 | + |
| 2115 | + class DummyBrowser: |
| 2116 | + async def new_context(self, **kwargs): |
| 2117 | + # Force an exception during context creation |
| 2118 | + raise Exception("Context error") |
| 2119 | + |
| 2120 | + async def close(self): |
| 2121 | + close_called["called"] = True |
| 2122 | + |
| 2123 | + class DummyPW: |
| 2124 | + async def __aenter__(self): |
| 2125 | + return self |
| 2126 | + |
| 2127 | + async def __aexit__(self, exc_type, exc, tb): |
| 2128 | + return |
| 2129 | + |
| 2130 | + class chromium: |
| 2131 | + @staticmethod |
| 2132 | + async def launch(headless, proxy, **kwargs): |
| 2133 | + return DummyBrowser() |
| 2134 | + |
| 2135 | + class firefox: |
| 2136 | + @staticmethod |
| 2137 | + async def launch(headless, proxy, **kwargs): |
| 2138 | + return DummyBrowser() |
| 2139 | + |
| 2140 | + monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW()) |
| 2141 | + loader = ChromiumLoader( |
| 2142 | + ["http://example.com"], |
| 2143 | + backend="playwright", |
| 2144 | + requires_js_support=True, |
| 2145 | + retry_limit=1, |
| 2146 | + timeout=1, |
| 2147 | + ) |
| 2148 | + with pytest.raises(RuntimeError, match="Failed to scrape after 1 attempts"): |
| 2149 | + await loader.ascrape_with_js_support("http://example.com") |
| 2150 | + assert close_called["called"] is True |
| 2151 | + |
| 2152 | + |
| 2153 | +@pytest.mark.asyncio |
| 2154 | +async def test_lazy_load_with_none_urls(monkeypatch): |
| 2155 | + """Test that lazy_load raises TypeError when urls is None.""" |
| 2156 | + loader = ChromiumLoader(None, backend="playwright") |
| 2157 | + with pytest.raises(TypeError): |
| 2158 | + list(loader.lazy_load()) |
| 2159 | + |
| 2160 | + |
| 2161 | +@pytest.mark.asyncio |
| 2162 | +def test_lazy_load_sequential_timing(monkeypatch): |
| 2163 | + """Test that lazy_load runs scraping sequentially rather than concurrently.""" |
| 2164 | + urls = ["http://example.com/1", "http://example.com/2", "http://example.com/3"] |
| 2165 | + loader = ChromiumLoader(urls, backend="playwright", requires_js_support=False) |
| 2166 | + |
| 2167 | + async def dummy_scraper_with_delay(url, browser_name="chromium"): |
| 2168 | + await asyncio.sleep(0.5) |
| 2169 | + return f"<html>Delayed content for {url}</html>" |
| 2170 | + |
| 2171 | + monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper_with_delay) |
| 2172 | + start = time.monotonic() |
| 2173 | + docs = list(loader.lazy_load()) |
| 2174 | + elapsed = time.monotonic() - start |
| 2175 | + # At least 0.5 seconds per URL should be observed. |
| 2176 | + assert elapsed >= 1.5, ( |
| 2177 | + f"Sequential lazy_load took too little time: {elapsed:.2f} seconds" |
| 2178 | + ) |
| 2179 | + for doc, url in zip(docs, urls): |
| 2180 | + assert f"Delayed content for {url}" in doc.page_content |
| 2181 | + assert doc.metadata["source"] == url |
| 2182 | + |
| 2183 | + |
| 2184 | +@pytest.mark.asyncio |
| 2185 | +def test_lazy_load_with_tuple_urls(monkeypatch): |
| 2186 | + """Test that lazy_load yields Document objects correctly when urls is provided as a tuple.""" |
| 2187 | + urls = ("http://example.com", "http://test.com") |
| 2188 | + loader = ChromiumLoader(urls, backend="playwright", requires_js_support=False) |
| 2189 | + |
| 2190 | + async def dummy_scraper(url, browser_name="chromium"): |
| 2191 | + return f"<html>Tuple content for {url}</html>" |
| 2192 | + |
| 2193 | + monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper) |
| 2194 | + docs = list(loader.lazy_load()) |
| 2195 | + assert len(docs) == 2 |
| 2196 | + for doc, url in zip(docs, urls): |
| 2197 | + assert f"Tuple content for {url}" in doc.page_content |
| 2198 | + assert doc.metadata["source"] == url |
0 commit comments