Skip to content

Commit 9773ef2

Browse files
committed
codebeaver/pre/beta-963 - .
1 parent c1ccd06 commit 9773ef2

6 files changed

+650
-0
lines changed

tests/graphs/abstract_graph_test.py

+63
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,67 @@
1414
"""
1515

1616

17+
def test_llm_missing_tokens(monkeypatch, capsys):
18+
"""Test that missing model tokens causes default to 8192 with an appropriate warning printed."""
19+
# Patch out models_tokens to simulate missing tokens for the given model
20+
from scrapegraphai.graphs import abstract_graph
21+
22+
monkeypatch.setattr(
23+
abstract_graph, "models_tokens", {"openai": {"gpt-3.5-turbo": 4096}}
24+
)
25+
llm_config = {"model": "openai/not-known-model", "openai_api_key": "test"}
26+
# Patch _create_graph to return a dummy graph to avoid real graph creation
27+
with patch.object(TestGraph, "_create_graph", return_value=Mock(nodes=[])):
28+
graph = TestGraph("Test prompt", {"llm": llm_config})
29+
# Since "not-known-model" is missing, it should default to 8192
30+
assert graph.model_token == 8192
31+
captured = capsys.readouterr().out
32+
assert "Max input tokens for model" in captured
33+
34+
35+
def test_burr_kwargs():
36+
"""Test that burr_kwargs configuration correctly sets use_burr and burr_config on the graph."""
37+
dummy_graph = Mock()
38+
dummy_graph.nodes = []
39+
with patch.object(TestGraph, "_create_graph", return_value=dummy_graph):
40+
config = {
41+
"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"},
42+
"burr_kwargs": {"some_key": "some_value"},
43+
}
44+
graph = TestGraph("Test prompt", config)
45+
# Check that the burr_kwargs have been applied and an app_instance_id added if missing
46+
assert dummy_graph.use_burr is True
47+
assert dummy_graph.burr_config["some_key"] == "some_value"
48+
assert "app_instance_id" in dummy_graph.burr_config
49+
50+
51+
def test_set_common_params():
52+
"""
53+
Test that the set_common_params method correctly updates the configuration
54+
of all nodes in the graph.
55+
"""
56+
# Create a mock graph with mock nodes
57+
mock_graph = Mock()
58+
mock_node1 = Mock()
59+
mock_node2 = Mock()
60+
mock_graph.nodes = [mock_node1, mock_node2]
61+
# Create a TestGraph instance with the mock graph
62+
with patch(
63+
"scrapegraphai.graphs.abstract_graph.AbstractGraph._create_graph",
64+
return_value=mock_graph,
65+
):
66+
graph = TestGraph(
67+
"Test prompt",
68+
{"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}},
69+
)
70+
# Call set_common_params with test parameters
71+
test_params = {"param1": "value1", "param2": "value2"}
72+
graph.set_common_params(test_params)
73+
# Assert that update_config was called on each node with the correct parameters
74+
mock_node1.update_config.assert_called_once_with(test_params, False)
75+
mock_node2.update_config.assert_called_once_with(test_params, False)
76+
77+
1778
class TestGraph(AbstractGraph):
1879
def __init__(self, prompt: str, config: dict):
1980
super().__init__(prompt, config)
@@ -78,6 +139,7 @@ class TestAbstractGraph:
78139
{
79140
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
80141
"region_name": "IDK",
142+
"temperature": 0.7,
81143
},
82144
ChatBedrock,
83145
),
@@ -136,6 +198,7 @@ def test_create_llm_unknown_provider(self):
136198
{
137199
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
138200
"region_name": "IDK",
201+
"temperature": 0.7,
139202
"rate_limit": {"requests_per_second": 1},
140203
},
141204
ChatBedrock,

tests/test_chromium.py

+262
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import asyncio
22
import sys
3+
import time
34
from unittest.mock import ANY, AsyncMock, patch
45

56
import aiohttp
@@ -1934,3 +1935,264 @@ def fake_parse_or_search_proxy(proxy):
19341935
)
19351936
with pytest.raises(ValueError, match="Invalid proxy"):
19361937
ChromiumLoader(["http://example.com"], backend="playwright", proxy="bad_proxy")
1938+
1939+
1940+
@pytest.mark.asyncio
1941+
async def test_alazy_load_with_single_url_string(monkeypatch):
1942+
"""Test that alazy_load yields Document objects when urls is a string (iterating over characters)."""
1943+
# Passing a string as URL; lazy_load will iterate each character.
1944+
loader = ChromiumLoader(
1945+
"http://example.com", backend="playwright", requires_js_support=False
1946+
)
1947+
1948+
async def dummy_scraper(url, browser_name="chromium"):
1949+
return f"<html>{url}</html>"
1950+
1951+
monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper)
1952+
docs = [doc async for doc in loader.alazy_load()]
1953+
# The expected number of documents is the length of the string
1954+
expected_length = len("http://example.com")
1955+
assert len(docs) == expected_length
1956+
# Check that the first document’s source is the first character ('h')
1957+
assert docs[0].metadata["source"] == "h"
1958+
1959+
1960+
def test_lazy_load_with_single_url_string(monkeypatch):
1961+
"""Test that lazy_load yields Document objects when urls is a string (iterating over characters)."""
1962+
loader = ChromiumLoader(
1963+
"http://example.com", backend="playwright", requires_js_support=False
1964+
)
1965+
1966+
async def dummy_scraper(url, browser_name="chromium"):
1967+
return f"<html>{url}</html>"
1968+
1969+
monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper)
1970+
docs = list(loader.lazy_load())
1971+
expected_length = len("http://example.com")
1972+
assert len(docs) == expected_length
1973+
# The first character from the URL is 'h'
1974+
assert docs[0].metadata["source"] == "h"
1975+
1976+
1977+
@pytest.mark.asyncio
1978+
async def test_ascrape_playwright_scroll_invalid_type(monkeypatch):
1979+
"""Test that ascrape_playwright_scroll raises TypeError when invalid types are passed for scroll or sleep."""
1980+
# Create a dummy playwright so that evaluate and content can be called
1981+
1982+
loader = ChromiumLoader(["http://example.com"], backend="playwright")
1983+
# Passing a non‐numeric sleep value should eventually trigger an error
1984+
with pytest.raises(TypeError):
1985+
await loader.ascrape_playwright_scroll(
1986+
"http://example.com", scroll=6000, sleep="2", scroll_to_bottom=False
1987+
)
1988+
1989+
1990+
@pytest.mark.asyncio
1991+
async def test_alazy_load_non_iterable_urls():
1992+
"""Test that alazy_load raises TypeError when urls is not an iterable (e.g., integer)."""
1993+
with pytest.raises(TypeError):
1994+
# Passing an integer as urls should cause a TypeError during iteration.
1995+
loader = ChromiumLoader(123, backend="playwright")
1996+
[doc async for doc in loader.alazy_load()]
1997+
1998+
1999+
def test_lazy_load_non_iterable_urls():
2000+
"""Test that lazy_load raises TypeError when urls is not an iterable (e.g., integer)."""
2001+
with pytest.raises(TypeError):
2002+
loader = ChromiumLoader(456, backend="playwright")
2003+
2004+
class DummyPW:
2005+
async def __aenter__(self):
2006+
return self
2007+
2008+
async def __aexit__(self, exc_type, exc, tb):
2009+
return
2010+
2011+
class chromium:
2012+
@staticmethod
2013+
async def launch(headless, proxy, **kwargs):
2014+
return DummyBrowser()
2015+
2016+
class firefox:
2017+
@staticmethod
2018+
async def launch(headless, proxy, **kwargs):
2019+
return DummyBrowser()
2020+
2021+
monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW())
2022+
2023+
# Create a loader instance with retry_limit=2 so that one failure is allowed.
2024+
2025+
2026+
@pytest.mark.asyncio
2027+
async def test_ascrape_playwright_caplog(monkeypatch, caplog):
2028+
"""
2029+
Test that ascrape_playwright recovers on failure and that error messages are logged.
2030+
This test simulates one failed attempt (via a Timeout) and then a successful attempt.
2031+
"""
2032+
# Create a loader instance with a retry limit of 2 and a short timeout.
2033+
loader = ChromiumLoader(
2034+
["http://example.com"], backend="playwright", retry_limit=2, timeout=1
2035+
)
2036+
attempt = {"count": 0}
2037+
2038+
async def dummy_ascrape(url, browser_name="chromium"):
2039+
if attempt["count"] < 1:
2040+
attempt["count"] += 1
2041+
raise asyncio.TimeoutError("Simulated Timeout")
2042+
return "Recovered Content"
2043+
2044+
monkeypatch.setattr(loader, "ascrape_playwright", dummy_ascrape)
2045+
with caplog.at_level("ERROR"):
2046+
result = await loader.ascrape_playwright("http://example.com")
2047+
assert "Recovered Content" in result
2048+
assert any(
2049+
"Attempt 1 failed: Simulated Timeout" in record.message
2050+
for record in caplog.records
2051+
)
2052+
2053+
class DummyContext:
2054+
def __init__(self):
2055+
self.new_page_called = False
2056+
2057+
async def new_page(self):
2058+
self.new_page_called = True
2059+
return DummyPage()
2060+
2061+
class DummyBrowser:
2062+
def __init__(self):
2063+
self.new_context_kwargs = None
2064+
2065+
async def new_context(self, **kwargs):
2066+
self.new_context_kwargs = kwargs
2067+
return DummyContext()
2068+
2069+
async def close(self):
2070+
return
2071+
2072+
class DummyPW:
2073+
async def __aenter__(self):
2074+
return self
2075+
2076+
async def __aexit__(self, exc_type, exc, tb):
2077+
return
2078+
2079+
class chromium:
2080+
@staticmethod
2081+
async def launch(headless, proxy, **kwargs):
2082+
return DummyBrowser()
2083+
2084+
monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW())
2085+
2086+
# Initialize the loader with a non-empty storage_state value.
2087+
loader = ChromiumLoader(
2088+
["http://example.com"], backend="playwright", storage_state="dummy_state"
2089+
)
2090+
2091+
# Call ascrape_playwright and capture its result.
2092+
result = await loader.ascrape_playwright("http://example.com")
2093+
2094+
# To verify that ignore_https_errors was passed into new_context,
2095+
# simulate a separate launch to inspect the new_context_kwargs.
2096+
browser_instance = await DummyPW.chromium.launch(
2097+
headless=loader.headless, proxy=loader.proxy
2098+
)
2099+
await browser_instance.new_context(
2100+
storage_state=loader.storage_state, ignore_https_errors=True
2101+
)
2102+
kwargs = browser_instance.new_context_kwargs
2103+
2104+
assert kwargs is not None
2105+
assert kwargs.get("ignore_https_errors") is True
2106+
assert kwargs.get("storage_state") == "dummy_state"
2107+
assert "<html>Ignore HTTPS errors Test</html>" in result
2108+
2109+
2110+
@pytest.mark.asyncio
2111+
async def test_ascrape_with_js_support_context_error_cleanup(monkeypatch):
2112+
"""Test that ascrape_with_js_support calls browser.close() even if new_context fails."""
2113+
close_called = {"called": False}
2114+
2115+
class DummyBrowser:
2116+
async def new_context(self, **kwargs):
2117+
# Force an exception during context creation
2118+
raise Exception("Context error")
2119+
2120+
async def close(self):
2121+
close_called["called"] = True
2122+
2123+
class DummyPW:
2124+
async def __aenter__(self):
2125+
return self
2126+
2127+
async def __aexit__(self, exc_type, exc, tb):
2128+
return
2129+
2130+
class chromium:
2131+
@staticmethod
2132+
async def launch(headless, proxy, **kwargs):
2133+
return DummyBrowser()
2134+
2135+
class firefox:
2136+
@staticmethod
2137+
async def launch(headless, proxy, **kwargs):
2138+
return DummyBrowser()
2139+
2140+
monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW())
2141+
loader = ChromiumLoader(
2142+
["http://example.com"],
2143+
backend="playwright",
2144+
requires_js_support=True,
2145+
retry_limit=1,
2146+
timeout=1,
2147+
)
2148+
with pytest.raises(RuntimeError, match="Failed to scrape after 1 attempts"):
2149+
await loader.ascrape_with_js_support("http://example.com")
2150+
assert close_called["called"] is True
2151+
2152+
2153+
@pytest.mark.asyncio
2154+
async def test_lazy_load_with_none_urls(monkeypatch):
2155+
"""Test that lazy_load raises TypeError when urls is None."""
2156+
loader = ChromiumLoader(None, backend="playwright")
2157+
with pytest.raises(TypeError):
2158+
list(loader.lazy_load())
2159+
2160+
2161+
@pytest.mark.asyncio
2162+
def test_lazy_load_sequential_timing(monkeypatch):
2163+
"""Test that lazy_load runs scraping sequentially rather than concurrently."""
2164+
urls = ["http://example.com/1", "http://example.com/2", "http://example.com/3"]
2165+
loader = ChromiumLoader(urls, backend="playwright", requires_js_support=False)
2166+
2167+
async def dummy_scraper_with_delay(url, browser_name="chromium"):
2168+
await asyncio.sleep(0.5)
2169+
return f"<html>Delayed content for {url}</html>"
2170+
2171+
monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper_with_delay)
2172+
start = time.monotonic()
2173+
docs = list(loader.lazy_load())
2174+
elapsed = time.monotonic() - start
2175+
# At least 0.5 seconds per URL should be observed.
2176+
assert elapsed >= 1.5, (
2177+
f"Sequential lazy_load took too little time: {elapsed:.2f} seconds"
2178+
)
2179+
for doc, url in zip(docs, urls):
2180+
assert f"Delayed content for {url}" in doc.page_content
2181+
assert doc.metadata["source"] == url
2182+
2183+
2184+
@pytest.mark.asyncio
2185+
def test_lazy_load_with_tuple_urls(monkeypatch):
2186+
"""Test that lazy_load yields Document objects correctly when urls is provided as a tuple."""
2187+
urls = ("http://example.com", "http://test.com")
2188+
loader = ChromiumLoader(urls, backend="playwright", requires_js_support=False)
2189+
2190+
async def dummy_scraper(url, browser_name="chromium"):
2191+
return f"<html>Tuple content for {url}</html>"
2192+
2193+
monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper)
2194+
docs = list(loader.lazy_load())
2195+
assert len(docs) == 2
2196+
for doc, url in zip(docs, urls):
2197+
assert f"Tuple content for {url}" in doc.page_content
2198+
assert doc.metadata["source"] == url

tests/test_json_scraper_multi_graph.py

Whitespace-only changes.

0 commit comments

Comments
 (0)