Skip to content

Commit 16e5913

Browse files
authored
Update pypi to opensource (#93)
* update docs * update pyproject.toml * fix typing * fix formatting * fix return ellipsis
1 parent 7b5bdeb commit 16e5913

File tree

11 files changed

+186
-115
lines changed

11 files changed

+186
-115
lines changed

.env.example

-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
DENDRITE_API_KEY=
2-
OPENAI_API_KEY=
31
ANTHROPIC_API_KEY=
42

53
# If using Browserbase

.github/workflows/ci.yml

-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ jobs:
4949
test:
5050
runs-on: ubuntu-latest
5151
env:
52-
DENDRITE_API_KEY: ${{secrets.DENDRITE_API_KEY}}
5352
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
5453
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
5554
steps:

dendrite/browser/async_api/dendrite_browser.py

+49-29
Original file line numberDiff line numberDiff line change
@@ -61,23 +61,16 @@ class AsyncDendrite(
6161
AsyncDendrite is a class that manages a browser instance using Playwright, allowing
6262
interactions with web pages using natural language.
6363
64-
This class handles initialization with API keys for Dendrite, OpenAI, and Anthropic, manages browser
65-
contexts, and provides methods for navigation, authentication, and other browser-related tasks.
64+
This class handles initialization with configuration options, manages browser contexts,
65+
and provides methods for navigation, authentication, and other browser-related tasks.
6666
6767
Attributes:
68-
id (UUID): The unique identifier for the AsyncDendrite instance.
69-
auth_data (Optional[AuthSession]): The authentication session data for the browser.
70-
dendrite_api_key (str): The API key for Dendrite, used for interactions with the Dendrite API.
71-
playwright_options (dict): Options for configuring the Playwright browser instance.
72-
playwright (Optional[Playwright]): The Playwright instance managing the browser.
68+
id (str): The unique identifier for the AsyncDendrite instance.
7369
browser_context (Optional[BrowserContext]): The current browser context, which may include cookies and other session data.
7470
active_page_manager (Optional[PageManager]): The manager responsible for handling active pages within the browser context.
7571
user_id (Optional[str]): The user ID associated with the browser session.
76-
browser_api_client (BrowserAPIClient): The API client used for communicating with the Dendrite API.
77-
api_config (APIConfig): The configuration for the language models, including API keys for OpenAI and Anthropic.
78-
79-
Raises:
80-
Exception: If any of the required API keys (Dendrite, OpenAI, Anthropic) are not provided or found in the environment variables.
72+
logic_engine (AsyncLogicEngine): The engine used for processing natural language interactions.
73+
closed (bool): Whether the browser instance has been closed.
8174
"""
8275

8376
def __init__(
@@ -94,10 +87,14 @@ def __init__(
9487
Initialize AsyncDendrite with optional domain authentication.
9588
9689
Args:
97-
playwright_options: Options for configuring Playwright
98-
remote_config: Remote browser provider configuration
99-
config: Configuration object
100-
auth: List of domains or single domain to load authentication state for
90+
playwright_options (dict): Options for configuring Playwright browser instance.
91+
Defaults to non-headless mode with stealth arguments.
92+
remote_config (Optional[Providers]): Remote browser provider configuration.
93+
Defaults to None for local browser.
94+
config (Optional[Config]): Configuration object for the instance.
95+
Defaults to a new Config instance.
96+
auth (Optional[Union[List[str], str]]): List of domains or single domain
97+
to load authentication state for. Defaults to None.
10198
"""
10299
self._impl = self._get_impl(remote_config)
103100
self._playwright_options = playwright_options
@@ -195,19 +192,23 @@ async def goto(
195192
expected_page: str = "",
196193
) -> AsyncPage:
197194
"""
198-
Navigates to the specified URL, optionally in a new tab
195+
Navigates to the specified URL, optionally in a new tab.
199196
200197
Args:
201-
url (str): The URL to navigate to.
202-
new_tab (bool, optional): Whether to open the URL in a new tab. Defaults to False.
203-
timeout (Optional[float], optional): The maximum time (in milliseconds) to wait for the page to load. Defaults to 15000.
204-
expected_page (str, optional): A description of the expected page type for verification. Defaults to an empty string.
198+
url (str): The URL to navigate to. If no protocol is specified, https:// will be added.
199+
new_tab (bool): Whether to open the URL in a new tab. Defaults to False.
200+
timeout (Optional[float]): The maximum time in milliseconds to wait for navigation.
201+
Defaults to 15000ms. Navigation will continue even if timeout occurs.
202+
expected_page (str): A description of the expected page type for verification.
203+
If provided, will verify the loaded page matches the description.
204+
Defaults to empty string (no verification).
205205
206206
Returns:
207207
AsyncPage: The page object after navigation.
208208
209209
Raises:
210-
Exception: If there is an error during navigation or if the expected page type is not found.
210+
IncorrectOutcomeError: If expected_page is provided and the loaded page
211+
doesn't match the expected description.
211212
"""
212213
# Check if the URL has a protocol
213214
if not re.match(r"^\w+://", url):
@@ -245,8 +246,13 @@ async def scroll_to_bottom(
245246
"""
246247
Scrolls to the bottom of the current page.
247248
248-
Returns:
249-
None
249+
Args:
250+
timeout (float): Maximum time in milliseconds to attempt scrolling.
251+
Defaults to 30000ms.
252+
scroll_increment (int): Number of pixels to scroll in each step.
253+
Defaults to 1000 pixels.
254+
no_progress_limit (int): Number of consecutive attempts with no progress
255+
before stopping. Defaults to 3 attempts.
250256
"""
251257
active_page = await self.get_active_page()
252258
await active_page.scroll_to_bottom(
@@ -305,10 +311,12 @@ async def add_cookies(self, cookies):
305311
Adds cookies to the current browser context.
306312
307313
Args:
308-
cookies (List[Dict[str, Any]]): A list of cookies to be added to the browser context.
314+
cookies (List[Dict[str, Any]]): A list of cookie objects to be added.
315+
Each cookie should be a dictionary with standard cookie attributes
316+
(name, value, domain, etc.).
309317
310318
Raises:
311-
Exception: If the browser context is not initialized.
319+
DendriteException: If the browser context is not initialized.
312320
"""
313321
if not self.browser_context:
314322
raise DendriteException("Browser context not initialized")
@@ -446,8 +454,16 @@ async def save_auth(self, url: str) -> None:
446454
"""
447455
Save authentication state for a specific domain.
448456
457+
This method captures and stores the current browser context's storage state
458+
(cookies and origin data) for the specified domain. The state can be later
459+
used to restore authentication.
460+
449461
Args:
450-
domain (str): Domain to save authentication for (e.g., "github.com")
462+
url (str): URL or domain to save authentication for (e.g., "github.com"
463+
or "https://github.com"). The domain will be extracted from the URL.
464+
465+
Raises:
466+
DendriteException: If the browser context is not initialized.
451467
"""
452468
if not self.browser_context:
453469
raise DendriteException("Browser context not initialized")
@@ -482,11 +498,15 @@ async def setup_auth(
482498
message: str = "Please log in to the website. Once done, press Enter to continue...",
483499
) -> None:
484500
"""
485-
Set up authentication for a specific URL.
501+
Set up authentication for a specific URL by guiding the user through login.
502+
503+
This method opens a browser window, navigates to the specified URL, waits for
504+
the user to complete the login process, and then saves the authentication state.
486505
487506
Args:
488507
url (str): URL to navigate to for login
489-
message (str): Message to show while waiting for user input
508+
message (str): Message to show while waiting for user to complete login.
509+
Defaults to standard login instruction message.
490510
"""
491511
# Extract domain from URL
492512
# domain = urlparse(url).netloc

dendrite/browser/sync_api/dendrite_browser.py

+49-29
Original file line numberDiff line numberDiff line change
@@ -58,23 +58,16 @@ class Dendrite(
5858
Dendrite is a class that manages a browser instance using Playwright, allowing
5959
interactions with web pages using natural language.
6060
61-
This class handles initialization with API keys for Dendrite, OpenAI, and Anthropic, manages browser
62-
contexts, and provides methods for navigation, authentication, and other browser-related tasks.
61+
This class handles initialization with configuration options, manages browser contexts,
62+
and provides methods for navigation, authentication, and other browser-related tasks.
6363
6464
Attributes:
65-
id (UUID): The unique identifier for the Dendrite instance.
66-
auth_data (Optional[AuthSession]): The authentication session data for the browser.
67-
dendrite_api_key (str): The API key for Dendrite, used for interactions with the Dendrite API.
68-
playwright_options (dict): Options for configuring the Playwright browser instance.
69-
playwright (Optional[Playwright]): The Playwright instance managing the browser.
65+
id (str): The unique identifier for the Dendrite instance.
7066
browser_context (Optional[BrowserContext]): The current browser context, which may include cookies and other session data.
7167
active_page_manager (Optional[PageManager]): The manager responsible for handling active pages within the browser context.
7268
user_id (Optional[str]): The user ID associated with the browser session.
73-
browser_api_client (BrowserAPIClient): The API client used for communicating with the Dendrite API.
74-
api_config (APIConfig): The configuration for the language models, including API keys for OpenAI and Anthropic.
75-
76-
Raises:
77-
Exception: If any of the required API keys (Dendrite, OpenAI, Anthropic) are not provided or found in the environment variables.
69+
logic_engine (LogicEngine): The engine used for processing natural language interactions.
70+
closed (bool): Whether the browser instance has been closed.
7871
"""
7972

8073
def __init__(
@@ -88,10 +81,14 @@ def __init__(
8881
Initialize Dendrite with optional domain authentication.
8982
9083
Args:
91-
playwright_options: Options for configuring Playwright
92-
remote_config: Remote browser provider configuration
93-
config: Configuration object
94-
auth: List of domains or single domain to load authentication state for
84+
playwright_options (dict): Options for configuring Playwright browser instance.
85+
Defaults to non-headless mode with stealth arguments.
86+
remote_config (Optional[Providers]): Remote browser provider configuration.
87+
Defaults to None for local browser.
88+
config (Optional[Config]): Configuration object for the instance.
89+
Defaults to a new Config instance.
90+
auth (Optional[Union[List[str], str]]): List of domains or single domain
91+
to load authentication state for. Defaults to None.
9592
"""
9693
self._impl = self._get_impl(remote_config)
9794
self._playwright_options = playwright_options
@@ -182,19 +179,23 @@ def goto(
182179
expected_page: str = "",
183180
) -> Page:
184181
"""
185-
Navigates to the specified URL, optionally in a new tab
182+
Navigates to the specified URL, optionally in a new tab.
186183
187184
Args:
188-
url (str): The URL to navigate to.
189-
new_tab (bool, optional): Whether to open the URL in a new tab. Defaults to False.
190-
timeout (Optional[float], optional): The maximum time (in milliseconds) to wait for the page to load. Defaults to 15000.
191-
expected_page (str, optional): A description of the expected page type for verification. Defaults to an empty string.
185+
url (str): The URL to navigate to. If no protocol is specified, https:// will be added.
186+
new_tab (bool): Whether to open the URL in a new tab. Defaults to False.
187+
timeout (Optional[float]): The maximum time in milliseconds to wait for navigation.
188+
Defaults to 15000ms. Navigation will continue even if timeout occurs.
189+
expected_page (str): A description of the expected page type for verification.
190+
If provided, will verify the loaded page matches the description.
191+
Defaults to empty string (no verification).
192192
193193
Returns:
194194
Page: The page object after navigation.
195195
196196
Raises:
197-
Exception: If there is an error during navigation or if the expected page type is not found.
197+
IncorrectOutcomeError: If expected_page is provided and the loaded page
198+
doesn't match the expected description.
198199
"""
199200
if not re.match("^\\w+://", url):
200201
url = f"https://{url}"
@@ -227,8 +228,13 @@ def scroll_to_bottom(
227228
"""
228229
Scrolls to the bottom of the current page.
229230
230-
Returns:
231-
None
231+
Args:
232+
timeout (float): Maximum time in milliseconds to attempt scrolling.
233+
Defaults to 30000ms.
234+
scroll_increment (int): Number of pixels to scroll in each step.
235+
Defaults to 1000 pixels.
236+
no_progress_limit (int): Number of consecutive attempts with no progress
237+
before stopping. Defaults to 3 attempts.
232238
"""
233239
active_page = self.get_active_page()
234240
active_page.scroll_to_bottom(
@@ -276,10 +282,12 @@ def add_cookies(self, cookies):
276282
Adds cookies to the current browser context.
277283
278284
Args:
279-
cookies (List[Dict[str, Any]]): A list of cookies to be added to the browser context.
285+
cookies (List[Dict[str, Any]]): A list of cookie objects to be added.
286+
Each cookie should be a dictionary with standard cookie attributes
287+
(name, value, domain, etc.).
280288
281289
Raises:
282-
Exception: If the browser context is not initialized.
290+
DendriteException: If the browser context is not initialized.
283291
"""
284292
if not self.browser_context:
285293
raise DendriteException("Browser context not initialized")
@@ -410,8 +418,16 @@ def save_auth(self, url: str) -> None:
410418
"""
411419
Save authentication state for a specific domain.
412420
421+
This method captures and stores the current browser context's storage state
422+
(cookies and origin data) for the specified domain. The state can be later
423+
used to restore authentication.
424+
413425
Args:
414-
domain (str): Domain to save authentication for (e.g., "github.com")
426+
url (str): URL or domain to save authentication for (e.g., "github.com"
427+
or "https://github.com"). The domain will be extracted from the URL.
428+
429+
Raises:
430+
DendriteException: If the browser context is not initialized.
415431
"""
416432
if not self.browser_context:
417433
raise DendriteException("Browser context not initialized")
@@ -439,11 +455,15 @@ def setup_auth(
439455
message: str = "Please log in to the website. Once done, press Enter to continue...",
440456
) -> None:
441457
"""
442-
Set up authentication for a specific URL.
458+
Set up authentication for a specific URL by guiding the user through login.
459+
460+
This method opens a browser window, navigates to the specified URL, waits for
461+
the user to complete the login process, and then saves the authentication state.
443462
444463
Args:
445464
url (str): URL to navigate to for login
446-
message (str): Message to show while waiting for user input
465+
message (str): Message to show while waiting for user to complete login.
466+
Defaults to standard login instruction message.
447467
"""
448468
domain = get_domain_w_suffix(url)
449469
try:

dendrite/logic/config.py

+28
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,41 @@
1010

1111

1212
class Config:
13+
"""
14+
Configuration class for Dendrite that manages file paths and LLM settings.
15+
16+
This class handles the configuration of cache locations, authentication sessions,
17+
and LLM (Language Learning Model) settings for the Dendrite system.
18+
19+
Attributes:
20+
cache_path (Path): Path to the cache directory
21+
llm_config (LLMConfig): Configuration for language learning models
22+
extract_cache (FileCache): Cache for extracted script data
23+
element_cache (FileCache): Cache for element selectors
24+
storage_cache (FileCache): Cache for browser storage states
25+
auth_session_path (Path): Path to authentication session data
26+
"""
27+
1328
def __init__(
1429
self,
1530
root_path: Union[str, Path] = ".dendrite",
1631
cache_path: Union[str, Path] = "cache",
1732
auth_session_path: Union[str, Path] = "auth",
1833
llm_config: Optional[LLMConfig] = None,
1934
):
35+
"""
36+
Initialize the Config with specified paths and LLM configuration.
37+
38+
Args:
39+
root_path (Union[str, Path]): Base directory for all Dendrite data.
40+
Defaults to ".dendrite".
41+
cache_path (Union[str, Path]): Directory name for cache storage relative
42+
to root_path. Defaults to "cache".
43+
auth_session_path (Union[str, Path]): Directory name for authentication
44+
sessions relative to root_path. Defaults to "auth".
45+
llm_config (Optional[LLMConfig]): Configuration for language models.
46+
If None, creates a default LLMConfig instance.
47+
"""
2048
self.cache_path = root_path / Path(cache_path)
2149
self.llm_config = llm_config or LLMConfig()
2250
self.extract_cache = FileCache(Script, self.cache_path / "extract.json")

dendrite/logic/extract/compress_html.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from collections import Counter
44
from typing import List, Optional, Tuple, TypedDict, Union
55

6-
from bs4 import BeautifulSoup, NavigableString, PageElement
6+
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
77
from bs4.element import Tag
88

99
from dendrite.logic.dom.truncate import (
@@ -385,18 +385,25 @@ def traverse(tag: Union[BeautifulSoup, Tag]):
385385
# child.replace_with(
386386
# f"[...{', '.join(next_element_tags)} tags collapsed for readability...]")
387387

388-
def remove_double_nested(soup):
389-
for tag in soup.find_all(True):
388+
def remove_double_nested(
389+
soup: Union[BeautifulSoup, Tag]
390+
) -> Union[BeautifulSoup, Tag]:
391+
for tag in soup.find_all():
390392
# If a tag only contains a single child of the same type
391-
if len(tag.find_all(True, recursive=False)) == 1 and isinstance(
392-
tag.contents[0], Tag
393+
children = tag.find_all(recursive=False)
394+
if (
395+
len(children) == 1
396+
and tag.contents
397+
and isinstance(tag.contents[0], Tag)
393398
):
394399
child_tag = tag.contents[0]
395400
# move the contents of the child tag up to the parent
396401
tag.clear()
397402
tag.extend(child_tag.contents)
398-
if len(tag.find_all(True, recursive=False)) == 1 and isinstance(
399-
tag.contents[0], Tag
403+
if (
404+
len(tag.find_all(recursive=False)) == 1
405+
and tag.contents
406+
and isinstance(tag.contents[0], Tag)
400407
):
401408
remove_double_nested(tag)
402409

dendrite/logic/get_element/get_element.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
async def get_element(dto: GetElementsDTO, config: Config) -> GetElementResponse:
2222
if isinstance(dto.prompt, str):
2323
return await process_prompt(dto.prompt, dto, config)
24-
raise ...
24+
raise NotImplementedError("Prompt is not a string")
2525

2626

2727
async def process_prompt(

0 commit comments

Comments
 (0)