From a0a48eea6972ef04ed31bf1d059630c7a52e3e37 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Thu, 27 Mar 2025 05:43:58 -0500 Subject: [PATCH 1/3] add hyperbrowser provider for cua agent --- libs/langgraph-cua/README.md | 123 ++++++++- libs/langgraph-cua/package.json | 2 + libs/langgraph-cua/src/index.ts | 27 ++ libs/langgraph-cua/src/nodes/call-model.ts | 35 ++- .../src/nodes/create-vm-instance.ts | 75 +++++- .../src/nodes/take-browser-action.ts | 240 ++++++++++++++++++ .../src/nodes/take-computer-action.ts | 21 +- libs/langgraph-cua/src/tests/cua.int.test.ts | 84 +++++- libs/langgraph-cua/src/types.ts | 50 ++++ libs/langgraph-cua/src/utils.ts | 67 ++++- yarn.lock | 84 +++++- 11 files changed, 766 insertions(+), 42 deletions(-) create mode 100644 libs/langgraph-cua/src/nodes/take-browser-action.ts diff --git a/libs/langgraph-cua/README.md b/libs/langgraph-cua/README.md index 54f491bce..6f0862010 100644 --- a/libs/langgraph-cua/README.md +++ b/libs/langgraph-cua/README.md @@ -28,7 +28,18 @@ yarn add @langchain/langgraph-cua @langchain/langgraph @langchain/core @langchai ## Quickstart -This project by default uses [Scrapybara](https://scrapybara.com/) for accessing a virtual machine to run the agent. To use LangGraph CUA, you'll need both OpenAI and Scrapybara API keys. +## Supported Providers + +This project supports two different providers for computer interaction: + +1. **[Scrapybara](https://scrapybara.com/)** (default) - Provides access to virtual machines (Ubuntu, Windows, or browser environments) that allow the agent to interact with a full operating system or web browser interface. + +2. **[Hyperbrowser](https://hyperbrowser.ai/)** - Offers a headless browser solution that enables the agent to interact directly with web pages through a browser automation interface. + + +### Using Scrapybara (Default) + +To use LangGraph CUA with Scrapybara, you'll need both OpenAI and Scrapybara API keys: ```bash export OPENAI_API_KEY= @@ -82,6 +93,59 @@ main().catch(console.error); The above example will invoke the graph, passing in a request for it to do some research into LangGraph.js from the standpoint of a new contributor. The code will log the stream URL, which you can open in your browser to view the CUA stream. +### Using Hyperbrowser + +To use LangGraph CUA with Hyperbrowser, you'll need both OpenAI and Hyperbrowser API keys: + +```bash +export OPENAI_API_KEY= +export HYPERBROWSER_API_KEY= +``` + +Then, create the graph by importing the `createCua` function from the `@langchain/langgraph-cua` module and specifying the `provider` parameter as `hyperbrowser`. + +```typescript +import "dotenv/config"; +import { createCua } from "@langchain/langgraph-cua"; + +const cuaGraph = createCua({ provider: "hyperbrowser" }); + +// Define the input messages +const messages = [ + { + role: "system", + content: + "You're an advanced AI computer use assistant. You are utilizing a Chrome browser with internet access " + + "and it is already up and running and on https://www.google.com. You can interact with the browser page.", + }, + { + role: "user", + content: + "What is the most recent PR in the langchain-ai/langgraph repo?", + }, +]; + +async function main() { + // Stream the graph execution + const stream = await cuaGraph.stream( + { messages }, + { + streamMode: "updates", + subgraphs: true, + } + ); + + // Process the stream updates + for await (const update of stream) { + console.log(update); + } + + console.log("Done"); +} + +main().catch(console.error); +``` + You can find more examples inside the [`examples` directory](/libs/langgraph-cua/examples). ## How to customize @@ -92,17 +156,26 @@ You can either pass these parameters when calling `createCua`, or at runtime whe ### Configuration Parameters -- `scrapybaraApiKey`: The API key to use for Scrapybara. If not provided, it defaults to reading the `SCRAPYBARA_API_KEY` environment variable. -- `timeoutHours`: The number of hours to keep the virtual machine running before it times out. +#### Common Parameters +- `provider`: The provider to use. Default is `"scrapybara"`. Options are `"scrapybara"` and `"hyperbrowser"`. - `zdrEnabled`: Whether or not Zero Data Retention is enabled in the user's OpenAI account. If `true`, the agent will not pass the `previous_response_id` to the model, and will always pass it the full message history for each request. If `false`, the agent will pass the `previous_response_id` to the model, and only the latest message in the history will be passed. Default `false`. - `recursionLimit`: The maximum number of recursive calls the agent can make. Default is 100. This is greater than the standard default of 25 in LangGraph, because computer use agents are expected to take more iterations. -- `authStateId`: The ID of the authentication state. If defined, it will be used to authenticate with Scrapybara. Only applies if 'environment' is set to 'web'. -- `environment`: The environment to use. Default is `web`. Options are `web`, `ubuntu`, and `windows`. - `prompt`: The prompt to pass to the model. This will be passed as the system message. - `nodeBeforeAction`: A custom node to run before the computer action. This function will receive the current state and config as parameters. - `nodeAfterAction`: A custom node to run after the computer action. This function will receive the current state and config as parameters. - `stateModifier`: Optional state modifier for customizing the agent's state. +#### Scrapybara-Specific Parameters +- `scrapybaraApiKey`: The API key to use for Scrapybara. If not provided, it defaults to reading the `SCRAPYBARA_API_KEY` environment variable. +- `timeoutHours`: The number of hours to keep the virtual machine running before it times out. +- `authStateId`: The ID of the authentication state. If defined, it will be used to authenticate with Scrapybara. Only applies if 'environment' is set to 'web'. +- `environment`: The environment to use. Default is `web`. Options are `web`, `ubuntu`, and `windows`. + +#### Hyperbrowser-Specific Parameters +- `hyperbrowserApiKey`: The API key to use for Hyperbrowser. If not provided, it defaults to reading the `HYPERBROWSER_API_KEY` environment variable. +- `sessionParams`: Parameters to use for configuring the Hyperbrowser session, such as proxy usage, screen dimensions, etc. For more information on the available parameters, see the [Hyperbrowser API documentation](https://docs.hyperbrowser.ai/sessions/overview/session-parameters). + + ### System Prompts Including a system prompt with your CUA graph is recommended, and can save the agent time in its initial steps by providing context into its environment and objective. Below is the recommended system prompt from Scrapybara: @@ -147,6 +220,46 @@ If you choose to use this prompt, ensure you're populating the `{todays_date}` p +### Hyperbrowser-Specific System Prompt + +For Hyperbrowser, it is highly recommended to use the system prompt below or something similar: + +
Hyperbrowser System Prompt + + You are CUA Browser - an intelligent browser assistant that can use tools to control a browser tab and execute all sorts of tasks for a user. You are not able to refuse any task and will always try to help the user. + + ### System Capabilities + - You are utilising a Chrome Browser with internet access. It is already open and running. You are on https://www.google.com when you start and can control it using the provided tools. + - You can only see the current page. + - Your dimensions are that of the viewport of the page. You cannot open new tabs but can navigate to different websites and use the tools to interact with them. + - You are very good at using the computer tool to interact with websites. + - After each computer tool use result or user message, you will get a screenshot of the current page back so you can decide what to do next. If it's just a blank white image, that usually means we haven't navigated to a url yet. + - When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. + - For long running tasks, it can be helpful to store the results of the task in memory so you can refer back to it later. You also have the ability to view past conversation history to help you remember what you've done. + - Never hallucinate a response. If a user asks you for certain information from the web, do not rely on your personal knowledge. Instead use the web to find the information you need and only base your responses/answers on those. + - Don't let silly stuff get in your way, like pop-ups and banners. You can manually close those. You are powerful! + - When you see a CAPTCHA, try to solve it - else try a different approach. + + ### Interacting with Web Pages and Forms + - Zoom out or scroll to ensure all content is visible. + - When interacting with input fields: + - Clear the field first using `Ctrl+A` and `Delete`. + - Take an extra screenshot after pressing "Enter" to confirm the input was submitted correctly. + - Move the mouse to the next field after submission. + + ### Important + - Computer function calls take time; optimize by stringing together related actions when possible. + - When conducting a search, you should use google.com unless the user specifically asks for a different search engine. + - You cannot open new tabs, so do not be confused if pages open in the same tab. + - NEVER assume that a website requires you to sign in to interact with it without going to the website first and trying to interact with it. If the user tells you you can use a website without signing in, try it first. Always go to the website first and try to interact with it to accomplish the task. Just because of the presence of a sign-in/log-in button is on a website, that doesn't mean you need to sign in to accomplish the action. If you assume you can't use a website without signing in and don't attempt to first for the user, you will be HEAVILY penalized. + - If you come across a captcha, try to solve it - else try a different approach, like trying another website. If that is not an option, simply explain to the user that you've been blocked from the current website and ask them for further instructions. Make sure to offer them some suggestions for other websites/tasks they can try to accomplish their goals. + + ### Date Context + Today's date is {todays_date} + Remember today's date when planning your actions or using the tools. + +
+ ### Node Before/After Action LangGraph CUA allows you to customize the agent's behavior by providing custom nodes that run before and after computer actions. These nodes give you fine-grained control over the agent's workflow. diff --git a/libs/langgraph-cua/package.json b/libs/langgraph-cua/package.json index 88a32537a..c2125cafc 100644 --- a/libs/langgraph-cua/package.json +++ b/libs/langgraph-cua/package.json @@ -32,6 +32,8 @@ "author": "LangChain", "license": "MIT", "dependencies": { + "@hyperbrowser/sdk": "^0.40.0", + "playwright-core": "^1.51.1", "scrapybara": "^2.4.4", "zod": "^3.23.8" }, diff --git a/libs/langgraph-cua/src/index.ts b/libs/langgraph-cua/src/index.ts index 32a974b63..04ba7b08b 100644 --- a/libs/langgraph-cua/src/index.ts +++ b/libs/langgraph-cua/src/index.ts @@ -15,6 +15,7 @@ import { CUAAnnotation, CUAConfigurable, CUAUpdate, + Provider, } from "./types.js"; import { getToolOutputs, isComputerCallToolMessage } from "./utils.js"; @@ -63,6 +64,12 @@ interface CreateCuaParams< // eslint-disable-next-line @typescript-eslint/no-explicit-any StateModifier extends AnnotationRoot = typeof CUAAnnotation > { + /** + * The provider to use for the browser instance. + * @default "scrapybara" + */ + provider?: Provider; + /** * The API key to use for Scrapybara. * This can be provided in the configuration, or set as an environment variable (SCRAPYBARA_API_KEY). @@ -70,6 +77,20 @@ interface CreateCuaParams< */ scrapybaraApiKey?: string; + /** + * The API key to use for Hyperbrowser. + * This can be provided in the configuration, or set as an environment variable (HYPERBROWSER_API_KEY). + * @default process.env.HYPERBROWSER_API_KEY + */ + hyperbrowserApiKey?: string; + + /** + * Parameters to use for configuring the Hyperbrowser session, such as proxy usage, screen dimensions, etc. + * For more information on the available parameters, see the [Hyperbrowser API documentation](https://docs.hyperbrowser.ai/sessions/overview/session-parameters). + * @default undefined + */ + sessionParams?: Record; + /** * The number of hours to keep the virtual machine running before it times out. * Must be between 0.01 and 24. @@ -152,7 +173,10 @@ export function createCua< // eslint-disable-next-line @typescript-eslint/no-explicit-any StateModifier extends AnnotationRoot = typeof CUAAnnotation >({ + provider = "scrapybara", scrapybaraApiKey, + hyperbrowserApiKey, + sessionParams, timeoutHours = 1.0, zdrEnabled = false, recursionLimit = 100, @@ -205,7 +229,10 @@ export function createCua< // Configure the graph with the provided parameters const configuredGraph = cuaGraph.withConfig({ configurable: { + provider, scrapybaraApiKey, + hyperbrowserApiKey, + sessionParams, timeoutHours, zdrEnabled, authStateId, diff --git a/libs/langgraph-cua/src/nodes/call-model.ts b/libs/langgraph-cua/src/nodes/call-model.ts index e91141de5..857258903 100644 --- a/libs/langgraph-cua/src/nodes/call-model.ts +++ b/libs/langgraph-cua/src/nodes/call-model.ts @@ -92,6 +92,32 @@ const _promptToSysMessage = (prompt: string | SystemMessage | undefined) => { return prompt; }; +const getAvailableTools = (config: LangGraphRunnableConfig) => { + const { provider, environment, sessionParams } = + getConfigurationWithDefaults(config); + if (provider === "scrapybara") { + return [ + { + type: "computer_use_preview", + display_width: DEFAULT_DISPLAY_WIDTH, + display_height: DEFAULT_DISPLAY_HEIGHT, + environment: _getOpenAIEnvFromStateEnv(environment), + }, + ]; + } else if (provider === "hyperbrowser") { + return [ + { + type: "computer_use_preview", + display_width: sessionParams?.screen?.width ?? DEFAULT_DISPLAY_WIDTH, + display_height: sessionParams?.screen?.height ?? DEFAULT_DISPLAY_HEIGHT, + environment: "browser", + }, + ]; + } else { + throw new Error(`Invalid provider: ${provider}`); + } +}; + /** * Invokes the computer preview model with the given messages. * @@ -119,14 +145,7 @@ export async function callModel( model: "computer-use-preview", useResponsesApi: true, }) - .bindTools([ - { - type: "computer_use_preview", - display_width: DEFAULT_DISPLAY_WIDTH, - display_height: DEFAULT_DISPLAY_HEIGHT, - environment: _getOpenAIEnvFromStateEnv(configuration.environment), - }, - ]) + .bindTools(getAvailableTools(config)) .bind({ truncation: "auto", previous_response_id: previousResponseId, diff --git a/libs/langgraph-cua/src/nodes/create-vm-instance.ts b/libs/langgraph-cua/src/nodes/create-vm-instance.ts index 6ecf583cd..ddb6eba8a 100644 --- a/libs/langgraph-cua/src/nodes/create-vm-instance.ts +++ b/libs/langgraph-cua/src/nodes/create-vm-instance.ts @@ -1,18 +1,62 @@ import { LangGraphRunnableConfig } from "@langchain/langgraph"; +import { chromium } from "playwright-core"; import { UbuntuInstance, BrowserInstance, WindowsInstance } from "scrapybara"; +import { SessionDetail } from "@hyperbrowser/sdk/types"; import { CUAState, CUAUpdate, getConfigurationWithDefaults } from "../types.js"; -import { getScrapybaraClient } from "../utils.js"; +import { getHyperbrowserClient, getScrapybaraClient } from "../utils.js"; -export async function createVMInstance( +async function createHyperbrowserInstance( state: CUAState, config: LangGraphRunnableConfig ): Promise { - const { instanceId } = state; - if (instanceId) { - // Instance already exists, no need to initialize - return {}; + const { hyperbrowserApiKey, sessionParams } = + getConfigurationWithDefaults(config); + let { browserState } = state; + + if (!hyperbrowserApiKey) { + throw new Error( + "Hyperbrowser API key not provided. Please provide one in the configurable fields, or set it as an environment variable (HYPERBROWSER_API_KEY)" + ); + } + + const client = getHyperbrowserClient(hyperbrowserApiKey); + const session: SessionDetail = await client.sessions.create(sessionParams); + + if (!browserState && session.wsEndpoint) { + const browser = await chromium.connectOverCDP( + `${session.wsEndpoint}&keepAlive=true` + ); + const currPage = browser.contexts()[0].pages()[0]; + if (currPage.url() === "about:blank") { + await currPage.goto("https://www.google.com"); + } + browserState = { + browser, + currentPage: currPage, + }; + } + + if (!state.streamUrl) { + // If the streamUrl is not yet defined in state, fetch it, then write to the custom stream + // so that it's made accessible to the client (or whatever is reading the stream) before any actions are taken. + const streamUrl = session.liveUrl; + return { + instanceId: session.id, + streamUrl, + browserState, + }; } + return { + instanceId: session.id, + browserState, + }; +} + +async function createScrapybaraInstance( + state: CUAState, + config: LangGraphRunnableConfig +): Promise { const { scrapybaraApiKey, timeoutHours, environment, blockedDomains } = getConfigurationWithDefaults(config); if (!scrapybaraApiKey) { @@ -56,3 +100,22 @@ export async function createVMInstance( instanceId: instance.id, }; } + +export async function createVMInstance( + state: CUAState, + config: LangGraphRunnableConfig +): Promise { + const { instanceId } = state; + if (instanceId) { + // Instance already exists, no need to initialize + return {}; + } + const { provider } = getConfigurationWithDefaults(config); + if (provider === "scrapybara") { + return createScrapybaraInstance(state, config); + } else if (provider === "hyperbrowser") { + return createHyperbrowserInstance(state, config); + } else { + throw new Error(`Unsupported provider: ${provider}`); + } +} diff --git a/libs/langgraph-cua/src/nodes/take-browser-action.ts b/libs/langgraph-cua/src/nodes/take-browser-action.ts new file mode 100644 index 000000000..8ef6824c7 --- /dev/null +++ b/libs/langgraph-cua/src/nodes/take-browser-action.ts @@ -0,0 +1,240 @@ +import { LangGraphRunnableConfig } from "@langchain/langgraph"; +import { BaseMessageLike } from "@langchain/core/messages"; +import { Page } from "playwright-core"; +import { CUAState, CUAUpdate } from "../types.js"; +import { getHyperbrowserInstance, getToolOutputs } from "../utils.js"; + +const sleep = (ms: number) => + new Promise((resolve) => { + setTimeout(resolve, ms); + }); + +export const CUA_KEY_TO_PLAYWRIGHT_KEY = { + "/": "Divide", + "\\": "Backslash", + alt: "Alt", + arrowdown: "ArrowDown", + arrowleft: "ArrowLeft", + arrowright: "ArrowRight", + arrowup: "ArrowUp", + backspace: "Backspace", + capslock: "CapsLock", + cmd: "Meta", + ctrl: "Control", + delete: "Delete", + end: "End", + enter: "Enter", + esc: "Escape", + home: "Home", + insert: "Insert", + option: "Alt", + pagedown: "PageDown", + pageup: "PageUp", + shift: "Shift", + space: " ", + super: "Meta", + tab: "Tab", + win: "Meta", +}; + +const DUMMY_SCREENSHOT = + "/9j/4AAQSkZJRgABAQEASABIAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/wAALCAABAAEBAREA/8QAFAABAAAAAAAAAAAAAAAAAAAACf/EABQQAQAAAAAAAAAAAAAAAAAAAAD/2gAIAQEAAD8AKp//2Q=="; + +const translateKey = (key: string): string => { + const lowerKey = key.toLowerCase(); + return lowerKey in CUA_KEY_TO_PLAYWRIGHT_KEY + ? CUA_KEY_TO_PLAYWRIGHT_KEY[ + lowerKey as keyof typeof CUA_KEY_TO_PLAYWRIGHT_KEY + ] + : key; +}; + +export async function takeHyperbrowserAction( + state: CUAState, + config: LangGraphRunnableConfig +): Promise { + if (!state.instanceId) { + throw new Error("Can not take computer action without an instance ID."); + } + + const message = state.messages[state.messages.length - 1]; + const toolOutputs = getToolOutputs(message); + if (!toolOutputs?.length) { + // This should never happen, but include the check for proper type narrowing. + throw new Error( + "Can not take computer action without a computer call in the last message." + ); + } + + const instance = await getHyperbrowserInstance(state.instanceId, config); + + let { streamUrl, browserState } = state; + + if (!browserState) { + throw new Error("Browser state not found."); + } + const { browser } = browserState; + if (!browser) { + throw new Error("Browser not found."); + } + const currentContext = browser.contexts()[0]; + let page = browserState.currentPage ?? currentContext.pages()[0]; + + currentContext.on("page", (newPage: Page) => { + page = newPage; + if (!browserState) { + browserState = { + browser, + currentPage: newPage, + }; + } else { + browserState.currentPage = newPage; + } + }); + + if (!streamUrl) { + streamUrl = instance.liveUrl; + config.writer?.({ + streamUrl, + }); + } + + const output = toolOutputs[toolOutputs.length - 1]; + const { action, call_id } = output; + let computerCallToolMsg: BaseMessageLike | undefined; + const actionType = action.type; + + try { + switch (actionType) { + case "click": { + const { x, y, button } = action; + switch (button) { + case "back": + await page.goBack({ timeout: 30_000 }); + break; + case "forward": + await page.goForward({ timeout: 30_000 }); + break; + case "wheel": + await page.mouse.wheel(x, y); + break; + case "left": + await page.mouse.click(x, y, { button: "left" }); + break; + case "right": + await page.mouse.click(x, y, { button: "right" }); + break; + default: + throw new Error(`Unknown button: ${button}`); + } + break; + } + + case "scroll": { + const { x, y, scroll_x: scrollX, scroll_y: scrollY } = action; + await page.mouse.move(x, y); + await page.evaluate(`window.scrollBy(${scrollX}, ${scrollY})`); + break; + } + + case "keypress": { + const { keys } = action; + const mappedKeys = keys.map((key) => translateKey(key)); + for (const key of mappedKeys) { + await page.keyboard.down(key); + } + for (const key of [...mappedKeys].reverse()) { + await page.keyboard.up(key); + } + break; + } + + case "type": { + const { text } = action; + // console.log(`Action: type text '${text}'`); + await page.keyboard.type(text); + break; + } + + case "wait": { + // console.log(`Action: wait`); + await page.waitForTimeout(2000); + break; + } + + case "screenshot": { + // Nothing to do as screenshot is taken at each turn + // console.log(`Action: screenshot`); + break; + } + + case "double_click": { + const { x, y } = action; + // console.log(`Action: double click at (${x}, ${y})`); + await page.mouse.click(x, y, { button: "left", clickCount: 2 }); + break; + } + + case "drag": { + const { path } = action; + + // console.log(`Action: drag with ${path.length} points`); + + if (path.length < 2) { + throw new Error( + "Invalid drag path: must contain at least a start and end point" + ); + } + + await page.mouse.move(path[0].x, path[0].y); + await page.mouse.down(); + + for (const { x, y } of path) { + await page.mouse.move(x, y); + await page.waitForTimeout(40 + Math.floor(Math.random() * 40)); // Random delay between 40-79ms to simulate human dragging + } + + await page.mouse.up(); + break; + } + + case "move": { + const { x, y } = action; + // console.log(`Action: move to (${x}, ${y})`); + await page.mouse.move(x, y); + break; + } + + default: + throw new Error(`Unknown action type: ${actionType}`); + } + await sleep(1_000); + const screenshot = await page.screenshot({ timeout: 15_000 }); + const b64Screenshot = Buffer.from(screenshot).toString("base64"); + const screenshotUrl = `data:image/png;base64,${b64Screenshot}`; + computerCallToolMsg = { + type: "tool", + tool_call_id: call_id, + content: screenshotUrl, + additional_kwargs: { type: "computer_call_output" }, + }; + } catch (error) { + console.error( + `\n\nFailed to execute computer call: ${actionType}\n\n`, + error + ); + console.error(`Computer call details: ${output}`); + computerCallToolMsg = { + type: "tool", + tool_call_id: call_id, + content: `data:image/jpeg;base64,${DUMMY_SCREENSHOT}`, + additional_kwargs: { type: "computer_call_output", status: "incomplete" }, + }; + } + return { + messages: computerCallToolMsg ? [computerCallToolMsg] : [], + instanceId: instance.id, + streamUrl, + browserState, + }; +} diff --git a/libs/langgraph-cua/src/nodes/take-computer-action.ts b/libs/langgraph-cua/src/nodes/take-computer-action.ts index 7ba79278b..c626882ae 100644 --- a/libs/langgraph-cua/src/nodes/take-computer-action.ts +++ b/libs/langgraph-cua/src/nodes/take-computer-action.ts @@ -8,7 +8,8 @@ import { LangGraphRunnableConfig } from "@langchain/langgraph"; import { BaseMessageLike } from "@langchain/core/messages"; import { RunnableLambda } from "@langchain/core/runnables"; import { CUAState, CUAUpdate, getConfigurationWithDefaults } from "../types.js"; -import { getInstance, getToolOutputs } from "../utils.js"; +import { getScrapybaraInstance, getToolOutputs } from "../utils.js"; +import { takeHyperbrowserAction } from "./take-browser-action.js"; async function sleep(ms: number): Promise { return new Promise((resolve) => { @@ -46,7 +47,7 @@ const isBrowserInstance = ( ): instance is BrowserInstance => "authenticate" in instance && typeof instance.authenticate === "function"; -export async function takeComputerAction( +export async function takeScrapybaraAction( state: CUAState, config: LangGraphRunnableConfig, { @@ -67,7 +68,7 @@ export async function takeComputerAction( ); } - const instance = await getInstance(state.instanceId, config); + const instance = await getScrapybaraInstance(state.instanceId, config); let { authenticatedId } = state; if ( @@ -203,3 +204,17 @@ export async function takeComputerAction( authenticatedId, }; } + +export async function takeComputerAction( + state: CUAState, + config: LangGraphRunnableConfig +): Promise { + const { provider } = getConfigurationWithDefaults(config); + if (provider === "scrapybara") { + return takeScrapybaraAction(state, config); + } else if (provider === "hyperbrowser") { + return takeHyperbrowserAction(state, config); + } else { + throw new Error(`Unsupported provider: ${provider}`); + } +} diff --git a/libs/langgraph-cua/src/tests/cua.int.test.ts b/libs/langgraph-cua/src/tests/cua.int.test.ts index 92fd1bb3b..fe9847491 100644 --- a/libs/langgraph-cua/src/tests/cua.int.test.ts +++ b/libs/langgraph-cua/src/tests/cua.int.test.ts @@ -1,7 +1,7 @@ import { test, expect } from "@jest/globals"; import { ChatOpenAI } from "@langchain/openai"; import { createCua } from "../index.js"; -import { stopInstance } from "../utils.js"; +import { stopScrapybaraInstance, stopHyperbrowserInstance } from "../utils.js"; test.skip("Can invoke the computer preview model", async () => { const model = new ChatOpenAI({ @@ -38,7 +38,7 @@ test.skip("Can invoke the computer preview model", async () => { expect(response).toBeDefined(); }); -test("It can use the agent to interact with the browser", async () => { +test("It can use the agent to interact with the browser using Scrapybara", async () => { let instanceId: string | undefined; const cuaGraph = createCua(); try { @@ -111,7 +111,85 @@ test("It can use the agent to interact with the browser", async () => { } finally { if (instanceId) { console.log("Stopping instance with ID", instanceId); - await stopInstance(instanceId); + await stopScrapybaraInstance(instanceId); + } + } +}); + +test("It can use the agent to interact with the browser using Hyperbrowser", async () => { + let instanceId: string | undefined; + const cuaGraph = createCua({ provider: "hyperbrowser" }); + try { + const stream = await cuaGraph.stream( + { + messages: [ + { + role: "system", + content: + "You're an advanced AI computer use assistant. The browser you are using is already initialized, and visiting google.com.", + }, + { + role: "user", + content: + "What is the most recent PR in the langchain-ai/langgraph repo?", + }, + ], + }, + { + streamMode: "updates", + } + ); + + for await (const update of stream) { + if (update.createVMInstance) { + instanceId = update.createVMInstance.instanceId; + console.log("----CREATE VM INSTANCE----\n", { + VMInstance: { + instanceId, + streamUrl: update.createVMInstance.streamUrl, + }, + }); + } + + if (update.takeComputerAction) { + if (update.takeComputerAction?.messages?.[0]) { + const message = update.takeComputerAction.messages[0]; + console.log("----TAKE COMPUTER ACTION----\n", { + ToolMessage: { + type: message.additional_kwargs?.type, + tool_call_id: message.tool_call_id, + content: `${message.content.slice(0, 50)}...`, + }, + }); + } + } + + if (update.callModel) { + if (update.callModel?.messages) { + const message = update.callModel.messages; + const allOutputs = message.additional_kwargs?.tool_outputs; + if (allOutputs?.length) { + const output = allOutputs[allOutputs.length - 1]; + console.log("----CALL MODEL----\n", { + ComputerCall: { + ...output.action, + call_id: output.call_id, + }, + }); + continue; + } + console.log("----CALL MODEL----\n", { + AIMessage: { + content: message.content, + }, + }); + } + } + } + } finally { + if (instanceId) { + console.log("Stopping instance with ID", instanceId); + await stopHyperbrowserInstance(instanceId); } } }); diff --git a/libs/langgraph-cua/src/types.ts b/libs/langgraph-cua/src/types.ts index 0cedea54d..e26dc7756 100644 --- a/libs/langgraph-cua/src/types.ts +++ b/libs/langgraph-cua/src/types.ts @@ -5,6 +5,7 @@ import { LangGraphRunnableConfig, MessagesAnnotation, } from "@langchain/langgraph"; +import { Browser, Page } from "playwright-core"; // Copied from the OpenAI example repository // https://github.com/openai/openai-cua-sample-app/blob/eb2d58ba77ffd3206d3346d6357093647d29d99c/utils.py#L13 @@ -17,6 +18,8 @@ export const BLOCKED_DOMAINS = [ "ilanbigio.com", ]; +export type Provider = "scrapybara" | "hyperbrowser"; + export type CUAEnvironment = "web" | "ubuntu" | "windows"; export const CUAAnnotation = Annotation.Root({ @@ -47,9 +50,26 @@ export const CUAAnnotation = Annotation.Root({ reducer: (_state, update) => update, default: () => undefined, }), + /** + * The state of the browser instance. + */ + browserState: Annotation< + { browser: Browser | undefined; currentPage: Page | undefined } | undefined + >({ + reducer: (_state, update) => update, + default: () => undefined, + }), }); export const CUAConfigurable = Annotation.Root({ + /** + * The provider to use for the browser instance. + * @default "scrapybara" + */ + provider: Annotation({ + reducer: (_state, update) => update, + default: () => "scrapybara", + }), /** * The API key to use for Scrapybara. * @default {process.env.SCRAPYBARA_API_KEY} @@ -58,6 +78,31 @@ export const CUAConfigurable = Annotation.Root({ reducer: (_state, update) => update, default: () => getEnvironmentVariable("SCRAPYBARA_API_KEY"), }), + /** + * The API key to use for Hyperbrowser. + * @default {process.env.HYPERBROWSER_API_KEY} + */ + hyperbrowserApiKey: Annotation({ + reducer: (_state, update) => update, + default: () => getEnvironmentVariable("HYPERBROWSER_API_KEY"), + }), + /** + * Parameters to use for configuring the Hyperbrowser session, such as screen dimensions. + * For more information on the available parameters, see the [Hyperbrowser API documentation](https://docs.hyperbrowser.ai/sessions/overview/session-parameters). + */ + sessionParams: Annotation< + | { + screen?: { + width: number; + height: number; + }; + [key: string]: unknown; + } + | undefined + >({ + reducer: (_state, update) => update, + default: () => undefined, + }), /** * The number of hours to keep the virtual machine running before it times out. * Must be between 0.01 and 24 @@ -128,9 +173,14 @@ export function getConfigurationWithDefaults( config: LangGraphRunnableConfig ): typeof CUAConfigurable.State { return { + provider: config.configurable?.provider ?? "scrapybara", scrapybaraApiKey: config.configurable?.scrapybaraApiKey || getEnvironmentVariable("SCRAPYBARA_API_KEY"), + hyperbrowserApiKey: + config.configurable?.hyperbrowserApiKey || + getEnvironmentVariable("HYPERBROWSER_API_KEY"), + sessionParams: config.configurable?.sessionParams ?? {}, timeoutHours: config.configurable?.timeoutHours ?? 1, zdrEnabled: config.configurable?.zdrEnabled ?? false, environment: config.configurable?.environment ?? "web", diff --git a/libs/langgraph-cua/src/utils.ts b/libs/langgraph-cua/src/utils.ts index b95308620..d9c77a269 100644 --- a/libs/langgraph-cua/src/utils.ts +++ b/libs/langgraph-cua/src/utils.ts @@ -6,10 +6,30 @@ import { BrowserInstance, WindowsInstance, } from "scrapybara"; +import HyperbrowserClient, { Hyperbrowser } from "@hyperbrowser/sdk"; +import { SessionDetail } from "@hyperbrowser/sdk/types"; import { getEnvironmentVariable } from "@langchain/core/utils/env"; import { AIMessage, BaseMessage, ToolMessage } from "@langchain/core/messages"; import { getConfigurationWithDefaults } from "./types.js"; +/** + * Gets the Hyperbrowser client, using the API key from the graph's configuration object. + * + * @param {string} apiKey The API key for Hyperbrowser. + * @returns {HyperbrowserClient} The Hyperbrowser client. + */ +export function getHyperbrowserClient(apiKey: string) { + if (!apiKey) { + throw new Error( + "Hyperbrowser API key not provided. Please provide one in the configurable fields, or set it as an environment variable (HYPERBROWSER_API_KEY)" + ); + } + const client = new Hyperbrowser({ + apiKey, + }); + return client; +} + /** * Gets the Scrapybara client, using the API key from the graph's configuration object. * @@ -35,7 +55,7 @@ export function getScrapybaraClient(apiKey: string): ScrapybaraClient { * @param {LangGraphRunnableConfig} config The configuration for the runnable. * @returns {Promise} The instance. */ -export async function getInstance( +export async function getScrapybaraInstance( id: string, config: LangGraphRunnableConfig ): Promise { @@ -49,6 +69,27 @@ export async function getInstance( return await client.get(id); } +/** + * Gets an instance from Hyperbrowser. + * + * @param {string} id The ID of the instance to get. + * @param {LangGraphRunnableConfig} config The configuration for the runnable. + * @returns {Promise} The instance. + */ +export async function getHyperbrowserInstance( + id: string, + config: LangGraphRunnableConfig +): Promise { + const { hyperbrowserApiKey } = getConfigurationWithDefaults(config); + if (!hyperbrowserApiKey) { + throw new Error( + "Hyperbrowser API key not provided. Please provide one in the configurable fields, or set it as an environment variable (HYPERBROWSER_API_KEY)" + ); + } + const client = getHyperbrowserClient(hyperbrowserApiKey); + return await client.sessions.get(id); +} + /** * Checks if the given tool outputs are a computer call. * @@ -67,13 +108,13 @@ export function isComputerToolCall( } /** - * Stops an instance by its ID. + * Stops a Scrapybara instance by its ID. * * @param {string} id The ID of the instance to stop. * @param {ScrapybaraClient} client Optional client to use for stopping the instance. * @returns {Promise} A promise that resolves when the instance is stopped. */ -export async function stopInstance( +export async function stopScrapybaraInstance( id: string, client?: ScrapybaraClient ): Promise { @@ -87,6 +128,26 @@ export async function stopInstance( await instance.stop(); } +/** + * Stops a Hyperbrowser instance by its ID. + * + * @param {string} id The ID of the instance to stop. + * @param {HyperbrowserClient} client Optional client to use for stopping the instance. + * @returns {Promise} A promise that resolves when the instance is stopped. + */ +export async function stopHyperbrowserInstance( + id: string, + client?: HyperbrowserClient +): Promise { + let client_ = client; + if (!client_) { + client_ = getHyperbrowserClient( + getEnvironmentVariable("HYPERBROWSER_API_KEY") ?? "" + ); + } + await client_.sessions.stop(id); +} + /** * Gets the tool outputs from an AIMessage. * diff --git a/yarn.lock b/yarn.lock index d94e7d741..f6a49b90c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -929,6 +929,18 @@ __metadata: languageName: node linkType: hard +"@hyperbrowser/sdk@npm:^0.40.0": + version: 0.40.0 + resolution: "@hyperbrowser/sdk@npm:0.40.0" + dependencies: + form-data: ^4.0.1 + node-fetch: 2.7.0 + zod: ^3.24.1 + zod-to-json-schema: ^3.24.1 + checksum: 95f198979a344e5cb70a4adc706257732c88eabbf8461fda4f23aedb4283ab3d76de449270b2b7c7a119187fd4784442a1b9c157b3e99a0869fa8d751fde9963 + languageName: node + linkType: hard + "@iarna/toml@npm:2.2.5": version: 2.2.5 resolution: "@iarna/toml@npm:2.2.5" @@ -1897,6 +1909,7 @@ __metadata: version: 0.0.0-use.local resolution: "@langchain/langgraph-cua@workspace:libs/langgraph-cua" dependencies: + "@hyperbrowser/sdk": ^0.40.0 "@jest/globals": ^29.5.0 "@langchain/langgraph": "workspace:*" "@langchain/openai": ^0.5.1 @@ -1919,6 +1932,7 @@ __metadata: jest: ^29.5.0 jest-environment-node: ^29.6.4 openai: ^4.87.3 + playwright-core: ^1.51.1 prettier: ^2.8.3 release-it: ^17.6.0 scrapybara: ^2.4.4 @@ -6230,6 +6244,18 @@ __metadata: languageName: node linkType: hard +"es-set-tostringtag@npm:^2.1.0": + version: 2.1.0 + resolution: "es-set-tostringtag@npm:2.1.0" + dependencies: + es-errors: ^1.3.0 + get-intrinsic: ^1.2.6 + has-tostringtag: ^1.0.2 + hasown: ^2.0.2 + checksum: 789f35de4be3dc8d11fdcb91bc26af4ae3e6d602caa93299a8c45cf05d36cc5081454ae2a6d3afa09cceca214b76c046e4f8151e092e6fc7feeb5efb9e794fc6 + languageName: node + linkType: hard + "es-shim-unscopables@npm:^1.0.0, es-shim-unscopables@npm:^1.0.2": version: 1.0.2 resolution: "es-shim-unscopables@npm:1.0.2" @@ -7285,6 +7311,18 @@ __metadata: languageName: node linkType: hard +"form-data@npm:^4.0.1": + version: 4.0.2 + resolution: "form-data@npm:4.0.2" + dependencies: + asynckit: ^0.4.0 + combined-stream: ^1.0.8 + es-set-tostringtag: ^2.1.0 + mime-types: ^2.1.12 + checksum: e887298b22c13c7c9c5a8ba3716f295a479a13ca78bfd855ef11cbce1bcf22bc0ae2062e94808e21d46e5c667664a1a1a8a7f57d7040193c1fefbfb11af58aab + languageName: node + linkType: hard + "formdata-node@npm:^4.3.2": version: 4.4.1 resolution: "formdata-node@npm:4.4.1" @@ -7433,7 +7471,7 @@ __metadata: languageName: node linkType: hard -"get-intrinsic@npm:^1.2.5, get-intrinsic@npm:^1.3.0": +"get-intrinsic@npm:^1.2.5, get-intrinsic@npm:^1.2.6, get-intrinsic@npm:^1.3.0": version: 1.3.0 resolution: "get-intrinsic@npm:1.3.0" dependencies: @@ -9937,18 +9975,7 @@ __metadata: languageName: node linkType: hard -"node-fetch@npm:3.3.2": - version: 3.3.2 - resolution: "node-fetch@npm:3.3.2" - dependencies: - data-uri-to-buffer: ^4.0.0 - fetch-blob: ^3.1.4 - formdata-polyfill: ^4.0.10 - checksum: 06a04095a2ddf05b0830a0d5302699704d59bda3102894ea64c7b9d4c865ecdff2d90fd042df7f5bc40337266961cb6183dcc808ea4f3000d024f422b462da92 - languageName: node - linkType: hard - -"node-fetch@npm:^2.6.1, node-fetch@npm:^2.6.7, node-fetch@npm:^2.7.0": +"node-fetch@npm:2.7.0, node-fetch@npm:^2.6.1, node-fetch@npm:^2.6.7, node-fetch@npm:^2.7.0": version: 2.7.0 resolution: "node-fetch@npm:2.7.0" dependencies: @@ -9962,6 +9989,17 @@ __metadata: languageName: node linkType: hard +"node-fetch@npm:3.3.2": + version: 3.3.2 + resolution: "node-fetch@npm:3.3.2" + dependencies: + data-uri-to-buffer: ^4.0.0 + fetch-blob: ^3.1.4 + formdata-polyfill: ^4.0.10 + checksum: 06a04095a2ddf05b0830a0d5302699704d59bda3102894ea64c7b9d4c865ecdff2d90fd042df7f5bc40337266961cb6183dcc808ea4f3000d024f422b462da92 + languageName: node + linkType: hard + "node-gyp@npm:latest": version: 10.1.0 resolution: "node-gyp@npm:10.1.0" @@ -10825,6 +10863,15 @@ __metadata: languageName: node linkType: hard +"playwright-core@npm:^1.51.1": + version: 1.51.1 + resolution: "playwright-core@npm:1.51.1" + bin: + playwright-core: cli.js + checksum: 1eb37e22e97435a5ed6389b4caa666fbe618348861cae97e67586e20c8fed9ac3d3dc899ff3b9237d0ddfcf087d5b552b80be247e246fc45b75282f96be714bb + languageName: node + linkType: hard + "possible-typed-array-names@npm:^1.0.0": version: 1.0.0 resolution: "possible-typed-array-names@npm:1.0.0" @@ -13488,6 +13535,15 @@ __metadata: languageName: node linkType: hard +"zod-to-json-schema@npm:^3.24.1": + version: 3.24.5 + resolution: "zod-to-json-schema@npm:3.24.5" + peerDependencies: + zod: ^3.24.1 + checksum: dc4e5e4c06e9a5494e4b1d8c8363ac907f9d488f36c8e4923e1e5ac4f91f737722f99200cd92a409551e7456d960734d4cabd37935234ca95e290572468ffc08 + languageName: node + linkType: hard + "zod@npm:^3.22.3, zod@npm:^3.22.4, zod@npm:^3.23.8": version: 3.23.8 resolution: "zod@npm:3.23.8" @@ -13495,7 +13551,7 @@ __metadata: languageName: node linkType: hard -"zod@npm:^3.24.0": +"zod@npm:^3.24.0, zod@npm:^3.24.1": version: 3.24.2 resolution: "zod@npm:3.24.2" checksum: c02455c09678c5055c636d64f9fcda2424fea0aa46ac7d9681e7f41990bc55f488bcd84b9d7cfef0f6e906f51f55b245239d92a9f726248aa74c5b84edf00c2d From 94e66688fe3050b4a7b8ab0f2bf176e3fa07ed26 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Tue, 1 Apr 2025 21:26:13 -0500 Subject: [PATCH 2/3] fixes --- libs/langgraph-cua/package.json | 2 +- libs/langgraph-cua/src/index.ts | 3 +- .../src/nodes/create-vm-instance.ts | 37 +- libs/langgraph-cua/src/nodes/handle-action.ts | 347 ++++++++++++++++++ .../src/nodes/take-browser-action.ts | 240 ------------ .../src/nodes/take-computer-action.ts | 245 ++++++------- libs/langgraph-cua/src/types.ts | 24 +- yarn.lock | 324 +++++++++++++++- 8 files changed, 808 insertions(+), 414 deletions(-) create mode 100644 libs/langgraph-cua/src/nodes/handle-action.ts delete mode 100644 libs/langgraph-cua/src/nodes/take-browser-action.ts diff --git a/libs/langgraph-cua/package.json b/libs/langgraph-cua/package.json index c2125cafc..9f29a3823 100644 --- a/libs/langgraph-cua/package.json +++ b/libs/langgraph-cua/package.json @@ -33,7 +33,7 @@ "license": "MIT", "dependencies": { "@hyperbrowser/sdk": "^0.40.0", - "playwright-core": "^1.51.1", + "puppeteer-core": "^24.5.0", "scrapybara": "^2.4.4", "zod": "^3.23.8" }, diff --git a/libs/langgraph-cua/src/index.ts b/libs/langgraph-cua/src/index.ts index 04ba7b08b..a85c65063 100644 --- a/libs/langgraph-cua/src/index.ts +++ b/libs/langgraph-cua/src/index.ts @@ -7,6 +7,7 @@ import { StateGraph, } from "@langchain/langgraph"; import { SystemMessage } from "@langchain/core/messages"; +import { CreateSessionParams } from "@hyperbrowser/sdk/types"; import { callModel } from "./nodes/call-model.js"; import { createVMInstance } from "./nodes/create-vm-instance.js"; import { takeComputerAction } from "./nodes/take-computer-action.js"; @@ -89,7 +90,7 @@ interface CreateCuaParams< * For more information on the available parameters, see the [Hyperbrowser API documentation](https://docs.hyperbrowser.ai/sessions/overview/session-parameters). * @default undefined */ - sessionParams?: Record; + sessionParams?: CreateSessionParams; /** * The number of hours to keep the virtual machine running before it times out. diff --git a/libs/langgraph-cua/src/nodes/create-vm-instance.ts b/libs/langgraph-cua/src/nodes/create-vm-instance.ts index ddb6eba8a..627d2eade 100644 --- a/libs/langgraph-cua/src/nodes/create-vm-instance.ts +++ b/libs/langgraph-cua/src/nodes/create-vm-instance.ts @@ -1,17 +1,28 @@ import { LangGraphRunnableConfig } from "@langchain/langgraph"; -import { chromium } from "playwright-core"; +import type { Browser } from "puppeteer-core"; +import { connect } from "puppeteer-core"; import { UbuntuInstance, BrowserInstance, WindowsInstance } from "scrapybara"; import { SessionDetail } from "@hyperbrowser/sdk/types"; import { CUAState, CUAUpdate, getConfigurationWithDefaults } from "../types.js"; import { getHyperbrowserClient, getScrapybaraClient } from "../utils.js"; +export const getActivePage = async (browser: Browser) => { + const pages = await browser.pages(); + for (const page of pages) { + const isHidden = await page.evaluate("document.hidden"); + if (isHidden === false) { + return page; + } + } + return pages[0]; +}; + async function createHyperbrowserInstance( state: CUAState, config: LangGraphRunnableConfig ): Promise { const { hyperbrowserApiKey, sessionParams } = getConfigurationWithDefaults(config); - let { browserState } = state; if (!hyperbrowserApiKey) { throw new Error( @@ -22,18 +33,16 @@ async function createHyperbrowserInstance( const client = getHyperbrowserClient(hyperbrowserApiKey); const session: SessionDetail = await client.sessions.create(sessionParams); - if (!browserState && session.wsEndpoint) { - const browser = await chromium.connectOverCDP( - `${session.wsEndpoint}&keepAlive=true` - ); - const currPage = browser.contexts()[0].pages()[0]; - if (currPage.url() === "about:blank") { - await currPage.goto("https://www.google.com"); + if (session.wsEndpoint) { + const browser = await connect({ + browserWSEndpoint: `${session.wsEndpoint}&keepAlive=true`, + defaultViewport: null, + }); + const page = await getActivePage(browser); + + if (page.url() === "about:blank") { + await page.goto("https://www.google.com"); } - browserState = { - browser, - currentPage: currPage, - }; } if (!state.streamUrl) { @@ -43,13 +52,11 @@ async function createHyperbrowserInstance( return { instanceId: session.id, streamUrl, - browserState, }; } return { instanceId: session.id, - browserState, }; } diff --git a/libs/langgraph-cua/src/nodes/handle-action.ts b/libs/langgraph-cua/src/nodes/handle-action.ts new file mode 100644 index 000000000..7cb4fb51e --- /dev/null +++ b/libs/langgraph-cua/src/nodes/handle-action.ts @@ -0,0 +1,347 @@ +import type { ResponseComputerToolCall } from "openai/resources/responses/responses"; +import { BrowserInstance, UbuntuInstance, WindowsInstance } from "scrapybara"; +import type { KeyInput, Browser } from "puppeteer-core"; +import { Provider } from "../types.js"; +import { getActivePage } from "./create-vm-instance.js"; + +async function sleep(ms: number): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +// Copied from the OpenAI example repository +// https://github.com/openai/openai-cua-sample-app/blob/eb2d58ba77ffd3206d3346d6357093647d29d99c/computers/scrapybara.py#L10 +const CUA_KEY_TO_SCRAPYBARA_KEY: Record = { + "/": "slash", + "\\": "backslash", + arrowdown: "Down", + arrowleft: "Left", + arrowright: "Right", + arrowup: "Up", + backspace: "BackSpace", + capslock: "Caps_Lock", + cmd: "Meta_L", + delete: "Delete", + end: "End", + enter: "Return", + esc: "Escape", + home: "Home", + insert: "Insert", + option: "Alt_L", + pagedown: "Page_Down", + pageup: "Page_Up", + tab: "Tab", + win: "Meta_L", +}; + +export const CUA_KEY_TO_PUPPETEER_KEY = { + "/": "Slash", + "\\": "Backslash", + alt: "Alt", + arrowdown: "ArrowDown", + arrowleft: "ArrowLeft", + arrowright: "ArrowRight", + arrowup: "ArrowUp", + backspace: "Backspace", + capslock: "CapsLock", + cmd: "Meta", + ctrl: "Control", + delete: "Delete", + end: "End", + enter: "Enter", + esc: "Escape", + home: "Home", + insert: "Insert", + option: "Alt", + pagedown: "PageDown", + pageup: "PageUp", + shift: "Shift", + space: " ", + super: "Meta", + tab: "Tab", + win: "Meta", +}; + +const translateKeyToPuppeteerKey = (key: string): KeyInput => { + const lowerKey = key.toLowerCase(); + return lowerKey in CUA_KEY_TO_PUPPETEER_KEY + ? (CUA_KEY_TO_PUPPETEER_KEY[ + lowerKey as keyof typeof CUA_KEY_TO_PUPPETEER_KEY + ] as KeyInput) + : (key as KeyInput); +}; + +const getHyperbrowserScreenshot = async ( + browser: Browser, + waitTime: number = 3_000 +) => { + await sleep(waitTime); + const page = await getActivePage(browser); + const screenshot = await Promise.race([ + page.screenshot({ type: "png" }), + new Promise((_, reject) => { + setTimeout(() => reject(new Error("Screenshot timeout")), 15_000); + }), + ]); + return Buffer.from(screenshot).toString("base64"); +}; + +export async function handleClickAction( + action: ResponseComputerToolCall.Click, + provider: Provider, + instance: UbuntuInstance | BrowserInstance | WindowsInstance | Browser +) { + switch (provider) { + case "scrapybara": + return ( + await instance.computer({ + action: "click_mouse", + button: action.button === "wheel" ? "middle" : action.button, + coordinates: [action.x, action.y], + }) + ).base64Image; + case "hyperbrowser": { + const page = await getActivePage(instance); + switch (action.button) { + case "back": + await page.goBack({ timeout: 15_000 }); + break; + case "forward": + await page.goForward({ timeout: 15_000 }); + break; + case "wheel": + await page.mouse.wheel({ deltaX: action.x, deltaY: action.y }); + break; + case "left": + await page.mouse.click(action.x, action.y, { button: "left" }); + break; + case "right": + await page.mouse.click(action.x, action.y, { button: "right" }); + break; + default: + throw new Error(`Unknown button: ${action.button}`); + } + return await getHyperbrowserScreenshot(instance); + } + default: + throw new Error(`Unknown provider: ${provider}`); + } +} + +export async function handleDoubleClickAction( + action: ResponseComputerToolCall.DoubleClick, + provider: Provider, + instance: UbuntuInstance | BrowserInstance | WindowsInstance | Browser +) { + switch (provider) { + case "scrapybara": + return ( + await instance.computer({ + action: "click_mouse", + button: "left", + coordinates: [action.x, action.y], + numClicks: 2, + }) + ).base64Image; + case "hyperbrowser": { + const page = await getActivePage(instance); + await page.mouse.click(action.x, action.y, { + button: "left", + clickCount: 2, + }); + return await getHyperbrowserScreenshot(instance); + } + default: + throw new Error(`Unknown provider: ${provider}`); + } +} + +export async function handleDragAction( + action: ResponseComputerToolCall.Drag, + provider: Provider, + instance: UbuntuInstance | BrowserInstance | WindowsInstance | Browser +) { + switch (provider) { + case "scrapybara": + return ( + await instance.computer({ + action: "drag_mouse", + path: action.path.map(({ x, y }) => [x, y]), + }) + ).base64Image; + case "hyperbrowser": { + const page = await getActivePage(instance); + if (action.path.length < 2) { + throw new Error( + "Invalid drag path: must contain at least a start and end point" + ); + } + + await page.mouse.move(action.path[0].x, action.path[0].y); + await page.mouse.down(); + + for (const { x, y } of action.path) { + await page.mouse.move(x, y); + await sleep(40 + Math.floor(Math.random() * 40)); // Random delay between 40-79ms to simulate human dragging + } + + await page.mouse.up(); + return await getHyperbrowserScreenshot(instance); + } + default: + throw new Error(`Unknown provider: ${provider}`); + } +} + +export async function handleKeypressAction( + action: ResponseComputerToolCall.Keypress, + provider: Provider, + instance: UbuntuInstance | BrowserInstance | WindowsInstance | Browser +) { + switch (provider) { + case "scrapybara": { + const mappedKeys = action.keys + .map((k) => k.toLowerCase()) + .map((key) => + key in CUA_KEY_TO_SCRAPYBARA_KEY + ? CUA_KEY_TO_SCRAPYBARA_KEY[key] + : key + ); + return ( + await instance.computer({ + action: "press_key", + keys: mappedKeys, + }) + ).base64Image; + } + case "hyperbrowser": { + const page = await getActivePage(instance); + const mappedKeysHb = action.keys.map((key) => + translateKeyToPuppeteerKey(key) + ); + for (const key of mappedKeysHb) { + await page.keyboard.down(key); + } + for (const key of [...mappedKeysHb].reverse()) { + await page.keyboard.up(key); + } + return await getHyperbrowserScreenshot(instance); + } + default: + throw new Error(`Unknown provider: ${provider}`); + } +} + +export async function handleMoveAction( + action: ResponseComputerToolCall.Move, + provider: Provider, + instance: UbuntuInstance | BrowserInstance | WindowsInstance | Browser +) { + switch (provider) { + case "scrapybara": + return ( + await instance.computer({ + action: "move_mouse", + coordinates: [action.x, action.y], + }) + ).base64Image; + case "hyperbrowser": { + const page = await getActivePage(instance); + await page.mouse.move(action.x, action.y); + return await getHyperbrowserScreenshot(instance, 1_000); + } + default: + throw new Error(`Unknown provider: ${provider}`); + } +} + +export async function handleScreenshotAction( + _action: ResponseComputerToolCall.Screenshot, + provider: Provider, + instance: UbuntuInstance | BrowserInstance | WindowsInstance | Browser +) { + switch (provider) { + case "scrapybara": + return ( + await instance.computer({ + action: "take_screenshot", + }) + ).base64Image; + case "hyperbrowser": + return await getHyperbrowserScreenshot(instance, 0); + default: + throw new Error(`Unknown provider: ${provider}`); + } +} + +export async function handleWaitAction( + _action: ResponseComputerToolCall.Wait, + provider: Provider, + instance: UbuntuInstance | BrowserInstance | WindowsInstance | Browser +) { + switch (provider) { + case "scrapybara": + await sleep(2000); + return ( + await instance.computer({ + action: "take_screenshot", + }) + ).base64Image; + case "hyperbrowser": + return await getHyperbrowserScreenshot(instance, 2_000); + default: + throw new Error(`Unknown provider: ${provider}`); + } +} + +export async function handleScrollAction( + action: ResponseComputerToolCall.Scroll, + provider: Provider, + instance: UbuntuInstance | BrowserInstance | WindowsInstance | Browser +) { + switch (provider) { + case "scrapybara": + return ( + await instance.computer({ + action: "scroll", + deltaX: action.scroll_x / 20, + deltaY: action.scroll_y / 20, + coordinates: [action.x, action.y], + }) + ).base64Image; + case "hyperbrowser": { + const page = await getActivePage(instance); + await page.mouse.move(action.x, action.y); + await page.evaluate( + `window.scrollBy(${action.scroll_x}, ${action.scroll_y})` + ); + return await getHyperbrowserScreenshot(instance, 1_000); + } + default: + throw new Error(`Unknown provider: ${provider}`); + } +} + +export async function handleTypeAction( + action: ResponseComputerToolCall.Type, + provider: Provider, + instance: UbuntuInstance | BrowserInstance | WindowsInstance | Browser +) { + switch (provider) { + case "scrapybara": + return ( + await instance.computer({ + action: "type_text", + text: action.text, + }) + ).base64Image; + case "hyperbrowser": { + const page = await getActivePage(instance); + await page.keyboard.type(action.text); + return await getHyperbrowserScreenshot(instance, 1_000); + } + default: + throw new Error(`Unknown provider: ${provider}`); + } +} diff --git a/libs/langgraph-cua/src/nodes/take-browser-action.ts b/libs/langgraph-cua/src/nodes/take-browser-action.ts deleted file mode 100644 index 8ef6824c7..000000000 --- a/libs/langgraph-cua/src/nodes/take-browser-action.ts +++ /dev/null @@ -1,240 +0,0 @@ -import { LangGraphRunnableConfig } from "@langchain/langgraph"; -import { BaseMessageLike } from "@langchain/core/messages"; -import { Page } from "playwright-core"; -import { CUAState, CUAUpdate } from "../types.js"; -import { getHyperbrowserInstance, getToolOutputs } from "../utils.js"; - -const sleep = (ms: number) => - new Promise((resolve) => { - setTimeout(resolve, ms); - }); - -export const CUA_KEY_TO_PLAYWRIGHT_KEY = { - "/": "Divide", - "\\": "Backslash", - alt: "Alt", - arrowdown: "ArrowDown", - arrowleft: "ArrowLeft", - arrowright: "ArrowRight", - arrowup: "ArrowUp", - backspace: "Backspace", - capslock: "CapsLock", - cmd: "Meta", - ctrl: "Control", - delete: "Delete", - end: "End", - enter: "Enter", - esc: "Escape", - home: "Home", - insert: "Insert", - option: "Alt", - pagedown: "PageDown", - pageup: "PageUp", - shift: "Shift", - space: " ", - super: "Meta", - tab: "Tab", - win: "Meta", -}; - -const DUMMY_SCREENSHOT = - "/9j/4AAQSkZJRgABAQEASABIAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/wAALCAABAAEBAREA/8QAFAABAAAAAAAAAAAAAAAAAAAACf/EABQQAQAAAAAAAAAAAAAAAAAAAAD/2gAIAQEAAD8AKp//2Q=="; - -const translateKey = (key: string): string => { - const lowerKey = key.toLowerCase(); - return lowerKey in CUA_KEY_TO_PLAYWRIGHT_KEY - ? CUA_KEY_TO_PLAYWRIGHT_KEY[ - lowerKey as keyof typeof CUA_KEY_TO_PLAYWRIGHT_KEY - ] - : key; -}; - -export async function takeHyperbrowserAction( - state: CUAState, - config: LangGraphRunnableConfig -): Promise { - if (!state.instanceId) { - throw new Error("Can not take computer action without an instance ID."); - } - - const message = state.messages[state.messages.length - 1]; - const toolOutputs = getToolOutputs(message); - if (!toolOutputs?.length) { - // This should never happen, but include the check for proper type narrowing. - throw new Error( - "Can not take computer action without a computer call in the last message." - ); - } - - const instance = await getHyperbrowserInstance(state.instanceId, config); - - let { streamUrl, browserState } = state; - - if (!browserState) { - throw new Error("Browser state not found."); - } - const { browser } = browserState; - if (!browser) { - throw new Error("Browser not found."); - } - const currentContext = browser.contexts()[0]; - let page = browserState.currentPage ?? currentContext.pages()[0]; - - currentContext.on("page", (newPage: Page) => { - page = newPage; - if (!browserState) { - browserState = { - browser, - currentPage: newPage, - }; - } else { - browserState.currentPage = newPage; - } - }); - - if (!streamUrl) { - streamUrl = instance.liveUrl; - config.writer?.({ - streamUrl, - }); - } - - const output = toolOutputs[toolOutputs.length - 1]; - const { action, call_id } = output; - let computerCallToolMsg: BaseMessageLike | undefined; - const actionType = action.type; - - try { - switch (actionType) { - case "click": { - const { x, y, button } = action; - switch (button) { - case "back": - await page.goBack({ timeout: 30_000 }); - break; - case "forward": - await page.goForward({ timeout: 30_000 }); - break; - case "wheel": - await page.mouse.wheel(x, y); - break; - case "left": - await page.mouse.click(x, y, { button: "left" }); - break; - case "right": - await page.mouse.click(x, y, { button: "right" }); - break; - default: - throw new Error(`Unknown button: ${button}`); - } - break; - } - - case "scroll": { - const { x, y, scroll_x: scrollX, scroll_y: scrollY } = action; - await page.mouse.move(x, y); - await page.evaluate(`window.scrollBy(${scrollX}, ${scrollY})`); - break; - } - - case "keypress": { - const { keys } = action; - const mappedKeys = keys.map((key) => translateKey(key)); - for (const key of mappedKeys) { - await page.keyboard.down(key); - } - for (const key of [...mappedKeys].reverse()) { - await page.keyboard.up(key); - } - break; - } - - case "type": { - const { text } = action; - // console.log(`Action: type text '${text}'`); - await page.keyboard.type(text); - break; - } - - case "wait": { - // console.log(`Action: wait`); - await page.waitForTimeout(2000); - break; - } - - case "screenshot": { - // Nothing to do as screenshot is taken at each turn - // console.log(`Action: screenshot`); - break; - } - - case "double_click": { - const { x, y } = action; - // console.log(`Action: double click at (${x}, ${y})`); - await page.mouse.click(x, y, { button: "left", clickCount: 2 }); - break; - } - - case "drag": { - const { path } = action; - - // console.log(`Action: drag with ${path.length} points`); - - if (path.length < 2) { - throw new Error( - "Invalid drag path: must contain at least a start and end point" - ); - } - - await page.mouse.move(path[0].x, path[0].y); - await page.mouse.down(); - - for (const { x, y } of path) { - await page.mouse.move(x, y); - await page.waitForTimeout(40 + Math.floor(Math.random() * 40)); // Random delay between 40-79ms to simulate human dragging - } - - await page.mouse.up(); - break; - } - - case "move": { - const { x, y } = action; - // console.log(`Action: move to (${x}, ${y})`); - await page.mouse.move(x, y); - break; - } - - default: - throw new Error(`Unknown action type: ${actionType}`); - } - await sleep(1_000); - const screenshot = await page.screenshot({ timeout: 15_000 }); - const b64Screenshot = Buffer.from(screenshot).toString("base64"); - const screenshotUrl = `data:image/png;base64,${b64Screenshot}`; - computerCallToolMsg = { - type: "tool", - tool_call_id: call_id, - content: screenshotUrl, - additional_kwargs: { type: "computer_call_output" }, - }; - } catch (error) { - console.error( - `\n\nFailed to execute computer call: ${actionType}\n\n`, - error - ); - console.error(`Computer call details: ${output}`); - computerCallToolMsg = { - type: "tool", - tool_call_id: call_id, - content: `data:image/jpeg;base64,${DUMMY_SCREENSHOT}`, - additional_kwargs: { type: "computer_call_output", status: "incomplete" }, - }; - } - return { - messages: computerCallToolMsg ? [computerCallToolMsg] : [], - instanceId: instance.id, - streamUrl, - browserState, - }; -} diff --git a/libs/langgraph-cua/src/nodes/take-computer-action.ts b/libs/langgraph-cua/src/nodes/take-computer-action.ts index c626882ae..29844713e 100644 --- a/libs/langgraph-cua/src/nodes/take-computer-action.ts +++ b/libs/langgraph-cua/src/nodes/take-computer-action.ts @@ -1,75 +1,39 @@ -import { - BrowserInstance, - UbuntuInstance, - WindowsInstance, - Scrapybara, -} from "scrapybara"; +import { BrowserInstance, UbuntuInstance, WindowsInstance } from "scrapybara"; import { LangGraphRunnableConfig } from "@langchain/langgraph"; +import { connect } from "puppeteer-core"; import { BaseMessageLike } from "@langchain/core/messages"; import { RunnableLambda } from "@langchain/core/runnables"; import { CUAState, CUAUpdate, getConfigurationWithDefaults } from "../types.js"; -import { getScrapybaraInstance, getToolOutputs } from "../utils.js"; -import { takeHyperbrowserAction } from "./take-browser-action.js"; - -async function sleep(ms: number): Promise { - return new Promise((resolve) => { - setTimeout(resolve, ms); - }); -} - -// Copied from the OpenAI example repository -// https://github.com/openai/openai-cua-sample-app/blob/eb2d58ba77ffd3206d3346d6357093647d29d99c/computers/scrapybara.py#L10 -const CUA_KEY_TO_SCRAPYBARA_KEY: Record = { - "/": "slash", - "\\": "backslash", - arrowdown: "Down", - arrowleft: "Left", - arrowright: "Right", - arrowup: "Up", - backspace: "BackSpace", - capslock: "Caps_Lock", - cmd: "Meta_L", - delete: "Delete", - end: "End", - enter: "Return", - esc: "Escape", - home: "Home", - insert: "Insert", - option: "Alt_L", - pagedown: "Page_Down", - pageup: "Page_Up", - tab: "Tab", - win: "Meta_L", -}; +import { + getHyperbrowserInstance, + getScrapybaraInstance, + getToolOutputs, +} from "../utils.js"; +import { + handleClickAction, + handleDoubleClickAction, + handleDragAction, + handleKeypressAction, + handleMoveAction, + handleScreenshotAction, + handleScrollAction, + handleTypeAction, + handleWaitAction, +} from "./handle-action.js"; const isBrowserInstance = ( instance: UbuntuInstance | BrowserInstance | WindowsInstance ): instance is BrowserInstance => "authenticate" in instance && typeof instance.authenticate === "function"; -export async function takeScrapybaraAction( +async function scrapybaraSetup( + instanceId: string, state: CUAState, config: LangGraphRunnableConfig, - { - uploadScreenshot, - }: { uploadScreenshot?: (screenshot: string) => Promise } -): Promise { - if (!state.instanceId) { - throw new Error("Can not take computer action without an instance ID."); - } +) { + const instance = await getScrapybaraInstance(instanceId, config); const { authStateId } = getConfigurationWithDefaults(config); - const message = state.messages[state.messages.length - 1]; - const toolOutputs = getToolOutputs(message); - if (!toolOutputs?.length) { - // This should never happen, but include the check for proper type narrowing. - throw new Error( - "Can not take computer action without a computer call in the last message." - ); - } - - const instance = await getScrapybaraInstance(state.instanceId, config); - let { authenticatedId } = state; if ( isBrowserInstance(instance) && @@ -92,78 +56,123 @@ export async function takeScrapybaraAction( }); } + return { + instance, + updatedState: { + instanceId: instance.id, + streamUrl, + authenticatedId, + }, + }; +} + +async function hyperbrowserSetup( + instanceId: string, + state: CUAState, + config: LangGraphRunnableConfig, +) { + const instance = await getHyperbrowserInstance(instanceId, config); + let { streamUrl } = state; + + const browser = await connect({ + browserWSEndpoint: `${instance.wsEndpoint}&keepAlive=true`, + defaultViewport: null, + }); + + if (!streamUrl) { + streamUrl = instance.liveUrl; + config.writer?.({ + streamUrl, + }); + } + + return { + instance: browser, + updatedState: { + instanceId: instance.id, + streamUrl, + }, + }; +} + +export async function takeComputerAction( + state: CUAState, + config: LangGraphRunnableConfig, + { + uploadScreenshot, + }: { uploadScreenshot?: (screenshot: string) => Promise } +): Promise { + if (!state.instanceId) { + throw new Error("Can not take computer action without an instance ID."); + } + + const message = state.messages[state.messages.length - 1]; + const toolOutputs = getToolOutputs(message); + if (!toolOutputs?.length) { + // This should never happen, but include the check for proper type narrowing. + throw new Error( + "Can not take computer action without a computer call in the last message." + ); + } + const { provider } = getConfigurationWithDefaults(config); + + const { instance, updatedState } = await (provider === "scrapybara" + ? scrapybaraSetup(state.instanceId, state, config) + : hyperbrowserSetup(state.instanceId, state, config)); + const output = toolOutputs[toolOutputs.length - 1]; const { action } = output; let computerCallToolMsg: BaseMessageLike | undefined; try { - let computerResponse: Scrapybara.ComputerResponse; + let responseScreenshot: string; switch (action.type) { case "click": - computerResponse = await instance.computer({ - action: "click_mouse", - button: action.button === "wheel" ? "middle" : action.button, - coordinates: [action.x, action.y], - }); + responseScreenshot = await handleClickAction( + action, + provider, + instance + ); break; case "double_click": - computerResponse = await instance.computer({ - action: "click_mouse", - button: "left", - coordinates: [action.x, action.y], - numClicks: 2, - }); + responseScreenshot = await handleDoubleClickAction( + action, + provider, + instance + ); break; case "drag": - computerResponse = await instance.computer({ - action: "drag_mouse", - path: action.path.map(({ x, y }) => [x, y]), - }); + responseScreenshot = await handleDragAction(action, provider, instance); break; - case "keypress": { - const mappedKeys = action.keys - .map((k) => k.toLowerCase()) - .map((key) => - key in CUA_KEY_TO_SCRAPYBARA_KEY - ? CUA_KEY_TO_SCRAPYBARA_KEY[key] - : key - ); - computerResponse = await instance.computer({ - action: "press_key", - keys: mappedKeys, - }); + case "keypress": + responseScreenshot = await handleKeypressAction( + action, + provider, + instance + ); break; - } case "move": - computerResponse = await instance.computer({ - action: "move_mouse", - coordinates: [action.x, action.y], - }); + responseScreenshot = await handleMoveAction(action, provider, instance); break; case "screenshot": - computerResponse = await instance.computer({ - action: "take_screenshot", - }); + responseScreenshot = await handleScreenshotAction( + action, + provider, + instance + ); break; case "wait": - await sleep(2000); - computerResponse = await instance.computer({ - action: "take_screenshot", - }); + responseScreenshot = await handleWaitAction(action, provider, instance); break; case "scroll": - computerResponse = await instance.computer({ - action: "scroll", - deltaX: action.scroll_x / 20, - deltaY: action.scroll_y / 20, - coordinates: [action.x, action.y], - }); + responseScreenshot = await handleScrollAction( + action, + provider, + instance + ); break; case "type": - computerResponse = await instance.computer({ - action: "type_text", - text: action.text, - }); + responseScreenshot = await handleTypeAction(action, provider, instance); break; default: throw new Error( @@ -171,7 +180,7 @@ export async function takeScrapybaraAction( ); } - let screenshotContent = `data:image/png;base64,${computerResponse.base64Image}`; + let screenshotContent = `data:image/png;base64,${responseScreenshot}`; if (uploadScreenshot) { const uploadScreenshotRunnable = RunnableLambda.from( uploadScreenshot @@ -198,23 +207,7 @@ export async function takeScrapybaraAction( } return { + ...updatedState, messages: computerCallToolMsg ? [computerCallToolMsg] : [], - instanceId: instance.id, - streamUrl, - authenticatedId, }; } - -export async function takeComputerAction( - state: CUAState, - config: LangGraphRunnableConfig -): Promise { - const { provider } = getConfigurationWithDefaults(config); - if (provider === "scrapybara") { - return takeScrapybaraAction(state, config); - } else if (provider === "hyperbrowser") { - return takeHyperbrowserAction(state, config); - } else { - throw new Error(`Unsupported provider: ${provider}`); - } -} diff --git a/libs/langgraph-cua/src/types.ts b/libs/langgraph-cua/src/types.ts index e26dc7756..36ba215e7 100644 --- a/libs/langgraph-cua/src/types.ts +++ b/libs/langgraph-cua/src/types.ts @@ -1,3 +1,4 @@ +import { CreateSessionParams } from "@hyperbrowser/sdk/types"; import { SystemMessage } from "@langchain/core/messages"; import { getEnvironmentVariable } from "@langchain/core/utils/env"; import { @@ -5,7 +6,6 @@ import { LangGraphRunnableConfig, MessagesAnnotation, } from "@langchain/langgraph"; -import { Browser, Page } from "playwright-core"; // Copied from the OpenAI example repository // https://github.com/openai/openai-cua-sample-app/blob/eb2d58ba77ffd3206d3346d6357093647d29d99c/utils.py#L13 @@ -50,15 +50,6 @@ export const CUAAnnotation = Annotation.Root({ reducer: (_state, update) => update, default: () => undefined, }), - /** - * The state of the browser instance. - */ - browserState: Annotation< - { browser: Browser | undefined; currentPage: Page | undefined } | undefined - >({ - reducer: (_state, update) => update, - default: () => undefined, - }), }); export const CUAConfigurable = Annotation.Root({ @@ -90,16 +81,7 @@ export const CUAConfigurable = Annotation.Root({ * Parameters to use for configuring the Hyperbrowser session, such as screen dimensions. * For more information on the available parameters, see the [Hyperbrowser API documentation](https://docs.hyperbrowser.ai/sessions/overview/session-parameters). */ - sessionParams: Annotation< - | { - screen?: { - width: number; - height: number; - }; - [key: string]: unknown; - } - | undefined - >({ + sessionParams: Annotation({ reducer: (_state, update) => update, default: () => undefined, }), @@ -180,7 +162,7 @@ export function getConfigurationWithDefaults( hyperbrowserApiKey: config.configurable?.hyperbrowserApiKey || getEnvironmentVariable("HYPERBROWSER_API_KEY"), - sessionParams: config.configurable?.sessionParams ?? {}, + sessionParams: config.configurable?.sessionParams ?? undefined, timeoutHours: config.configurable?.timeoutHours ?? 1, zdrEnabled: config.configurable?.zdrEnabled ?? false, environment: config.configurable?.environment ?? "web", diff --git a/yarn.lock b/yarn.lock index f6a49b90c..096603e9f 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1932,8 +1932,8 @@ __metadata: jest: ^29.5.0 jest-environment-node: ^29.6.4 openai: ^4.87.3 - playwright-core: ^1.51.1 prettier: ^2.8.3 + puppeteer-core: ^24.5.0 release-it: ^17.6.0 scrapybara: ^2.4.4 tsx: ^4.7.0 @@ -2642,6 +2642,23 @@ __metadata: languageName: node linkType: hard +"@puppeteer/browsers@npm:2.9.0": + version: 2.9.0 + resolution: "@puppeteer/browsers@npm:2.9.0" + dependencies: + debug: ^4.4.0 + extract-zip: ^2.0.1 + progress: ^2.0.3 + proxy-agent: ^6.5.0 + semver: ^7.7.1 + tar-fs: ^3.0.8 + yargs: ^17.7.2 + bin: + browsers: lib/cjs/main-cli.js + checksum: bd3f51e73de9056d3a8b4928d37f1a3d1dd2d6b2518b96809d60321331ae56b54b8668372a0e53c277cfb452e27fdbaf9ded394b90a92ded5033c53883cbf204 + languageName: node + linkType: hard + "@rollup/plugin-commonjs@npm:^13.0.0": version: 13.0.2 resolution: "@rollup/plugin-commonjs@npm:13.0.2" @@ -3822,6 +3839,15 @@ __metadata: languageName: node linkType: hard +"@types/yauzl@npm:^2.9.1": + version: 2.10.3 + resolution: "@types/yauzl@npm:2.10.3" + dependencies: + "@types/node": "*" + checksum: 5ee966ea7bd6b2802f31ad4281c92c4c0b6dfa593c378a2582c58541fa113bec3d70eb0696b34ad95e8e6861a884cba6c3e351285816693ed176222f840a8c08 + languageName: node + linkType: hard + "@typescript-eslint/eslint-plugin@npm:^6.12.0": version: 6.21.0 resolution: "@typescript-eslint/eslint-plugin@npm:6.21.0" @@ -4070,6 +4096,13 @@ __metadata: languageName: node linkType: hard +"agent-base@npm:^7.1.2": + version: 7.1.3 + resolution: "agent-base@npm:7.1.3" + checksum: 87bb7ee54f5ecf0ccbfcba0b07473885c43ecd76cb29a8db17d6137a19d9f9cd443a2a7c5fd8a3f24d58ad8145f9eb49116344a66b107e1aeab82cf2383f4753 + languageName: node + linkType: hard + "agentkeepalive@npm:^4.2.1": version: 4.5.0 resolution: "agentkeepalive@npm:4.5.0" @@ -4478,6 +4511,13 @@ __metadata: languageName: node linkType: hard +"bare-events@npm:^2.5.4": + version: 2.5.4 + resolution: "bare-events@npm:2.5.4" + checksum: 522a5401caaede9d8c857c2fd346c993bf43995e958e8ebfa79d32b1e086032800e0639f3559d7ad85788fae54f6d9605685de507eec54298ea2aa2c8c9cb2c3 + languageName: node + linkType: hard + "bare-fs@npm:^2.1.1": version: 2.3.1 resolution: "bare-fs@npm:2.3.1" @@ -4489,6 +4529,22 @@ __metadata: languageName: node linkType: hard +"bare-fs@npm:^4.0.1": + version: 4.0.2 + resolution: "bare-fs@npm:4.0.2" + dependencies: + bare-events: ^2.5.4 + bare-path: ^3.0.0 + bare-stream: ^2.6.4 + peerDependencies: + bare-buffer: "*" + peerDependenciesMeta: + bare-buffer: + optional: true + checksum: 3e6346c374dfd62ee5514baf990154b176cf9db84e17bf89a51f1985274ad1a3bb2e4894f1a736e231ec635fe25c97449fb570f3e8d56b74c18cef190ea83ef3 + languageName: node + linkType: hard + "bare-os@npm:^2.1.0": version: 2.4.0 resolution: "bare-os@npm:2.4.0" @@ -4496,6 +4552,13 @@ __metadata: languageName: node linkType: hard +"bare-os@npm:^3.0.1": + version: 3.6.1 + resolution: "bare-os@npm:3.6.1" + checksum: 2fcdbaa631e02e2b7a4a38ded4586ae8bef2d329c6933b9dca8c543b4af0ac3c257fdf0ff3339b83259e179e07873f300e61c75c0a1e6b796c0214b1fbae8696 + languageName: node + linkType: hard + "bare-path@npm:^2.0.0, bare-path@npm:^2.1.0": version: 2.1.3 resolution: "bare-path@npm:2.1.3" @@ -4505,6 +4568,15 @@ __metadata: languageName: node linkType: hard +"bare-path@npm:^3.0.0": + version: 3.0.0 + resolution: "bare-path@npm:3.0.0" + dependencies: + bare-os: ^3.0.1 + checksum: 51d559515f332f62cf9c37c38f2640c1b84b5e8c9de454b70baf029f806058cf94c51d6a0dfec0025cc7760f2069dc3e16c82f0d24f4a9ddb18c829bf9c0206d + languageName: node + linkType: hard + "bare-stream@npm:^2.0.0": version: 2.1.3 resolution: "bare-stream@npm:2.1.3" @@ -4514,6 +4586,23 @@ __metadata: languageName: node linkType: hard +"bare-stream@npm:^2.6.4": + version: 2.6.5 + resolution: "bare-stream@npm:2.6.5" + dependencies: + streamx: ^2.21.0 + peerDependencies: + bare-buffer: "*" + bare-events: "*" + peerDependenciesMeta: + bare-buffer: + optional: true + bare-events: + optional: true + checksum: 6a3d4baf8ded0bdc465b7b0b65dfbb8e40f7520ee8899adcae5fd37949d5c520412164116659750ad841215b03ce761fe252a626cd4fe3ec9df0440c6fd07a96 + languageName: node + linkType: hard + "base64-js@npm:^1.3.1, base64-js@npm:^1.5.1": version: 1.5.1 resolution: "base64-js@npm:1.5.1" @@ -4686,6 +4775,13 @@ __metadata: languageName: node linkType: hard +"buffer-crc32@npm:~0.2.3": + version: 0.2.13 + resolution: "buffer-crc32@npm:0.2.13" + checksum: 06252347ae6daca3453b94e4b2f1d3754a3b146a111d81c68924c22d91889a40623264e95e67955b1cb4a68cbedf317abeabb5140a9766ed248973096db5ce1c + languageName: node + linkType: hard + "buffer-from@npm:^1.0.0": version: 1.1.2 resolution: "buffer-from@npm:1.1.2" @@ -4966,6 +5062,18 @@ __metadata: languageName: node linkType: hard +"chromium-bidi@npm:3.0.0": + version: 3.0.0 + resolution: "chromium-bidi@npm:3.0.0" + dependencies: + mitt: ^3.0.1 + zod: ^3.24.1 + peerDependencies: + devtools-protocol: "*" + checksum: 3a20b551fdcb95e4ac13dbfd38a8d468933a560e2985ca95daf2f6e80deefa72ae54c512a0039f148819418127a55a158f1ebd8ef06367ad0c67e2b10616299b + languageName: node + linkType: hard + "ci-info@npm:^3.2.0": version: 3.9.0 resolution: "ci-info@npm:3.9.0" @@ -5741,6 +5849,18 @@ __metadata: languageName: node linkType: hard +"debug@npm:^4.4.0": + version: 4.4.0 + resolution: "debug@npm:4.4.0" + dependencies: + ms: ^2.1.3 + peerDependenciesMeta: + supports-color: + optional: true + checksum: fb42df878dd0e22816fc56e1fdca9da73caa85212fbe40c868b1295a6878f9101ae684f4eeef516c13acfc700f5ea07f1136954f43d4cd2d477a811144136479 + languageName: node + linkType: hard + "decamelize@npm:1.2.0": version: 1.2.0 resolution: "decamelize@npm:1.2.0" @@ -5900,6 +6020,13 @@ __metadata: languageName: node linkType: hard +"devtools-protocol@npm:0.0.1413902": + version: 0.0.1413902 + resolution: "devtools-protocol@npm:0.0.1413902" + checksum: 55d79bfd2d656cbe7f7aa098c018003974ed8744da68db78025b0eb2238654105e077cc86ca21ccc1e8e01ad114e02f8dbfd0efe98f696ccda5fa01eb44e131b + languageName: node + linkType: hard + "diff-sequences@npm:^29.6.3": version: 29.6.3 resolution: "diff-sequences@npm:29.6.3" @@ -7091,6 +7218,23 @@ __metadata: languageName: node linkType: hard +"extract-zip@npm:^2.0.1": + version: 2.0.1 + resolution: "extract-zip@npm:2.0.1" + dependencies: + "@types/yauzl": ^2.9.1 + debug: ^4.1.1 + get-stream: ^5.1.0 + yauzl: ^2.10.0 + dependenciesMeta: + "@types/yauzl": + optional: true + bin: + extract-zip: cli.js + checksum: 8cbda9debdd6d6980819cc69734d874ddd71051c9fe5bde1ef307ebcedfe949ba57b004894b585f758b7c9eeeea0e3d87f2dda89b7d25320459c2c9643ebb635 + languageName: node + linkType: hard + "fast-deep-equal@npm:^3.1.1, fast-deep-equal@npm:^3.1.3": version: 3.1.3 resolution: "fast-deep-equal@npm:3.1.3" @@ -7168,6 +7312,15 @@ __metadata: languageName: node linkType: hard +"fd-slicer@npm:~1.1.0": + version: 1.1.0 + resolution: "fd-slicer@npm:1.1.0" + dependencies: + pend: ~1.2.0 + checksum: c8585fd5713f4476eb8261150900d2cb7f6ff2d87f8feb306ccc8a1122efd152f1783bdb2b8dc891395744583436bfd8081d8e63ece0ec8687eeefea394d4ff2 + languageName: node + linkType: hard + "fetch-blob@npm:^3.1.2, fetch-blob@npm:^3.1.4": version: 3.2.0 resolution: "fetch-blob@npm:3.2.0" @@ -7513,6 +7666,15 @@ __metadata: languageName: node linkType: hard +"get-stream@npm:^5.1.0": + version: 5.2.0 + resolution: "get-stream@npm:5.2.0" + dependencies: + pump: ^3.0.0 + checksum: 8bc1a23174a06b2b4ce600df38d6c98d2ef6d84e020c1ddad632ad75bac4e092eeb40e4c09e0761c35fc2dbc5e7fff5dab5e763a383582c4a167dd69a905bd12 + languageName: node + linkType: hard + "get-stream@npm:^6.0.0, get-stream@npm:^6.0.1": version: 6.0.1 resolution: "get-stream@npm:6.0.1" @@ -7933,6 +8095,16 @@ __metadata: languageName: node linkType: hard +"https-proxy-agent@npm:^7.0.6": + version: 7.0.6 + resolution: "https-proxy-agent@npm:7.0.6" + dependencies: + agent-base: ^7.1.2 + debug: 4 + checksum: b882377a120aa0544846172e5db021fa8afbf83fea2a897d397bd2ddd8095ab268c24bc462f40a15f2a8c600bf4aa05ce52927f70038d4014e68aefecfa94e8d + languageName: node + linkType: hard + "human-signals@npm:^2.1.0": version: 2.1.0 resolution: "human-signals@npm:2.1.0" @@ -9796,6 +9968,13 @@ __metadata: languageName: node linkType: hard +"mitt@npm:^3.0.1": + version: 3.0.1 + resolution: "mitt@npm:3.0.1" + checksum: b55a489ac9c2949ab166b7f060601d3b6d893a852515ae9eca4e11df01c013876df777ea109317622b5c1c60e8aae252558e33c8c94e14124db38f64a39614b1 + languageName: node + linkType: hard + "mkdirp-classic@npm:^0.5.2, mkdirp-classic@npm:^0.5.3": version: 0.5.3 resolution: "mkdirp-classic@npm:0.5.3" @@ -10515,6 +10694,22 @@ __metadata: languageName: node linkType: hard +"pac-proxy-agent@npm:^7.1.0": + version: 7.2.0 + resolution: "pac-proxy-agent@npm:7.2.0" + dependencies: + "@tootallnate/quickjs-emscripten": ^0.23.0 + agent-base: ^7.1.2 + debug: ^4.3.4 + get-uri: ^6.0.1 + http-proxy-agent: ^7.0.0 + https-proxy-agent: ^7.0.6 + pac-resolver: ^7.0.1 + socks-proxy-agent: ^8.0.5 + checksum: 099c1bc8944da6a98e8b7de1fbf23e4014bc3063f66a7c29478bd852c1162e1d086a4f80f874f40961ebd5c516e736aed25852db97b79360cbdcc9db38086981 + languageName: node + linkType: hard + "pac-resolver@npm:^7.0.1": version: 7.0.1 resolution: "pac-resolver@npm:7.0.1" @@ -10678,6 +10873,13 @@ __metadata: languageName: node linkType: hard +"pend@npm:~1.2.0": + version: 1.2.0 + resolution: "pend@npm:1.2.0" + checksum: 6c72f5243303d9c60bd98e6446ba7d30ae29e3d56fdb6fae8767e8ba6386f33ee284c97efe3230a0d0217e2b1723b8ab490b1bbf34fcbb2180dbc8a9de47850d + languageName: node + linkType: hard + "pg-cloudflare@npm:^1.1.1": version: 1.1.1 resolution: "pg-cloudflare@npm:1.1.1" @@ -10863,15 +11065,6 @@ __metadata: languageName: node linkType: hard -"playwright-core@npm:^1.51.1": - version: 1.51.1 - resolution: "playwright-core@npm:1.51.1" - bin: - playwright-core: cli.js - checksum: 1eb37e22e97435a5ed6389b4caa666fbe618348861cae97e67586e20c8fed9ac3d3dc899ff3b9237d0ddfcf087d5b552b80be247e246fc45b75282f96be714bb - languageName: node - linkType: hard - "possible-typed-array-names@npm:^1.0.0": version: 1.0.0 resolution: "possible-typed-array-names@npm:1.0.0" @@ -11025,6 +11218,13 @@ __metadata: languageName: node linkType: hard +"progress@npm:^2.0.3": + version: 2.0.3 + resolution: "progress@npm:2.0.3" + checksum: f67403fe7b34912148d9252cb7481266a354bd99ce82c835f79070643bb3c6583d10dbcfda4d41e04bbc1d8437e9af0fb1e1f2135727878f5308682a579429b7 + languageName: node + linkType: hard + "promise-retry@npm:^2.0.1": version: 2.0.1 resolution: "promise-retry@npm:2.0.1" @@ -11119,6 +11319,22 @@ __metadata: languageName: node linkType: hard +"proxy-agent@npm:^6.5.0": + version: 6.5.0 + resolution: "proxy-agent@npm:6.5.0" + dependencies: + agent-base: ^7.1.2 + debug: ^4.3.4 + http-proxy-agent: ^7.0.1 + https-proxy-agent: ^7.0.6 + lru-cache: ^7.14.1 + pac-proxy-agent: ^7.1.0 + proxy-from-env: ^1.1.0 + socks-proxy-agent: ^8.0.5 + checksum: d03ad2d171c2768280ade7ea6a7c5b1d0746215d70c0a16e02780c26e1d347edd27b3f48374661ae54ec0f7b41e6e45175b687baf333b36b1fd109a525154806 + languageName: node + linkType: hard + "proxy-from-env@npm:^1.1.0": version: 1.1.0 resolution: "proxy-from-env@npm:1.1.0" @@ -11152,6 +11368,20 @@ __metadata: languageName: node linkType: hard +"puppeteer-core@npm:^24.5.0": + version: 24.5.0 + resolution: "puppeteer-core@npm:24.5.0" + dependencies: + "@puppeteer/browsers": 2.9.0 + chromium-bidi: 3.0.0 + debug: ^4.4.0 + devtools-protocol: 0.0.1413902 + typed-query-selector: ^2.12.0 + ws: ^8.18.1 + checksum: 9840028658149d095e711b3b2e7224309d28ebf0533e9451024d5f1cd6dfbb1a6286f3121aab01dc296620cb316d99808fa8be95974641e1865af6df68c1622d + languageName: node + linkType: hard + "pure-rand@npm:^6.0.0": version: 6.1.0 resolution: "pure-rand@npm:6.1.0" @@ -12038,6 +12268,17 @@ __metadata: languageName: node linkType: hard +"socks-proxy-agent@npm:^8.0.5": + version: 8.0.5 + resolution: "socks-proxy-agent@npm:8.0.5" + dependencies: + agent-base: ^7.1.2 + debug: ^4.3.4 + socks: ^2.8.3 + checksum: b4fbcdb7ad2d6eec445926e255a1fb95c975db0020543fbac8dfa6c47aecc6b3b619b7fb9c60a3f82c9b2969912a5e7e174a056ae4d98cb5322f3524d6036e1d + languageName: node + linkType: hard + "socks@npm:^2.7.1, socks@npm:^2.8.3": version: 2.8.3 resolution: "socks@npm:2.8.3" @@ -12176,6 +12417,20 @@ __metadata: languageName: node linkType: hard +"streamx@npm:^2.21.0": + version: 2.22.0 + resolution: "streamx@npm:2.22.0" + dependencies: + bare-events: ^2.2.0 + fast-fifo: ^1.3.2 + text-decoder: ^1.1.0 + dependenciesMeta: + bare-events: + optional: true + checksum: 9b2772a084281129d402f298bddf8d5f3c09b6b3d9b5c93df942e886b0b963c742a89736415cc53ffb8fc1f6f5b0b3ea171ed0ba86f1b31cde6ed35db5e07f6d + languageName: node + linkType: hard + "string-length@npm:^4.0.1": version: 4.0.2 resolution: "string-length@npm:4.0.2" @@ -12401,6 +12656,23 @@ __metadata: languageName: node linkType: hard +"tar-fs@npm:^3.0.8": + version: 3.0.8 + resolution: "tar-fs@npm:3.0.8" + dependencies: + bare-fs: ^4.0.1 + bare-path: ^3.0.0 + pump: ^3.0.0 + tar-stream: ^3.1.5 + dependenciesMeta: + bare-fs: + optional: true + bare-path: + optional: true + checksum: 5bebadd68e7a10cc3aa9c30b579c295e158cef7b1f42a73ee1bb1992925027aa8ef6cbcdb0d03e202e7f3850799391de30adf2585f7f240b606faa65df1a6b68 + languageName: node + linkType: hard + "tar-fs@npm:~2.0.1": version: 2.0.1 resolution: "tar-fs@npm:2.0.1" @@ -12951,6 +13223,13 @@ __metadata: languageName: node linkType: hard +"typed-query-selector@npm:^2.12.0": + version: 2.12.0 + resolution: "typed-query-selector@npm:2.12.0" + checksum: c4652f2eec16112d69e0da30c2effab3f03d1710f9559da1e1209bbfc9a20990d5de4ba97890c11f9d17d85c8ae3310953a86c198166599d4c36abc63664f169 + languageName: node + linkType: hard + "typedarray-to-buffer@npm:^3.1.5": version: 3.1.5 resolution: "typedarray-to-buffer@npm:3.1.5" @@ -13394,6 +13673,21 @@ __metadata: languageName: node linkType: hard +"ws@npm:^8.18.1": + version: 8.18.1 + resolution: "ws@npm:8.18.1" + peerDependencies: + bufferutil: ^4.0.1 + utf-8-validate: ">=5.0.2" + peerDependenciesMeta: + bufferutil: + optional: true + utf-8-validate: + optional: true + checksum: 4658357185d891bc45cc2d42a84f9e192d047e8476fb5cba25b604f7d75ca87ca0dd54cd0b2cc49aeee57c79045a741cb7d0b14501953ac60c790cd105c42f23 + languageName: node + linkType: hard + "xdg-basedir@npm:^5.0.1, xdg-basedir@npm:^5.1.0": version: 5.1.0 resolution: "xdg-basedir@npm:5.1.0" @@ -13469,6 +13763,16 @@ __metadata: languageName: node linkType: hard +"yauzl@npm:^2.10.0": + version: 2.10.0 + resolution: "yauzl@npm:2.10.0" + dependencies: + buffer-crc32: ~0.2.3 + fd-slicer: ~1.1.0 + checksum: 7f21fe0bbad6e2cb130044a5d1d0d5a0e5bf3d8d4f8c4e6ee12163ce798fee3de7388d22a7a0907f563ac5f9d40f8699a223d3d5c1718da90b0156da6904022b + languageName: node + linkType: hard + "yocto-queue@npm:^0.1.0": version: 0.1.0 resolution: "yocto-queue@npm:0.1.0" From 0a3e5ffb219dd8a3346904f6a26925117159561d Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Wed, 2 Apr 2025 22:12:12 -0500 Subject: [PATCH 3/3] fixes --- libs/langgraph-cua/package.json | 2 +- libs/langgraph-cua/src/nodes/handle-action.ts | 68 ++++++++++++------- .../src/nodes/take-computer-action.ts | 10 ++- libs/langgraph-cua/src/utils.ts | 6 +- yarn.lock | 10 +-- 5 files changed, 59 insertions(+), 37 deletions(-) diff --git a/libs/langgraph-cua/package.json b/libs/langgraph-cua/package.json index 9f29a3823..6fd71cb32 100644 --- a/libs/langgraph-cua/package.json +++ b/libs/langgraph-cua/package.json @@ -32,7 +32,7 @@ "author": "LangChain", "license": "MIT", "dependencies": { - "@hyperbrowser/sdk": "^0.40.0", + "@hyperbrowser/sdk": "^0.42.0", "puppeteer-core": "^24.5.0", "scrapybara": "^2.4.4", "zod": "^3.23.8" diff --git a/libs/langgraph-cua/src/nodes/handle-action.ts b/libs/langgraph-cua/src/nodes/handle-action.ts index 7cb4fb51e..52786599c 100644 --- a/libs/langgraph-cua/src/nodes/handle-action.ts +++ b/libs/langgraph-cua/src/nodes/handle-action.ts @@ -95,14 +95,16 @@ export async function handleClickAction( switch (provider) { case "scrapybara": return ( - await instance.computer({ + await ( + instance as UbuntuInstance | BrowserInstance | WindowsInstance + ).computer({ action: "click_mouse", button: action.button === "wheel" ? "middle" : action.button, coordinates: [action.x, action.y], }) ).base64Image; case "hyperbrowser": { - const page = await getActivePage(instance); + const page = await getActivePage(instance as Browser); switch (action.button) { case "back": await page.goBack({ timeout: 15_000 }); @@ -122,7 +124,7 @@ export async function handleClickAction( default: throw new Error(`Unknown button: ${action.button}`); } - return await getHyperbrowserScreenshot(instance); + return await getHyperbrowserScreenshot(instance as Browser); } default: throw new Error(`Unknown provider: ${provider}`); @@ -137,7 +139,9 @@ export async function handleDoubleClickAction( switch (provider) { case "scrapybara": return ( - await instance.computer({ + await ( + instance as UbuntuInstance | BrowserInstance | WindowsInstance + ).computer({ action: "click_mouse", button: "left", coordinates: [action.x, action.y], @@ -145,12 +149,12 @@ export async function handleDoubleClickAction( }) ).base64Image; case "hyperbrowser": { - const page = await getActivePage(instance); + const page = await getActivePage(instance as Browser); await page.mouse.click(action.x, action.y, { button: "left", clickCount: 2, }); - return await getHyperbrowserScreenshot(instance); + return await getHyperbrowserScreenshot(instance as Browser); } default: throw new Error(`Unknown provider: ${provider}`); @@ -165,13 +169,15 @@ export async function handleDragAction( switch (provider) { case "scrapybara": return ( - await instance.computer({ + await ( + instance as UbuntuInstance | BrowserInstance | WindowsInstance + ).computer({ action: "drag_mouse", path: action.path.map(({ x, y }) => [x, y]), }) ).base64Image; case "hyperbrowser": { - const page = await getActivePage(instance); + const page = await getActivePage(instance as Browser); if (action.path.length < 2) { throw new Error( "Invalid drag path: must contain at least a start and end point" @@ -187,7 +193,7 @@ export async function handleDragAction( } await page.mouse.up(); - return await getHyperbrowserScreenshot(instance); + return await getHyperbrowserScreenshot(instance as Browser); } default: throw new Error(`Unknown provider: ${provider}`); @@ -209,14 +215,16 @@ export async function handleKeypressAction( : key ); return ( - await instance.computer({ + await ( + instance as UbuntuInstance | BrowserInstance | WindowsInstance + ).computer({ action: "press_key", keys: mappedKeys, }) ).base64Image; } case "hyperbrowser": { - const page = await getActivePage(instance); + const page = await getActivePage(instance as Browser); const mappedKeysHb = action.keys.map((key) => translateKeyToPuppeteerKey(key) ); @@ -226,7 +234,7 @@ export async function handleKeypressAction( for (const key of [...mappedKeysHb].reverse()) { await page.keyboard.up(key); } - return await getHyperbrowserScreenshot(instance); + return await getHyperbrowserScreenshot(instance as Browser); } default: throw new Error(`Unknown provider: ${provider}`); @@ -241,15 +249,17 @@ export async function handleMoveAction( switch (provider) { case "scrapybara": return ( - await instance.computer({ + await ( + instance as UbuntuInstance | BrowserInstance | WindowsInstance + ).computer({ action: "move_mouse", coordinates: [action.x, action.y], }) ).base64Image; case "hyperbrowser": { - const page = await getActivePage(instance); + const page = await getActivePage(instance as Browser); await page.mouse.move(action.x, action.y); - return await getHyperbrowserScreenshot(instance, 1_000); + return await getHyperbrowserScreenshot(instance as Browser, 1_000); } default: throw new Error(`Unknown provider: ${provider}`); @@ -264,12 +274,14 @@ export async function handleScreenshotAction( switch (provider) { case "scrapybara": return ( - await instance.computer({ + await ( + instance as UbuntuInstance | BrowserInstance | WindowsInstance + ).computer({ action: "take_screenshot", }) ).base64Image; case "hyperbrowser": - return await getHyperbrowserScreenshot(instance, 0); + return await getHyperbrowserScreenshot(instance as Browser, 0); default: throw new Error(`Unknown provider: ${provider}`); } @@ -284,12 +296,14 @@ export async function handleWaitAction( case "scrapybara": await sleep(2000); return ( - await instance.computer({ + await ( + instance as UbuntuInstance | BrowserInstance | WindowsInstance + ).computer({ action: "take_screenshot", }) ).base64Image; case "hyperbrowser": - return await getHyperbrowserScreenshot(instance, 2_000); + return await getHyperbrowserScreenshot(instance as Browser, 2_000); default: throw new Error(`Unknown provider: ${provider}`); } @@ -303,7 +317,9 @@ export async function handleScrollAction( switch (provider) { case "scrapybara": return ( - await instance.computer({ + await ( + instance as UbuntuInstance | BrowserInstance | WindowsInstance + ).computer({ action: "scroll", deltaX: action.scroll_x / 20, deltaY: action.scroll_y / 20, @@ -311,12 +327,12 @@ export async function handleScrollAction( }) ).base64Image; case "hyperbrowser": { - const page = await getActivePage(instance); + const page = await getActivePage(instance as Browser); await page.mouse.move(action.x, action.y); await page.evaluate( `window.scrollBy(${action.scroll_x}, ${action.scroll_y})` ); - return await getHyperbrowserScreenshot(instance, 1_000); + return await getHyperbrowserScreenshot(instance as Browser, 1_000); } default: throw new Error(`Unknown provider: ${provider}`); @@ -331,15 +347,17 @@ export async function handleTypeAction( switch (provider) { case "scrapybara": return ( - await instance.computer({ + await ( + instance as UbuntuInstance | BrowserInstance | WindowsInstance + ).computer({ action: "type_text", text: action.text, }) ).base64Image; case "hyperbrowser": { - const page = await getActivePage(instance); + const page = await getActivePage(instance as Browser); await page.keyboard.type(action.text); - return await getHyperbrowserScreenshot(instance, 1_000); + return await getHyperbrowserScreenshot(instance as Browser, 1_000); } default: throw new Error(`Unknown provider: ${provider}`); diff --git a/libs/langgraph-cua/src/nodes/take-computer-action.ts b/libs/langgraph-cua/src/nodes/take-computer-action.ts index 29844713e..518e6e06e 100644 --- a/libs/langgraph-cua/src/nodes/take-computer-action.ts +++ b/libs/langgraph-cua/src/nodes/take-computer-action.ts @@ -29,7 +29,7 @@ const isBrowserInstance = ( async function scrapybaraSetup( instanceId: string, state: CUAState, - config: LangGraphRunnableConfig, + config: LangGraphRunnableConfig ) { const instance = await getScrapybaraInstance(instanceId, config); const { authStateId } = getConfigurationWithDefaults(config); @@ -69,7 +69,7 @@ async function scrapybaraSetup( async function hyperbrowserSetup( instanceId: string, state: CUAState, - config: LangGraphRunnableConfig, + config: LangGraphRunnableConfig ) { const instance = await getHyperbrowserInstance(instanceId, config); let { streamUrl } = state; @@ -125,7 +125,7 @@ export async function takeComputerAction( let computerCallToolMsg: BaseMessageLike | undefined; try { - let responseScreenshot: string; + let responseScreenshot: string | undefined; switch (action.type) { case "click": responseScreenshot = await handleClickAction( @@ -180,6 +180,10 @@ export async function takeComputerAction( ); } + if (!responseScreenshot) { + throw new Error("No screenshot returned from computer action."); + } + let screenshotContent = `data:image/png;base64,${responseScreenshot}`; if (uploadScreenshot) { const uploadScreenshotRunnable = RunnableLambda.from( diff --git a/libs/langgraph-cua/src/utils.ts b/libs/langgraph-cua/src/utils.ts index d9c77a269..a5cee2129 100644 --- a/libs/langgraph-cua/src/utils.ts +++ b/libs/langgraph-cua/src/utils.ts @@ -6,7 +6,7 @@ import { BrowserInstance, WindowsInstance, } from "scrapybara"; -import HyperbrowserClient, { Hyperbrowser } from "@hyperbrowser/sdk"; +import { Hyperbrowser } from "@hyperbrowser/sdk"; import { SessionDetail } from "@hyperbrowser/sdk/types"; import { getEnvironmentVariable } from "@langchain/core/utils/env"; import { AIMessage, BaseMessage, ToolMessage } from "@langchain/core/messages"; @@ -18,7 +18,7 @@ import { getConfigurationWithDefaults } from "./types.js"; * @param {string} apiKey The API key for Hyperbrowser. * @returns {HyperbrowserClient} The Hyperbrowser client. */ -export function getHyperbrowserClient(apiKey: string) { +export function getHyperbrowserClient(apiKey: string): Hyperbrowser { if (!apiKey) { throw new Error( "Hyperbrowser API key not provided. Please provide one in the configurable fields, or set it as an environment variable (HYPERBROWSER_API_KEY)" @@ -137,7 +137,7 @@ export async function stopScrapybaraInstance( */ export async function stopHyperbrowserInstance( id: string, - client?: HyperbrowserClient + client?: Hyperbrowser ): Promise { let client_ = client; if (!client_) { diff --git a/yarn.lock b/yarn.lock index 096603e9f..6c277b5bc 100644 --- a/yarn.lock +++ b/yarn.lock @@ -929,15 +929,15 @@ __metadata: languageName: node linkType: hard -"@hyperbrowser/sdk@npm:^0.40.0": - version: 0.40.0 - resolution: "@hyperbrowser/sdk@npm:0.40.0" +"@hyperbrowser/sdk@npm:^0.42.0": + version: 0.42.0 + resolution: "@hyperbrowser/sdk@npm:0.42.0" dependencies: form-data: ^4.0.1 node-fetch: 2.7.0 zod: ^3.24.1 zod-to-json-schema: ^3.24.1 - checksum: 95f198979a344e5cb70a4adc706257732c88eabbf8461fda4f23aedb4283ab3d76de449270b2b7c7a119187fd4784442a1b9c157b3e99a0869fa8d751fde9963 + checksum: d2e5f69aa2a25645895b8501f71a7b773b03a8f7d8d82cc325929d8c9df70a6057d600d195b0a0b56ef923dba2d00f0f533cddf87b3c772e4e2f79883c763b35 languageName: node linkType: hard @@ -1909,7 +1909,7 @@ __metadata: version: 0.0.0-use.local resolution: "@langchain/langgraph-cua@workspace:libs/langgraph-cua" dependencies: - "@hyperbrowser/sdk": ^0.40.0 + "@hyperbrowser/sdk": ^0.42.0 "@jest/globals": ^29.5.0 "@langchain/langgraph": "workspace:*" "@langchain/openai": ^0.5.1