diff --git a/docs/examples/using_browser_profile.mdx b/docs/examples/using_browser_profile.mdx new file mode 100644 index 000000000000..50ab1a22d7d7 --- /dev/null +++ b/docs/examples/using_browser_profile.mdx @@ -0,0 +1,50 @@ +--- +id: using-browser-profile +title: Using browser profile +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; +import ApiLink from '@site/src/components/ApiLink'; + +import PlaywrightChromeSource from '!!raw-loader!./using_browser_profiles_playwright_chrome.ts'; +import PuppeteerChromeSource from '!!raw-loader!./using_browser_profiles_puppeteer_chrome.ts'; +import PlaywrightFirefoxSource from '!!raw-loader!./using_browser_profiles_playwright_firefox.ts'; + +This example demonstrates how to run `PlaywrightCrawler` and `PuppeteerCrawler` using your local browser profile from [Chrome](https://www.google.com/intl/us/chrome/) or [Firefox](https://www.firefox.com/). + +Using browser profiles allows you to leverage existing login sessions, saved passwords, bookmarks, and other personalized browser data during crawling. This can be particularly useful for testing scenarios or when you need to access content that requires authentication. + +## Chrome browser + +To run a crawler with your Chrome profile, you need to know the path to your profile files. You can find this information by entering `chrome://version/` as a URL in your Chrome browser. If you have multiple profiles, pay attention to the profile name - if you only have one profile, it's always `Default`. + +:::warning Profile access limitation +Due to [Chrome's security policies](https://developer.chrome.com/blog/remote-debugging-port), automation cannot use your main browsing profile directly. The example copies your profile to a temporary location as a workaround. +::: + +Make sure you don't have any running Chrome browser processes before running this code: + + + + + {PlaywrightChromeSource} + + + + + {PuppeteerChromeSource} + + + + +## Firefox browser + +To find the path to your Firefox profile, enter `about:profiles` as a URL in your Firefox browser. Unlike Chrome, you can use your standard profile path directly without copying it first. + +Make sure you don't have any running Firefox browser processes before running this code: + + + {PlaywrightFirefoxSource} + diff --git a/docs/examples/using_browser_profiles_playwright_chrome.ts b/docs/examples/using_browser_profiles_playwright_chrome.ts new file mode 100644 index 000000000000..68ee9015d6fd --- /dev/null +++ b/docs/examples/using_browser_profiles_playwright_chrome.ts @@ -0,0 +1,42 @@ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { PlaywrightCrawler } from 'crawlee'; + +// Profile name to use (usually 'Default' for single profile setups) +const PROFILE_NAME = 'Default'; + +// Path to Chrome user data directory (example for Windows) +// Use `chrome://version/` to find your profile path +const PROFILE_PATH = path.join(os.homedir(), 'AppData', 'Local', 'Google', 'Chrome', 'User Data'); + +// Copy profile to a temp directory to avoid Chrome's lock +const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'crawlee-chrome-profile-')); +fs.cpSync(path.join(PROFILE_PATH, PROFILE_NAME), path.join(tempDir, PROFILE_NAME), { recursive: true }); + +const crawler = new PlaywrightCrawler({ + launchContext: { + // Use the installed Chrome browser + useChrome: true, + // Set user data directory to the temp copy + userDataDir: tempDir, + launchOptions: { + headless: false, + // Slow down actions to mimic human behavior + slowMo: 200, + args: [ + // Use the specified profile + `--profile-directory=${PROFILE_NAME}`, + ], + }, + }, + async requestHandler({ request, log }) { + log.info(`Visiting ${request.url}`); + }, +}); + +await crawler.run(['https://crawlee.dev']); + +// Clean up the temp profile +fs.rmSync(tempDir, { recursive: true, force: true }); diff --git a/docs/examples/using_browser_profiles_playwright_firefox.ts b/docs/examples/using_browser_profiles_playwright_firefox.ts new file mode 100644 index 000000000000..64ab3f03cd29 --- /dev/null +++ b/docs/examples/using_browser_profiles_playwright_firefox.ts @@ -0,0 +1,34 @@ +import os from 'node:os'; +import path from 'node:path'; + +import { PlaywrightCrawler } from 'crawlee'; +import { firefox } from 'playwright'; + +// Replace this with your actual Firefox profile name +// Find it at about:profiles in Firefox +const PROFILE_NAME = 'your-profile-name-here'; + +// Path to Firefox profile directory (example for Windows) +// Use `about:profiles` to find your profile path +const PROFILE_PATH = path.join(os.homedir(), 'AppData', 'Roaming', 'Mozilla', 'Firefox', 'Profiles', PROFILE_NAME); + +const crawler = new PlaywrightCrawler({ + launchContext: { + // Use Firefox browser + launcher: firefox, + // Path to your Firefox profile + userDataDir: PROFILE_PATH, + launchOptions: { + headless: false, + args: [ + // Required to avoid version conflicts + '--allow-downgrade', + ], + }, + }, + async requestHandler({ request, log }) { + log.info(`Visiting ${request.url}`); + }, +}); + +await crawler.run(['https://crawlee.dev']); diff --git a/docs/examples/using_browser_profiles_puppeteer_chrome.ts b/docs/examples/using_browser_profiles_puppeteer_chrome.ts new file mode 100644 index 000000000000..ba42cb99d4f6 --- /dev/null +++ b/docs/examples/using_browser_profiles_puppeteer_chrome.ts @@ -0,0 +1,42 @@ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { PuppeteerCrawler } from 'crawlee'; + +// Profile name to use (usually 'Default' for single profile setups) +const PROFILE_NAME = 'Default'; + +// Path to Chrome user data directory (example for Windows) +// Use `chrome://version/` to find your profile path +const PROFILE_PATH = path.join(os.homedir(), 'AppData', 'Local', 'Google', 'Chrome', 'User Data'); + +// Copy profile to a temp directory to avoid Chrome's lock +const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'crawlee-chrome-profile-')); +fs.cpSync(path.join(PROFILE_PATH, PROFILE_NAME), path.join(tempDir, PROFILE_NAME), { recursive: true }); + +const crawler = new PuppeteerCrawler({ + launchContext: { + // Use the installed Chrome browser + useChrome: true, + launchOptions: { + headless: false, + // Set user data directory via Puppeteer launch options + userDataDir: tempDir, + // Slow down actions to mimic human behavior + slowMo: 200, + args: [ + // Use the specified profile + `--profile-directory=${PROFILE_NAME}`, + ], + }, + }, + async requestHandler({ request, log }) { + log.info(`Visiting ${request.url}`); + }, +}); + +await crawler.run(['https://crawlee.dev']); + +// Clean up the temp profile +fs.rmSync(tempDir, { recursive: true, force: true });