Skip to content

Commit b59788e

Browse files
authored
Profiles: Support for running with existing profiles + saving profile after a login (#34)
Support for profiles via a mounted .tar.gz and --profile option + improved docs #18 * support creating profiles via 'create-login-profile' command with options for where to save profile, username/pass and debug screenshot output. support entering username and password (hidden) on command-line if omitted. * use patched pywb for fix * bump browsertrix-behaviors to 0.1.0 * README: updates to include better getting started, behaviors and profile reference/examples * bump version to 0.3.0!
1 parent c9f8fe0 commit b59788e

File tree

8 files changed

+477
-82
lines changed

8 files changed

+477
-82
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ ADD uwsgi.ini /app/
4343
ADD *.js /app/
4444

4545
RUN ln -s /app/main.js /usr/bin/crawl
46+
RUN ln -s /app/create-login-profile.js /usr/bin/create-login-profile
4647

4748
WORKDIR /crawls
4849

README.md

Lines changed: 154 additions & 54 deletions
Large diffs are not rendered by default.

crawler.js

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ const fetch = require("node-fetch");
55
const AbortController = require("abort-controller");
66
const path = require("path");
77
const fs = require("fs");
8+
const os = require("os");
89
const Sitemapper = require("sitemapper");
910
const { v4: uuidv4 } = require("uuid");
1011
const warcio = require("warcio");
@@ -44,6 +45,7 @@ class Crawler {
4445

4546
this.userAgent = "";
4647
this.behaviorsLogDebug = false;
48+
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
4749

4850
const params = require("yargs")
4951
.usage("browsertrix-crawler [options]")
@@ -279,6 +281,11 @@ class Crawler {
279281
default: "autoplay,autofetch,siteSpecific",
280282
type: "string",
281283
},
284+
285+
"profile": {
286+
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
287+
type: "string",
288+
},
282289
};
283290
}
284291

@@ -399,6 +406,10 @@ class Crawler {
399406
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
400407
}
401408

409+
if (argv.profile) {
410+
child_process.execSync("tar xvfz " + argv.profile, {cwd: this.profileDir});
411+
}
412+
402413
return true;
403414
}
404415

@@ -411,6 +422,7 @@ class Crawler {
411422
"--disable-background-media-suspend",
412423
"--autoplay-policy=no-user-gesture-required",
413424
"--disable-features=IsolateOrigins,site-per-process",
425+
"--disable-popup-blocking"
414426
];
415427
}
416428

@@ -420,7 +432,9 @@ class Crawler {
420432
headless: this.params.headless,
421433
executablePath: CHROME_PATH,
422434
ignoreHTTPSErrors: true,
423-
args: this.chromeArgs
435+
args: this.chromeArgs,
436+
userDataDir: this.profileDir,
437+
defaultViewport: null,
424438
};
425439
}
426440

@@ -436,31 +450,30 @@ class Crawler {
436450
process.exit(1);
437451
}
438452
}
439-
453+
454+
_behaviorLog({data, type}) {
455+
switch (type) {
456+
case "info":
457+
console.log(JSON.stringify(data));
458+
break;
459+
460+
case "debug":
461+
default:
462+
if (this.behaviorsLogDebug) {
463+
console.log("behavior debug: " + JSON.stringify(data));
464+
}
465+
}
466+
}
467+
440468
async crawlPage({page, data}) {
441469
try {
442470
if (this.emulateDevice) {
443471
await page.emulate(this.emulateDevice);
444472
}
445473

446474
if (this.behaviorOpts) {
447-
await page.exposeFunction(BEHAVIOR_LOG_FUNC, ({data, type}) => {
448-
switch (type) {
449-
case "info":
450-
console.log(JSON.stringify(data));
451-
break;
452-
453-
case "debug":
454-
default:
455-
if (this.behaviorsLogDebug) {
456-
console.log("behavior debug: " + JSON.stringify(data));
457-
}
458-
}
459-
});
460-
461-
await page.evaluateOnNewDocument(behaviors + `
462-
self.__bx_behaviors.init(${this.behaviorOpts});
463-
`);
475+
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
476+
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.behaviorOpts});`);
464477
}
465478

466479
// run custom driver here

create-login-profile.js

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#!/usr/bin/env node
2+
3+
const readline = require("readline");
4+
const child_process = require("child_process");
5+
6+
const puppeteer = require("puppeteer-core");
7+
const yargs = require("yargs");
8+
9+
function cliOpts() {
10+
return {
11+
"url": {
12+
describe: "The URL of the login page",
13+
type: "string",
14+
demandOption: true,
15+
},
16+
17+
"user": {
18+
describe: "The username for the login. If not specified, will be prompted",
19+
},
20+
21+
"password": {
22+
describe: "The password for the login. If not specified, will be prompted (recommended)",
23+
},
24+
25+
"filename": {
26+
describe: "The filename for the profile tarball",
27+
default: "/output/profile.tar.gz",
28+
},
29+
30+
"debugScreenshot": {
31+
describe: "If specified, take a screenshot after login and save as this filename"
32+
},
33+
34+
"headless": {
35+
describe: "Run in headless mode, otherwise start xvfb",
36+
type: "boolean",
37+
default: false,
38+
},
39+
};
40+
}
41+
42+
43+
44+
async function main() {
45+
const params = yargs
46+
.usage("browsertrix-crawler profile [options]")
47+
.option(cliOpts())
48+
.argv;
49+
50+
if (!params.headless) {
51+
console.log("Launching XVFB");
52+
child_process.spawn("Xvfb", [
53+
process.env.DISPLAY,
54+
"-listen",
55+
"tcp",
56+
"-screen",
57+
"0",
58+
process.env.GEOMETRY,
59+
"-ac",
60+
"+extension",
61+
"RANDR"
62+
]);
63+
}
64+
65+
//await new Promise(resolve => setTimeout(resolve, 2000));
66+
67+
const args = {
68+
headless: !!params.headless,
69+
executablePath: "google-chrome",
70+
ignoreHTTPSErrors: true,
71+
args: [
72+
"--no-xshm",
73+
"--no-sandbox",
74+
"--disable-background-media-suspend",
75+
"--autoplay-policy=no-user-gesture-required",
76+
"--disable-features=IsolateOrigins,site-per-process",
77+
"--user-data-dir=/tmp/profile"
78+
]
79+
};
80+
81+
if (!params.user) {
82+
params.user = await promptInput("Enter username: ");
83+
}
84+
85+
if (!params.password) {
86+
params.password = await promptInput("Enter password: ", true);
87+
}
88+
89+
const browser = await puppeteer.launch(args);
90+
91+
const page = await browser.newPage();
92+
93+
const waitUntil = ["load", "networkidle2"];
94+
95+
await page.setCacheEnabled(false);
96+
97+
console.log("loading");
98+
99+
await page.goto(params.url, {waitUntil});
100+
101+
console.log("loaded");
102+
103+
let u, p;
104+
105+
try {
106+
u = await page.waitForXPath("//input[contains(@name, 'user')]");
107+
108+
p = await page.waitForXPath("//input[contains(@name, 'pass') and @type='password']");
109+
110+
} catch (e) {
111+
if (params.debugScreenshot) {
112+
await page.screenshot({path: params.debugScreenshot});
113+
}
114+
console.log("Login form could not be found");
115+
await page.close();
116+
process.exit(1);
117+
return;
118+
}
119+
120+
await u.type(params.user);
121+
122+
await p.type(params.password);
123+
124+
await Promise.allSettled([
125+
p.press("Enter"),
126+
page.waitForNavigation({waitUntil})
127+
]);
128+
129+
await page._client.send("Network.clearBrowserCache");
130+
131+
if (params.debugScreenshot) {
132+
await page.screenshot({path: params.debugScreenshot});
133+
}
134+
135+
await browser.close();
136+
137+
console.log("creating profile");
138+
139+
const profileFilename = params.filename || "/output/profile.tar.gz";
140+
141+
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: "/tmp/profile"});
142+
console.log("done");
143+
144+
process.exit(0);
145+
}
146+
147+
function promptInput(msg, hidden = false) {
148+
const rl = readline.createInterface({
149+
input: process.stdin,
150+
output: process.stdout
151+
});
152+
153+
if (hidden) {
154+
// from https://stackoverflow.com/a/59727173
155+
rl.input.on("keypress", function () {
156+
// get the number of characters entered so far:
157+
const len = rl.line.length;
158+
// move cursor back to the beginning of the input:
159+
readline.moveCursor(rl.output, -len, 0);
160+
// clear everything to the right of the cursor:
161+
readline.clearLine(rl.output, 1);
162+
// replace the original input with asterisks:
163+
for (let i = 0; i < len; i++) {
164+
rl.output.write("*");
165+
}
166+
});
167+
}
168+
169+
return new Promise((resolve) => {
170+
rl.question(msg, function (res) {
171+
rl.close();
172+
resolve(res);
173+
});
174+
});
175+
}
176+
177+
main();
178+

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ version: '3.5'
22

33
services:
44
crawler:
5-
image: webrecorder/browsertrix-crawler:0.3.0-beta.0
5+
image: webrecorder/browsertrix-crawler:0.3.0
66
build:
77
context: ./
88

package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
{
22
"name": "browsertrix-crawler",
3-
"version": "0.3.0-beta.0",
3+
"version": "0.3.0",
44
"main": "browsertrix-crawler",
55
"repository": "https://github.com/webrecorder/browsertrix-crawler",
66
"author": "Ilya Kreymer <[email protected]>, Webrecorder Software",
77
"license": "MIT",
88
"dependencies": {
99
"abort-controller": "^3.0.0",
10-
"browsertrix-behaviors": "github:webrecorder/browsertrix-behaviors",
10+
"browsertrix-behaviors": "^0.1.0",
1111
"node-fetch": "^2.6.1",
1212
"puppeteer-cluster": "^0.22.0",
1313
"puppeteer-core": "^5.3.1",
@@ -20,6 +20,6 @@
2020
"eslint-plugin-react": "^7.22.0",
2121
"jest": "^26.6.3",
2222
"md5": "^2.3.0",
23-
"warcio": "^1.4.2"
23+
"warcio": "^1.4.3"
2424
}
2525
}

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
pywb>=2.5.0
1+
#pywb>=2.5.0
2+
git+https://github.com/webrecorder/pywb@yt-rules-improve
23
uwsgi
34
wacz>=0.2.1

0 commit comments

Comments
 (0)