Skip to content

Commit 24e2c4d

Browse files
Emma DicksonEmma Dickson
andauthored
Create --combineWARC flag that combines generated warcs into a single warc upto rollover size (#33)
* generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc.. * each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC. * add test for --combineWARC flag * add improved lint rules Co-authored-by: Emma Dickson <[email protected]>
1 parent bc7f1ba commit 24e2c4d

File tree

11 files changed

+201
-51
lines changed

11 files changed

+201
-51
lines changed

.eslintrc.js

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,21 @@ module.exports = {
1111
"sourceType": "module"
1212
},
1313
"rules": {
14-
}
15-
};
14+
"indent": [
15+
"error",
16+
2
17+
],
18+
"linebreak-style": [
19+
"error",
20+
"unix"
21+
],
22+
"quotes": [
23+
"error",
24+
"double"
25+
],
26+
"semi": [
27+
"error",
28+
"always"
29+
]
30+
}
31+
};

.eslintrc.yml

Lines changed: 0 additions & 15 deletions
This file was deleted.

.github/workflows/ci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
- name: build docker
4242
run: docker-compose build
4343
- name: run crawl
44-
run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --workers 2
44+
run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2
4545
- name: validate existing wacz
4646
run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz
4747
- name: unzip wacz

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ Options:
6767
is done [boolean] [default: false]
6868
--generateWACZ If set, generate wacz for use with pywb after crawl
6969
is done [boolean] [default: false]
70+
--combineWARC If set, combine the individual warcs generated into a single warc after crawl
71+
is done [boolean] [default: false]
72+
--rolloverSize If set, dictates the maximum size that a generated warc and combined warc can be
73+
[number] [default: 1000000000]
7074
--text If set, extract the pages full text to be added to the pages.jsonl
7175
file [boolean] [default: false]
7276
--cwd Crawl working directory for captures (pywb root). If not

config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ recorder:
1212
dedup_policy: skip
1313
source_coll: live
1414
cache: always
15+
rollover_size: ${ROLLOVER_SIZE}
1516

1617
#autoindex: 10
1718

crawler.js

Lines changed: 128 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ const path = require("path");
77
const fs = require("fs");
88
const Sitemapper = require("sitemapper");
99
const { v4: uuidv4 } = require("uuid");
10+
const warcio = require("warcio");
1011

1112
const TextExtract = require("./textextract");
1213
const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
@@ -104,7 +105,7 @@ class Crawler {
104105
}
105106

106107
bootstrap() {
107-
let opts = {}
108+
let opts = {};
108109
if (this.params.logging.includes("pywb")) {
109110
opts = {stdio: "inherit", cwd: this.params.cwd};
110111
}
@@ -120,8 +121,8 @@ class Crawler {
120121

121122
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
122123

123-
opts.env = {...process.env, COLL: this.params.collection};
124-
124+
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
125+
125126
child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts);
126127

127128
if (!this.params.headless) {
@@ -212,6 +213,19 @@ class Crawler {
212213
default: false,
213214
},
214215

216+
"combineWARC": {
217+
alias: ["combinewarc", "combineWarc"],
218+
describe: "If set, combine the warcs",
219+
type: "boolean",
220+
default: false,
221+
},
222+
223+
"rolloverSize": {
224+
describe: "If set, declare the rollover size",
225+
default: 1000000000,
226+
type: "number",
227+
},
228+
215229
"generateWACZ": {
216230
alias: ["generatewacz", "generateWacz"],
217231
describe: "If set, generate wacz",
@@ -426,15 +440,15 @@ class Crawler {
426440
if (this.behaviorOpts) {
427441
await page.exposeFunction(BEHAVIOR_LOG_FUNC, ({data, type}) => {
428442
switch (type) {
429-
case "info":
430-
console.log(JSON.stringify(data));
431-
break;
432-
433-
case "debug":
434-
default:
435-
if (this.behaviorsLogDebug) {
436-
console.log("behavior debug: " + JSON.stringify(data));
437-
}
443+
case "info":
444+
console.log(JSON.stringify(data));
445+
break;
446+
447+
case "debug":
448+
default:
449+
if (this.behaviorsLogDebug) {
450+
console.log("behavior debug: " + JSON.stringify(data));
451+
}
438452
}
439453
});
440454

@@ -448,7 +462,7 @@ class Crawler {
448462

449463

450464
const title = await page.title();
451-
let text = '';
465+
let text = "";
452466
if (this.params.text) {
453467
const client = await page.target().createCDPSession();
454468
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
@@ -467,6 +481,28 @@ class Crawler {
467481
console.warn(e);
468482
}
469483
}
484+
485+
async createWARCInfo(filename) {
486+
const warcVersion = "WARC/1.1";
487+
const type = "warcinfo";
488+
const packageFileJSON = JSON.parse(fs.readFileSync("../app/package.json"));
489+
const pywb_version = fs.readFileSync("/usr/local/lib/python3.8/site-packages/pywb/version.py", "utf8").split("\n")[0].split("=")[1].trim().replace(/['"]+/g, "");
490+
const warcioPackageJson = JSON.parse(fs.readFileSync("/app/node_modules/warcio/package.json"));
491+
492+
const info = {
493+
"software": `Browsertrix-Crawler ${packageFileJSON["version"]} (with warcio.js ${warcioPackageJson} pywb ${pywb_version})`,
494+
"format": "WARC File Format 1.1"
495+
};
496+
497+
const record = await warcio.WARCRecord.createWARCInfo({filename, type, warcVersion}, info);
498+
const buffer = await warcio.WARCSerializer.serialize(record, {gzip: true});
499+
return buffer;
500+
}
501+
502+
getFileSize(filename) {
503+
var stats = fs.statSync(filename);
504+
return stats.size;
505+
}
470506

471507
async crawl() {
472508
try {
@@ -505,6 +541,10 @@ class Crawler {
505541
// extra wait for all resources to land into WARCs
506542
console.log("Waiting 5s to ensure WARCs are finished");
507543
await this.sleep(5000);
544+
545+
if (this.params.combineWARC) {
546+
await this.combineWARC();
547+
}
508548

509549
if (this.params.generateCDX) {
510550
console.log("Generate CDX");
@@ -594,16 +634,16 @@ class Crawler {
594634
// create pages dir if doesn't exist and write pages.jsonl header
595635
if (!fs.existsSync(this.pagesDir)) {
596636
fs.mkdirSync(this.pagesDir);
597-
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
637+
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
598638
if (this.params.text) {
599639
console.log("creating pages with full text");
600-
header["hasText"] = true
640+
header["hasText"] = true;
601641
}
602642
else{
603643
console.log("creating pages without full text");
604-
header["hasText"] = false
644+
header["hasText"] = false;
605645
}
606-
const header_formatted = JSON.stringify(header).concat("\n")
646+
const header_formatted = JSON.stringify(header).concat("\n");
607647
fs.writeFileSync(this.pagesFile, header_formatted);
608648
}
609649
} catch(err) {
@@ -616,7 +656,7 @@ class Crawler {
616656
const row = {"id": id, "url": url, "title": title};
617657

618658
if (text == true){
619-
row['text'] = text_content
659+
row["text"] = text_content;
620660
}
621661

622662
const processedRow = JSON.stringify(row).concat("\n");
@@ -746,6 +786,76 @@ class Crawler {
746786
console.log(e);
747787
}
748788
}
789+
790+
async combineWARC() {
791+
console.log("Combining the warcs");
792+
793+
// Get the list of created Warcs
794+
const warcLists = fs.readdirSync(path.join(this.collDir, "archive"));
795+
796+
const fileSizeObjects = []; // Used to sort the created warc by fileSize
797+
798+
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
799+
for (let i = 0; i < warcLists.length; i++) {
800+
let fileName = path.join(this.collDir, "archive", warcLists[i]);
801+
let fileSize = this.getFileSize(fileName);
802+
fileSizeObjects.push({"fileSize": fileSize, "fileName": fileName});
803+
fileSizeObjects.sort(function(a, b){
804+
return b.fileSize - a.fileSize;
805+
});
806+
}
807+
808+
const generatedCombinedWarcs = [];
809+
810+
// Used to name combined warcs, default to -1 for first increment
811+
let combinedWarcNumber = -1;
812+
813+
// write combine WARC to collection root
814+
let combinedWarcFullPath = "";
815+
816+
// Iterate through the sorted file size array.
817+
for (let j = 0; j < fileSizeObjects.length; j++) {
818+
819+
// if need to rollover to new warc
820+
let doRollover = false;
821+
822+
// set to true for first warc
823+
if (combinedWarcNumber < 0) {
824+
doRollover = true;
825+
} else {
826+
// Check the size of the existing combined warc.
827+
const currentCombinedWarcSize = this.getFileSize(combinedWarcFullPath);
828+
829+
// If adding the current warc to the existing combined file creates a file smaller than the rollover size add the data to the combinedWarc
830+
const proposedWarcSize = fileSizeObjects[j].fileSize + currentCombinedWarcSize;
831+
832+
doRollover = (proposedWarcSize >= this.params.rolloverSize);
833+
}
834+
835+
if (doRollover) {
836+
// If adding the current warc to the existing combined file creates a file larger than the rollover size do the following:
837+
// 1. increment the combinedWarcNumber
838+
// 2. create the name of the new combinedWarcFile
839+
// 3. Write the header out to the new file
840+
// 4. Write out the current warc data to the combinedFile
841+
combinedWarcNumber = combinedWarcNumber + 1;
842+
843+
const combinedWarcName = `${this.params.collection}_${combinedWarcNumber}.warc`;
844+
845+
// write combined warcs to root collection dir as they're output of a collection (like wacz)
846+
combinedWarcFullPath = path.join(this.collDir, combinedWarcName);
847+
848+
generatedCombinedWarcs.push(combinedWarcName);
849+
850+
const warcBuffer = await this.createWARCInfo(combinedWarcName);
851+
fs.writeFileSync(combinedWarcFullPath, warcBuffer);
852+
}
853+
854+
fs.appendFileSync(combinedWarcFullPath, fs.readFileSync(fileSizeObjects[j].fileName));
855+
}
856+
857+
console.log(`Combined warcs saved as: ${generatedCombinedWarcs}`);
858+
}
749859
}
750860

751861
module.exports.Crawler = Crawler;

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"eslint": "^7.20.0",
2020
"eslint-plugin-react": "^7.22.0",
2121
"jest": "^26.6.3",
22-
"md5": "^2.3.0"
22+
"md5": "^2.3.0",
23+
"warcio": "^1.4.2"
2324
}
2425
}

tests/combine_warc.test.js

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
const fs = require("fs");
2+
3+
test("check that a combined warc file exists in the archive folder", () => {
4+
const warcLists = fs.readdirSync("crawls/collections/wr-net");
5+
var captureFound = 0;
6+
7+
for (var i = 0; i < warcLists.length; i++) {
8+
if (warcLists[i].endsWith("_0.warc")){
9+
captureFound = 1;
10+
}
11+
}
12+
expect(captureFound).toEqual(1);
13+
});

tests/rollover.test.js

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
const fs = require("fs");
2+
const path = require("path");
3+
4+
function getFileSize(filename) {
5+
var stats = fs.statSync(filename);
6+
return stats.size;
7+
}
8+
9+
test("check that a combined warc file is under the rolloverSize", () => {
10+
const warcLists = fs.readdirSync(path.join("crawls/collections/wr-net/wacz", "archive"));
11+
var rolloverSize = 0;
12+
13+
for (var i = 0; i < warcLists.length; i++) {
14+
var size = getFileSize(path.join("crawls/collections/wr-net/wacz/archive/", warcLists[i]));
15+
if (size < 10000){
16+
rolloverSize = 1;
17+
}
18+
}
19+
expect(rolloverSize).toEqual(1);
20+
});

tests/text.test.js

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
const fs = require("fs");
2-
const md5 = require('md5');
2+
const md5 = require("md5");
33

44

5-
test('check that the pages.jsonl file exists in the collection under the pages folder', () => {
6-
expect(fs.existsSync('crawls/collections/wr-net/pages/pages.jsonl')).toBe(true);
5+
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
6+
expect(fs.existsSync("crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
77
});
88

9-
test('check that the pages.jsonl file exists in the wacz under the pages folder', () => {
10-
expect(fs.existsSync('crawls/collections/wr-net/wacz/pages/pages.jsonl')).toBe(true);
9+
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
10+
expect(fs.existsSync("crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
1111
});
1212

13-
test('check that the hash in the pages folder and in the unzipped wacz folders match', () => {
14-
const crawl_hash = md5(JSON.parse(fs.readFileSync('crawls/collections/wr-net/wacz/pages/pages.jsonl', 'utf8').split('\n')[1])['text']);
15-
const wacz_hash = md5(JSON.parse(fs.readFileSync('crawls/collections/wr-net/pages/pages.jsonl', 'utf8').split('\n')[1])['text']);
16-
const fixture_hash = md5(JSON.parse(fs.readFileSync('tests/fixtures/pages.jsonl', 'utf8').split('\n')[1])['text']);
13+
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
14+
const crawl_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
15+
const wacz_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
16+
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
1717

1818
expect(wacz_hash).toEqual(fixture_hash);
1919
expect(wacz_hash).toEqual(crawl_hash);

0 commit comments

Comments
 (0)