Create --combineWARC flag that combines generated warcs into a single warc upto rollover size (#33)

Emma Dickson · Emma Dickson · web-flow · commit 24e2c4ddf8c3 · 2021-03-31T10:41:27.000-07:00
* generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc..
* each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC.
* add test for --combineWARC flag
* add improved lint rules

Co-authored-by: Emma Dickson &lt;emmadickson@Emmas-MacBook-Pro.local&gt;
diff --git a/.eslintrc.js b/.eslintrc.js
@@ -11,5 +11,21 @@ module.exports = {
         "sourceType": "module"
     },
     "rules": {
-    }
-};
+      "indent": [
+          "error",
+          2
+      ],
+      "linebreak-style": [
+          "error",
+          "unix"
+      ],
+      "quotes": [
+          "error",
+          "double"
+      ],
+      "semi": [
+          "error",
+          "always"
+      ]
+  }
+};
diff --git a/.eslintrc.yml b/.eslintrc.yml
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -41,7 +41,7 @@ jobs:
     - name: build docker
       run: docker-compose build
     - name: run crawl
-      run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ  --text --collection wr-net --workers 2              
+      run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ  --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2              
     - name: validate existing wacz
       run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz
     - name: unzip wacz
diff --git a/README.md b/README.md
@@ -67,6 +67,10 @@ Options:
                      is done                          [boolean] [default: false]
       --generateWACZ If set, generate wacz for use with pywb after crawl
                       is done                          [boolean] [default: false]
+      --combineWARC If set, combine the individual warcs generated into a single warc after crawl
+                      is done                          [boolean] [default: false]
+      --rolloverSize If set, dictates the maximum size that a generated warc and combined warc can be
+                                                      [number] [default: 1000000000]
       --text         If set, extract the pages full text to be added to the pages.jsonl  
                       file                         [boolean] [default: false]
       --cwd          Crawl working directory for captures (pywb root). If not
diff --git a/config.yaml b/config.yaml
@@ -12,6 +12,7 @@ recorder:
   dedup_policy: skip
   source_coll: live
   cache: always
+  rollover_size: ${ROLLOVER_SIZE}
 
 #autoindex: 10
 
diff --git a/crawler.js b/crawler.js
@@ -7,6 +7,7 @@ const path = require("path");
 const fs = require("fs");
 const Sitemapper = require("sitemapper");
 const { v4: uuidv4 } = require("uuid");
+const warcio = require("warcio");
 
 const TextExtract = require("./textextract");
 const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
@@ -104,7 +105,7 @@ class Crawler {
   }
 
   bootstrap() {
-    let opts = {}
+    let opts = {};
     if (this.params.logging.includes("pywb")) {
       opts = {stdio: "inherit", cwd: this.params.cwd};
     }
@@ -120,8 +121,8 @@ class Crawler {
     
     child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
 
-    opts.env = {...process.env, COLL: this.params.collection};
-
+    opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
+    
     child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts);
 
     if (!this.params.headless) {
@@ -212,6 +213,19 @@ class Crawler {
         default: false,
       },
 
+      "combineWARC": {
+        alias: ["combinewarc", "combineWarc"],
+        describe: "If set, combine the warcs",
+        type: "boolean",
+        default: false,
+      },
+      
+      "rolloverSize": {
+        describe: "If set, declare the rollover size",
+        default: 1000000000,
+        type: "number",
+      },
+      
       "generateWACZ": {
         alias: ["generatewacz", "generateWacz"],
         describe: "If set, generate wacz",
@@ -426,15 +440,15 @@ class Crawler {
       if (this.behaviorOpts) {
         await page.exposeFunction(BEHAVIOR_LOG_FUNC, ({data, type}) => {
           switch (type) {
-            case "info":
-              console.log(JSON.stringify(data));
-              break;
-
-            case "debug":
-            default:
-              if (this.behaviorsLogDebug) {
-                console.log("behavior debug: " + JSON.stringify(data));
-              }
+          case "info":
+            console.log(JSON.stringify(data));
+            break;
+
+          case "debug":
+          default:
+            if (this.behaviorsLogDebug) {
+              console.log("behavior debug: " + JSON.stringify(data));
+            }
           }
         });
 
@@ -448,7 +462,7 @@ class Crawler {
       
       
       const title = await page.title();
-      let text = '';
+      let text = "";
       if (this.params.text) {
         const client = await page.target().createCDPSession();
         const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
@@ -467,6 +481,28 @@ class Crawler {
       console.warn(e);
     }
   }
+  
+  async createWARCInfo(filename) {
+    const warcVersion = "WARC/1.1";
+    const type = "warcinfo";
+    const packageFileJSON = JSON.parse(fs.readFileSync("../app/package.json"));
+    const pywb_version = fs.readFileSync("/usr/local/lib/python3.8/site-packages/pywb/version.py", "utf8").split("\n")[0].split("=")[1].trim().replace(/['"]+/g, "");
+    const warcioPackageJson = JSON.parse(fs.readFileSync("/app/node_modules/warcio/package.json"));
+
+    const info = {
+      "software": `Browsertrix-Crawler ${packageFileJSON["version"]} (with warcio.js ${warcioPackageJson} pywb ${pywb_version})`,
+      "format": "WARC File Format 1.1"
+    };
+    
+    const record = await warcio.WARCRecord.createWARCInfo({filename, type, warcVersion}, info);
+    const buffer = await warcio.WARCSerializer.serialize(record, {gzip: true});
+    return buffer;
+  }
+  
+  getFileSize(filename) {
+    var stats = fs.statSync(filename);
+    return stats.size;
+  }
 
   async crawl() {
     try {
@@ -505,6 +541,10 @@ class Crawler {
     // extra wait for all resources to land into WARCs
     console.log("Waiting 5s to ensure WARCs are finished");
     await this.sleep(5000);
+    
+    if (this.params.combineWARC) {
+      await this.combineWARC();
+    }
 
     if (this.params.generateCDX) {
       console.log("Generate CDX");
@@ -594,16 +634,16 @@ class Crawler {
       // create pages dir if doesn't exist and write pages.jsonl header
       if (!fs.existsSync(this.pagesDir)) {
         fs.mkdirSync(this.pagesDir);
-        const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
+        const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
         if (this.params.text) {
           console.log("creating pages with full text");
-          header["hasText"] = true
+          header["hasText"] = true;
         }
         else{
           console.log("creating pages without full text");
-          header["hasText"] = false
+          header["hasText"] = false;
         }
-        const header_formatted = JSON.stringify(header).concat("\n")
+        const header_formatted = JSON.stringify(header).concat("\n");
         fs.writeFileSync(this.pagesFile, header_formatted);
       }
     } catch(err) {
@@ -616,7 +656,7 @@ class Crawler {
     const row = {"id": id, "url": url, "title": title};
 
     if (text == true){
-      row['text'] = text_content
+      row["text"] = text_content;
     }
     
     const processedRow = JSON.stringify(row).concat("\n");
@@ -746,6 +786,76 @@ class Crawler {
       console.log(e);
     }
   }
+
+  async combineWARC() {
+    console.log("Combining the warcs");
+
+    // Get the list of created Warcs
+    const warcLists = fs.readdirSync(path.join(this.collDir, "archive"));
+
+    const fileSizeObjects = []; // Used to sort the created warc by fileSize
+
+    // Go through a list of the created works and create an array sorted by their filesize with the largest file first.
+    for (let i = 0; i < warcLists.length; i++) {
+      let fileName = path.join(this.collDir, "archive", warcLists[i]);
+      let fileSize = this.getFileSize(fileName);
+      fileSizeObjects.push({"fileSize": fileSize, "fileName": fileName});
+      fileSizeObjects.sort(function(a, b){
+        return b.fileSize - a.fileSize;
+      });
+    }
+
+    const generatedCombinedWarcs = [];
+
+    // Used to name combined warcs, default to -1 for first increment
+    let combinedWarcNumber = -1;
+
+    // write combine WARC to collection root
+    let combinedWarcFullPath = "";
+
+    // Iterate through the sorted file size array.
+    for (let j = 0; j < fileSizeObjects.length; j++) {
+
+      // if need to rollover to new warc
+      let doRollover = false;
+
+      // set to true for first warc
+      if (combinedWarcNumber < 0) {
+        doRollover = true;
+      } else {
+        // Check the size of the existing combined warc.
+        const currentCombinedWarcSize = this.getFileSize(combinedWarcFullPath);
+
+        //  If adding the current warc to the existing combined file creates a file smaller than the rollover size add the data to the combinedWarc
+        const proposedWarcSize = fileSizeObjects[j].fileSize + currentCombinedWarcSize;
+
+        doRollover = (proposedWarcSize >= this.params.rolloverSize);
+      }
+
+      if (doRollover) {
+        // If adding the current warc to the existing combined file creates a file larger than the rollover size do the following: 
+        // 1. increment the combinedWarcNumber
+        // 2. create the name of the new combinedWarcFile
+        // 3. Write the header out to the new file
+        // 4. Write out the current warc data to the combinedFile
+        combinedWarcNumber = combinedWarcNumber + 1;
+
+        const combinedWarcName = `${this.params.collection}_${combinedWarcNumber}.warc`;
+
+        // write combined warcs to root collection dir as they're output of a collection (like wacz)
+        combinedWarcFullPath = path.join(this.collDir, combinedWarcName);
+
+        generatedCombinedWarcs.push(combinedWarcName);
+
+        const warcBuffer = await this.createWARCInfo(combinedWarcName);
+        fs.writeFileSync(combinedWarcFullPath, warcBuffer);
+      }
+
+      fs.appendFileSync(combinedWarcFullPath, fs.readFileSync(fileSizeObjects[j].fileName));
+    }
+
+    console.log(`Combined warcs saved as: ${generatedCombinedWarcs}`);
+  }
 }
 
 module.exports.Crawler = Crawler;
diff --git a/package.json b/package.json
@@ -19,6 +19,7 @@
     "eslint": "^7.20.0",
     "eslint-plugin-react": "^7.22.0",
     "jest": "^26.6.3",
-    "md5": "^2.3.0"
+    "md5": "^2.3.0",
+    "warcio": "^1.4.2"
   }
 }
diff --git a/tests/combine_warc.test.js b/tests/combine_warc.test.js
@@ -0,0 +1,13 @@
+const fs = require("fs");
+
+test("check that a combined warc file exists in the archive folder", () => {
+  const warcLists = fs.readdirSync("crawls/collections/wr-net");
+  var captureFound = 0;
+  
+  for (var i = 0; i < warcLists.length; i++) {
+    if (warcLists[i].endsWith("_0.warc")){
+      captureFound = 1;
+    }
+  }
+  expect(captureFound).toEqual(1);
+});
diff --git a/tests/rollover.test.js b/tests/rollover.test.js
@@ -0,0 +1,20 @@
+const fs = require("fs");
+const path = require("path");
+
+function getFileSize(filename) {
+  var stats = fs.statSync(filename);
+  return stats.size;
+}
+
+test("check that a combined warc file is under the rolloverSize", () => {
+  const warcLists = fs.readdirSync(path.join("crawls/collections/wr-net/wacz", "archive"));
+  var rolloverSize = 0;
+  
+  for (var i = 0; i < warcLists.length; i++) {
+    var size = getFileSize(path.join("crawls/collections/wr-net/wacz/archive/", warcLists[i]));
+    if (size < 10000){
+      rolloverSize = 1;
+    }
+  }
+  expect(rolloverSize).toEqual(1);
+});
diff --git a/tests/text.test.js b/tests/text.test.js
@@ -1,19 +1,19 @@
 const fs = require("fs");
-const md5 = require('md5');
+const md5 = require("md5");
 
 
-test('check that the pages.jsonl file exists in the collection under the pages folder', () => {
-  expect(fs.existsSync('crawls/collections/wr-net/pages/pages.jsonl')).toBe(true);
+test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
+  expect(fs.existsSync("crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
 });
 
-test('check that the pages.jsonl file exists in the wacz under the pages folder', () => {
-  expect(fs.existsSync('crawls/collections/wr-net/wacz/pages/pages.jsonl')).toBe(true);
+test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
+  expect(fs.existsSync("crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
 });
 
-test('check that the hash in the pages folder and in the unzipped wacz folders match', () => {
-  const crawl_hash = md5(JSON.parse(fs.readFileSync('crawls/collections/wr-net/wacz/pages/pages.jsonl', 'utf8').split('\n')[1])['text']);
-  const wacz_hash = md5(JSON.parse(fs.readFileSync('crawls/collections/wr-net/pages/pages.jsonl', 'utf8').split('\n')[1])['text']);
-  const fixture_hash = md5(JSON.parse(fs.readFileSync('tests/fixtures/pages.jsonl', 'utf8').split('\n')[1])['text']);
+test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
+  const crawl_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
+  const wacz_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
+  const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
   
   expect(wacz_hash).toEqual(fixture_hash);
   expect(wacz_hash).toEqual(crawl_hash);
diff --git a/textextract.js b/textextract.js

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@`
`19`	`19`	`"eslint": "^7.20.0",`
`20`	`20`	`"eslint-plugin-react": "^7.22.0",`
`21`	`21`	`"jest": "^26.6.3",`
`22`		`- "md5": "^2.3.0"`
	`22`	`+ "md5": "^2.3.0",`
	`23`	`+ "warcio": "^1.4.2"`
`23`	`24`	`}`
`24`	`25`	`}`