@@ -7,6 +7,7 @@ const path = require("path");
77const fs = require ( "fs" ) ;
88const Sitemapper = require ( "sitemapper" ) ;
99const { v4 : uuidv4 } = require ( "uuid" ) ;
10+ const warcio = require ( "warcio" ) ;
1011
1112const TextExtract = require ( "./textextract" ) ;
1213const behaviors = fs . readFileSync ( "/app/node_modules/browsertrix-behaviors/dist/behaviors.js" , "utf-8" ) ;
@@ -104,7 +105,7 @@ class Crawler {
104105 }
105106
106107 bootstrap ( ) {
107- let opts = { }
108+ let opts = { } ;
108109 if ( this . params . logging . includes ( "pywb" ) ) {
109110 opts = { stdio : "inherit" , cwd : this . params . cwd } ;
110111 }
@@ -120,8 +121,8 @@ class Crawler {
120121
121122 child_process . spawnSync ( "wb-manager" , [ "init" , this . params . collection ] , opts ) ;
122123
123- opts . env = { ...process . env , COLL : this . params . collection } ;
124-
124+ opts . env = { ...process . env , COLL : this . params . collection , ROLLOVER_SIZE : this . params . rolloverSize } ;
125+
125126 child_process . spawn ( "uwsgi" , [ path . join ( __dirname , "uwsgi.ini" ) ] , opts ) ;
126127
127128 if ( ! this . params . headless ) {
@@ -212,6 +213,19 @@ class Crawler {
212213 default : false ,
213214 } ,
214215
216+ "combineWARC" : {
217+ alias : [ "combinewarc" , "combineWarc" ] ,
218+ describe : "If set, combine the warcs" ,
219+ type : "boolean" ,
220+ default : false ,
221+ } ,
222+
223+ "rolloverSize" : {
224+ describe : "If set, declare the rollover size" ,
225+ default : 1000000000 ,
226+ type : "number" ,
227+ } ,
228+
215229 "generateWACZ" : {
216230 alias : [ "generatewacz" , "generateWacz" ] ,
217231 describe : "If set, generate wacz" ,
@@ -426,15 +440,15 @@ class Crawler {
426440 if ( this . behaviorOpts ) {
427441 await page . exposeFunction ( BEHAVIOR_LOG_FUNC , ( { data, type} ) => {
428442 switch ( type ) {
429- case "info" :
430- console . log ( JSON . stringify ( data ) ) ;
431- break ;
432-
433- case "debug" :
434- default :
435- if ( this . behaviorsLogDebug ) {
436- console . log ( "behavior debug: " + JSON . stringify ( data ) ) ;
437- }
443+ case "info" :
444+ console . log ( JSON . stringify ( data ) ) ;
445+ break ;
446+
447+ case "debug" :
448+ default :
449+ if ( this . behaviorsLogDebug ) {
450+ console . log ( "behavior debug: " + JSON . stringify ( data ) ) ;
451+ }
438452 }
439453 } ) ;
440454
@@ -448,7 +462,7 @@ class Crawler {
448462
449463
450464 const title = await page . title ( ) ;
451- let text = '' ;
465+ let text = "" ;
452466 if ( this . params . text ) {
453467 const client = await page . target ( ) . createCDPSession ( ) ;
454468 const result = await client . send ( "DOM.getDocument" , { "depth" : - 1 , "pierce" : true } ) ;
@@ -467,6 +481,28 @@ class Crawler {
467481 console . warn ( e ) ;
468482 }
469483 }
484+
485+ async createWARCInfo ( filename ) {
486+ const warcVersion = "WARC/1.1" ;
487+ const type = "warcinfo" ;
488+ const packageFileJSON = JSON . parse ( fs . readFileSync ( "../app/package.json" ) ) ;
489+ const pywb_version = fs . readFileSync ( "/usr/local/lib/python3.8/site-packages/pywb/version.py" , "utf8" ) . split ( "\n" ) [ 0 ] . split ( "=" ) [ 1 ] . trim ( ) . replace ( / [ ' " ] + / g, "" ) ;
490+ const warcioPackageJson = JSON . parse ( fs . readFileSync ( "/app/node_modules/warcio/package.json" ) ) ;
491+
492+ const info = {
493+ "software" : `Browsertrix-Crawler ${ packageFileJSON [ "version" ] } (with warcio.js ${ warcioPackageJson } pywb ${ pywb_version } )` ,
494+ "format" : "WARC File Format 1.1"
495+ } ;
496+
497+ const record = await warcio . WARCRecord . createWARCInfo ( { filename, type, warcVersion} , info ) ;
498+ const buffer = await warcio . WARCSerializer . serialize ( record , { gzip : true } ) ;
499+ return buffer ;
500+ }
501+
502+ getFileSize ( filename ) {
503+ var stats = fs . statSync ( filename ) ;
504+ return stats . size ;
505+ }
470506
471507 async crawl ( ) {
472508 try {
@@ -505,6 +541,10 @@ class Crawler {
505541 // extra wait for all resources to land into WARCs
506542 console . log ( "Waiting 5s to ensure WARCs are finished" ) ;
507543 await this . sleep ( 5000 ) ;
544+
545+ if ( this . params . combineWARC ) {
546+ await this . combineWARC ( ) ;
547+ }
508548
509549 if ( this . params . generateCDX ) {
510550 console . log ( "Generate CDX" ) ;
@@ -594,16 +634,16 @@ class Crawler {
594634 // create pages dir if doesn't exist and write pages.jsonl header
595635 if ( ! fs . existsSync ( this . pagesDir ) ) {
596636 fs . mkdirSync ( this . pagesDir ) ;
597- const header = { "format" : "json-pages-1.0" , "id" : "pages" , "title" : "All Pages" }
637+ const header = { "format" : "json-pages-1.0" , "id" : "pages" , "title" : "All Pages" } ;
598638 if ( this . params . text ) {
599639 console . log ( "creating pages with full text" ) ;
600- header [ "hasText" ] = true
640+ header [ "hasText" ] = true ;
601641 }
602642 else {
603643 console . log ( "creating pages without full text" ) ;
604- header [ "hasText" ] = false
644+ header [ "hasText" ] = false ;
605645 }
606- const header_formatted = JSON . stringify ( header ) . concat ( "\n" )
646+ const header_formatted = JSON . stringify ( header ) . concat ( "\n" ) ;
607647 fs . writeFileSync ( this . pagesFile , header_formatted ) ;
608648 }
609649 } catch ( err ) {
@@ -616,7 +656,7 @@ class Crawler {
616656 const row = { "id" : id , "url" : url , "title" : title } ;
617657
618658 if ( text == true ) {
619- row [ ' text' ] = text_content
659+ row [ " text" ] = text_content ;
620660 }
621661
622662 const processedRow = JSON . stringify ( row ) . concat ( "\n" ) ;
@@ -746,6 +786,76 @@ class Crawler {
746786 console . log ( e ) ;
747787 }
748788 }
789+
790+ async combineWARC ( ) {
791+ console . log ( "Combining the warcs" ) ;
792+
793+ // Get the list of created Warcs
794+ const warcLists = fs . readdirSync ( path . join ( this . collDir , "archive" ) ) ;
795+
796+ const fileSizeObjects = [ ] ; // Used to sort the created warc by fileSize
797+
798+ // Go through a list of the created works and create an array sorted by their filesize with the largest file first.
799+ for ( let i = 0 ; i < warcLists . length ; i ++ ) {
800+ let fileName = path . join ( this . collDir , "archive" , warcLists [ i ] ) ;
801+ let fileSize = this . getFileSize ( fileName ) ;
802+ fileSizeObjects . push ( { "fileSize" : fileSize , "fileName" : fileName } ) ;
803+ fileSizeObjects . sort ( function ( a , b ) {
804+ return b . fileSize - a . fileSize ;
805+ } ) ;
806+ }
807+
808+ const generatedCombinedWarcs = [ ] ;
809+
810+ // Used to name combined warcs, default to -1 for first increment
811+ let combinedWarcNumber = - 1 ;
812+
813+ // write combine WARC to collection root
814+ let combinedWarcFullPath = "" ;
815+
816+ // Iterate through the sorted file size array.
817+ for ( let j = 0 ; j < fileSizeObjects . length ; j ++ ) {
818+
819+ // if need to rollover to new warc
820+ let doRollover = false ;
821+
822+ // set to true for first warc
823+ if ( combinedWarcNumber < 0 ) {
824+ doRollover = true ;
825+ } else {
826+ // Check the size of the existing combined warc.
827+ const currentCombinedWarcSize = this . getFileSize ( combinedWarcFullPath ) ;
828+
829+ // If adding the current warc to the existing combined file creates a file smaller than the rollover size add the data to the combinedWarc
830+ const proposedWarcSize = fileSizeObjects [ j ] . fileSize + currentCombinedWarcSize ;
831+
832+ doRollover = ( proposedWarcSize >= this . params . rolloverSize ) ;
833+ }
834+
835+ if ( doRollover ) {
836+ // If adding the current warc to the existing combined file creates a file larger than the rollover size do the following:
837+ // 1. increment the combinedWarcNumber
838+ // 2. create the name of the new combinedWarcFile
839+ // 3. Write the header out to the new file
840+ // 4. Write out the current warc data to the combinedFile
841+ combinedWarcNumber = combinedWarcNumber + 1 ;
842+
843+ const combinedWarcName = `${ this . params . collection } _${ combinedWarcNumber } .warc` ;
844+
845+ // write combined warcs to root collection dir as they're output of a collection (like wacz)
846+ combinedWarcFullPath = path . join ( this . collDir , combinedWarcName ) ;
847+
848+ generatedCombinedWarcs . push ( combinedWarcName ) ;
849+
850+ const warcBuffer = await this . createWARCInfo ( combinedWarcName ) ;
851+ fs . writeFileSync ( combinedWarcFullPath , warcBuffer ) ;
852+ }
853+
854+ fs . appendFileSync ( combinedWarcFullPath , fs . readFileSync ( fileSizeObjects [ j ] . fileName ) ) ;
855+ }
856+
857+ console . log ( `Combined warcs saved as: ${ generatedCombinedWarcs } ` ) ;
858+ }
749859}
750860
751861module . exports . Crawler = Crawler ;
0 commit comments