Skip to content

Commit 22f20e6

Browse files
committed
support linking files in place instead of copying
This allows users to manage collections of large WARC files without duplicating space. Hardlinks are used instead of symlinks to reflect the original mechanism, where the file is copied (so it can be safely removed from the source). If we used symlinks, we would break that expectation which could lead to data loss. Inversely, hardlinks can lead to data loss as well. For example, pywb could somehow edit the file, which would modify the original as well. But we assume here pywb does not modify the file, and each side of the hardlink can have their own permissions to ensure this (or not) as well. Closes: webrecorder#408
1 parent 1b151b7 commit 22f20e6

File tree

1 file changed

+13
-4
lines changed

1 file changed

+13
-4
lines changed

pywb/manager/manager.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,17 +108,24 @@ def _assert_coll_exists(self):
108108
'To create a new collection, run\n\n{1} init {0}')
109109
raise IOError(msg.format(self.coll_name, sys.argv[0]))
110110

111-
def add_warcs(self, warcs):
111+
def add_warcs(self, warcs, hardlink=False):
112112
if not os.path.isdir(self.archive_dir):
113113
raise IOError('Directory {0} does not exist'.
114114
format(self.archive_dir))
115115

116116
full_paths = []
117117
for filename in warcs:
118118
filename = os.path.abspath(filename)
119-
shutil.copy2(filename, self.archive_dir)
119+
if hardlink:
120+
os.link(filename, os.path.join(self.archive_dir,
121+
os.path.basename(filename)))
122+
else:
123+
shutil.copy2(filename, self.archive_dir)
120124
full_paths.append(os.path.join(self.archive_dir, filename))
121-
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
125+
logging.info('%s %s to %s',
126+
hardlink and 'Linked' or 'Copied',
127+
filename,
128+
self.archive_dir)
122129

123130
self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)
124131

@@ -357,12 +364,14 @@ def do_list(r):
357364
# Add Warcs
358365
def do_add(r):
359366
m = CollectionsManager(r.coll_name)
360-
m.add_warcs(r.files)
367+
m.add_warcs(r.files, r.hardlink)
361368

362369
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
363370
addwarc = subparsers.add_parser('add', help=addwarc_help)
364371
addwarc.add_argument('coll_name')
365372
addwarc.add_argument('files', nargs='+')
373+
addwarc.add_argument('--hardlink', '-l', action='store_true',
374+
help='hardlink files into storage instead of copying')
366375
addwarc.set_defaults(func=do_add)
367376

368377
# Reindex All

0 commit comments

Comments
 (0)