Skip to content

Commit b9980a9

Browse files
committed
Support cleaning up extra backup sites
This adds a new setting called `extra_backup_sites_prefixes`, which when set, makes cleaning process look into those extra sites as well, and this is particularly useful when we have old backups from a previous backup site after a PG major version upgrade.
1 parent a1da446 commit b9980a9

File tree

4 files changed

+243
-48
lines changed

4 files changed

+243
-48
lines changed

.pylintrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ disable=
2828

2929
[FORMAT]
3030
max-line-length=125
31-
max-module-lines=1100
31+
max-module-lines=1200
3232

3333
[REPORTS]
3434
output-format=text

README.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,11 @@ How many threads to use for tar, compress and encrypt tasks. Only applies for
723723
this, with higher thread count speed improvement is negligible and CPU time is
724724
lost switching between threads.
725725

726+
``extra_backup_sites_prefixes`` (default undefined)
727+
728+
Prefixes for extra backup sites to look into during cleanup. This could be useful
729+
if you want to removed old backups from previous backup sites after a PG upgrade.
730+
726731
``encryption_key_id`` (no default)
727732

728733
Specifies the encryption key used when storing encrypted backups. If this

pghoard/pghoard.py

Lines changed: 70 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def create_backup_site_paths(self, site: str) -> BackupSitePaths:
323323

324324
return backup_site_paths
325325

326-
def delete_remote_wal_before(self, wal_segment, site, pg_version):
326+
def delete_remote_wal_before(self, wal_segment, site, site_prefix, pg_version):
327327
self.log.info("Starting WAL deletion from: %r before: %r, pg_version: %r", site, wal_segment, pg_version)
328328
storage = self.site_transfers.get(site)
329329
valid_timeline = True
@@ -334,7 +334,7 @@ def delete_remote_wal_before(self, wal_segment, site, pg_version):
334334
lsn = lsn.previous_walfile_start_lsn
335335
if lsn is None:
336336
break
337-
wal_path = os.path.join(self._get_site_prefix(site), "xlog", lsn.walfile_name)
337+
wal_path = os.path.join(site_prefix, "xlog", lsn.walfile_name)
338338
self.log.debug("Deleting wal_file: %r", wal_path)
339339
try:
340340
storage.delete_key(wal_path)
@@ -360,8 +360,8 @@ def delete_remote_wal_before(self, wal_segment, site, pg_version):
360360
def _get_delta_basebackup_files(self, site, storage, metadata, basebackup_name_to_delete, backups_to_keep) -> List:
361361
delta_formats = (BaseBackupFormat.delta_v1, BaseBackupFormat.delta_v2)
362362
assert metadata["format"] in delta_formats
363-
all_hexdigests = set()
364-
keep_hexdigests = set()
363+
all_hexdigests: Dict[str, str] = {}
364+
keep_hexdigests: Dict[str, str] = {}
365365

366366
basebackup_data_files = list()
367367
delta_backup_names = {
@@ -372,7 +372,7 @@ def _get_delta_basebackup_files(self, site, storage, metadata, basebackup_name_t
372372
delta_backup_names[basebackup_name_to_delete] = metadata
373373

374374
for backup_name, backup_metadata in delta_backup_names.items():
375-
delta_backup_key = os.path.join(self._get_site_prefix(site), "basebackup", backup_name)
375+
delta_backup_key = os.path.join(backup_metadata["site-prefix"], "basebackup", backup_name)
376376
meta, _ = download_backup_meta_file(
377377
storage=storage,
378378
basebackup_path=delta_backup_key,
@@ -385,18 +385,22 @@ def _get_delta_basebackup_files(self, site, storage, metadata, basebackup_name_t
385385
backup_state = snapshot_result["state"]
386386
files = backup_state["files"]
387387

388-
backup_hexdigests = set(delta_file["hexdigest"] for delta_file in files if delta_file["hexdigest"])
389-
all_hexdigests |= backup_hexdigests
388+
backup_hexdigests = {
389+
delta_file["hexdigest"]: backup_metadata["site-prefix"]
390+
for delta_file in files
391+
if delta_file["hexdigest"]
392+
}
393+
all_hexdigests.update(backup_hexdigests)
390394

391395
if backup_name != basebackup_name_to_delete:
392396
# Keep data file in case if there is still a reference from other backups
393-
keep_hexdigests |= backup_hexdigests
397+
keep_hexdigests.update(backup_hexdigests)
394398
else:
395399
# Add bundles to remove
396400
for chunk in meta.get("chunks", []):
397401
basebackup_data_files.append(
398402
os.path.join(
399-
self._get_site_prefix(site),
403+
backup_metadata["site-prefix"],
400404
FileTypePrefixes[FileType.Basebackup_delta_chunk],
401405
chunk["chunk_filename"],
402406
)
@@ -405,16 +409,21 @@ def _get_delta_basebackup_files(self, site, storage, metadata, basebackup_name_t
405409
# Remove unreferenced files
406410
extra_hexdigests = set(all_hexdigests).difference(keep_hexdigests)
407411
for hexdigest in extra_hexdigests:
408-
basebackup_data_files.append(os.path.join(self._get_site_prefix(site), "basebackup_delta", hexdigest))
412+
basebackup_data_files.append(os.path.join(all_hexdigests[hexdigest], "basebackup_delta", hexdigest))
409413

410414
return basebackup_data_files
411415

412416
def delete_remote_basebackup(self, site, basebackup, metadata, basebackups):
413417
start_time = time.monotonic()
414418
storage = self.site_transfers.get(site)
415-
main_backup_key = os.path.join(self._get_site_prefix(site), "basebackup", basebackup)
419+
main_backup_key = os.path.join(metadata["site-prefix"], "basebackup", basebackup)
416420
basebackup_data_files = [main_backup_key]
417421

422+
# When we delete a basebackup, let's also delete any WAL segments before its `start-wal-segment`.
423+
pg_version_str = metadata.get("pg-version")
424+
pg_version = None if pg_version_str is None else int(pg_version_str)
425+
self.delete_remote_wal_before(metadata["start-wal-segment"], site, metadata["site-prefix"], pg_version)
426+
418427
if metadata.get("format") == BaseBackupFormat.v2:
419428
bmeta_compressed = storage.get_contents_to_string(main_backup_key)[0]
420429
with rohmufile.file_reader(
@@ -427,7 +436,7 @@ def delete_remote_basebackup(self, site, basebackup, metadata, basebackups):
427436
for chunk in bmeta["chunks"]:
428437
basebackup_data_files.append(
429438
os.path.join(
430-
self._get_site_prefix(site),
439+
metadata["site-prefix"],
431440
"basebackup_chunk",
432441
chunk["chunk_filename"],
433442
)
@@ -457,14 +466,15 @@ def get_or_create_site_storage(self, site):
457466
self.site_transfers[site] = storage
458467
return storage
459468

460-
def get_remote_basebackups_info(self, site):
469+
def get_remote_basebackups_info(self, site, site_prefix=None):
461470
storage = self.get_or_create_site_storage(site=site)
462471
site_config = self.config["backup_sites"][site]
463-
results = storage.list_path(os.path.join(site_config["prefix"], "basebackup"))
472+
site_prefix = site_prefix or site_config["prefix"]
473+
results = storage.list_path(os.path.join(site_prefix, "basebackup"))
464474
for entry in results:
465475
self.patch_basebackup_info(entry=entry, site_config=site_config)
466476

467-
preservation_requests = storage.list_path(os.path.join(site_config["prefix"], "preservation_request"))
477+
preservation_requests = storage.list_path(os.path.join(site_prefix, "preservation_request"))
468478
backups_to_preserve = parse_preservation_requests(preservation_requests)
469479
for entry in results:
470480
patch_basebackup_metadata_with_preservation(entry, backups_to_preserve)
@@ -517,7 +527,7 @@ def determine_backups_to_delete(self, *, basebackups, site_config):
517527
if max_age_days and min_backups > 0:
518528
while basebackups and len(basebackups) > min_backups:
519529
if is_basebackup_preserved(basebackups[0], now):
520-
self.log.info("Not deleting more backups because %r still needs to preserved", basebackups[0]["name"])
530+
self.log.info("Not deleting more backups because %r still needs to be preserved", basebackups[0]["name"])
521531
break
522532
# For age checks we treat the age as current_time - (backup_start_time + backup_interval). So when
523533
# backup interval is set to 24 hours a backup started 2.5 days ago would be considered to be 1.5 days old.
@@ -539,31 +549,62 @@ def determine_backups_to_delete(self, *, basebackups, site_config):
539549

540550
def refresh_backup_list_and_delete_old(self, site):
541551
"""Look up basebackups from the object store, prune any extra
542-
backups and return the datetime of the latest backup."""
543-
basebackups = self.get_remote_basebackups_info(site)
544-
self.log.debug("Found %r basebackups", basebackups)
552+
backups from the current and the extra backup sites and update
553+
the state with the up-to-date backups list."""
554+
current_basebackups = self.get_remote_basebackups_info(site)
555+
current_site_prefix = self._get_site_prefix(site)
556+
for basebackup in current_basebackups:
557+
basebackup["metadata"]["site-prefix"] = current_site_prefix
558+
559+
# If `extra_backup_sites_prefixes` is set, let's also check those sites for backups that are due for cleanup.
560+
extra_basebackups = []
561+
if self.config.get("extra_backup_sites_prefixes"):
562+
new_extra_backup_sites_prefixes = []
563+
extra_backup_sites_prefixes = self.state["extra_backup_sites_prefixes"]
564+
565+
for site_prefix in extra_backup_sites_prefixes:
566+
extra_site_basebackups = self.get_remote_basebackups_info(site, site_prefix=site_prefix)
567+
if not extra_site_basebackups:
568+
continue
569+
for basebackup in extra_site_basebackups:
570+
basebackup["metadata"]["site-prefix"] = site_prefix
571+
extra_basebackups.extend(extra_site_basebackups)
572+
# If we did not find any backup in this extra site, we do not include it in the next round of checks.
573+
new_extra_backup_sites_prefixes.append(site_prefix)
574+
575+
self.state["extra_backup_sites_prefixes"] = new_extra_backup_sites_prefixes
576+
577+
extra_basebackups.sort(key=lambda entry: entry["metadata"]["start-time"])
578+
all_basebackups = extra_basebackups + current_basebackups
579+
580+
self.log.debug("Found %r basebackups", all_basebackups)
545581

546582
site_config = self.config["backup_sites"][site]
547583
# Never delete backups from a recovery site. This check is already elsewhere as well
548584
# but still check explicitly here to ensure we certainly won't delete anything unexpectedly
549585
if site_config["active"]:
550-
basebackups_to_delete = self.determine_backups_to_delete(basebackups=basebackups, site_config=site_config)
586+
basebackups_to_delete = self.determine_backups_to_delete(basebackups=all_basebackups, site_config=site_config)
551587

552588
for basebackup_to_be_deleted in basebackups_to_delete:
553589
pg_version_str = basebackup_to_be_deleted["metadata"].get("pg-version")
554590
pg_version = None if pg_version_str is None else int(pg_version_str)
555-
last_wal_segment_still_needed = 0
556-
if basebackups:
557-
last_wal_segment_still_needed = basebackups[0]["metadata"]["start-wal-segment"]
591+
oldest_wal_segment_to_keep = ""
592+
if all_basebackups:
593+
oldest_wal_segment_to_keep = all_basebackups[0]["metadata"]["start-wal-segment"]
558594

559-
if last_wal_segment_still_needed:
595+
if oldest_wal_segment_to_keep:
560596
# This is breaking concurrent PITR starting from the *previous* backup.
561597
# That's why once a backup is preserved, we keep that backup and all the next ones.
562-
self.delete_remote_wal_before(last_wal_segment_still_needed, site, pg_version)
598+
self.delete_remote_wal_before(
599+
oldest_wal_segment_to_keep, site, basebackup_to_be_deleted["metadata"]["site-prefix"], pg_version
600+
)
563601
self.delete_remote_basebackup(
564-
site, basebackup_to_be_deleted["name"], basebackup_to_be_deleted["metadata"], basebackups=basebackups
602+
site,
603+
basebackup_to_be_deleted["name"],
604+
basebackup_to_be_deleted["metadata"],
605+
basebackups=all_basebackups
565606
)
566-
self.state["backup_sites"][site]["basebackups"] = basebackups
607+
self.state["backup_sites"][site]["basebackups"] = current_basebackups
567608

568609
def get_normalized_backup_time(self, site_config, *, now=None):
569610
"""Returns the closest historical backup time that current time matches to (or current time if it matches).
@@ -589,6 +630,8 @@ def get_normalized_backup_time(self, site_config, *, now=None):
589630
def set_state_defaults(self, site):
590631
if site not in self.state["backup_sites"]:
591632
self.state["backup_sites"][site] = {"basebackups": []}
633+
if "extra_backup_sites_prefixes" not in self.state:
634+
self.state["extra_backup_sites_prefixes"] = self.config.get("extra_backup_sites_prefixes", [])
592635

593636
def startup_walk_for_missed_files(self):
594637
"""Check xlog and xlog_incoming directories for files that receivexlog has received but not yet

0 commit comments

Comments
 (0)