From a6f3b12b24d0c7bff195d7fa3def26d54e2dcddf Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 16 Jun 2021 05:59:26 +0300 Subject: [PATCH 1/5] [Issue #313] restore pg_control last --- src/restore.c | 264 +++++++++++++++++++++++++++------------------ src/utils/parray.c | 7 ++ src/utils/parray.h | 1 + 3 files changed, 170 insertions(+), 102 deletions(-) diff --git a/src/restore.c b/src/restore.c index 7f5df1a00..4f90f90b9 100644 --- a/src/restore.c +++ b/src/restore.c @@ -3,7 +3,7 @@ * restore.c: restore DB cluster and archived WAL. * * Portions Copyright (c) 2009-2013, NIPPON TELEGRAPH AND TELEPHONE CORPORATION - * Portions Copyright (c) 2015-2019, Postgres Professional + * Portions Copyright (c) 2015-2021, Postgres Professional * *------------------------------------------------------------------------- */ @@ -61,6 +61,8 @@ static void create_recovery_conf(InstanceState *instanceState, time_t backup_id, pgBackup *backup, pgRestoreParams *params); static void *restore_files(void *arg); +static size_t restore_file(pgFile *dest_file, const char *to_fullpath, bool already_exists, char *out_buf, + pgBackup *dest_backup, parray *parent_chain, bool use_bitmap, IncrRestoreMode incremental_mode, XLogRecPtr shift_lsn); static void set_orphan_status(parray *backups, pgBackup *parent_backup); static void restore_chain(pgBackup *dest_backup, parray *parent_chain, @@ -710,6 +712,8 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, parray *pgdata_files = NULL; parray *dest_files = NULL; parray *external_dirs = NULL; + pgFile *dest_pg_control_file = NULL; + char dest_pg_control_fullpath[MAXPGPATH]; /* arrays with meta info for multi threaded backup */ pthread_t *threads; restore_files_arg *threads_args; @@ -965,6 +969,30 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, dest_bytes = dest_backup->pgdata_bytes; pretty_size(dest_bytes, pretty_dest_bytes, lengthof(pretty_dest_bytes)); + + /* + * [Issue #313] + * find pg_control file (in already sorted earlier dest_files, see parray_qsort(backup->files...)) + * and exclude it from list for future special processing + */ + { + int control_file_elem_index; + pgFile search_key; + MemSet(&search_key, 0, sizeof(pgFile)); + /* pgFileCompareRelPathWithExternal uses only .rel_path and .external_dir_num for comparision */ + search_key.rel_path = XLOG_CONTROL_FILE; + search_key.external_dir_num = 0; + control_file_elem_index = parray_bsearch_index(dest_files, &search_key, pgFileCompareRelPathWithExternal); + if(control_file_elem_index < 0) + elog(ERROR, "\"%s\" not found in backup %s", XLOG_CONTROL_FILE, base36enc(dest_backup->start_time)); + dest_pg_control_file = parray_remove(dest_files, control_file_elem_index); + + join_path_components(dest_pg_control_fullpath, pgdata_path, dest_pg_control_file->rel_path); + /* remove dest control file before restoring */ + if (params->incremental_mode != INCR_NONE) + fio_unlink(dest_pg_control_fullpath, FIO_DB_HOST); + } + elog(INFO, "Start restoring backup files. PGDATA size: %s", pretty_dest_bytes); time(&start_time); thread_interrupted = false; @@ -972,30 +1000,30 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, /* Restore files into target directory */ for (i = 0; i < num_threads; i++) { - restore_files_arg *arg = &(threads_args[i]); - - arg->dest_files = dest_files; - arg->pgdata_files = pgdata_files; - arg->dest_backup = dest_backup; - arg->dest_external_dirs = external_dirs; - arg->parent_chain = parent_chain; - arg->dbOid_exclude_list = dbOid_exclude_list; - arg->skip_external_dirs = params->skip_external_dirs; - arg->to_root = pgdata_path; - arg->use_bitmap = use_bitmap; - arg->incremental_mode = params->incremental_mode; - arg->shift_lsn = params->shift_lsn; - threads_args[i].restored_bytes = 0; - /* By default there are some error */ - threads_args[i].ret = 1; + threads_args[i] = (restore_files_arg){ + .dest_files = dest_files, + .pgdata_files = pgdata_files, + .dest_backup = dest_backup, + .dest_external_dirs = external_dirs, + .parent_chain = parent_chain, + .dbOid_exclude_list = dbOid_exclude_list, + .skip_external_dirs = params->skip_external_dirs, + .to_root = pgdata_path, + .use_bitmap = use_bitmap, + .incremental_mode = params->incremental_mode, + .shift_lsn = params->shift_lsn, + .restored_bytes = 0, + /* By default there are some error */ + .ret = 1, + }; /* Useless message TODO: rewrite */ elog(LOG, "Start thread %i", i + 1); - pthread_create(&threads[i], NULL, restore_files, arg); + pthread_create(&threads[i], NULL, restore_files, &(threads_args[i])); } - /* Wait theads */ + /* Wait threads */ for (i = 0; i < num_threads; i++) { pthread_join(threads[i], NULL); @@ -1005,6 +1033,15 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, total_bytes += threads_args[i].restored_bytes; } + /* [Issue #313] copy pg_control at very end */ + if (restore_isok) + { + fio_is_remote(FIO_DB_HOST); /* reopen already closed ssh connection */ + total_bytes += restore_file(dest_pg_control_file, dest_pg_control_fullpath, false, NULL, + dest_backup, parent_chain, use_bitmap, params->incremental_mode, params->shift_lsn); + fio_disconnect(); + } + time(&end_time); pretty_time_interval(difftime(end_time, start_time), pretty_time, lengthof(pretty_time)); @@ -1066,10 +1103,15 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, } /* TODO: write test for case: file to be synced is missing */ + /* MKulagin question: where is fio connection reopened? */ if (fio_sync(to_fullpath, FIO_DB_HOST) != 0) elog(ERROR, "Failed to sync file \"%s\": %s", to_fullpath, strerror(errno)); } + /* sync control file */ + if (fio_sync(dest_pg_control_fullpath, FIO_DB_HOST) != 0) + elog(ERROR, "Failed to sync file \"%s\": %s", dest_pg_control_fullpath, strerror(errno)); + time(&end_time); pretty_time_interval(difftime(end_time, start_time), pretty_time, lengthof(pretty_time)); @@ -1089,6 +1131,8 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, parray_free(pgdata_files); } + pgFileFree(dest_pg_control_file); + for (i = parray_num(parent_chain) - 1; i >= 0; i--) { pgBackup *backup = (pgBackup *) parray_get(parent_chain, i); @@ -1107,7 +1151,6 @@ restore_files(void *arg) int i; uint64 n_files; char to_fullpath[MAXPGPATH]; - FILE *out = NULL; char *out_buf = pgut_malloc(STDIO_BUFSIZE); restore_files_arg *arguments = (restore_files_arg *) arg; @@ -1117,9 +1160,6 @@ restore_files(void *arg) for (i = 0; i < parray_num(arguments->dest_files); i++) { bool already_exists = false; - PageState *checksum_map = NULL; /* it should take ~1.5MB at most */ - datapagemap_t *lsn_map = NULL; /* it should take 16kB at most */ - char *errmsg = NULL; /* remote agent error message */ pgFile *dest_file = (pgFile *) parray_get(arguments->dest_files, i); /* Directories were created before */ @@ -1193,73 +1233,101 @@ restore_files(void *arg) already_exists = true; } - /* - * Handle incremental restore case for data files. - * If file is already exists in pgdata, then - * we scan it block by block and get - * array of checksums for every page. - */ - if (already_exists && - dest_file->is_datafile && !dest_file->is_cfs && - dest_file->n_blocks > 0) + arguments->restored_bytes += restore_file(dest_file, to_fullpath, already_exists, out_buf, + arguments->dest_backup, arguments->parent_chain, arguments->use_bitmap, + arguments->incremental_mode, arguments->shift_lsn); + } + + free(out_buf); + + /* ssh connection to longer needed */ + fio_disconnect(); + + /* Data files restoring is successful */ + arguments->ret = 0; + + return NULL; +} + +/* + * Restore one file into $PGDATA. + */ +static size_t +restore_file(pgFile *dest_file, const char *to_fullpath, bool already_exists, char *out_buf, + pgBackup *dest_backup, parray *parent_chain, bool use_bitmap, IncrRestoreMode incremental_mode, XLogRecPtr shift_lsn) +{ + FILE *out = NULL; + size_t restored_bytes = 0; + PageState *checksum_map = NULL; /* it should take ~1.5MB at most */ + datapagemap_t *lsn_map = NULL; /* it should take 16kB at most */ + char *errmsg = NULL; /* remote agent error message */ + + /* + * Handle incremental restore case for data files. + * If file is already exists in pgdata, then + * we scan it block by block and get + * array of checksums for every page. + */ + if (already_exists && + dest_file->is_datafile && !dest_file->is_cfs && + dest_file->n_blocks > 0) + { + if (incremental_mode == INCR_LSN) { - if (arguments->incremental_mode == INCR_LSN) - { - lsn_map = fio_get_lsn_map(to_fullpath, arguments->dest_backup->checksum_version, - dest_file->n_blocks, arguments->shift_lsn, - dest_file->segno * RELSEG_SIZE, FIO_DB_HOST); - } - else if (arguments->incremental_mode == INCR_CHECKSUM) - { - checksum_map = fio_get_checksum_map(to_fullpath, arguments->dest_backup->checksum_version, - dest_file->n_blocks, arguments->dest_backup->stop_lsn, - dest_file->segno * RELSEG_SIZE, FIO_DB_HOST); - } + lsn_map = fio_get_lsn_map(to_fullpath, dest_backup->checksum_version, + dest_file->n_blocks, shift_lsn, + dest_file->segno * RELSEG_SIZE, FIO_DB_HOST); } + else if (incremental_mode == INCR_CHECKSUM) + { + checksum_map = fio_get_checksum_map(to_fullpath, dest_backup->checksum_version, + dest_file->n_blocks, dest_backup->stop_lsn, + dest_file->segno * RELSEG_SIZE, FIO_DB_HOST); + } + } - /* - * Open dest file and truncate it to zero, if destination - * file already exists and dest file size is zero, or - * if file do not exist - */ - if ((already_exists && dest_file->write_size == 0) || !already_exists) - out = fio_fopen(to_fullpath, PG_BINARY_W, FIO_DB_HOST); - /* - * If file already exists and dest size is not zero, - * then open it for reading and writing. - */ - else - out = fio_fopen(to_fullpath, PG_BINARY_R "+", FIO_DB_HOST); - - if (out == NULL) - elog(ERROR, "Cannot open restore target file \"%s\": %s", - to_fullpath, strerror(errno)); + /* + * Open dest file and truncate it to zero, if destination + * file already exists and dest file size is zero, or + * if file do not exist + */ + if ((already_exists && dest_file->write_size == 0) || !already_exists) + out = fio_fopen(to_fullpath, PG_BINARY_W, FIO_DB_HOST); + /* + * If file already exists and dest size is not zero, + * then open it for reading and writing. + */ + else + out = fio_fopen(to_fullpath, PG_BINARY_R "+", FIO_DB_HOST); - /* update file permission */ - if (fio_chmod(to_fullpath, dest_file->mode, FIO_DB_HOST) == -1) - elog(ERROR, "Cannot change mode of \"%s\": %s", to_fullpath, - strerror(errno)); + if (out == NULL) + elog(ERROR, "Cannot open restore target file \"%s\": %s", + to_fullpath, strerror(errno)); - if (!dest_file->is_datafile || dest_file->is_cfs) - elog(VERBOSE, "Restoring nonedata file: \"%s\"", to_fullpath); - else - elog(VERBOSE, "Restoring data file: \"%s\"", to_fullpath); + /* update file permission */ + if (fio_chmod(to_fullpath, dest_file->mode, FIO_DB_HOST) == -1) + elog(ERROR, "Cannot change mode of \"%s\": %s", to_fullpath, + strerror(errno)); - // If destination file is 0 sized, then just close it and go for the next - if (dest_file->write_size == 0) - goto done; + if (!dest_file->is_datafile || dest_file->is_cfs) + elog(VERBOSE, "Restoring nonedata file: \"%s\"", to_fullpath); + else + elog(VERBOSE, "Restoring data file: \"%s\"", to_fullpath); + // If destination file is 0 sized, then just close it and go for the next + if (dest_file->write_size != 0) + { /* Restore destination file */ if (dest_file->is_datafile && !dest_file->is_cfs) { /* enable stdio buffering for local destination data file */ - if (!fio_is_remote_file(out)) + if (!fio_is_remote_file(out) && out_buf != NULL) setvbuf(out, out_buf, _IOFBF, STDIO_BUFSIZE); /* Destination file is data file */ - arguments->restored_bytes += restore_data_file(arguments->parent_chain, + restored_bytes += restore_data_file(parent_chain, dest_file, out, to_fullpath, - arguments->use_bitmap, checksum_map, - arguments->shift_lsn, lsn_map, true); + use_bitmap, checksum_map, + shift_lsn, lsn_map, true); } else { @@ -1267,40 +1335,32 @@ restore_files(void *arg) if (!fio_is_remote_file(out)) setvbuf(out, NULL, _IONBF, BUFSIZ); /* Destination file is nonedata file */ - arguments->restored_bytes += restore_non_data_file(arguments->parent_chain, - arguments->dest_backup, dest_file, out, to_fullpath, + restored_bytes += restore_non_data_file(parent_chain, + dest_backup, dest_file, out, to_fullpath, already_exists); } + } -done: - /* Writing is asynchronous in case of restore in remote mode, so check the agent status */ - if (fio_check_error_file(out, &errmsg)) - elog(ERROR, "Cannot write to the remote file \"%s\": %s", to_fullpath, errmsg); - - /* close file */ - if (fio_fclose(out) != 0) - elog(ERROR, "Cannot close file \"%s\": %s", to_fullpath, - strerror(errno)); - - /* free pagemap used for restore optimization */ - pg_free(dest_file->pagemap.bitmap); - - if (lsn_map) - pg_free(lsn_map->bitmap); + /* Writing is asynchronous in case of restore in remote mode, so check the agent status */ + if (fio_check_error_file(out, &errmsg)) + elog(ERROR, "Cannot write to the remote file \"%s\": %s", to_fullpath, errmsg); - pg_free(lsn_map); - pg_free(checksum_map); - } + /* close file */ + if (fio_fclose(out) != 0) + elog(ERROR, "Cannot close file \"%s\": %s", to_fullpath, + strerror(errno)); - free(out_buf); + if (lsn_map) + pg_free(lsn_map->bitmap); - /* ssh connection to longer needed */ - fio_disconnect(); + pg_free(lsn_map); + pg_free(checksum_map); - /* Data files restoring is successful */ - arguments->ret = 0; + /* free pagemap used for restore optimization */ + pg_free(dest_file->pagemap.bitmap); + dest_file->pagemap.bitmap = NULL; - return NULL; + return restored_bytes; } /* diff --git a/src/utils/parray.c b/src/utils/parray.c index 95b83365d..792e26907 100644 --- a/src/utils/parray.c +++ b/src/utils/parray.c @@ -198,6 +198,13 @@ parray_bsearch(parray *array, const void *key, int(*compare)(const void *, const return bsearch(&key, array->data, array->used, sizeof(void *), compare); } +int +parray_bsearch_index(parray *array, const void *key, int(*compare)(const void *, const void *)) +{ + void **elem = parray_bsearch(array, key, compare); + return elem != NULL ? elem - array->data : -1; +} + /* checks that parray contains element */ bool parray_contains(parray *array, void *elem) { diff --git a/src/utils/parray.h b/src/utils/parray.h index 85d7383f3..e92ad728c 100644 --- a/src/utils/parray.h +++ b/src/utils/parray.h @@ -29,6 +29,7 @@ extern bool parray_rm(parray *array, const void *key, int(*compare)(const void * extern size_t parray_num(const parray *array); extern void parray_qsort(parray *array, int(*compare)(const void *, const void *)); extern void *parray_bsearch(parray *array, const void *key, int(*compare)(const void *, const void *)); +extern int parray_bsearch_index(parray *array, const void *key, int(*compare)(const void *, const void *)); extern void parray_walk(parray *array, void (*action)(void *)); extern bool parray_contains(parray *array, void *elem); From 0ef23391b4d473e85d76befae8b9215381dcc9d3 Mon Sep 17 00:00:00 2001 From: Grigory Smolkin Date: Wed, 16 Jun 2021 23:35:53 +0300 Subject: [PATCH 2/5] [Issue #313] review feedback --- src/restore.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/restore.c b/src/restore.c index 4f90f90b9..619d8a486 100644 --- a/src/restore.c +++ b/src/restore.c @@ -983,14 +983,18 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, search_key.rel_path = XLOG_CONTROL_FILE; search_key.external_dir_num = 0; control_file_elem_index = parray_bsearch_index(dest_files, &search_key, pgFileCompareRelPathWithExternal); - if(control_file_elem_index < 0) - elog(ERROR, "\"%s\" not found in backup %s", XLOG_CONTROL_FILE, base36enc(dest_backup->start_time)); + + if (control_file_elem_index < 0) + elog(ERROR, "File \"%s\" not found in backup %s", XLOG_CONTROL_FILE, base36enc(dest_backup->start_time)); dest_pg_control_file = parray_remove(dest_files, control_file_elem_index); join_path_components(dest_pg_control_fullpath, pgdata_path, dest_pg_control_file->rel_path); /* remove dest control file before restoring */ if (params->incremental_mode != INCR_NONE) fio_unlink(dest_pg_control_fullpath, FIO_DB_HOST); + + // TODO: maybe we should rename "pg_control" into something like "pg_control.pbk" to + // keep the ability to rerun failed incremental restore ? } elog(INFO, "Start restoring backup files. PGDATA size: %s", pretty_dest_bytes); @@ -1036,7 +1040,6 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, /* [Issue #313] copy pg_control at very end */ if (restore_isok) { - fio_is_remote(FIO_DB_HOST); /* reopen already closed ssh connection */ total_bytes += restore_file(dest_pg_control_file, dest_pg_control_fullpath, false, NULL, dest_backup, parent_chain, use_bitmap, params->incremental_mode, params->shift_lsn); fio_disconnect(); @@ -1103,7 +1106,6 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, } /* TODO: write test for case: file to be synced is missing */ - /* MKulagin question: where is fio connection reopened? */ if (fio_sync(to_fullpath, FIO_DB_HOST) != 0) elog(ERROR, "Failed to sync file \"%s\": %s", to_fullpath, strerror(errno)); } From 447f0a632d28b68ce7775d9cb1ce2e388dabec8a Mon Sep 17 00:00:00 2001 From: Grigory Smolkin Date: Thu, 17 Jun 2021 02:39:02 +0300 Subject: [PATCH 3/5] [Issue #313] improve test coverage --- tests/restore.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/restore.py b/tests/restore.py index 8ccffa44c..cf150a5f2 100644 --- a/tests/restore.py +++ b/tests/restore.py @@ -3918,5 +3918,10 @@ def test_restore_issue_313(self): '\n Unexpected Error Message: {0}\n CMD: {1}'.format( repr(e.message), self.cmd)) + with open(os.path.join(node.logs_dir, 'postgresql.log'), 'r') as f: + self.assertIn( + "PANIC: could not read from control file: read 0 bytes", + f.read()) + # Clean after yourself self.del_test_dir(module_name, fname) From 3f06d21dee6b389437ef55a0a602f6c2c11962c5 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Fri, 18 Jun 2021 15:34:26 +0300 Subject: [PATCH 4/5] allow a failed incremental restore to be restarted --- src/pg_probackup.h | 3 ++- src/restore.c | 48 +++++++++++++++++++++++++++++++--------------- src/util.c | 7 +++++-- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/src/pg_probackup.h b/src/pg_probackup.h index f5fd0f672..49348c3bd 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -84,6 +84,7 @@ extern const char *PROGRAM_EMAIL; #define DATABASE_MAP "database_map" #define HEADER_MAP "page_header_map" #define HEADER_MAP_TMP "page_header_map_tmp" +#define XLOG_CONTROL_BAK_FILE XLOG_CONTROL_FILE".pbk.bak" /* Timeout defaults */ #define ARCHIVE_TIMEOUT_DEFAULT 300 @@ -1143,7 +1144,7 @@ extern uint64 get_remote_system_identifier(PGconn *conn); extern uint32 get_data_checksum_version(bool safe); extern pg_crc32c get_pgcontrol_checksum(const char *pgdata_path); extern uint32 get_xlog_seg_size(char *pgdata_path); -extern void get_redo(const char *pgdata_path, RedoParams *redo); +extern void get_redo(const char *pgdata_path, const char *pg_control_filename, RedoParams *redo); extern void set_min_recovery_point(pgFile *file, const char *backup_path, XLogRecPtr stop_backup_lsn); extern void copy_pgcontrol_file(const char *from_fullpath, fio_location from_location, diff --git a/src/restore.c b/src/restore.c index 619d8a486..cb644d799 100644 --- a/src/restore.c +++ b/src/restore.c @@ -489,7 +489,20 @@ do_restore_or_validate(InstanceState *instanceState, time_t target_backup_id, pg { RedoParams redo; parray *timelines = NULL; - get_redo(instance_config.pgdata, &redo); + + /* [Issue #313] check for previous failed incremental restore */ + { + char filename[MAXPGPATH]; + + join_path_components(filename, instance_config.pgdata, XLOG_CONTROL_BAK_FILE); + if (fio_access(filename, F_OK, FIO_DB_HOST) == 0) + { + elog(WARNING, "\"%s\" found, using incremental restore parameters from it", filename); + get_redo(instance_config.pgdata, XLOG_CONTROL_BAK_FILE, &redo); + } + else + get_redo(instance_config.pgdata, XLOG_CONTROL_FILE, &redo); + } if (redo.checksum_version == 0) elog(ERROR, "Incremental restore in 'lsn' mode require " @@ -714,6 +727,7 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, parray *external_dirs = NULL; pgFile *dest_pg_control_file = NULL; char dest_pg_control_fullpath[MAXPGPATH]; + char dest_pg_control_bak_fullpath[MAXPGPATH]; /* arrays with meta info for multi threaded backup */ pthread_t *threads; restore_files_arg *threads_args; @@ -794,12 +808,7 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, * unless we are running incremental-lsn restore, then bitmap is mandatory. */ if (use_bitmap && parray_num(parent_chain) == 1) - { - if (params->incremental_mode == INCR_NONE) - use_bitmap = false; - else - use_bitmap = true; - } + use_bitmap = params->incremental_mode != INCR_NONE; /* * Restore dest_backup internal directories. @@ -917,6 +926,11 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, pg_strcasecmp(file->name, RELMAPPER_FILENAME) == 0) redundant = true; + /* global/pg_control.pbk.bak are always keeped, because it's needed for restart failed incremental restore */ + if (file->external_dir_num == 0 && + pg_strcasecmp(file->rel_path, XLOG_CONTROL_BAK_FILE) == 0) + redundant = false; + /* do not delete the useful internal directories */ if (S_ISDIR(file->mode) && !redundant) continue; @@ -988,13 +1002,16 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, elog(ERROR, "File \"%s\" not found in backup %s", XLOG_CONTROL_FILE, base36enc(dest_backup->start_time)); dest_pg_control_file = parray_remove(dest_files, control_file_elem_index); - join_path_components(dest_pg_control_fullpath, pgdata_path, dest_pg_control_file->rel_path); - /* remove dest control file before restoring */ - if (params->incremental_mode != INCR_NONE) - fio_unlink(dest_pg_control_fullpath, FIO_DB_HOST); - - // TODO: maybe we should rename "pg_control" into something like "pg_control.pbk" to - // keep the ability to rerun failed incremental restore ? + join_path_components(dest_pg_control_fullpath, pgdata_path, XLOG_CONTROL_FILE); + join_path_components(dest_pg_control_bak_fullpath, pgdata_path, XLOG_CONTROL_BAK_FILE); + /* + * rename (if it exist) dest control file before restoring + * if it doesn't exist, that mean, that we already restoring in a previously failed + * pgdata, where XLOG_CONTROL_BAK_FILE exist + */ + if (params->incremental_mode != INCR_NONE + && fio_access(dest_pg_control_fullpath, F_OK, FIO_DB_HOST) == 0) + fio_rename(dest_pg_control_fullpath, dest_pg_control_bak_fullpath, FIO_DB_HOST); } elog(INFO, "Start restoring backup files. PGDATA size: %s", pretty_dest_bytes); @@ -1042,7 +1059,8 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, { total_bytes += restore_file(dest_pg_control_file, dest_pg_control_fullpath, false, NULL, dest_backup, parent_chain, use_bitmap, params->incremental_mode, params->shift_lsn); - fio_disconnect(); + if (params->incremental_mode != INCR_NONE) + fio_unlink(dest_pg_control_bak_fullpath, FIO_DB_HOST); } time(&end_time); diff --git a/src/util.c b/src/util.c index c0a1dc9e4..8d5b3f534 100644 --- a/src/util.c +++ b/src/util.c @@ -351,15 +351,18 @@ get_pgcontrol_checksum(const char *pgdata_path) return ControlFile.crc; } +/* + * you can use XLOG_CONTROL_FILE or XLOG_CONTROL_BAK_FILE for value of control_file_name parameter + */ void -get_redo(const char *pgdata_path, RedoParams *redo) +get_redo(const char *pgdata_path, const char *pg_control_filename, RedoParams *redo) { ControlFileData ControlFile; char *buffer; size_t size; /* First fetch file... */ - buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, false, FIO_DB_HOST); + buffer = slurpFile(pgdata_path, pg_control_filename, &size, false, FIO_DB_HOST); digestControlFile(&ControlFile, buffer, size); pg_free(buffer); From 11ef0286fcac64f61c898005b26f785275f8d31a Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Fri, 18 Jun 2021 18:19:15 +0300 Subject: [PATCH 5/5] [Issue #313] add test incr_restore.IncrRestoreTest.test_incr_restore_issue_313 --- src/archive.c | 2 +- src/backup.c | 2 +- src/init.c | 2 +- src/pg_probackup.h | 2 +- src/restore.c | 13 ++++-- src/util.c | 4 +- tests/helpers/ptrack_helpers.py | 10 ++-- tests/incr_restore.py | 81 ++++++++++++++++++++++++++++++++- tests/restore.py | 2 +- 9 files changed, 101 insertions(+), 17 deletions(-) diff --git a/src/archive.c b/src/archive.c index 4058cd0d4..44ddc08c0 100644 --- a/src/archive.c +++ b/src/archive.c @@ -148,7 +148,7 @@ do_archive_push(InstanceState *instanceState, InstanceConfig *instance, char *wa elog(ERROR, "getcwd() error"); /* verify that archive-push --instance parameter is valid */ - system_id = get_system_identifier(current_dir); + system_id = get_system_identifier(current_dir, XLOG_CONTROL_FILE, FIO_DB_HOST); if (instance->pgdata == NULL) elog(ERROR, "Cannot read pg_probackup.conf for this instance"); diff --git a/src/backup.c b/src/backup.c index 46e4f1ea7..cb190cd20 100644 --- a/src/backup.c +++ b/src/backup.c @@ -983,7 +983,7 @@ check_system_identifiers(PGconn *conn, char *pgdata) uint64 system_id_conn; uint64 system_id_pgdata; - system_id_pgdata = get_system_identifier(pgdata); + system_id_pgdata = get_system_identifier(pgdata, XLOG_CONTROL_FILE, FIO_DB_HOST); system_id_conn = get_remote_system_identifier(conn); /* for checkdb check only system_id_pgdata and system_id_conn */ diff --git a/src/init.c b/src/init.c index dc821325a..622310bc2 100644 --- a/src/init.c +++ b/src/init.c @@ -57,7 +57,7 @@ do_add_instance(InstanceState *instanceState, InstanceConfig *instance) "(-D, --pgdata)"); /* Read system_identifier from PGDATA */ - instance->system_identifier = get_system_identifier(instance->pgdata); + instance->system_identifier = get_system_identifier(instance->pgdata, XLOG_CONTROL_FILE, FIO_DB_HOST); /* Starting from PostgreSQL 11 read WAL segment size from PGDATA */ instance->xlog_seg_size = get_xlog_seg_size(instance->pgdata); diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 49348c3bd..08757b67e 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -1139,7 +1139,7 @@ extern XLogRecPtr get_next_record_lsn(const char *archivedir, XLogSegNo segno, T extern TimeLineID get_current_timeline(PGconn *conn); extern TimeLineID get_current_timeline_from_control(bool safe); extern XLogRecPtr get_checkpoint_location(PGconn *conn); -extern uint64 get_system_identifier(const char *pgdata_path); +extern uint64 get_system_identifier(const char *pgdata_path, const char *pg_control_filename, fio_location location); extern uint64 get_remote_system_identifier(PGconn *conn); extern uint32 get_data_checksum_version(bool safe); extern pg_crc32c get_pgcontrol_checksum(const char *pgdata_path); diff --git a/src/restore.c b/src/restore.c index cb644d799..067abd349 100644 --- a/src/restore.c +++ b/src/restore.c @@ -2233,7 +2233,7 @@ check_incremental_compatibility(const char *pgdata, uint64 system_identifier, bool postmaster_is_up = false; bool backup_label_exists = false; pid_t pid; - char backup_label[MAXPGPATH]; + char filename[MAXPGPATH]; /* check postmaster pid */ pid = fio_check_postmaster(pgdata, FIO_DB_HOST); @@ -2268,7 +2268,12 @@ check_incremental_compatibility(const char *pgdata, uint64 system_identifier, */ elog(INFO, "Trying to read pg_control file in destination direstory"); - system_id_pgdata = get_system_identifier(pgdata); + /* [Issue #313] check for previous failed incremental restore */ + join_path_components(filename, instance_config.pgdata, XLOG_CONTROL_BAK_FILE); + if (fio_access(filename, F_OK, FIO_DB_HOST) == 0) + system_id_pgdata = get_system_identifier(pgdata, XLOG_CONTROL_BAK_FILE, FIO_DB_HOST); + else + system_id_pgdata = get_system_identifier(pgdata, XLOG_CONTROL_FILE, FIO_DB_HOST); if (system_id_pgdata == instance_config.system_identifier) system_id_match = true; @@ -2283,8 +2288,8 @@ check_incremental_compatibility(const char *pgdata, uint64 system_identifier, */ if (incremental_mode == INCR_LSN) { - join_path_components(backup_label, pgdata, "backup_label"); - if (fio_access(backup_label, F_OK, FIO_DB_HOST) == 0) + join_path_components(filename, pgdata, PG_BACKUP_LABEL_FILE); + if (fio_access(filename, F_OK, FIO_DB_HOST) == 0) { elog(WARNING, "Destination directory contains \"backup_control\" file. " "This does NOT mean that you should delete this file and retry, only that " diff --git a/src/util.c b/src/util.c index 8d5b3f534..cdb64948d 100644 --- a/src/util.c +++ b/src/util.c @@ -249,14 +249,14 @@ get_checkpoint_location(PGconn *conn) } uint64 -get_system_identifier(const char *pgdata_path) +get_system_identifier(const char *pgdata_path, const char *pg_control_filename, fio_location location) { ControlFileData ControlFile; char *buffer; size_t size; /* First fetch file... */ - buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, false, FIO_DB_HOST); + buffer = slurpFile(pgdata_path, pg_control_filename, &size, false, location); if (buffer == NULL) return 0; digestControlFile(&ControlFile, buffer, size); diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py index 0a87eb1b6..e3b0aa6c8 100644 --- a/tests/helpers/ptrack_helpers.py +++ b/tests/helpers/ptrack_helpers.py @@ -1602,11 +1602,11 @@ def pgdata_content(self, pgdata, ignore_ptrack=True, exclude_dirs=None): files_to_ignore = [ 'postmaster.pid', 'postmaster.opts', 'pg_internal.init', 'postgresql.auto.conf', - 'backup_label', 'tablespace_map', 'recovery.conf', - 'ptrack_control', 'ptrack_init', 'pg_control', - 'probackup_recovery.conf', 'recovery.signal', - 'standby.signal', 'ptrack.map', 'ptrack.map.mmap', - 'ptrack.map.tmp' + 'backup_label', 'backup_label.old', 'tablespace_map', + 'recovery.conf', 'recovery.done', 'ptrack_control', + 'ptrack_init', 'pg_control', 'probackup_recovery.conf', + 'recovery.signal', 'standby.signal', + 'ptrack.map', 'ptrack.map.mmap', 'ptrack.map.tmp' ] if exclude_dirs: diff --git a/tests/incr_restore.py b/tests/incr_restore.py index cb684a23a..d7dec183c 100644 --- a/tests/incr_restore.py +++ b/tests/incr_restore.py @@ -9,7 +9,7 @@ import hashlib import shutil import json -from testgres import QueryException +from testgres import QueryException, StartNodeException module_name = 'incr_restore' @@ -2436,3 +2436,82 @@ def test_incremental_pg_filenode_map(self): self.del_test_dir(module_name, fname) # check that MinRecPoint and BackupStartLsn are correctly used in case of --incrementa-lsn + + + # @unittest.skip("skip") + def test_incr_restore_issue_313(self): + """ + Check that failed incremental restore can be restarted + """ + fname = self.id().split('.')[3] + + node = self.make_simple_node( + base_dir = os.path.join(module_name, fname, 'node'), + set_replication = True, + initdb_params = ['--data-checksums']) + + backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup') + self.init_pb(backup_dir) + self.add_instance(backup_dir, 'node', node) + self.set_archiving(backup_dir, 'node', node) + node.slow_start() + + node.pgbench_init(scale = 50) + + full_backup_id = self.backup_node(backup_dir, 'node', node, backup_type='full') + + pgbench = node.pgbench( + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + options=['-T', '10', '-c', '1', '--no-vacuum']) + pgbench.wait() + pgbench.stdout.close() + + last_backup_id = self.backup_node(backup_dir, 'node', node, backup_type='delta') + + pgdata = self.pgdata_content(node.data_dir) + node.cleanup() + + self.restore_node(backup_dir, 'node', node, backup_id=full_backup_id) + + count = 0 + filelist = self.get_backup_filelist(backup_dir, 'node', last_backup_id) + for file in filelist: + # count only nondata files + if int(filelist[file]['is_datafile']) == 0 and int(filelist[file]['size']) > 0: + count += 1 + + gdb = self.restore_node(backup_dir, 'node', node, gdb=True, + backup_id=last_backup_id, options=['--progress', '--incremental-mode=checksum']) + gdb.verbose = False + gdb.set_breakpoint('restore_non_data_file') + gdb.run_until_break() + gdb.continue_execution_until_break(count - 2) + gdb.quit() + + try: + node.slow_start() + # we should die here because exception is what we expect to happen + self.assertEqual( + 1, 0, + "Expecting Error because backup is not fully restored") + except StartNodeException as e: + self.assertIn( + 'Cannot start node', + e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format( + repr(e.message), self.cmd)) + + with open(os.path.join(node.logs_dir, 'postgresql.log'), 'r') as f: + self.assertIn( + "postgres: could not find the database system", + f.read()) + + self.restore_node(backup_dir, 'node', node, + backup_id=last_backup_id, options=['--progress', '--incremental-mode=checksum']) + node.slow_start() + + self.compare_pgdata(pgdata, self.pgdata_content(node.data_dir)) + + # Clean after yourself + node.stop() + self.del_test_dir(module_name, fname) diff --git a/tests/restore.py b/tests/restore.py index cf150a5f2..363aae1ec 100644 --- a/tests/restore.py +++ b/tests/restore.py @@ -3880,7 +3880,7 @@ def test_restore_issue_313(self): node.cleanup() count = 0 - filelist = self.get_backup_filelist(backup_dir, 'node', backup_id) + filelist = self.get_backup_filelist(backup_dir, 'node', backup_id) for file in filelist: # count only nondata files if int(filelist[file]['is_datafile']) == 0 and int(filelist[file]['size']) > 0: