Skip to content

Commit ed81e96

Browse files
authored
Merge pull request #453 from aiven/rdunklau-workaround_tar_concurrency
Workaround tar concurrency issue #453
2 parents d0d8c55 + 7f043ec commit ed81e96

File tree

7 files changed

+119
-5
lines changed

7 files changed

+119
-5
lines changed

pghoard/restore.py

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
from pghoard.common import BaseBackupFormat, StrEnum
3535
from pghoard.rohmu import compat, dates, get_transfer, rohmufile
36-
from pghoard.rohmu.errors import Error, InvalidConfigurationError
36+
from pghoard.rohmu.errors import (Error, InvalidConfigurationError, MaybeRecoverableError)
3737

3838
from . import common, config, logutil, version
3939
from .postgres_command import PGHOARD_HOST, PGHOARD_PORT
@@ -640,6 +640,7 @@ def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files:
640640
self.manager_class = multiprocessing.Manager if self._process_count() > 1 else ThreadingManager
641641
self.max_stale_seconds = 120
642642
self.pending_jobs = set()
643+
self.jobs_to_retry = set()
643644
self.pgdata = pgdata
644645
# There's no point in spawning child processes if process count is 1
645646
self.pool_class = multiprocessing.Pool if self._process_count() > 1 else multiprocessing.pool.ThreadPool
@@ -648,6 +649,7 @@ def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files:
648649
self.sleep_fn = time.sleep
649650
self.tablespaces = tablespaces
650651
self.total_download_size = 0
652+
self.retry_per_file = {}
651653

652654
def fetch_all(self):
653655
for retry in range(3):
@@ -656,7 +658,7 @@ def fetch_all(self):
656658
self._setup_progress_tracking(manager)
657659
with self.pool_class(processes=self._process_count()) as pool:
658660
self._queue_jobs(pool)
659-
self._wait_for_jobs_to_complete()
661+
self._wait_for_jobs_to_complete(pool)
660662
# Context manager does not seem to properly wait for the subprocesses to exit, let's join
661663
# the pool manually (close need to be called before joining)
662664
pool.close()
@@ -738,15 +740,33 @@ def job_completed(self, key):
738740
if key in self.pending_jobs:
739741
self.pending_jobs.remove(key)
740742
self.completed_jobs.add(key)
743+
self.retry_per_file.pop(key, None)
741744

742745
def job_failed(self, key, exception):
746+
if isinstance(exception, MaybeRecoverableError):
747+
self.log.warning("Got error which can be recoverable from chunk download %s", exception)
748+
with self.lock:
749+
if key in self.pending_jobs:
750+
retries = self.retry_per_file.get(key, 0) + 1
751+
self.retry_per_file[key] = retries
752+
self.pending_jobs.remove(key)
753+
if retries <= 2:
754+
self.jobs_to_retry.add(key)
755+
return
756+
self.errors += 1
757+
self.completed_jobs.add(key)
758+
self.retry_per_file.pop(key, None)
759+
self.log.error("Giving up on recoverable error: %s", exception)
760+
return
761+
743762
self.log.error("Got error from chunk download: %s", exception)
744763
self.last_progress_ts = time.monotonic()
745764
with self.lock:
746765
if key in self.pending_jobs:
747766
self.errors += 1
748767
self.pending_jobs.remove(key)
749768
self.completed_jobs.add(key)
769+
self.retry_per_file.pop(key, None)
750770

751771
def jobs_in_progress(self):
752772
with self.lock:
@@ -788,8 +808,18 @@ def _write_status_output_to_file(self, output_file):
788808
}
789809
)
790810

791-
def _wait_for_jobs_to_complete(self):
811+
def _wait_for_jobs_to_complete(self, pool):
792812
while self.jobs_in_progress():
813+
to_queue = []
814+
with self.lock:
815+
if self.jobs_to_retry:
816+
for item in self.data_files:
817+
if item.id in self.jobs_to_retry:
818+
self.pending_jobs.add(item.id)
819+
self.jobs_to_retry.remove(item.id)
820+
to_queue.append(item)
821+
for item in to_queue:
822+
self._queue_job(pool, item)
793823
self._print_download_progress()
794824
if self.status_output_file:
795825
self._write_status_output_to_file(self.status_output_file)
@@ -910,12 +940,16 @@ def _fetch_delta_file(self, metadata, fetch_fn):
910940
)
911941

912942
def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
943+
# Force tar to use the C locale to match errors in stderr
944+
tar_env = os.environ.copy()
945+
tar_env["LANG"] = "C"
913946
with subprocess.Popen(
914947
self._build_tar_args(metadata),
915948
bufsize=0,
916949
stdin=subprocess.PIPE,
917950
stdout=subprocess.DEVNULL,
918-
stderr=subprocess.PIPE
951+
stderr=subprocess.PIPE,
952+
env=tar_env
919953
) as tar:
920954
common.increase_pipe_capacity(tar.stdin, tar.stderr)
921955
sink = rohmufile.create_sink_pipeline(
@@ -939,7 +973,18 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
939973
exit_code = tar.wait()
940974
file_name = "<mem_bytes>" if isinstance(self.file_info, FileDataInfo) else self.file_info.name
941975
if exit_code != 0:
942-
raise Exception("tar exited with code {!r} for file {!r}, output: {!r}".format(exit_code, file_name, output))
976+
ex_message = "tar exited with code {!r} for file {!r}, output: {!r}".format(exit_code, file_name, output)
977+
# Running multiple tar commands in parallel in the same
978+
# directory can lead to race conditions while creating the
979+
# intermediate directories.
980+
# In that case, try to recover from it.
981+
# See issue #452 and https://savannah.gnu.org/bugs/index.php?61015
982+
if exit_code == 2 and b"Cannot open: No such file or directory" in output:
983+
raise MaybeRecoverableError(ex_message)
984+
else:
985+
raise Exception(
986+
"tar exited with code {!r} for file {!r}, output: {!r}".format(exit_code, file_name, output)
987+
)
943988
self.log.info("Processing of %r completed successfully", file_name)
944989

945990

pghoard/rohmu/errors.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,7 @@ class LocalFileIsRemoteFileError(StorageError):
2828

2929
class MissingLibraryError(Exception):
3030
"""Missing dependency library"""
31+
32+
33+
class MaybeRecoverableError(Error):
34+
"""An error that may be recoverable"""
1.93 KB
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"compression-algorithm": "snappy", "encryption-key-id": "5ba999de817c49a682ffed124abf9a2e", "format": "pghoard-bb-v2", "original-file-size": "20480"}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
{
2+
"backup_sites": {
3+
"f73f56ee-6b9f-4ce0-b7aa-a170d58da833": {
4+
"encryption_key_id": "5ba999de817c49a682ffed124abf9a2e",
5+
"encryption_keys": {
6+
"5ba999de817c49a682ffed124abf9a2e": {
7+
"private": "-----BEGIN PRIVATE KEY-----\nMIIG/AIBADANBgkqhkiG9w0BAQEFAASCBuYwggbiAgEAAoIBgQC/e0jNVBCB8pxK\nwPmJUlus6q+mKQ9QD0esP/TzBZ6TwKiMlMukwh0FCah88UTf/9VNDEvgzFhrcbEc\n0O0ZKywSHNFOEq6onm3QWRqnMvXZLlTyhIBZRuLB8Vt3WH3Atv6BwbLRgFuT6Rfx\nopOGptmNQsOMT4z9lB2n2JiTBJsg7+iAfw6ZltuBSIjC8/5flcmYkkTQFEHUJ3RY\nOTjlqTY7y8J464qvXgQIUE/kCx7np4pdvWc3Zf9l1hgv5Ol/escpr8Mo2Cg5Qhjg\ntLzB4emx8dfnoV1oc77F4XEFj5SzBZqgKy/pV5yy8UtJ6NDBEwG4cK5kXXFyu31q\nc+XYzgD2SdmG3Fqvy5Ikwj8Sx82zrwBWEM8etEy7CniyZi7+Sr9G4NfFvBG8OOQ2\nXGMfnl8doEFYRtVdjU0o3VSn37ASCW8XsHgd/Zobu6k9sMKdU1iNrEjkiSGQtlbD\nsq8GMuq8saSbPyRUNQO7mFQC2F6K49KGgxpn4d65G0wQMMnbq4sCAwEAAQKCAYAK\nYYOr5g/TC7UfdGDS6g0gTcTiDD3RSFLJato7xqU3O22n2XVE5GUwXbqts2LZhgQp\nXi5K7KkqggppFoaUI7wK61cJlYe0iopHjl0cjW24rYNbdoWC0Y3/l7cuvDRtGz6n\nCDpKk1vjo/JxXjADT85hkyoI1FM/eCU3cU2sQsaqPXdsZ/cBqqUR2D3Z2+KBihxY\n0i063q5G8zCii8+i286d5UkQxyxIn582WCxMn7G4O2QL+vW6kiQLgFTlW9Kw35YO\nfbM04zUmpvZpjCJuqLDw/2x7/sJn17vzQ+LsdBz/JXaLEidkI4Tr4caWqSjJZVBK\nb5LzeOrqCBcsppXUhN+4yvqCAtK0iUCLbFwPuqqmv3Ly2GpWQBPEgozdaXZOOGzc\ne7M5xCDH5bQRR/3e6Md9fHl/ATKD5eSun+7dyjNq8FJHeUAEKS2okd5oFmTD4mkZ\nxV1GTHTLzUy0qeyiUoGCxnCnS8q9rx2Os4j4/Y+aQI6xzUWYK+Zb57o9w7kIirEC\ngcEA8BHiDzTM4HgmKJ5kPkB38JYkoI0hFzofesvvKg8Nx4hEg6KBL3t7kJPqdJgm\ntZHwhg3EwzKIRCY+wHCpYy0ouX/MAfKjWBI9uAYhTB5nbJEqxFNo18uK40PoWRMt\nNSralCrRcIb5z/Kl5WTsXN6DHJFTDwsGHF41LWrnCXRNM/XfzJvE7X0VlcjbL/A9\n0cxfEuTb7k2xhJNBbizCvdw92mmDemFxq9PO63966nnOIjwaX3m4G1yg3nsUGeTY\nVmpZAoHBAMwwBaj9x29Cxjn1Dy4rF5GiNErO5JY9mZnroa4I43zbYuXww5mWTpHs\nORntM1XtKt2E5KeW9fBUdJPWh4epnh2cyFmodSxaEk35wfnEGnMPlwHTgZvUE7kz\n6VXqDEGK4mYmzof6edbpg+tejx8SsK6Pwrt1Moj9evGs+I38oK2IRJchIS7Ur+qj\njLS13Z1mxLmAquDsmzAVgEL13rgUsBsiV+ghEEpvPS8gb9yZmpTiKD07JKml18VK\n06FsF9VAgwKBwBT/O2phD9pCJ/Q9hj77nIHqX+G69j3103MGCzD+iBH/lR5+RBZH\nEpOenE7+T8Rps0PGSINaFBkBz1M9h1MpS/qNduZktmypi8RgpODnd9xDBh6NvQnC\nv68I7XV8++M+kEeNRyw0Yf0SF/hsT1AAFi+VdlJGgI5SnwwN4Y8uIOJ+ish2h07O\nNekX3DPhK0cCPP6GDcZV/US/LGXafF3muXI08E7v3uVMbTijubhwVtsfrp7TIosi\nGt/am/N31IQaYQKBwAmUmz9hoPDsfiaMBAlThkiUBsYXzQvrmgBp2O00h4/9LzfA\nwzy6m7cnEUrRIV5/wUohiST/5UxAejPRlgxcfgm/qHrkd5L8Ku2zsVFJzT/m1FwG\nk4c/PSmscN9SGv8cSCEo4vnoW70kucbaafa4Rsf6ANYQ2q0oz5L1XbgzyUo7IZTB\nvi/XVOW6hMiZ2+sdvk9B5UKmd2WbLKh3ptqWRekQBHXkz0He1E0YxYbhQiqILgEp\nfD/lgylDqIhjbP7ZhwKBwFYIBmEYEs/+ixHkHGbzE4tP+VhgzQ7we1+xyqN7Z69e\nf7StEuWWIZ5Os9JpdhZmN/9qb4kzL+Pb0vELuNHLwv08MnJ792wA9dPeCuYLKKiK\nUBuBwnslBy69tCRSXQ8ltH3kX67lxCu28hDRg+oN83bEczPJ2I/x7BKDrWNYZvrq\nJw7Ijp5l5gLqtdb7eOI8mSs5F7yaUKR6yVhl3ifc6bQUmmGmmElobpCpcGqAmXSj\noAeJCZug3MWS2fxhluthxQ==\n-----END PRIVATE KEY-----\n",
8+
"public": "-----BEGIN PUBLIC KEY-----\nMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEAv3tIzVQQgfKcSsD5iVJb\nrOqvpikPUA9HrD/08wWek8CojJTLpMIdBQmofPFE3//VTQxL4MxYa3GxHNDtGSss\nEhzRThKuqJ5t0FkapzL12S5U8oSAWUbiwfFbd1h9wLb+gcGy0YBbk+kX8aKThqbZ\njULDjE+M/ZQdp9iYkwSbIO/ogH8OmZbbgUiIwvP+X5XJmJJE0BRB1Cd0WDk45ak2\nO8vCeOuKr14ECFBP5Ase56eKXb1nN2X/ZdYYL+Tpf3rHKa/DKNgoOUIY4LS8weHp\nsfHX56FdaHO+xeFxBY+UswWaoCsv6VecsvFLSejQwRMBuHCuZF1xcrt9anPl2M4A\n9knZhtxar8uSJMI/EsfNs68AVhDPHrRMuwp4smYu/kq/RuDXxbwRvDjkNlxjH55f\nHaBBWEbVXY1NKN1Up9+wEglvF7B4Hf2aG7upPbDCnVNYjaxI5IkhkLZWw7KvBjLq\nvLGkmz8kVDUDu5hUAtheiuPShoMaZ+HeuRtMEDDJ26uLAgMBAAE=\n-----END PUBLIC KEY-----\n"
9+
}
10+
},
11+
"object_storage": {
12+
"directory": "test/basebackup/chunks",
13+
"storage_type": "local"
14+
},
15+
"prefix": "1052a492-1a01-459d-a126-9db8518724c0/f73f56ee-6b9f-4ce0-b7aa-a170d58da833"
16+
}
17+
},
18+
"compression": {
19+
"algorithm": "snappy"
20+
},
21+
"log_level": "INFO",
22+
"restore_process_count": 2
23+
}

test/tar_failer

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
# This is a mock program to simulate tar race conditions failure
3+
cat - > /dev/null
4+
echo "Cannot open: No such file or directory" >&2
5+
exit 2

test/test_restore.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,42 @@ def test_real_processing(self):
235235
for tar in ["tar", "pghoard/gnutaremu.py"]:
236236
self.run_restore_test("basebackup", tar, self.real_processing)
237237

238+
# Simulate a tar failure similar to what we have when running into
239+
# concurrency issues
240+
# We use a basebackup with one chunk to make sure all retries
241+
# deal with the same file
242+
def test_tar_recovery(self):
243+
pass_after_retry = 2
244+
245+
def simulate_tar_failure(fetcher, restore_dir):
246+
# Mock "_build_tar_args" to get the information we need
247+
retry = 0
248+
original_tar_executable = fetcher.config["tar_executable"]
249+
fetcher.config["tar_executable"] = "test/tar_failer"
250+
original_build_tar_args = ChunkFetcher._build_tar_args # pylint: disable=protected-access
251+
252+
def _build_tar_args(self, metadata):
253+
nonlocal retry, pass_after_retry
254+
retry += 1
255+
if retry >= pass_after_retry:
256+
self.config["tar_executable"] = original_tar_executable
257+
return original_build_tar_args(self, metadata)
258+
259+
with patch("pghoard.restore.ChunkFetcher._build_tar_args", new=_build_tar_args):
260+
# Now launch the test progress_percent
261+
fetcher.fetch_all()
262+
self.check_sha256(
263+
os.path.join(restore_dir, "base", "1", "3608"),
264+
"cd461a152a9259c2d311ee348a4fa6722c119c1ff9a5b3147a86058d76f9bba8"
265+
)
266+
267+
# Check that after 1 failure, we succeed
268+
self.run_restore_test("basebackup_one_chunk", "tar", simulate_tar_failure)
269+
pass_after_retry = 10
270+
# Check that after 3 failure, we fail
271+
with pytest.raises(RestoreError):
272+
self.run_restore_test("basebackup_one_chunk", "tar", simulate_tar_failure)
273+
238274
def real_processing(self, fetcher, restore_dir):
239275
assert fetcher.pool_class == multiprocessing.Pool # pylint: disable=comparison-with-callable
240276
fetcher.fetch_all()

0 commit comments

Comments
 (0)