diff --git a/src/osmium/replication/server.py b/src/osmium/replication/server.py index 4f7d89c..22e84a4 100644 --- a/src/osmium/replication/server.py +++ b/src/osmium/replication/server.py @@ -144,8 +144,14 @@ def collect_diffs(self, start_id: int, max_size: Optional[int] = None, contains the MergeInputReader with the data and `newest` is a sequence id of the most recent diff available. - Returns None if there was an error during download or no new - data was available. + Returns None if there was no new data was available. + + If there is an error during the download, then the function will + simply return the already downloaded data. If the reported + error is a client error (HTTP 4xx) and happens during the download + of the first diff, then a ::request.HTTPError:: is raised: this + condition is likely to be permanent and the caller should not + simply retry without investigating the cause. """ # must not read data newer than the published sequence id # or we might end up reading partial data @@ -168,8 +174,19 @@ def collect_diffs(self, start_id: int, max_size: Optional[int] = None, and current_id <= newest.sequence: try: diffdata = self.get_diff_block(current_id) - except: # noqa: E722 - LOG.error("Error during diff download. Bailing out.") + except requests.RequestException as ex: + if start_id == current_id \ + and ex.response is not None \ + and (ex.response.status_code % 100 == 4): + # If server directly responds with a client error, + # reraise the exception to signal a potentially permanent + # error. + LOG.error("Permanent server error: %s", ex.response) + raise ex + # In all other cases, process whatever diffs we have and + # encourage a retry. + LOG.error("Error during diff download: %s", ex) + LOG.error("Bailing out.") diffdata = '' if len(diffdata) == 0: if start_id == current_id: @@ -305,7 +322,8 @@ def apply_diffs_to_file(self, infile: str, outfile: str, start_id: int, return (diffs.id, diffs.newest) def timestamp_to_sequence(self, timestamp: dt.datetime, - balanced_search: bool = False) -> Optional[int]: + balanced_search: bool = False, + limit_by_oldest_available: bool = False) -> Optional[int]: """ Get the sequence number of the replication file that contains the given timestamp. The search algorithm is optimised for replication servers that publish updates in regular intervals. For servers @@ -313,8 +331,15 @@ def timestamp_to_sequence(self, timestamp: dt.datetime, should be set to true so that a standard binary search for the sequence will be used. The default is good for all known OSM replication services. - """ + When `limit_by_oldest_available` is set, then the function will + return None when the server replication does not start at 0 and + the given timestamp is older than the oldest available timestamp + on the server. Some replication servers do not keep the full + history and this flag avoids accidentally trying to download older + data. The downside is that the function will never return the + oldest available sequence ID when the flag is set. + """ # get the current timestamp from the server upper = self.get_state_info() @@ -331,8 +356,10 @@ def timestamp_to_sequence(self, timestamp: dt.datetime, lower = self.get_state_info(lowerid) if lower is not None and lower.timestamp >= timestamp: - if lower.sequence == 0 or lower.sequence + 1 >= upper.sequence: - return lower.sequence + if lower.sequence == 0: + return 0 + if lower.sequence + 1 >= upper.sequence: + return None if limit_by_oldest_available else lower.sequence upper = lower lower = None lowerid = 0 diff --git a/src/osmium/tools/common.py b/src/osmium/tools/common.py index aba1346..f805305 100644 --- a/src/osmium/tools/common.py +++ b/src/osmium/tools/common.py @@ -29,11 +29,19 @@ class ReplicationStart: def get_sequence(self, svr: ReplicationServer) -> Optional[int]: if self.seq_id is not None: log.debug("Using given sequence ID %d" % self.seq_id) + if self.seq_id > 0: + start_state = svr.get_state_info(seq=self.seq_id) + if start_state is None: + log.error( + f"Cannot download state information for ID {self.seq_id}." + " Server may not have this diff anymore.") + return None + self.date = start_state.timestamp return self.seq_id + 1 assert self.date is not None log.debug("Looking up sequence ID for timestamp %s" % self.date) - return svr.timestamp_to_sequence(self.date) + return svr.timestamp_to_sequence(self.date, limit_by_oldest_available=True) def get_end_sequence(self, svr: ReplicationServer) -> Optional[int]: if self.seq_id is not None: diff --git a/src/osmium/tools/pyosmium_get_changes.py b/src/osmium/tools/pyosmium_get_changes.py index 2e4f7dc..7f9d874 100644 --- a/src/osmium/tools/pyosmium_get_changes.py +++ b/src/osmium/tools/pyosmium_get_changes.py @@ -162,9 +162,16 @@ def pyosmium_get_changes(args: List[str]) -> int: cookie_jar.load(options.cookie) svr.set_request_parameter('cookies', cookie_jar) + # Sanity check if server URL is correct and server is responding. + current = svr.get_state_info() + if current is None: + log.error("Cannot download state information. Is the replication URL correct?") + return 3 + log.debug(f"Server is at sequence {current.sequence} ({current.timestamp}).") + startseq = options.start.get_sequence(svr) if startseq is None: - log.error("Cannot read state file from server. Is the URL correct?") + log.error(f"No starting point found for time {options.start.date} on server {url}") return 1 if options.outfile is None: diff --git a/src/osmium/tools/pyosmium_up_to_date.py b/src/osmium/tools/pyosmium_up_to_date.py index f3e7c15..ec0c8ca 100644 --- a/src/osmium/tools/pyosmium_up_to_date.py +++ b/src/osmium/tools/pyosmium_up_to_date.py @@ -94,18 +94,10 @@ def update_from_custom_server(start: ReplicationStart, options: Any) -> int: log.error(f"No starting point found for time {start.date} on server {start.source}") return 3 - if start.date is None: - start_state = svr.get_state_info(seq=startseq) - if start_state is None: - log.error(f"Cannot download state information for ID {startseq}. " - 'Is the URL correct?') - return 3 - start.date = start_state.timestamp - if not options.force_update: cmpdate = dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=90) cmpdate = cmpdate.replace(tzinfo=dt.timezone.utc) - if start.date < cmpdate: + if start.date is None or start.date < cmpdate: log.error( """The OSM file is more than 3 months old. You should download a more recent file instead of updating. If you really want to diff --git a/test/test_pyosmium_get_changes.py b/test/test_pyosmium_get_changes.py index ca2044b..c0d2739 100644 --- a/test/test_pyosmium_get_changes.py +++ b/test/test_pyosmium_get_changes.py @@ -8,6 +8,7 @@ """ from textwrap import dedent import uuid +import datetime as dt import pytest @@ -22,115 +23,140 @@ import cookielib as cookiejarlib -class TestPyosmiumGetChanges: +REPLICATION_BASE_TIME = dt.datetime(year=2017, month=8, day=26, hour=11, tzinfo=dt.timezone.utc) +REPLICATION_BASE_SEQ = 100 +REPLICATION_CURRENT = 140 - def main(self, httpserver, *args): - return pyosmium_get_changes(['--server', httpserver.url_for('')] + list(args)) - def test_init_id(self, capsys, httpserver): - assert 0 == self.main(httpserver, '-I', '453') +@pytest.fixture +def replication_server(httpserver): + def _state(seq): + seqtime = REPLICATION_BASE_TIME + dt.timedelta(hours=seq - REPLICATION_CURRENT) + timestamp = seqtime.strftime('%Y-%m-%dT%H\\:%M\\:%SZ') + return f"sequenceNumber={seq}\ntimestamp={timestamp}\n" - output = capsys.readouterr().out.strip() + httpserver.no_handler_status_code = 404 + httpserver.expect_request('/state.txt').respond_with_data(_state(REPLICATION_CURRENT)) + for i in range(REPLICATION_BASE_SEQ, REPLICATION_CURRENT + 1): + httpserver.expect_request(f'/000/000/{i}.opl')\ + .respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000))) + httpserver.expect_request(f'/000/000/{i}.state.txt').respond_with_data(_state(i)) - assert output == '453' + return httpserver.url_for('') - def test_init_date(self, capsys, httpserver): - httpserver.expect_request('/state.txt').respond_with_data(dedent("""\ - sequenceNumber=100 - timestamp=2017-08-26T11\\:04\\:02Z - """)) - httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\ - sequenceNumber=0 - timestamp=2016-08-26T11\\:04\\:02Z - """)) - assert 0 == self.main(httpserver, '-D', '2015-12-24T08:08:08Z') - output = capsys.readouterr().out.strip() +@pytest.fixture +def runner(httpserver): + def _run(*args): + return pyosmium_get_changes( + ['--server', httpserver.url_for(''), '--diff-type', 'opl'] + list(map(str, args))) - assert output == '-1' + return _run - def test_init_to_file(self, tmp_path, httpserver): - fname = tmp_path / f"{uuid.uuid4()}.seq" - assert 0 == self.main(httpserver, '-I', '453', '-f', str(fname)) - assert fname.read_text() == '453' +def test_init_id(runner, capsys, replication_server): + assert 0 == runner('-I', '100') - def test_init_from_seq_file(self, tmp_path, httpserver): - fname = tmp_path / f"{uuid.uuid4()}.seq" - fname.write_text('453') + output = capsys.readouterr().out.strip() - assert 0 == self.main(httpserver, '-f', str(fname)) - assert fname.read_text() == '453' + assert output == '100' - def test_init_date_with_cookie(self, capsys, tmp_path, httpserver): - httpserver.expect_request('/state.txt').respond_with_data(dedent("""\ - sequenceNumber=100 - timestamp=2017-08-26T11\\:04\\:02Z - """)) - httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\ - sequenceNumber=0 - timestamp=2016-08-26T11\\:04\\:02Z - """)) - fname = tmp_path / 'my.cookie' - cookie_jar = cookiejarlib.MozillaCookieJar(str(fname)) - cookie_jar.save() +def test_init_date(runner, capsys, httpserver): + httpserver.expect_request('/state.txt').respond_with_data(dedent("""\ + sequenceNumber=100 + timestamp=2017-08-26T11\\:04\\:02Z + """)) + httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\ + sequenceNumber=0 + timestamp=2016-08-26T11\\:04\\:02Z + """)) + assert 0 == runner('-D', '2015-12-24T08:08:08Z') - assert 0 == self.main(httpserver, '--cookie', str(fname), - '-D', '2015-12-24T08:08:08Z') + output = capsys.readouterr().out.strip() - output = capsys.readouterr().out.strip() + assert output == '-1' - assert output == '-1' - def test_get_simple_update(self, tmp_path, httpserver): - outfile = tmp_path / f"{uuid.uuid4()}.opl" +def test_init_to_file(runner, tmp_path, replication_server): + fname = tmp_path / f"{uuid.uuid4()}.seq" - httpserver.expect_request('/state.txt').respond_with_data(dedent("""\ - sequenceNumber=454 - timestamp=2017-08-26T11\\:04\\:02Z - """)) - httpserver.expect_request('/000/000/454.state.txt').respond_with_data(dedent("""\ - sequenceNumber=454 - timestamp=2016-08-26T11\\:04\\:02Z - """)) - httpserver.expect_request('/000/000/454.opl').respond_with_data( - "n12 v1 x4 y6\nn13 v1 x9 y-6\nw2 v2 Nn1,n2") + assert 0 == runner('-I', '130', '-f', fname) + assert fname.read_text() == '130' - assert 0 == self.main(httpserver, '--diff-type', 'opl', - '-I', '453', '-o', str(outfile)) - ids = IDCollector() - osmium.apply(str(outfile), ids) +def test_init_from_seq_file(runner, tmp_path, replication_server): + fname = tmp_path / f"{uuid.uuid4()}.seq" + fname.write_text('140') - assert ids.nodes == [12, 13] - assert ids.ways == [2] - assert ids.relations == [] + assert 0 == runner('-f', fname) + assert fname.read_text() == '140' - @pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107), - (None, 1, 108), - (105, 1, 105), - (110, 1, 108)]) - def test_apply_diffs_endid(self, tmp_path, httpserver, end_id, max_size, actual_end): - outfile = tmp_path / f"{uuid.uuid4()}.opl" - httpserver.expect_request('/state.txt').respond_with_data("""\ - sequenceNumber=140 - timestamp=2017-08-26T11\\:04\\:02Z - """) - for i in range(100, 141): - httpserver.expect_request(f'/000/000/{i}.opl')\ - .respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000))) +def test_init_date_with_cookie(runner, capsys, tmp_path, httpserver): + httpserver.expect_request('/state.txt').respond_with_data(dedent("""\ + sequenceNumber=100 + timestamp=2017-08-26T11\\:04\\:02Z + """)) + httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\ + sequenceNumber=0 + timestamp=2016-08-26T11\\:04\\:02Z + """)) - params = [httpserver, '--diff-type', 'opl', '-I', '100', '-o', str(outfile)] - if end_id is not None: - params.extend(('--end-id', str(end_id))) - if max_size is not None: - params.extend(('-s', str(max_size))) - - assert 0 == self.main(*params) - - ids = IDCollector() - osmium.apply(str(outfile), ids) - - assert ids.relations == list(range(101, actual_end + 1)) + fname = tmp_path / 'my.cookie' + cookie_jar = cookiejarlib.MozillaCookieJar(str(fname)) + cookie_jar.save() + + assert 0 == runner('--cookie', fname, '-D', '2015-12-24T08:08:08Z') + + output = capsys.readouterr().out.strip() + + assert output == '-1' + + +def test_get_simple_update(runner, tmp_path, replication_server): + outfile = tmp_path / f"{uuid.uuid4()}.opl" + + assert 0 == runner('-I', '139', '-o', outfile) + + ids = IDCollector() + osmium.apply(outfile, ids) + + assert ids.nodes == [] + assert ids.ways == [] + assert ids.relations == [140] + + +@pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107), + (None, 1, 108), + (105, 1, 105), + (110, 1, 108)]) +def test_apply_diffs_endid(runner, tmp_path, replication_server, end_id, max_size, actual_end): + outfile = tmp_path / f"{uuid.uuid4()}.opl" + + params = ['-I', '100', '-o', outfile] + if end_id is not None: + params.extend(('--end-id', end_id)) + if max_size is not None: + params.extend(('-s', max_size)) + + assert 0 == runner(*params) + + ids = IDCollector() + osmium.apply(str(outfile), ids) + + assert ids.relations == list(range(101, actual_end + 1)) + + +def test_change_id_too_old_for_replication_source(runner, tmp_path, replication_server, caplog): + outfile = tmp_path / f"{uuid.uuid4()}.opl" + + assert 1 == runner('-I', 98, '-o', outfile) + assert 'Cannot download state information for ID 98.' in caplog.text + + +def test_change_date_too_old_for_replication_source(runner, tmp_path, replication_server, caplog): + outfile = tmp_path / f"{uuid.uuid4()}.opl" + + assert 1 == runner('-D', '2015-12-24T08:08:08Z', '-o', outfile) + assert 'No starting point found' in caplog.text diff --git a/test/test_pyosmium_up-to-date.py b/test/test_pyosmium_up-to-date.py index 4c7706a..0f46c49 100644 --- a/test/test_pyosmium_up-to-date.py +++ b/test/test_pyosmium_up-to-date.py @@ -168,3 +168,23 @@ def test_update_with_enddate(test_data, runner, tmp_path): osmium.apply(newfile, ids) assert ids.relations == list(range(101, 106)) + + +def test_change_date_too_old_for_replication_source(test_data, runner, caplog): + outfile = test_data("n1 v1 t2070-04-05T06:30:00Z") + + assert 3 == runner(outfile) + assert 'No starting point found' in caplog.text + + +def test_change_id_too_old_for_replication_source(caplog, tmp_path, runner, replication_server): + outfile = tmp_path / f"{uuid.uuid4()}.pbf" + h = osmium.io.Header() + h.set('osmosis_replication_base_url', replication_server) + h.set('osmosis_replication_sequence_number', '98') + + with osmium.SimpleWriter(outfile, 4000, h) as w: + w.add_node({'id': 1}) + + assert 3 == runner(outfile) + assert 'Cannot download state information for ID 98' in caplog.text diff --git a/test/test_replication.py b/test/test_replication.py index 538a4cb..67cdc00 100644 --- a/test/test_replication.py +++ b/test/test_replication.py @@ -10,6 +10,7 @@ import uuid import pytest +import requests from werkzeug.wrappers import Response @@ -391,6 +392,23 @@ def test_apply_diffs_permanent_error(httpserver, caplog): httpserver.expect_ordered_request('/000/000/100.opl')\ .respond_with_data('not a file', status=404) + with caplog.at_level(logging.ERROR): + with rserv.ReplicationServer(httpserver.url_for(''), "opl") as svr: + h = CountingHandler() + with pytest.raises(requests.HTTPError, match='404'): + svr.apply_diffs(h, 100, 10000) + + assert 'Permanent server error' in caplog.text + + +def test_apply_diffs_transient_error_first_diff(httpserver, caplog): + httpserver.expect_ordered_request('/state.txt').respond_with_data("""\ + sequenceNumber=100 + timestamp=2017-08-26T11\\:04\\:02Z + """) + httpserver.expect_request('/000/000/100.opl')\ + .respond_with_data('not a file', status=503) + with caplog.at_level(logging.ERROR): with rserv.ReplicationServer(httpserver.url_for(''), "opl") as svr: h = CountingHandler() @@ -421,7 +439,7 @@ def test_apply_diffs_permanent_error_later_diff(httpserver, caplog): assert 'Error during diff download' in caplog.text -def test_apply_diffs_transient_error(httpserver, caplog): +def test_apply_diffs_transient_error_later_diff(httpserver, caplog): httpserver.expect_ordered_request('/state.txt').respond_with_data("""\ sequenceNumber=101 timestamp=2017-08-26T11\\:04\\:02Z