Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 35 additions & 8 deletions src/osmium/replication/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,14 @@ def collect_diffs(self, start_id: int, max_size: Optional[int] = None,
contains the MergeInputReader with the data and `newest` is a
sequence id of the most recent diff available.

Returns None if there was an error during download or no new
data was available.
Returns None if there was no new data was available.

If there is an error during the download, then the function will
simply return the already downloaded data. If the reported
error is a client error (HTTP 4xx) and happens during the download
of the first diff, then a ::request.HTTPError:: is raised: this
condition is likely to be permanent and the caller should not
simply retry without investigating the cause.
"""
# must not read data newer than the published sequence id
# or we might end up reading partial data
Expand All @@ -168,8 +174,19 @@ def collect_diffs(self, start_id: int, max_size: Optional[int] = None,
and current_id <= newest.sequence:
try:
diffdata = self.get_diff_block(current_id)
except: # noqa: E722
LOG.error("Error during diff download. Bailing out.")
except requests.RequestException as ex:
if start_id == current_id \
and ex.response is not None \
and (ex.response.status_code % 100 == 4):
# If server directly responds with a client error,
# reraise the exception to signal a potentially permanent
# error.
LOG.error("Permanent server error: %s", ex.response)
raise ex
# In all other cases, process whatever diffs we have and
# encourage a retry.
LOG.error("Error during diff download: %s", ex)
LOG.error("Bailing out.")
diffdata = ''
if len(diffdata) == 0:
if start_id == current_id:
Expand Down Expand Up @@ -305,16 +322,24 @@ def apply_diffs_to_file(self, infile: str, outfile: str, start_id: int,
return (diffs.id, diffs.newest)

def timestamp_to_sequence(self, timestamp: dt.datetime,
balanced_search: bool = False) -> Optional[int]:
balanced_search: bool = False,
limit_by_oldest_available: bool = False) -> Optional[int]:
""" Get the sequence number of the replication file that contains the
given timestamp. The search algorithm is optimised for replication
servers that publish updates in regular intervals. For servers
with irregular change file publication dates 'balanced_search`
should be set to true so that a standard binary search for the
sequence will be used. The default is good for all known
OSM replication services.
"""

When `limit_by_oldest_available` is set, then the function will
return None when the server replication does not start at 0 and
the given timestamp is older than the oldest available timestamp
on the server. Some replication servers do not keep the full
history and this flag avoids accidentally trying to download older
data. The downside is that the function will never return the
oldest available sequence ID when the flag is set.
"""
# get the current timestamp from the server
upper = self.get_state_info()

Expand All @@ -331,8 +356,10 @@ def timestamp_to_sequence(self, timestamp: dt.datetime,
lower = self.get_state_info(lowerid)

if lower is not None and lower.timestamp >= timestamp:
if lower.sequence == 0 or lower.sequence + 1 >= upper.sequence:
return lower.sequence
if lower.sequence == 0:
return 0
if lower.sequence + 1 >= upper.sequence:
return None if limit_by_oldest_available else lower.sequence
upper = lower
lower = None
lowerid = 0
Expand Down
10 changes: 9 additions & 1 deletion src/osmium/tools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,19 @@ class ReplicationStart:
def get_sequence(self, svr: ReplicationServer) -> Optional[int]:
if self.seq_id is not None:
log.debug("Using given sequence ID %d" % self.seq_id)
if self.seq_id > 0:
start_state = svr.get_state_info(seq=self.seq_id)
if start_state is None:
log.error(
f"Cannot download state information for ID {self.seq_id}."
" Server may not have this diff anymore.")
return None
self.date = start_state.timestamp
return self.seq_id + 1

assert self.date is not None
log.debug("Looking up sequence ID for timestamp %s" % self.date)
return svr.timestamp_to_sequence(self.date)
return svr.timestamp_to_sequence(self.date, limit_by_oldest_available=True)

def get_end_sequence(self, svr: ReplicationServer) -> Optional[int]:
if self.seq_id is not None:
Expand Down
9 changes: 8 additions & 1 deletion src/osmium/tools/pyosmium_get_changes.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,16 @@ def pyosmium_get_changes(args: List[str]) -> int:
cookie_jar.load(options.cookie)
svr.set_request_parameter('cookies', cookie_jar)

# Sanity check if server URL is correct and server is responding.
current = svr.get_state_info()
if current is None:
log.error("Cannot download state information. Is the replication URL correct?")
return 3
log.debug(f"Server is at sequence {current.sequence} ({current.timestamp}).")

startseq = options.start.get_sequence(svr)
if startseq is None:
log.error("Cannot read state file from server. Is the URL correct?")
log.error(f"No starting point found for time {options.start.date} on server {url}")
return 1

if options.outfile is None:
Expand Down
10 changes: 1 addition & 9 deletions src/osmium/tools/pyosmium_up_to_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,18 +94,10 @@ def update_from_custom_server(start: ReplicationStart, options: Any) -> int:
log.error(f"No starting point found for time {start.date} on server {start.source}")
return 3

if start.date is None:
start_state = svr.get_state_info(seq=startseq)
if start_state is None:
log.error(f"Cannot download state information for ID {startseq}. "
'Is the URL correct?')
return 3
start.date = start_state.timestamp

if not options.force_update:
cmpdate = dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=90)
cmpdate = cmpdate.replace(tzinfo=dt.timezone.utc)
if start.date < cmpdate:
if start.date is None or start.date < cmpdate:
log.error(
"""The OSM file is more than 3 months old. You should download a
more recent file instead of updating. If you really want to
Expand Down
202 changes: 114 additions & 88 deletions test/test_pyosmium_get_changes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""
from textwrap import dedent
import uuid
import datetime as dt

import pytest

Expand All @@ -22,115 +23,140 @@
import cookielib as cookiejarlib


class TestPyosmiumGetChanges:
REPLICATION_BASE_TIME = dt.datetime(year=2017, month=8, day=26, hour=11, tzinfo=dt.timezone.utc)
REPLICATION_BASE_SEQ = 100
REPLICATION_CURRENT = 140

def main(self, httpserver, *args):
return pyosmium_get_changes(['--server', httpserver.url_for('')] + list(args))

def test_init_id(self, capsys, httpserver):
assert 0 == self.main(httpserver, '-I', '453')
@pytest.fixture
def replication_server(httpserver):
def _state(seq):
seqtime = REPLICATION_BASE_TIME + dt.timedelta(hours=seq - REPLICATION_CURRENT)
timestamp = seqtime.strftime('%Y-%m-%dT%H\\:%M\\:%SZ')
return f"sequenceNumber={seq}\ntimestamp={timestamp}\n"

output = capsys.readouterr().out.strip()
httpserver.no_handler_status_code = 404
httpserver.expect_request('/state.txt').respond_with_data(_state(REPLICATION_CURRENT))
for i in range(REPLICATION_BASE_SEQ, REPLICATION_CURRENT + 1):
httpserver.expect_request(f'/000/000/{i}.opl')\
.respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000)))
httpserver.expect_request(f'/000/000/{i}.state.txt').respond_with_data(_state(i))

assert output == '453'
return httpserver.url_for('')

def test_init_date(self, capsys, httpserver):
httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
sequenceNumber=100
timestamp=2017-08-26T11\\:04\\:02Z
"""))
httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
sequenceNumber=0
timestamp=2016-08-26T11\\:04\\:02Z
"""))
assert 0 == self.main(httpserver, '-D', '2015-12-24T08:08:08Z')

output = capsys.readouterr().out.strip()
@pytest.fixture
def runner(httpserver):
def _run(*args):
return pyosmium_get_changes(
['--server', httpserver.url_for(''), '--diff-type', 'opl'] + list(map(str, args)))

assert output == '-1'
return _run

def test_init_to_file(self, tmp_path, httpserver):
fname = tmp_path / f"{uuid.uuid4()}.seq"

assert 0 == self.main(httpserver, '-I', '453', '-f', str(fname))
assert fname.read_text() == '453'
def test_init_id(runner, capsys, replication_server):
assert 0 == runner('-I', '100')

def test_init_from_seq_file(self, tmp_path, httpserver):
fname = tmp_path / f"{uuid.uuid4()}.seq"
fname.write_text('453')
output = capsys.readouterr().out.strip()

assert 0 == self.main(httpserver, '-f', str(fname))
assert fname.read_text() == '453'
assert output == '100'

def test_init_date_with_cookie(self, capsys, tmp_path, httpserver):
httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
sequenceNumber=100
timestamp=2017-08-26T11\\:04\\:02Z
"""))
httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
sequenceNumber=0
timestamp=2016-08-26T11\\:04\\:02Z
"""))

fname = tmp_path / 'my.cookie'
cookie_jar = cookiejarlib.MozillaCookieJar(str(fname))
cookie_jar.save()
def test_init_date(runner, capsys, httpserver):
httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
sequenceNumber=100
timestamp=2017-08-26T11\\:04\\:02Z
"""))
httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
sequenceNumber=0
timestamp=2016-08-26T11\\:04\\:02Z
"""))
assert 0 == runner('-D', '2015-12-24T08:08:08Z')

assert 0 == self.main(httpserver, '--cookie', str(fname),
'-D', '2015-12-24T08:08:08Z')
output = capsys.readouterr().out.strip()

output = capsys.readouterr().out.strip()
assert output == '-1'

assert output == '-1'

def test_get_simple_update(self, tmp_path, httpserver):
outfile = tmp_path / f"{uuid.uuid4()}.opl"
def test_init_to_file(runner, tmp_path, replication_server):
fname = tmp_path / f"{uuid.uuid4()}.seq"

httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
sequenceNumber=454
timestamp=2017-08-26T11\\:04\\:02Z
"""))
httpserver.expect_request('/000/000/454.state.txt').respond_with_data(dedent("""\
sequenceNumber=454
timestamp=2016-08-26T11\\:04\\:02Z
"""))
httpserver.expect_request('/000/000/454.opl').respond_with_data(
"n12 v1 x4 y6\nn13 v1 x9 y-6\nw2 v2 Nn1,n2")
assert 0 == runner('-I', '130', '-f', fname)
assert fname.read_text() == '130'

assert 0 == self.main(httpserver, '--diff-type', 'opl',
'-I', '453', '-o', str(outfile))

ids = IDCollector()
osmium.apply(str(outfile), ids)
def test_init_from_seq_file(runner, tmp_path, replication_server):
fname = tmp_path / f"{uuid.uuid4()}.seq"
fname.write_text('140')

assert ids.nodes == [12, 13]
assert ids.ways == [2]
assert ids.relations == []
assert 0 == runner('-f', fname)
assert fname.read_text() == '140'

@pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107),
(None, 1, 108),
(105, 1, 105),
(110, 1, 108)])
def test_apply_diffs_endid(self, tmp_path, httpserver, end_id, max_size, actual_end):
outfile = tmp_path / f"{uuid.uuid4()}.opl"

httpserver.expect_request('/state.txt').respond_with_data("""\
sequenceNumber=140
timestamp=2017-08-26T11\\:04\\:02Z
""")
for i in range(100, 141):
httpserver.expect_request(f'/000/000/{i}.opl')\
.respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000)))
def test_init_date_with_cookie(runner, capsys, tmp_path, httpserver):
httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
sequenceNumber=100
timestamp=2017-08-26T11\\:04\\:02Z
"""))
httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
sequenceNumber=0
timestamp=2016-08-26T11\\:04\\:02Z
"""))

params = [httpserver, '--diff-type', 'opl', '-I', '100', '-o', str(outfile)]
if end_id is not None:
params.extend(('--end-id', str(end_id)))
if max_size is not None:
params.extend(('-s', str(max_size)))

assert 0 == self.main(*params)

ids = IDCollector()
osmium.apply(str(outfile), ids)

assert ids.relations == list(range(101, actual_end + 1))
fname = tmp_path / 'my.cookie'
cookie_jar = cookiejarlib.MozillaCookieJar(str(fname))
cookie_jar.save()

assert 0 == runner('--cookie', fname, '-D', '2015-12-24T08:08:08Z')

output = capsys.readouterr().out.strip()

assert output == '-1'


def test_get_simple_update(runner, tmp_path, replication_server):
outfile = tmp_path / f"{uuid.uuid4()}.opl"

assert 0 == runner('-I', '139', '-o', outfile)

ids = IDCollector()
osmium.apply(outfile, ids)

assert ids.nodes == []
assert ids.ways == []
assert ids.relations == [140]


@pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107),
(None, 1, 108),
(105, 1, 105),
(110, 1, 108)])
def test_apply_diffs_endid(runner, tmp_path, replication_server, end_id, max_size, actual_end):
outfile = tmp_path / f"{uuid.uuid4()}.opl"

params = ['-I', '100', '-o', outfile]
if end_id is not None:
params.extend(('--end-id', end_id))
if max_size is not None:
params.extend(('-s', max_size))

assert 0 == runner(*params)

ids = IDCollector()
osmium.apply(str(outfile), ids)

assert ids.relations == list(range(101, actual_end + 1))


def test_change_id_too_old_for_replication_source(runner, tmp_path, replication_server, caplog):
outfile = tmp_path / f"{uuid.uuid4()}.opl"

assert 1 == runner('-I', 98, '-o', outfile)
assert 'Cannot download state information for ID 98.' in caplog.text


def test_change_date_too_old_for_replication_source(runner, tmp_path, replication_server, caplog):
outfile = tmp_path / f"{uuid.uuid4()}.opl"

assert 1 == runner('-D', '2015-12-24T08:08:08Z', '-o', outfile)
assert 'No starting point found' in caplog.text
20 changes: 20 additions & 0 deletions test/test_pyosmium_up-to-date.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,23 @@ def test_update_with_enddate(test_data, runner, tmp_path):
osmium.apply(newfile, ids)

assert ids.relations == list(range(101, 106))


def test_change_date_too_old_for_replication_source(test_data, runner, caplog):
outfile = test_data("n1 v1 t2070-04-05T06:30:00Z")

assert 3 == runner(outfile)
assert 'No starting point found' in caplog.text


def test_change_id_too_old_for_replication_source(caplog, tmp_path, runner, replication_server):
outfile = tmp_path / f"{uuid.uuid4()}.pbf"
h = osmium.io.Header()
h.set('osmosis_replication_base_url', replication_server)
h.set('osmosis_replication_sequence_number', '98')

with osmium.SimpleWriter(outfile, 4000, h) as w:
w.add_node({'id': 1})

assert 3 == runner(outfile)
assert 'Cannot download state information for ID 98' in caplog.text
Loading