Skip to content

Commit 785aae6

Browse files
authored
Merge pull request #305 from lonvia/past-replication
Improve working with replication servers which only keep part of the history
2 parents b2d2560 + ae0954c commit 785aae6

File tree

7 files changed

+206
-108
lines changed

7 files changed

+206
-108
lines changed

src/osmium/replication/server.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,14 @@ def collect_diffs(self, start_id: int, max_size: Optional[int] = None,
144144
contains the MergeInputReader with the data and `newest` is a
145145
sequence id of the most recent diff available.
146146
147-
Returns None if there was an error during download or no new
148-
data was available.
147+
Returns None if there was no new data was available.
148+
149+
If there is an error during the download, then the function will
150+
simply return the already downloaded data. If the reported
151+
error is a client error (HTTP 4xx) and happens during the download
152+
of the first diff, then a ::request.HTTPError:: is raised: this
153+
condition is likely to be permanent and the caller should not
154+
simply retry without investigating the cause.
149155
"""
150156
# must not read data newer than the published sequence id
151157
# or we might end up reading partial data
@@ -168,8 +174,19 @@ def collect_diffs(self, start_id: int, max_size: Optional[int] = None,
168174
and current_id <= newest.sequence:
169175
try:
170176
diffdata = self.get_diff_block(current_id)
171-
except: # noqa: E722
172-
LOG.error("Error during diff download. Bailing out.")
177+
except requests.RequestException as ex:
178+
if start_id == current_id \
179+
and ex.response is not None \
180+
and (ex.response.status_code % 100 == 4):
181+
# If server directly responds with a client error,
182+
# reraise the exception to signal a potentially permanent
183+
# error.
184+
LOG.error("Permanent server error: %s", ex.response)
185+
raise ex
186+
# In all other cases, process whatever diffs we have and
187+
# encourage a retry.
188+
LOG.error("Error during diff download: %s", ex)
189+
LOG.error("Bailing out.")
173190
diffdata = ''
174191
if len(diffdata) == 0:
175192
if start_id == current_id:
@@ -305,16 +322,24 @@ def apply_diffs_to_file(self, infile: str, outfile: str, start_id: int,
305322
return (diffs.id, diffs.newest)
306323

307324
def timestamp_to_sequence(self, timestamp: dt.datetime,
308-
balanced_search: bool = False) -> Optional[int]:
325+
balanced_search: bool = False,
326+
limit_by_oldest_available: bool = False) -> Optional[int]:
309327
""" Get the sequence number of the replication file that contains the
310328
given timestamp. The search algorithm is optimised for replication
311329
servers that publish updates in regular intervals. For servers
312330
with irregular change file publication dates 'balanced_search`
313331
should be set to true so that a standard binary search for the
314332
sequence will be used. The default is good for all known
315333
OSM replication services.
316-
"""
317334
335+
When `limit_by_oldest_available` is set, then the function will
336+
return None when the server replication does not start at 0 and
337+
the given timestamp is older than the oldest available timestamp
338+
on the server. Some replication servers do not keep the full
339+
history and this flag avoids accidentally trying to download older
340+
data. The downside is that the function will never return the
341+
oldest available sequence ID when the flag is set.
342+
"""
318343
# get the current timestamp from the server
319344
upper = self.get_state_info()
320345

@@ -331,8 +356,10 @@ def timestamp_to_sequence(self, timestamp: dt.datetime,
331356
lower = self.get_state_info(lowerid)
332357

333358
if lower is not None and lower.timestamp >= timestamp:
334-
if lower.sequence == 0 or lower.sequence + 1 >= upper.sequence:
335-
return lower.sequence
359+
if lower.sequence == 0:
360+
return 0
361+
if lower.sequence + 1 >= upper.sequence:
362+
return None if limit_by_oldest_available else lower.sequence
336363
upper = lower
337364
lower = None
338365
lowerid = 0

src/osmium/tools/common.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,19 @@ class ReplicationStart:
2929
def get_sequence(self, svr: ReplicationServer) -> Optional[int]:
3030
if self.seq_id is not None:
3131
log.debug("Using given sequence ID %d" % self.seq_id)
32+
if self.seq_id > 0:
33+
start_state = svr.get_state_info(seq=self.seq_id)
34+
if start_state is None:
35+
log.error(
36+
f"Cannot download state information for ID {self.seq_id}."
37+
" Server may not have this diff anymore.")
38+
return None
39+
self.date = start_state.timestamp
3240
return self.seq_id + 1
3341

3442
assert self.date is not None
3543
log.debug("Looking up sequence ID for timestamp %s" % self.date)
36-
return svr.timestamp_to_sequence(self.date)
44+
return svr.timestamp_to_sequence(self.date, limit_by_oldest_available=True)
3745

3846
def get_end_sequence(self, svr: ReplicationServer) -> Optional[int]:
3947
if self.seq_id is not None:

src/osmium/tools/pyosmium_get_changes.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,9 +162,16 @@ def pyosmium_get_changes(args: List[str]) -> int:
162162
cookie_jar.load(options.cookie)
163163
svr.set_request_parameter('cookies', cookie_jar)
164164

165+
# Sanity check if server URL is correct and server is responding.
166+
current = svr.get_state_info()
167+
if current is None:
168+
log.error("Cannot download state information. Is the replication URL correct?")
169+
return 3
170+
log.debug(f"Server is at sequence {current.sequence} ({current.timestamp}).")
171+
165172
startseq = options.start.get_sequence(svr)
166173
if startseq is None:
167-
log.error("Cannot read state file from server. Is the URL correct?")
174+
log.error(f"No starting point found for time {options.start.date} on server {url}")
168175
return 1
169176

170177
if options.outfile is None:

src/osmium/tools/pyosmium_up_to_date.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,18 +94,10 @@ def update_from_custom_server(start: ReplicationStart, options: Any) -> int:
9494
log.error(f"No starting point found for time {start.date} on server {start.source}")
9595
return 3
9696

97-
if start.date is None:
98-
start_state = svr.get_state_info(seq=startseq)
99-
if start_state is None:
100-
log.error(f"Cannot download state information for ID {startseq}. "
101-
'Is the URL correct?')
102-
return 3
103-
start.date = start_state.timestamp
104-
10597
if not options.force_update:
10698
cmpdate = dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=90)
10799
cmpdate = cmpdate.replace(tzinfo=dt.timezone.utc)
108-
if start.date < cmpdate:
100+
if start.date is None or start.date < cmpdate:
109101
log.error(
110102
"""The OSM file is more than 3 months old. You should download a
111103
more recent file instead of updating. If you really want to

test/test_pyosmium_get_changes.py

Lines changed: 114 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"""
99
from textwrap import dedent
1010
import uuid
11+
import datetime as dt
1112

1213
import pytest
1314

@@ -22,115 +23,140 @@
2223
import cookielib as cookiejarlib
2324

2425

25-
class TestPyosmiumGetChanges:
26+
REPLICATION_BASE_TIME = dt.datetime(year=2017, month=8, day=26, hour=11, tzinfo=dt.timezone.utc)
27+
REPLICATION_BASE_SEQ = 100
28+
REPLICATION_CURRENT = 140
2629

27-
def main(self, httpserver, *args):
28-
return pyosmium_get_changes(['--server', httpserver.url_for('')] + list(args))
2930

30-
def test_init_id(self, capsys, httpserver):
31-
assert 0 == self.main(httpserver, '-I', '453')
31+
@pytest.fixture
32+
def replication_server(httpserver):
33+
def _state(seq):
34+
seqtime = REPLICATION_BASE_TIME + dt.timedelta(hours=seq - REPLICATION_CURRENT)
35+
timestamp = seqtime.strftime('%Y-%m-%dT%H\\:%M\\:%SZ')
36+
return f"sequenceNumber={seq}\ntimestamp={timestamp}\n"
3237

33-
output = capsys.readouterr().out.strip()
38+
httpserver.no_handler_status_code = 404
39+
httpserver.expect_request('/state.txt').respond_with_data(_state(REPLICATION_CURRENT))
40+
for i in range(REPLICATION_BASE_SEQ, REPLICATION_CURRENT + 1):
41+
httpserver.expect_request(f'/000/000/{i}.opl')\
42+
.respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000)))
43+
httpserver.expect_request(f'/000/000/{i}.state.txt').respond_with_data(_state(i))
3444

35-
assert output == '453'
45+
return httpserver.url_for('')
3646

37-
def test_init_date(self, capsys, httpserver):
38-
httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
39-
sequenceNumber=100
40-
timestamp=2017-08-26T11\\:04\\:02Z
41-
"""))
42-
httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
43-
sequenceNumber=0
44-
timestamp=2016-08-26T11\\:04\\:02Z
45-
"""))
46-
assert 0 == self.main(httpserver, '-D', '2015-12-24T08:08:08Z')
4747

48-
output = capsys.readouterr().out.strip()
48+
@pytest.fixture
49+
def runner(httpserver):
50+
def _run(*args):
51+
return pyosmium_get_changes(
52+
['--server', httpserver.url_for(''), '--diff-type', 'opl'] + list(map(str, args)))
4953

50-
assert output == '-1'
54+
return _run
5155

52-
def test_init_to_file(self, tmp_path, httpserver):
53-
fname = tmp_path / f"{uuid.uuid4()}.seq"
5456

55-
assert 0 == self.main(httpserver, '-I', '453', '-f', str(fname))
56-
assert fname.read_text() == '453'
57+
def test_init_id(runner, capsys, replication_server):
58+
assert 0 == runner('-I', '100')
5759

58-
def test_init_from_seq_file(self, tmp_path, httpserver):
59-
fname = tmp_path / f"{uuid.uuid4()}.seq"
60-
fname.write_text('453')
60+
output = capsys.readouterr().out.strip()
6161

62-
assert 0 == self.main(httpserver, '-f', str(fname))
63-
assert fname.read_text() == '453'
62+
assert output == '100'
6463

65-
def test_init_date_with_cookie(self, capsys, tmp_path, httpserver):
66-
httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
67-
sequenceNumber=100
68-
timestamp=2017-08-26T11\\:04\\:02Z
69-
"""))
70-
httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
71-
sequenceNumber=0
72-
timestamp=2016-08-26T11\\:04\\:02Z
73-
"""))
7464

75-
fname = tmp_path / 'my.cookie'
76-
cookie_jar = cookiejarlib.MozillaCookieJar(str(fname))
77-
cookie_jar.save()
65+
def test_init_date(runner, capsys, httpserver):
66+
httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
67+
sequenceNumber=100
68+
timestamp=2017-08-26T11\\:04\\:02Z
69+
"""))
70+
httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
71+
sequenceNumber=0
72+
timestamp=2016-08-26T11\\:04\\:02Z
73+
"""))
74+
assert 0 == runner('-D', '2015-12-24T08:08:08Z')
7875

79-
assert 0 == self.main(httpserver, '--cookie', str(fname),
80-
'-D', '2015-12-24T08:08:08Z')
76+
output = capsys.readouterr().out.strip()
8177

82-
output = capsys.readouterr().out.strip()
78+
assert output == '-1'
8379

84-
assert output == '-1'
8580

86-
def test_get_simple_update(self, tmp_path, httpserver):
87-
outfile = tmp_path / f"{uuid.uuid4()}.opl"
81+
def test_init_to_file(runner, tmp_path, replication_server):
82+
fname = tmp_path / f"{uuid.uuid4()}.seq"
8883

89-
httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
90-
sequenceNumber=454
91-
timestamp=2017-08-26T11\\:04\\:02Z
92-
"""))
93-
httpserver.expect_request('/000/000/454.state.txt').respond_with_data(dedent("""\
94-
sequenceNumber=454
95-
timestamp=2016-08-26T11\\:04\\:02Z
96-
"""))
97-
httpserver.expect_request('/000/000/454.opl').respond_with_data(
98-
"n12 v1 x4 y6\nn13 v1 x9 y-6\nw2 v2 Nn1,n2")
84+
assert 0 == runner('-I', '130', '-f', fname)
85+
assert fname.read_text() == '130'
9986

100-
assert 0 == self.main(httpserver, '--diff-type', 'opl',
101-
'-I', '453', '-o', str(outfile))
10287

103-
ids = IDCollector()
104-
osmium.apply(str(outfile), ids)
88+
def test_init_from_seq_file(runner, tmp_path, replication_server):
89+
fname = tmp_path / f"{uuid.uuid4()}.seq"
90+
fname.write_text('140')
10591

106-
assert ids.nodes == [12, 13]
107-
assert ids.ways == [2]
108-
assert ids.relations == []
92+
assert 0 == runner('-f', fname)
93+
assert fname.read_text() == '140'
10994

110-
@pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107),
111-
(None, 1, 108),
112-
(105, 1, 105),
113-
(110, 1, 108)])
114-
def test_apply_diffs_endid(self, tmp_path, httpserver, end_id, max_size, actual_end):
115-
outfile = tmp_path / f"{uuid.uuid4()}.opl"
11695

117-
httpserver.expect_request('/state.txt').respond_with_data("""\
118-
sequenceNumber=140
119-
timestamp=2017-08-26T11\\:04\\:02Z
120-
""")
121-
for i in range(100, 141):
122-
httpserver.expect_request(f'/000/000/{i}.opl')\
123-
.respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000)))
96+
def test_init_date_with_cookie(runner, capsys, tmp_path, httpserver):
97+
httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
98+
sequenceNumber=100
99+
timestamp=2017-08-26T11\\:04\\:02Z
100+
"""))
101+
httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
102+
sequenceNumber=0
103+
timestamp=2016-08-26T11\\:04\\:02Z
104+
"""))
124105

125-
params = [httpserver, '--diff-type', 'opl', '-I', '100', '-o', str(outfile)]
126-
if end_id is not None:
127-
params.extend(('--end-id', str(end_id)))
128-
if max_size is not None:
129-
params.extend(('-s', str(max_size)))
130-
131-
assert 0 == self.main(*params)
132-
133-
ids = IDCollector()
134-
osmium.apply(str(outfile), ids)
135-
136-
assert ids.relations == list(range(101, actual_end + 1))
106+
fname = tmp_path / 'my.cookie'
107+
cookie_jar = cookiejarlib.MozillaCookieJar(str(fname))
108+
cookie_jar.save()
109+
110+
assert 0 == runner('--cookie', fname, '-D', '2015-12-24T08:08:08Z')
111+
112+
output = capsys.readouterr().out.strip()
113+
114+
assert output == '-1'
115+
116+
117+
def test_get_simple_update(runner, tmp_path, replication_server):
118+
outfile = tmp_path / f"{uuid.uuid4()}.opl"
119+
120+
assert 0 == runner('-I', '139', '-o', outfile)
121+
122+
ids = IDCollector()
123+
osmium.apply(outfile, ids)
124+
125+
assert ids.nodes == []
126+
assert ids.ways == []
127+
assert ids.relations == [140]
128+
129+
130+
@pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107),
131+
(None, 1, 108),
132+
(105, 1, 105),
133+
(110, 1, 108)])
134+
def test_apply_diffs_endid(runner, tmp_path, replication_server, end_id, max_size, actual_end):
135+
outfile = tmp_path / f"{uuid.uuid4()}.opl"
136+
137+
params = ['-I', '100', '-o', outfile]
138+
if end_id is not None:
139+
params.extend(('--end-id', end_id))
140+
if max_size is not None:
141+
params.extend(('-s', max_size))
142+
143+
assert 0 == runner(*params)
144+
145+
ids = IDCollector()
146+
osmium.apply(str(outfile), ids)
147+
148+
assert ids.relations == list(range(101, actual_end + 1))
149+
150+
151+
def test_change_id_too_old_for_replication_source(runner, tmp_path, replication_server, caplog):
152+
outfile = tmp_path / f"{uuid.uuid4()}.opl"
153+
154+
assert 1 == runner('-I', 98, '-o', outfile)
155+
assert 'Cannot download state information for ID 98.' in caplog.text
156+
157+
158+
def test_change_date_too_old_for_replication_source(runner, tmp_path, replication_server, caplog):
159+
outfile = tmp_path / f"{uuid.uuid4()}.opl"
160+
161+
assert 1 == runner('-D', '2015-12-24T08:08:08Z', '-o', outfile)
162+
assert 'No starting point found' in caplog.text

test/test_pyosmium_up-to-date.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,3 +168,23 @@ def test_update_with_enddate(test_data, runner, tmp_path):
168168
osmium.apply(newfile, ids)
169169

170170
assert ids.relations == list(range(101, 106))
171+
172+
173+
def test_change_date_too_old_for_replication_source(test_data, runner, caplog):
174+
outfile = test_data("n1 v1 t2070-04-05T06:30:00Z")
175+
176+
assert 3 == runner(outfile)
177+
assert 'No starting point found' in caplog.text
178+
179+
180+
def test_change_id_too_old_for_replication_source(caplog, tmp_path, runner, replication_server):
181+
outfile = tmp_path / f"{uuid.uuid4()}.pbf"
182+
h = osmium.io.Header()
183+
h.set('osmosis_replication_base_url', replication_server)
184+
h.set('osmosis_replication_sequence_number', '98')
185+
186+
with osmium.SimpleWriter(outfile, 4000, h) as w:
187+
w.add_node({'id': 1})
188+
189+
assert 3 == runner(outfile)
190+
assert 'Cannot download state information for ID 98' in caplog.text

0 commit comments

Comments
 (0)