osmcode · lonvia · Oct 10, 2025 · Oct 5, 2025 · Oct 5, 2025 · Oct 9, 2025
diff --git a/src/osmium/replication/server.py b/src/osmium/replication/server.py
@@ -144,8 +144,14 @@ def collect_diffs(self, start_id: int, max_size: Optional[int] = None,
             contains the MergeInputReader with the data and `newest` is a
             sequence id of the most recent diff available.
 
-            Returns None if there was an error during download or no new
-            data was available.
+            Returns None if there was no new data was available.
+
+            If there is an error during the download, then the function will
+            simply return the already downloaded data. If the reported
+            error is a client error (HTTP 4xx) and happens during the download
+            of the first diff, then a ::request.HTTPError:: is raised: this
+            condition is likely to be permanent and the caller should not
+            simply retry without investigating the cause.
         """
         # must not read data newer than the published sequence id
         # or we might end up reading partial data
@@ -168,8 +174,19 @@ def collect_diffs(self, start_id: int, max_size: Optional[int] = None,
                 and current_id <= newest.sequence:
             try:
                 diffdata = self.get_diff_block(current_id)
-            except:  # noqa: E722
-                LOG.error("Error during diff download. Bailing out.")
+            except requests.RequestException as ex:
+                if start_id == current_id \
+                        and ex.response is not None \
+                        and (ex.response.status_code % 100 == 4):
+                    # If server directly responds with a client error,
+                    # reraise the exception to signal a potentially permanent
+                    # error.
+                    LOG.error("Permanent server error: %s", ex.response)
+                    raise ex
+                # In all other cases, process whatever diffs we have and
+                # encourage a retry.
+                LOG.error("Error during diff download: %s", ex)
+                LOG.error("Bailing out.")
                 diffdata = ''
             if len(diffdata) == 0:
                 if start_id == current_id:
@@ -305,16 +322,24 @@ def apply_diffs_to_file(self, infile: str, outfile: str, start_id: int,
         return (diffs.id, diffs.newest)
 
     def timestamp_to_sequence(self, timestamp: dt.datetime,
-                              balanced_search: bool = False) -> Optional[int]:
+                              balanced_search: bool = False,
+                              limit_by_oldest_available: bool = False) -> Optional[int]:
         """ Get the sequence number of the replication file that contains the
             given timestamp. The search algorithm is optimised for replication
             servers that publish updates in regular intervals. For servers
             with irregular change file publication dates 'balanced_search`
             should be set to true so that a standard binary search for the
             sequence will be used. The default is good for all known
             OSM replication services.
-        """
 
+            When `limit_by_oldest_available` is set, then the function will
+            return None when the server replication does not start at 0 and
+            the given timestamp is older than the oldest available timestamp
+            on the server. Some replication servers do not keep the full
+            history and this flag avoids accidentally trying to download older
+            data. The downside is that the function will never return the
+            oldest available sequence ID when the flag is set.
+        """
         # get the current timestamp from the server
         upper = self.get_state_info()
 
@@ -331,8 +356,10 @@ def timestamp_to_sequence(self, timestamp: dt.datetime,
             lower = self.get_state_info(lowerid)
 
             if lower is not None and lower.timestamp >= timestamp:
-                if lower.sequence == 0 or lower.sequence + 1 >= upper.sequence:
-                    return lower.sequence
+                if lower.sequence == 0:
+                    return 0
+                if lower.sequence + 1 >= upper.sequence:
+                    return None if limit_by_oldest_available else lower.sequence
                 upper = lower
                 lower = None
                 lowerid = 0

diff --git a/src/osmium/tools/common.py b/src/osmium/tools/common.py
@@ -29,11 +29,19 @@ class ReplicationStart:
     def get_sequence(self, svr: ReplicationServer) -> Optional[int]:
         if self.seq_id is not None:
             log.debug("Using given sequence ID %d" % self.seq_id)
+            if self.seq_id > 0:
+                start_state = svr.get_state_info(seq=self.seq_id)
+                if start_state is None:
+                    log.error(
+                        f"Cannot download state information for ID {self.seq_id}."
+                        " Server may not have this diff anymore.")
+                    return None
+                self.date = start_state.timestamp
             return self.seq_id + 1
 
         assert self.date is not None
         log.debug("Looking up sequence ID for timestamp %s" % self.date)
-        return svr.timestamp_to_sequence(self.date)
+        return svr.timestamp_to_sequence(self.date, limit_by_oldest_available=True)
 
     def get_end_sequence(self, svr: ReplicationServer) -> Optional[int]:
         if self.seq_id is not None:

diff --git a/src/osmium/tools/pyosmium_get_changes.py b/src/osmium/tools/pyosmium_get_changes.py
@@ -162,9 +162,16 @@ def pyosmium_get_changes(args: List[str]) -> int:
             cookie_jar.load(options.cookie)
             svr.set_request_parameter('cookies', cookie_jar)
 
+        # Sanity check if server URL is correct and server is responding.
+        current = svr.get_state_info()
+        if current is None:
+            log.error("Cannot download state information. Is the replication URL correct?")
+            return 3
+        log.debug(f"Server is at sequence {current.sequence} ({current.timestamp}).")
+
         startseq = options.start.get_sequence(svr)
         if startseq is None:
-            log.error("Cannot read state file from server. Is the URL correct?")
+            log.error(f"No starting point found for time {options.start.date} on server {url}")
             return 1
 
         if options.outfile is None:

diff --git a/src/osmium/tools/pyosmium_up_to_date.py b/src/osmium/tools/pyosmium_up_to_date.py
@@ -94,18 +94,10 @@ def update_from_custom_server(start: ReplicationStart, options: Any) -> int:
             log.error(f"No starting point found for time {start.date} on server {start.source}")
             return 3
 
-        if start.date is None:
-            start_state = svr.get_state_info(seq=startseq)
-            if start_state is None:
-                log.error(f"Cannot download state information for ID {startseq}. "
-                          'Is the URL correct?')
-                return 3
-            start.date = start_state.timestamp
-
         if not options.force_update:
             cmpdate = dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=90)
             cmpdate = cmpdate.replace(tzinfo=dt.timezone.utc)
-            if start.date < cmpdate:
+            if start.date is None or start.date < cmpdate:
                 log.error(
                   """The OSM file is more than 3 months old. You should download a
                      more recent file instead of updating. If you really want to

diff --git a/test/test_pyosmium_get_changes.py b/test/test_pyosmium_get_changes.py
@@ -8,6 +8,7 @@
 """
 from textwrap import dedent
 import uuid
+import datetime as dt
 
 import pytest
 
@@ -22,115 +23,140 @@
     import cookielib as cookiejarlib
 
 
-class TestPyosmiumGetChanges:
+REPLICATION_BASE_TIME = dt.datetime(year=2017, month=8, day=26, hour=11, tzinfo=dt.timezone.utc)
+REPLICATION_BASE_SEQ = 100
+REPLICATION_CURRENT = 140
 
-    def main(self, httpserver, *args):
-        return pyosmium_get_changes(['--server', httpserver.url_for('')] + list(args))
 
-    def test_init_id(self, capsys, httpserver):
-        assert 0 == self.main(httpserver, '-I', '453')
+@pytest.fixture
+def replication_server(httpserver):
+    def _state(seq):
+        seqtime = REPLICATION_BASE_TIME + dt.timedelta(hours=seq - REPLICATION_CURRENT)
+        timestamp = seqtime.strftime('%Y-%m-%dT%H\\:%M\\:%SZ')
+        return f"sequenceNumber={seq}\ntimestamp={timestamp}\n"
 
-        output = capsys.readouterr().out.strip()
+    httpserver.no_handler_status_code = 404
+    httpserver.expect_request('/state.txt').respond_with_data(_state(REPLICATION_CURRENT))
+    for i in range(REPLICATION_BASE_SEQ, REPLICATION_CURRENT + 1):
+        httpserver.expect_request(f'/000/000/{i}.opl')\
+                  .respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000)))
+        httpserver.expect_request(f'/000/000/{i}.state.txt').respond_with_data(_state(i))
 
-        assert output == '453'
+    return httpserver.url_for('')
 
-    def test_init_date(self, capsys, httpserver):
-        httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
-                    sequenceNumber=100
-                    timestamp=2017-08-26T11\\:04\\:02Z
-                 """))
-        httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
-                    sequenceNumber=0
-                    timestamp=2016-08-26T11\\:04\\:02Z
-                 """))
-        assert 0 == self.main(httpserver, '-D', '2015-12-24T08:08:08Z')
 
-        output = capsys.readouterr().out.strip()
+@pytest.fixture
+def runner(httpserver):
+    def _run(*args):
+        return pyosmium_get_changes(
+            ['--server', httpserver.url_for(''), '--diff-type', 'opl'] + list(map(str, args)))
 
-        assert output == '-1'
+    return _run
 
-    def test_init_to_file(self, tmp_path, httpserver):
-        fname = tmp_path / f"{uuid.uuid4()}.seq"
 
-        assert 0 == self.main(httpserver, '-I', '453', '-f', str(fname))
-        assert fname.read_text() == '453'
+def test_init_id(runner, capsys, replication_server):
+    assert 0 == runner('-I', '100')
 
-    def test_init_from_seq_file(self, tmp_path, httpserver):
-        fname = tmp_path / f"{uuid.uuid4()}.seq"
-        fname.write_text('453')
+    output = capsys.readouterr().out.strip()
 
-        assert 0 == self.main(httpserver, '-f', str(fname))
-        assert fname.read_text() == '453'
+    assert output == '100'
 
-    def test_init_date_with_cookie(self, capsys, tmp_path, httpserver):
-        httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
-                    sequenceNumber=100
-                    timestamp=2017-08-26T11\\:04\\:02Z
-                 """))
-        httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
-                    sequenceNumber=0
-                    timestamp=2016-08-26T11\\:04\\:02Z
-                 """))
 
-        fname = tmp_path / 'my.cookie'
-        cookie_jar = cookiejarlib.MozillaCookieJar(str(fname))
-        cookie_jar.save()
+def test_init_date(runner, capsys, httpserver):
+    httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
+                sequenceNumber=100
+                timestamp=2017-08-26T11\\:04\\:02Z
+             """))
+    httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
+                sequenceNumber=0
+                timestamp=2016-08-26T11\\:04\\:02Z
+             """))
+    assert 0 == runner('-D', '2015-12-24T08:08:08Z')
 
-        assert 0 == self.main(httpserver, '--cookie', str(fname),
-                              '-D', '2015-12-24T08:08:08Z')
+    output = capsys.readouterr().out.strip()
 
-        output = capsys.readouterr().out.strip()
+    assert output == '-1'
 
-        assert output == '-1'
 
-    def test_get_simple_update(self, tmp_path, httpserver):
-        outfile = tmp_path / f"{uuid.uuid4()}.opl"
+def test_init_to_file(runner, tmp_path, replication_server):
+    fname = tmp_path / f"{uuid.uuid4()}.seq"
 
-        httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
-                    sequenceNumber=454
-                    timestamp=2017-08-26T11\\:04\\:02Z
-                 """))
-        httpserver.expect_request('/000/000/454.state.txt').respond_with_data(dedent("""\
-                    sequenceNumber=454
-                    timestamp=2016-08-26T11\\:04\\:02Z
-                 """))
-        httpserver.expect_request('/000/000/454.opl').respond_with_data(
-                 "n12 v1 x4 y6\nn13 v1 x9 y-6\nw2 v2 Nn1,n2")
+    assert 0 == runner('-I', '130', '-f', fname)
+    assert fname.read_text() == '130'
 
-        assert 0 == self.main(httpserver, '--diff-type', 'opl',
-                              '-I', '453', '-o', str(outfile))
 
-        ids = IDCollector()
-        osmium.apply(str(outfile), ids)
+def test_init_from_seq_file(runner, tmp_path, replication_server):
+    fname = tmp_path / f"{uuid.uuid4()}.seq"
+    fname.write_text('140')
 
-        assert ids.nodes == [12, 13]
-        assert ids.ways == [2]
-        assert ids.relations == []
+    assert 0 == runner('-f', fname)
+    assert fname.read_text() == '140'
 
-    @pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107),
-                                                            (None, 1, 108),
-                                                            (105, 1, 105),
-                                                            (110, 1, 108)])
-    def test_apply_diffs_endid(self, tmp_path, httpserver, end_id, max_size, actual_end):
-        outfile = tmp_path / f"{uuid.uuid4()}.opl"
 
-        httpserver.expect_request('/state.txt').respond_with_data("""\
-            sequenceNumber=140
-            timestamp=2017-08-26T11\\:04\\:02Z
-        """)
-        for i in range(100, 141):
-            httpserver.expect_request(f'/000/000/{i}.opl')\
-                      .respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000)))
+def test_init_date_with_cookie(runner, capsys, tmp_path, httpserver):
+    httpserver.expect_request('/state.txt').respond_with_data(dedent("""\
+                sequenceNumber=100
+                timestamp=2017-08-26T11\\:04\\:02Z
+             """))
+    httpserver.expect_request('/000/000/000.state.txt').respond_with_data(dedent("""\
+                sequenceNumber=0
+                timestamp=2016-08-26T11\\:04\\:02Z
+             """))
 
-        params = [httpserver, '--diff-type', 'opl', '-I', '100', '-o', str(outfile)]
-        if end_id is not None:
-            params.extend(('--end-id', str(end_id)))
-        if max_size is not None:
-            params.extend(('-s', str(max_size)))
-
-        assert 0 == self.main(*params)
-
-        ids = IDCollector()
-        osmium.apply(str(outfile), ids)
-
-        assert ids.relations == list(range(101, actual_end + 1))
+    fname = tmp_path / 'my.cookie'
+    cookie_jar = cookiejarlib.MozillaCookieJar(str(fname))
+    cookie_jar.save()
+
+    assert 0 == runner('--cookie', fname, '-D', '2015-12-24T08:08:08Z')
+
+    output = capsys.readouterr().out.strip()
+
+    assert output == '-1'
+
+
+def test_get_simple_update(runner, tmp_path, replication_server):
+    outfile = tmp_path / f"{uuid.uuid4()}.opl"
+
+    assert 0 == runner('-I', '139', '-o', outfile)
+
+    ids = IDCollector()
+    osmium.apply(outfile, ids)
+
+    assert ids.nodes == []
+    assert ids.ways == []
+    assert ids.relations == [140]
+
+
+@pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107),
+                                                        (None, 1, 108),
+                                                        (105, 1, 105),
+                                                        (110, 1, 108)])
+def test_apply_diffs_endid(runner, tmp_path, replication_server, end_id, max_size, actual_end):
+    outfile = tmp_path / f"{uuid.uuid4()}.opl"
+
+    params = ['-I', '100', '-o', outfile]
+    if end_id is not None:
+        params.extend(('--end-id', end_id))
+    if max_size is not None:
+        params.extend(('-s', max_size))
+
+    assert 0 == runner(*params)
+
+    ids = IDCollector()
+    osmium.apply(str(outfile), ids)
+
+    assert ids.relations == list(range(101, actual_end + 1))
+
+
+def test_change_id_too_old_for_replication_source(runner, tmp_path, replication_server, caplog):
+    outfile = tmp_path / f"{uuid.uuid4()}.opl"
+
+    assert 1 == runner('-I', 98, '-o', outfile)
+    assert 'Cannot download state information for ID 98.' in caplog.text
+
+
+def test_change_date_too_old_for_replication_source(runner, tmp_path, replication_server, caplog):
+    outfile = tmp_path / f"{uuid.uuid4()}.opl"
+
+    assert 1 == runner('-D', '2015-12-24T08:08:08Z', '-o', outfile)
+    assert 'No starting point found' in caplog.text
diff --git a/test/test_pyosmium_up-to-date.py b/test/test_pyosmium_up-to-date.py
@@ -168,3 +168,23 @@ def test_update_with_enddate(test_data, runner, tmp_path):
     osmium.apply(newfile, ids)
 
     assert ids.relations == list(range(101, 106))
+
+
+def test_change_date_too_old_for_replication_source(test_data, runner, caplog):
+    outfile = test_data("n1 v1 t2070-04-05T06:30:00Z")
+
+    assert 3 == runner(outfile)
+    assert 'No starting point found' in caplog.text
+
+
+def test_change_id_too_old_for_replication_source(caplog, tmp_path, runner, replication_server):
+    outfile = tmp_path / f"{uuid.uuid4()}.pbf"
+    h = osmium.io.Header()
+    h.set('osmosis_replication_base_url', replication_server)
+    h.set('osmosis_replication_sequence_number', '98')
+
+    with osmium.SimpleWriter(outfile, 4000, h) as w:
+        w.add_node({'id': 1})
+
+    assert 3 == runner(outfile)
+    assert 'Cannot download state information for ID 98' in caplog.text