From 2227d5d3e699e9848c02dfaf7f6c2288d1ef0434 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 27 Nov 2025 21:48:32 +0100 Subject: [PATCH 1/8] BUG: fix Time type columns being skipped with arrow=False --- pyogrio/_io.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index e8f5c4fd..e9c35335 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -59,7 +59,7 @@ FIELD_TYPES = [ None, # OFTWideStringList, deprecated, not supported "object", # OFTBinary, Raw Binary data "datetime64[D]", # OFTDate, Date - None, # OFTTime, Time, NOTE: not directly supported in numpy + "object", # OFTTime, Time, NOTE: not directly supported in numpy "datetime64[ms]", # OFTDateTime, Date and Time "int64", # OFTInteger64, Single 64bit integer "list(int64)" # OFTInteger64List, List of 64bit integers, not supported @@ -976,7 +976,7 @@ cdef process_fields( bin_value = OGR_F_GetFieldAsBinary(ogr_feature, field_index, &ret_length) data[i] = bin_value[:ret_length] - elif field_type == OFTDateTime or field_type == OFTDate: + elif field_type == OFTDateTime or field_type == OFTDate or field_type == OFTTime: if datetime_as_string: # defer datetime parsing to user/ pandas layer @@ -1018,6 +1018,9 @@ cdef process_fields( data[i] = datetime.datetime( year, month, day, hour, minute, second, microsecond ).isoformat() + + elif field_type == OFTTime: + data[i] = datetime.time(hour, minute, second, microsecond) elif field_type == OFTIntegerList: # According to GDAL doc, this can return NULL for an empty list, which is a From 27736c642d651383f645c1a9fa41322be3c301d4 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 27 Nov 2025 22:07:53 +0100 Subject: [PATCH 2/8] Add tests --- pyogrio/_compat.py | 4 ++ pyogrio/_io.pyx | 2 +- pyogrio/tests/conftest.py | 36 +++++++++++++++++ pyogrio/tests/test_geopandas_io.py | 65 ++++++++++++++++++++++++++++-- 4 files changed, 103 insertions(+), 4 deletions(-) diff --git a/pyogrio/_compat.py b/pyogrio/_compat.py index 3f8515c1..d04984f8 100644 --- a/pyogrio/_compat.py +++ b/pyogrio/_compat.py @@ -1,5 +1,7 @@ from packaging.version import Version +import numpy as np + from pyogrio.core import __gdal_geos_version__, __gdal_version__ # detect optional dependencies @@ -38,6 +40,8 @@ HAS_GEOPANDAS = geopandas is not None +NUMPY_GE_20 = Version(np.__version__) >= Version("2.0.0") + PANDAS_GE_15 = pandas is not None and Version(pandas.__version__) >= Version("1.5.0") PANDAS_GE_20 = pandas is not None and Version(pandas.__version__) >= Version("2.0.0") PANDAS_GE_22 = pandas is not None and Version(pandas.__version__) >= Version("2.2.0") diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index e9c35335..6d14bd1b 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -978,7 +978,7 @@ cdef process_fields( elif field_type == OFTDateTime or field_type == OFTDate or field_type == OFTTime: - if datetime_as_string: + if field_type == OFTDateTime and datetime_as_string: # defer datetime parsing to user/ pandas layer IF CTE_GDAL_VERSION >= (3, 7, 0): data[i] = get_string( diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index f6a1bbe0..dacd08f4 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -336,6 +336,42 @@ def list_field_values_files(tmp_path, request): return list_field_values_parquet_file() +@pytest.fixture(scope="function") +def many_data_types_geojson_file(tmp_path): + # create GeoJSON file with properties of many data types + many_types_geojson = """{ + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "geometry": { + "type": "Point", + "coordinates": [0, 0] + }, + "properties": { + "int_col": 1, + "float_col": 1.5, + "str_col": "string", + "bool_col": true, + "null_col": null, + "date_col": "2020-01-01", + "time_col": "12:00:00", + "datetime_col": "2020-01-01T12:00:00", + "list_int_col": [1, 2, 3], + "list_str_col": ["a", "b", "c"], + "list_mixed_col": [1, "a", null, true] + } + } + ] + }""" + + filename = tmp_path / "test_many_data_types.geojson" + with open(filename, "w") as f: + _ = f.write(many_types_geojson) + + return filename + + @pytest.fixture(scope="function") def nested_geojson_file(tmp_path): # create GeoJSON file with nested properties diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index c00cb588..daf393fb 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -3,7 +3,7 @@ import os import re import warnings -from datetime import datetime +from datetime import datetime, time from io import BytesIO from zipfile import ZipFile @@ -23,6 +23,7 @@ GDAL_GE_311, HAS_ARROW_WRITE_API, HAS_PYPROJ, + NUMPY_GE_20, PANDAS_GE_15, PANDAS_GE_23, PANDAS_GE_30, @@ -51,7 +52,14 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt - from pandas.api.types import is_datetime64_dtype, is_object_dtype, is_string_dtype + from pandas.api.types import ( + is_bool_dtype, + is_datetime64_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_string_dtype, + ) import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -515,6 +523,57 @@ def test_read_list_nested_struct_parquet_file( assert result["col_struct"][2] == {"a": 1, "b": 2} +def test_read_many_data_types_geojson_file(many_data_types_geojson_file, use_arrow): + """Test reading a GeoJSON file containing many data types.""" + result = read_dataframe(many_data_types_geojson_file, use_arrow=use_arrow) + + assert "int_col" in result.columns + assert is_integer_dtype(result["int_col"].dtype) + assert result["int_col"].to_list() == [1] + + assert "float_col" in result.columns + assert is_float_dtype(result["float_col"].dtype) + assert result["float_col"].to_list() == [1.5] + + assert "str_col" in result.columns + assert is_string_dtype(result["str_col"].dtype) + assert result["str_col"].to_list() == ["string"] + + assert "bool_col" in result.columns + assert is_bool_dtype(result["bool_col"].dtype) + assert result["bool_col"].to_list() == [True] + + if use_arrow: + assert "date_col" in result.columns + assert is_datetime64_dtype(result["date_col"].dtype) + if NUMPY_GE_20: + assert result["date_col"].to_list() == [np.datetime64("2020-01-01")] + else: + assert result["date_col"].to_list() == [pd.Timestamp("2020-01-01")] + + # Time columns are ignored without arrow: + # Reported in https://github.com/geopandas/pyogrio/issues/615 + assert "time_col" in result.columns + assert is_object_dtype(result["time_col"].dtype) + assert result["time_col"].to_list() == [time(12, 0, 0)] + + assert "datetime_col" in result.columns + assert is_datetime64_dtype(result["datetime_col"].dtype) + assert result["datetime_col"].to_list() == [pd.Timestamp("2020-01-01T12:00:00")] + + assert "list_int_col" in result.columns + assert is_object_dtype(result["list_int_col"].dtype) + assert result["list_int_col"][0].tolist() == [1, 2, 3] + + assert "list_str_col" in result.columns + assert is_object_dtype(result["list_str_col"].dtype) + assert result["list_str_col"][0].tolist() == ["a", "b", "c"] + + assert "list_mixed_col" in result.columns + assert is_object_dtype(result["list_mixed_col"].dtype) + assert result["list_mixed_col"][0] == [1, "a", None, True] + + @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @@ -3200,7 +3259,7 @@ def test_write_geojson_rfc7946_coordinates(tmp_path, use_arrow): assert np.array_equal(gdf_in_appended.geometry.values, points + points_append) -@pytest.mark.requires_arrow_api +@pytest.mark.requires_arrow_write_api @pytest.mark.skipif( not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available" ) From c09b16fc1873e9c11d6c497308cfce0e4e3b835f Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 27 Nov 2025 22:10:09 +0100 Subject: [PATCH 3/8] Update CHANGES.md --- CHANGES.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index a5b2c4b6..88e73daa 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,11 @@ # CHANGELOG +## 0.12.1 (????-??-??) + +## Bug fixes + +- Fix Time type columns being skipped when reading with `arrow=False` (#617) + ## 0.12.0 (2025-11-26) ### Potentially breaking changes From 2d04679b5e7c3c3c3c2bc43b1976ffc83300a687 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 27 Nov 2025 22:24:15 +0100 Subject: [PATCH 4/8] Fix linter --- pyogrio/_io.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 6d14bd1b..0add9fe9 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -976,8 +976,7 @@ cdef process_fields( bin_value = OGR_F_GetFieldAsBinary(ogr_feature, field_index, &ret_length) data[i] = bin_value[:ret_length] - elif field_type == OFTDateTime or field_type == OFTDate or field_type == OFTTime: - + elif field_type in (OFTDateTime, OFTDate, OFTTime): if field_type == OFTDateTime and datetime_as_string: # defer datetime parsing to user/ pandas layer IF CTE_GDAL_VERSION >= (3, 7, 0): @@ -1018,7 +1017,7 @@ cdef process_fields( data[i] = datetime.datetime( year, month, day, hour, minute, second, microsecond ).isoformat() - + elif field_type == OFTTime: data[i] = datetime.time(hour, minute, second, microsecond) From 1f59fd8877fce1d85645dc5efb69b996096774e1 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 27 Nov 2025 22:26:47 +0100 Subject: [PATCH 5/8] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index daf393fb..54e028a4 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -551,8 +551,6 @@ def test_read_many_data_types_geojson_file(many_data_types_geojson_file, use_arr else: assert result["date_col"].to_list() == [pd.Timestamp("2020-01-01")] - # Time columns are ignored without arrow: - # Reported in https://github.com/geopandas/pyogrio/issues/615 assert "time_col" in result.columns assert is_object_dtype(result["time_col"].dtype) assert result["time_col"].to_list() == [time(12, 0, 0)] From 2aa769bcc8d3fd106434d2b2805a594242649dc4 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 28 Nov 2025 00:49:43 +0100 Subject: [PATCH 6/8] Remove numpy check --- pyogrio/_compat.py | 4 ---- pyogrio/tests/test_geopandas_io.py | 6 +----- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/pyogrio/_compat.py b/pyogrio/_compat.py index d04984f8..3f8515c1 100644 --- a/pyogrio/_compat.py +++ b/pyogrio/_compat.py @@ -1,7 +1,5 @@ from packaging.version import Version -import numpy as np - from pyogrio.core import __gdal_geos_version__, __gdal_version__ # detect optional dependencies @@ -40,8 +38,6 @@ HAS_GEOPANDAS = geopandas is not None -NUMPY_GE_20 = Version(np.__version__) >= Version("2.0.0") - PANDAS_GE_15 = pandas is not None and Version(pandas.__version__) >= Version("1.5.0") PANDAS_GE_20 = pandas is not None and Version(pandas.__version__) >= Version("2.0.0") PANDAS_GE_22 = pandas is not None and Version(pandas.__version__) >= Version("2.2.0") diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 54e028a4..0959203d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -23,7 +23,6 @@ GDAL_GE_311, HAS_ARROW_WRITE_API, HAS_PYPROJ, - NUMPY_GE_20, PANDAS_GE_15, PANDAS_GE_23, PANDAS_GE_30, @@ -546,10 +545,7 @@ def test_read_many_data_types_geojson_file(many_data_types_geojson_file, use_arr if use_arrow: assert "date_col" in result.columns assert is_datetime64_dtype(result["date_col"].dtype) - if NUMPY_GE_20: - assert result["date_col"].to_list() == [np.datetime64("2020-01-01")] - else: - assert result["date_col"].to_list() == [pd.Timestamp("2020-01-01")] + assert result["date_col"].to_list() == [pd.Timestamp("2020-01-01")] assert "time_col" in result.columns assert is_object_dtype(result["time_col"].dtype) From ed6002a798ab5fef96be5738ee184b42bab3861a Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 28 Nov 2025 01:04:16 +0100 Subject: [PATCH 7/8] Make the test roundtrip --- pyogrio/tests/test_geopandas_io.py | 85 ++++++++++++++++++------------ 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 0959203d..e9ebf432 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -522,50 +522,67 @@ def test_read_list_nested_struct_parquet_file( assert result["col_struct"][2] == {"a": 1, "b": 2} -def test_read_many_data_types_geojson_file(many_data_types_geojson_file, use_arrow): - """Test reading a GeoJSON file containing many data types.""" - result = read_dataframe(many_data_types_geojson_file, use_arrow=use_arrow) +def test_read_many_data_types_geojson_file( + request, tmp_path, many_data_types_geojson_file, use_arrow +): + """Test roundtripping a GeoJSON file containing many data types.""" - assert "int_col" in result.columns - assert is_integer_dtype(result["int_col"].dtype) - assert result["int_col"].to_list() == [1] + def validate_result(df: pd.DataFrame, use_arrow: bool): + """Validate the dataframe read from the many data types geojson file.""" + assert "int_col" in df.columns + assert is_integer_dtype(df["int_col"].dtype) + assert df["int_col"].to_list() == [1] - assert "float_col" in result.columns - assert is_float_dtype(result["float_col"].dtype) - assert result["float_col"].to_list() == [1.5] + assert "float_col" in df.columns + assert is_float_dtype(df["float_col"].dtype) + assert df["float_col"].to_list() == [1.5] - assert "str_col" in result.columns - assert is_string_dtype(result["str_col"].dtype) - assert result["str_col"].to_list() == ["string"] + assert "str_col" in df.columns + assert is_string_dtype(df["str_col"].dtype) + assert df["str_col"].to_list() == ["string"] - assert "bool_col" in result.columns - assert is_bool_dtype(result["bool_col"].dtype) - assert result["bool_col"].to_list() == [True] + assert "bool_col" in df.columns + assert is_bool_dtype(df["bool_col"].dtype) + assert df["bool_col"].to_list() == [True] - if use_arrow: - assert "date_col" in result.columns - assert is_datetime64_dtype(result["date_col"].dtype) - assert result["date_col"].to_list() == [pd.Timestamp("2020-01-01")] + if use_arrow: + assert "date_col" in df.columns + assert is_datetime64_dtype(df["date_col"].dtype) + assert df["date_col"].to_list() == [pd.Timestamp("2020-01-01")] + + assert "time_col" in df.columns + assert is_object_dtype(df["time_col"].dtype) + assert df["time_col"].to_list() == [time(12, 0, 0)] - assert "time_col" in result.columns - assert is_object_dtype(result["time_col"].dtype) - assert result["time_col"].to_list() == [time(12, 0, 0)] + assert "datetime_col" in df.columns + assert is_datetime64_dtype(df["datetime_col"].dtype) + assert df["datetime_col"].to_list() == [pd.Timestamp("2020-01-01T12:00:00")] - assert "datetime_col" in result.columns - assert is_datetime64_dtype(result["datetime_col"].dtype) - assert result["datetime_col"].to_list() == [pd.Timestamp("2020-01-01T12:00:00")] + assert "list_int_col" in df.columns + assert is_object_dtype(df["list_int_col"].dtype) + assert df["list_int_col"][0].tolist() == [1, 2, 3] - assert "list_int_col" in result.columns - assert is_object_dtype(result["list_int_col"].dtype) - assert result["list_int_col"][0].tolist() == [1, 2, 3] + assert "list_str_col" in df.columns + assert is_object_dtype(df["list_str_col"].dtype) + assert df["list_str_col"][0].tolist() == ["a", "b", "c"] - assert "list_str_col" in result.columns - assert is_object_dtype(result["list_str_col"].dtype) - assert result["list_str_col"][0].tolist() == ["a", "b", "c"] + assert "list_mixed_col" in df.columns + assert is_object_dtype(df["list_mixed_col"].dtype) + assert df["list_mixed_col"][0] == [1, "a", None, True] - assert "list_mixed_col" in result.columns - assert is_object_dtype(result["list_mixed_col"].dtype) - assert result["list_mixed_col"][0] == [1, "a", None, True] + read_gdf = read_dataframe(many_data_types_geojson_file, use_arrow=use_arrow) + validate_result(read_gdf, use_arrow) + + # Roundtrip gives issues, to be further checked + request.node.add_marker( + pytest.mark.xfail( + reason="writing and reading many_data_types_geojson_file again fails" + ) + ) + write_path = tmp_path / "many_data_types_copy.geojson" + write_dataframe(read_gdf, write_path, use_arrow=use_arrow) + read_back_gdf = read_dataframe(write_path, use_arrow=use_arrow) + validate_result(read_back_gdf, use_arrow) @pytest.mark.filterwarnings( From f5bbcf90400593b6564d2eb7c4d54bff516bf38b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 28 Nov 2025 19:39:25 +0100 Subject: [PATCH 8/8] Fix test for older GDAL versions --- pyogrio/tests/test_geopandas_io.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 48620cc0..3b383f9a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -566,7 +566,7 @@ def test_roundtrip_many_data_types_geojson_file( ): """Test roundtripping a GeoJSON file containing many data types.""" - def validate_result(df: pd.DataFrame, use_arrow: bool, ignore_mixed_list_col=False): + def validate_result(df: pd.DataFrame, use_arrow: bool, after_write=False): """Function to validate the data of many_data_types_geojson_file. Depending on arrow being used or not there are small differences. @@ -597,9 +597,12 @@ def validate_result(df: pd.DataFrame, use_arrow: bool, ignore_mixed_list_col=Fal assert is_datetime64_dtype(df["date_col"].dtype) assert df["date_col"].to_list() == [pd.Timestamp("2020-01-01")] - assert "time_col" in df.columns - assert is_object_dtype(df["time_col"].dtype) - assert df["time_col"].to_list() == [time(12, 0, 0)] + if not (after_write and use_arrow and not GDAL_GE_311): + # Before GDAL 3.11, if time columns were written with arrow they were + # not actually written. + assert "time_col" in df.columns + assert is_object_dtype(df["time_col"].dtype) + assert df["time_col"].to_list() == [time(12, 0, 0)] assert "datetime_col" in df.columns assert is_datetime64_dtype(df["datetime_col"].dtype) @@ -613,22 +616,21 @@ def validate_result(df: pd.DataFrame, use_arrow: bool, ignore_mixed_list_col=Fal assert is_object_dtype(df["list_str_col"].dtype) assert df["list_str_col"][0].tolist() == ["a", "b", "c"] - if not ignore_mixed_list_col: + if not (after_write and use_arrow): + # Writing a column with mixed types in a list is not supported with Arrow. assert "list_mixed_col" in df.columns assert is_object_dtype(df["list_mixed_col"].dtype) assert df["list_mixed_col"][0] == [1, "a", None, True] # Read and validate result of reading read_gdf = read_dataframe(many_data_types_geojson_file, use_arrow=use_arrow) - validate_result(read_gdf, use_arrow) + validate_result(read_gdf, use_arrow, after_write=False) # Write the data read, read it back, and validate again if use_arrow: # Writing a column with mixed types in a list is not supported with Arrow. - ignore_mixed_list_col = True read_gdf = read_gdf.drop(columns=["list_mixed_col"]) else: - ignore_mixed_list_col = False request.node.add_marker( pytest.mark.xfail( reason="roundtripping list types fails with use_arrow=False" @@ -640,9 +642,7 @@ def validate_result(df: pd.DataFrame, use_arrow: bool, ignore_mixed_list_col=Fal # Validate data written read_back_gdf = read_dataframe(tmp_file, use_arrow=use_arrow) - validate_result( - read_back_gdf, use_arrow, ignore_mixed_list_col=ignore_mixed_list_col - ) + validate_result(read_back_gdf, use_arrow, after_write=True) @pytest.mark.filterwarnings(