geopandas · theroggy · Oct 17, 2024 · Oct 18, 2024 · Jan 16, 2025 · Jan 17, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,6 +5,7 @@
 ### Improvements
 
 -   Capture all errors logged by gdal when opening a file fails (#495).
+-   Improve support for datetime columns (#486).
 -   Add support to read and write ".gpkg.zip" (GDAL >= 3.7), ".shp.zip", and ".shz"
     files (#527).
 -   Compatibility with the string dtype in the upcoming pandas 3.0 release (#493).

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -1482,6 +1482,7 @@ def ogr_open_arrow(
     int return_fids=False,
     int batch_size=0,
     use_pyarrow=False,
+    datetime_as_string=False,
 ):
 
     cdef int err = 0
@@ -1695,6 +1696,12 @@ def ogr_open_arrow(
                 "GEOARROW".encode("UTF-8")
             )
 
+        # Read DateTime fields as strings, as the Arrow DateTime column type is
+        # quite limited regarding support for mixed timezones,...
+        IF CTE_GDAL_VERSION >= (3, 11, 0):
+            if datetime_as_string:
+                options = CSLSetNameValue(options, "DATETIME_AS_STRING", "YES")
+
         # make sure layer is read from beginning
         OGR_L_ResetReading(ogr_layer)
 
@@ -1720,6 +1727,7 @@ def ogr_open_arrow(
             "crs": crs,
             "encoding": encoding,
             "fields": fields[:, 2],
+            "dtypes": fields[:, 3],
             "geometry_type": geometry_type,
             "geometry_name": geometry_name,
             "fid_column": fid_column,

diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
@@ -2,6 +2,7 @@
 
 import os
 import warnings
+from datetime import datetime
 
 import numpy as np
 
@@ -46,6 +47,7 @@ def _try_parse_datetime(ser):
         datetime_kwargs = {"format": "ISO8601", "errors": "ignore"}
     else:
         datetime_kwargs = {"yearfirst": True}
+
     with warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
@@ -58,12 +60,6 @@ def _try_parse_datetime(ser):
             res = pd.to_datetime(ser, **datetime_kwargs)
         except Exception:
             res = ser
-    # if object dtype, try parse as utc instead
-    if res.dtype in ("object", "string"):
-        try:
-            res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
-        except Exception:
-            pass
 
     if res.dtype.kind == "M":  # any datetime64
         # GDAL only supports ms precision, convert outputs to match.
@@ -73,6 +69,7 @@ def _try_parse_datetime(ser):
             res = res.dt.as_unit("ms")
         else:
             res = res.dt.round(freq="ms")
+
     return res
 
 
@@ -267,11 +264,10 @@ def read_dataframe(
 
     read_func = read_arrow if use_arrow else read
     gdal_force_2d = False if use_arrow else force_2d
-    if not use_arrow:
-        # For arrow, datetimes are read as is.
-        # For numpy IO, datetimes are read as string values to preserve timezone info
-        # as numpy does not directly support timezones.
-        kwargs["datetime_as_string"] = True
+
+    # Always read datetimes as string values to preserve (mixed) timezone info
+    # as numpy does not directly support timezones and arrow datetime columns
+    # don't support mixed timezones.
     result = read_func(
         path_or_buffer,
         layer=layer,
@@ -288,6 +284,7 @@ def read_dataframe(
         sql=sql,
         sql_dialect=sql_dialect,
         return_fids=fid_as_index,
+        datetime_as_string=True,
         **kwargs,
     )
 
@@ -330,6 +327,11 @@ def read_dataframe(
 
         del table
 
+        # convert datetime columns that were read as string to datetime
+        for dtype, column in zip(meta["dtypes"], meta["fields"]):
+            if dtype is not None and dtype.startswith("datetime"):
+                df[column] = _try_parse_datetime(df[column])
+
         if fid_as_index:
             df = df.set_index(meta["fid_column"])
             df.index.names = ["fid"]
@@ -619,8 +621,33 @@ def write_dataframe(
             df = pd.DataFrame(df, copy=False)
             df[geometry_column] = geometry
 
+        # Convert all datetime columns to isoformat strings, to avoid mixed timezone
+        # information getting lost.
+        datetime_cols = []
+        for name, dtype in df.dtypes.items():
+            col = df[name]
+            if dtype == "object":
+                # When all non-NA values are Timestamps, treat as datetime column
+                col_na = df[col.notna()][name]
+                if len(col_na) and all(
+                    isinstance(x, (pd.Timestamp, datetime)) for x in col_na
+                ):
+                    df[name] = col.astype("string")
+                    datetime_cols.append(name)
+            elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC":
+                # When it is a datetime column with a timezone different than UTC, it
+                # needs to be converted to string, otherwise the timezone info is lost.
+                df[name] = col.astype("string")
+                datetime_cols.append(name)
+
         table = pa.Table.from_pandas(df, preserve_index=False)
 
+        # Add metadata to datetime columns so GDAL knows they are datetimes.
+        for datetime_col in datetime_cols:
+            table = _add_column_metadata(
+                table, column_metadata={datetime_col: {"GDAL:OGR:type": "DateTime"}}
+            )
+
         # Null arrow columns are not supported by GDAL, so convert to string
         for field_index, field in enumerate(table.schema):
             if field.type == pa.null():
@@ -678,6 +705,8 @@ def write_dataframe(
     gdal_tz_offsets = {}
     for name in fields:
         col = df[name]
+        values = None
+
         if isinstance(col.dtype, pd.DatetimeTZDtype):
             # Deal with datetimes with timezones by passing down timezone separately
             # pass down naive datetime
@@ -692,8 +721,24 @@ def write_dataframe(
             # Convert each row offset to a signed multiple of 15m and add to GMT value
             gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100
             gdal_tz_offsets[name] = gdal_offset_representation.values
-        else:
+
+        elif col.dtype == "object":
+            # Column of Timestamp/datetime objects, split in naive datetime and tz.
+            col_na = df[col.notna()][name]
+            if len(col_na) and all(
+                isinstance(x, (pd.Timestamp, datetime)) for x in col_na
+            ):
+                tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset())
+                gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100
+                gdal_tz_offsets[name] = gdal_offset_repr.values
+                naive = col.apply(
+                    lambda x: None if pd.isna(x) else x.replace(tzinfo=None)
+                )
+                values = naive.values
+
+        if values is None:
             values = col.values
+
         if isinstance(values, pd.api.extensions.ExtensionArray):
             from pandas.arrays import BooleanArray, FloatingArray, IntegerArray
 
@@ -729,3 +774,48 @@ def write_dataframe(
         gdal_tz_offsets=gdal_tz_offsets,
         **kwargs,
     )
+
+
+def _add_column_metadata(table, column_metadata: dict = {}):
+    """Add or update column-level metadata to an arrow table.
+
+    Parameters
+    ----------
+    table : pyarrow.Table
+        The table to add the column metadata to.
+    column_metadata : dict
+        A dictionary with column metadata in the form
+            {
+                "column_1": {"some": "data"},
+                "column_2": {"more": "stuff"},
+            }
+
+    Returns
+    -------
+    pyarrow.Table: table with the updated column metadata.
+    """
+    import pyarrow as pa
+
+    if not column_metadata:
+        return table
+
+    # Create updated column fields with new metadata
+    fields = []
+    for col in table.schema.names:
+        if col in column_metadata:
+            # Add/update column metadata
+            metadata = table.field(col).metadata or {}
+            for key, value in column_metadata[col].items():
+                metadata[key] = value
+            # Update field with updated metadata
+            fields.append(table.field(col).with_metadata(metadata))
+        else:
+            fields.append(table.field(col))
+
+    # Create new schema with the updated field metadata
+    schema = pa.schema(fields, metadata=table.schema.metadata)
+
+    # Build new table with updated schema (shouldn't copy data)
+    table = table.cast(schema)
+
+    return table
diff --git a/pyogrio/raw.py b/pyogrio/raw.py
@@ -233,6 +233,7 @@ def read_arrow(
     sql=None,
     sql_dialect=None,
     return_fids=False,
+    datetime_as_string=False,
     **kwargs,
 ):
     """Read OGR data source into a pyarrow Table.
@@ -303,6 +304,7 @@ def read_arrow(
         skip_features=gdal_skip_features,
         batch_size=batch_size,
         use_pyarrow=True,
+        datetime_as_string=datetime_as_string,
         **kwargs,
     ) as source:
         meta, reader = source
@@ -358,6 +360,7 @@ def open_arrow(
     return_fids=False,
     batch_size=65_536,
     use_pyarrow=False,
+    datetime_as_string=False,
     **kwargs,
 ):
     """Open OGR data source as a stream of Arrow record batches.
@@ -386,6 +389,9 @@ def open_arrow(
         ArrowStream object. In the default case, this stream object needs
         to be passed to another library supporting the Arrow PyCapsule
         Protocol to consume the stream of data.
+    datetime_as_string : bool, optional (default: False)
+        If True, will return datetime dtypes as detected by GDAL as strings,
+        as arrow doesn't support e.g. mixed timezones.
 
     Examples
     --------
@@ -423,6 +429,7 @@ def open_arrow(
         Meta is: {
             "crs": "<crs>",
             "fields": <ndarray of field names>,
+            "dtypes": <ndarray of numpy dtypes corresponding to fields>
             "encoding": "<encoding>",
             "geometry_type": "<geometry_type>",
             "geometry_name": "<name of geometry column in arrow table>",
@@ -453,6 +460,7 @@ def open_arrow(
         dataset_kwargs=dataset_kwargs,
         batch_size=batch_size,
         use_pyarrow=use_pyarrow,
+        datetime_as_string=datetime_as_string,
     )