diff --git a/docs/get_started.qmd b/docs/get_started.qmd index 4b8096f..7bcd175 100644 --- a/docs/get_started.qmd +++ b/docs/get_started.qmd @@ -77,6 +77,7 @@ Above, we saved the data as a CSV, but you can choose another option depending o - `type = "arrow"` uses `to_feather()` from pandas to create an Arrow/Feather file. - `type = "joblib"` uses `joblib.dump()` to create a binary Python data file, such as for storing a trained model. See the [joblib docs](https://joblib.readthedocs.io/en/latest/) for more information. - `type = "json"` uses `json.dump()` to create a JSON file. Pretty much every programming language can read JSON files, but they only work well for nested lists. +- `type = "geoparquet"` uses `to_parquet()` from [geopandas](https://github.com/geopandas/geopandas) to create a [GeoParquet](https://github.com/opengeospatial/geoparquet) file, which is a specialized Parquet format for geospatial data. Note that when the data lives elsewhere, pins takes care of downloading and caching so that it's only re-downloaded when needed. That said, most boards transmit pins over HTTP, and this is going to be slow and possibly unreliable for very large pins. diff --git a/pins/_adaptors.py b/pins/_adaptors.py index 80fb9f6..50f8bfa 100644 --- a/pins/_adaptors.py +++ b/pins/_adaptors.py @@ -8,17 +8,23 @@ from typing_extensions import TypeAlias if TYPE_CHECKING: + import geopandas as gpd import pandas as pd PandasDataFrame: TypeAlias = pd.DataFrame - DataFrame: TypeAlias = PandasDataFrame + GeoPandasGeoDataFrame: TypeAlias = gpd.GeoDataFrame + DataFrame: TypeAlias = PandasDataFrame | GeoPandasGeoDataFrame class AbstractPandasFrame(AbstractBackend): _backends = [("pandas", "DataFrame")] -AbstractDF: TypeAlias = AbstractPandasFrame +class AbstractGeoPandasFrame(AbstractPandasFrame): + _backends = [("geopandas", "GeoDataFrame")] + + +AbstractDF: TypeAlias = AbstractPandasFrame | AbstractGeoPandasFrame class Adaptor: @@ -142,12 +148,29 @@ def write_feather(self, file: str) -> None: self._d.to_feather(file) +class GeoPandasAdaptor(PandasAdaptor): + _d: ClassVar[GeoPandasGeoDataFrame] # type: ignore[reportIncompatibleVariableOverride] + + def __init__(self, data: AbstractGeoPandasFrame) -> None: + super().__init__(data) + + @property + def df_type(self) -> str: + # Consider over-riding this for specialized dataframes + return "GeoDataFrame" + + def head(self, n: int) -> GeoPandasAdaptor: + return GeoPandasAdaptor(self._d.head(n)) + + @overload def create_adaptor(obj: DataFrame) -> DFAdaptor: ... @overload def create_adaptor(obj: Any) -> Adaptor: ... def create_adaptor(obj: Any | DataFrame) -> Adaptor | DFAdaptor: - if isinstance(obj, AbstractPandasFrame): + if isinstance(obj, AbstractGeoPandasFrame): + return GeoPandasAdaptor(obj) + elif isinstance(obj, AbstractPandasFrame): return PandasAdaptor(obj) elif isinstance(obj, Adaptor): return obj diff --git a/pins/boards.py b/pins/boards.py index c1eeb2f..557fa55 100644 --- a/pins/boards.py +++ b/pins/boards.py @@ -358,7 +358,7 @@ def pin_write( Pin name. type: File type used to save `x` to disk. May be "csv", "arrow", "parquet", - "joblib", or "json". + "joblib", "json", or "geoparquet". title: A title for the pin; most important for shared boards so that others can understand what the pin contains. If omitted, a brief description diff --git a/pins/drivers.py b/pins/drivers.py index 2b5a004..d5a6b65 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -92,6 +92,16 @@ def load_data( return pd.read_csv(f) + elif meta.type == "geoparquet": + try: + import geopandas as gpd + except ModuleNotFoundError: + raise ModuleNotFoundError( + 'The "geopandas" package is required to read "geoparquet" type files.' + ) from None + + return gpd.read_parquet(f) + elif meta.type == "joblib": import joblib @@ -139,6 +149,8 @@ def save_data( if apply_suffix: if pin_type == "file": suffix = "".join(Path(obj).suffixes) + elif pin_type == "geoparquet": + suffix = ".parquet" else: suffix = f".{pin_type}" else: @@ -162,6 +174,8 @@ def save_data( raise NotImplementedError(msg) elif pin_type == "parquet": adaptor.write_parquet(final_name) + elif pin_type == "geoparquet": + adaptor.write_parquet(final_name) elif pin_type == "joblib": adaptor.write_joblib(final_name) elif pin_type == "json": diff --git a/pins/tests/test_drivers.py b/pins/tests/test_drivers.py index 5959e02..85414d7 100644 --- a/pins/tests/test_drivers.py +++ b/pins/tests/test_drivers.py @@ -3,6 +3,7 @@ from pathlib import Path import fsspec +import geopandas as gpd import pandas as pd import pytest @@ -37,6 +38,10 @@ class D: [ (pd.DataFrame({"x": [1, 2]}), "somename: a pinned 2 x 1 DataFrame"), (pd.DataFrame({"x": [1], "y": [2]}), "somename: a pinned 1 x 2 DataFrame"), + ( + gpd.GeoDataFrame({"x": [1], "geometry": [None]}), + "somename: a pinned 1 x 2 GeoDataFrame", + ), (ExC(), "somename: a pinned ExC object"), (ExC().D(), "somename: a pinned ExC.D object"), ([1, 2, 3], "somename: a pinned list object"), @@ -79,6 +84,27 @@ def test_driver_roundtrip(tmp_path: Path, type_): assert df.equals(obj) +def test_driver_geoparquet_roundtrip(tmp_path): + import geopandas as gpd + + gdf = gpd.GeoDataFrame( + {"x": [1, 2, 3], "geometry": gpd.points_from_xy([1, 2, 3], [1, 2, 3])} + ) + + fname = "some_gdf" + full_file = f"{fname}.parquet" + + p_obj = tmp_path / fname + res_fname = save_data(gdf, p_obj, "geoparquet") + + assert Path(res_fname).name == full_file + + meta = MetaRaw(full_file, "geoparquet", "my_pin") + obj = load_data(meta, fsspec.filesystem("file"), tmp_path, allow_pickle_read=True) + + assert gdf.equals(obj) + + @pytest.mark.parametrize( "type_", [ diff --git a/pyproject.toml b/pyproject.toml index 1deb7bb..e746dcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,8 @@ check = [ "pyright==1.1.372", # Pinned; manually sync with .github/workflows/code-checks.yml "ruff==0.5.4", # Pinned; manually sync with pre-commit-config.yaml "types-appdirs", - "databricks-sdk" + "databricks-sdk", + "geopandas", ] databricks = ["databricks-sdk"] doc = [ @@ -65,6 +66,7 @@ test = [ "pytest-dotenv", "pytest-parallel", "s3fs", + "geopandas>=0.8.0", # At 0.8.0, the GeoParquet format was introduced. "rdata", "databricks-sdk", ] diff --git a/requirements/dev.txt b/requirements/dev.txt index 0f8fd75..fde0c23 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -62,6 +62,8 @@ cachetools==5.5.2 # via google-auth certifi==2025.4.26 # via + # pyogrio + # pyproj # requests # sphobjinv cffi==1.17.1 @@ -124,6 +126,8 @@ fsspec==2025.5.1 # s3fs gcsfs==2025.5.1 # via pins (pyproject.toml) +geopandas==1.0.1 + # via pins (setup.cfg) google-api-core==2.25.0 # via # google-cloud-core @@ -247,8 +251,12 @@ nodeenv==1.9.1 numpy==2.2.6 # via # fastparquet + # geopandas # pandas + # pyarrow + # pyogrio # rdata + # shapely # xarray oauthlib==3.2.2 # via requests-oauthlib @@ -257,13 +265,16 @@ packaging==25.0 # black # build # fastparquet + # geopandas # ipykernel + # pyogrio # pytest # pytest-cases # xarray pandas==2.2.3 # via # fastparquet + # geopandas # pins (pyproject.toml) # rdata # xarray @@ -331,6 +342,10 @@ pyjwt==2.10.1 # via # msal # pyjwt +pyogrio==0.9.0 + # via geopandas +pyproj==3.6.1 + # via geopandas pyproject-hooks==1.2.0 # via # build @@ -401,6 +416,8 @@ ruff==0.5.4 # via pins (pyproject.toml) s3fs==2025.5.1 # via pins (pyproject.toml) +shapely==2.0.5 + # via geopandas six==1.17.0 # via # azure-core