diff --git a/README.md b/README.md index ed241a1..c58e61e 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,13 @@ _Google Earth Engine Feature Collections via Dask DataFrames._ ## How to use Install with pip: + ```shell pip install dask-ee ``` Then, authenticate Earth Engine: + ```shell earthengine authenticate ``` @@ -26,36 +28,41 @@ import dask_ee ``` You'll need to initialize Earth Engine before working with data: + ```python ee.Initialize() ``` From here, you can read Earth Engine FeatureCollections like they are DataFrames: + ```python df = dask_ee.read_ee("WRI/GPPD/power_plants") df.head() ``` + These work like Pandas DataFrames, but they are lazily evaluated via [Dask](https://dask.org/). Feel free to do any analysis you wish. For example: + ```python # Thanks @aazuspan, https://www.aazuspan.dev/blog/dask_featurecollection ( - df[df.comm_year.gt(1940) & df.country.eq("USA") & df.fuel1.isin(["Coal", "Wind"])] - .astype({"comm_year": int}) - .drop(columns=["geo"]) - .groupby(["comm_year", "fuel1"]) - .agg({"capacitymw": "sum"}) - .reset_index() - .sort_values(by=["comm_year"]) - .compute(scheduler="threads") - .pivot_table(index="comm_year", columns="fuel1", values="capacitymw", fill_value=0) - .plot() + df[df.comm_year.gt(1940) & df.country.eq("USA") & df.fuel1.isin(["Coal", "Wind"])] + .astype({"comm_year": int}) + .drop(columns=["geometry"]) + .groupby(["comm_year", "fuel1"]) + .agg({"capacitymw": "sum"}) + .reset_index() + .sort_values(by=["comm_year"]) + .compute(scheduler="threads") + .pivot_table(index="comm_year", columns="fuel1", values="capacitymw", fill_value=0) + .plot() ) ``` + ![Coal vs Wind in the US since 1940](https://raw.githubusercontent.com/alxmrs/dask-ee/main/demo.png) -There are a few other useful things you can do. +There are a few other useful things you can do. For one, you may pass in a pre-processed `ee.FeatureCollection`. This allows full utilization of the Earth Engine API. @@ -71,6 +78,7 @@ df = dask_ee.read_ee(fc) In addition, you may change the `chunksize`, which controls how many rows are included in each Dask partition. + ```python df = dask_ee.read_ee("WRI/GPPD/power_plants", chunksize=7_000) df.head() @@ -81,12 +89,33 @@ df.head() Contributions are welcome. A good way to start is to check out open [issues](https://github.com/alxmrs/dask-ee/issues) or file a new one. We're happy to review pull requests, too. -Before writing code, please install the development dependencies (after cloning the repo): +Before writing code, please install the development dependencies (after cloning the repo): + ```shell pip install -e ".[dev]" ``` +
+Help! On install, I hit the error: NotADirectoryError: [Errno 20] Not a directory: 'gdal-config'. + +You may need to install `gdal` on your system to properly test `dask-geopandas`. On a Mac +with [Homebrew](https://brew.sh), you can run: + +```shell +brew install gdal +``` + +Or, if you're using [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html), you can just +directly install `dask-geopandas`: + +```shell +conda install dask-geopandas -c conda-forge +``` + +
+ ## License + ``` Copyright 2024 Alexander S Merose diff --git a/dask_ee/read.py b/dask_ee/read.py index 6077353..48cd8e0 100644 --- a/dask_ee/read.py +++ b/dask_ee/read.py @@ -75,9 +75,10 @@ def to_df(page: ee.FeatureCollection) -> pd.DataFrame: meta = {k: _BUILTIN_DTYPES[v] for k, v in columns.items()} + # We rename to geometry because that is the default column that geopandas looks for. return dd.from_map( to_df, pages, meta=meta, divisions=divisions, - ) + ).rename(columns={'geo': 'geometry'}) diff --git a/dask_ee/read_integrationtest.py b/dask_ee/read_integrationtest.py index d580f73..f2f2054 100644 --- a/dask_ee/read_integrationtest.py +++ b/dask_ee/read_integrationtest.py @@ -11,7 +11,9 @@ import unittest import dask.dataframe as dd +import dask_geopandas as dgp import ee +import shapely import dask_ee @@ -37,6 +39,21 @@ def test_reads_dask_dataframe(self): print(columns) print(head) + def test_read_different_dataframe(self): + fc = ee.FeatureCollection('TIGER/2016/Roads').limit(10_001) + df = dask_ee.read_ee(fc) + + head = df.head() + columns = df.columns + + self.assertIsNotNone(df) + self.assertIsNotNone(head) + self.assertIsInstance(df, dd.DataFrame) + self.assertEqual(df.compute().shape, (10_001, 5)) + + print(columns) + print(head) + def test_works_with_defined_features(self): # Make a list of Features. features = [ @@ -52,14 +69,14 @@ def test_works_with_defined_features(self): df = dask_ee.read_ee(fc) - self.assertEqual(list(df.columns), ['geo', 'name']) + self.assertEqual(list(df.columns), ['geometry', 'name']) def test_works_with_a_single_feature_in_fc(self): from_geom = ee.FeatureCollection(ee.Geometry.Point(16.37, 48.225)) df = dask_ee.read_ee(from_geom) - self.assertEqual(list(df.columns), ['geo']) + self.assertEqual(list(df.columns), ['geometry']) self.assertEqual(df.compute().shape, (1, 1)) def test_can_create_random_points(self): @@ -72,7 +89,7 @@ def test_can_create_random_points(self): # Note: these random points have no system:index! df = dask_ee.read_ee(random_points) - self.assertEqual(list(df.columns), ['geo']) + self.assertEqual(list(df.columns), ['geometry']) self.assertEqual(df.compute().shape, (1000, 1)) def test_prof__read_ee(self): @@ -83,6 +100,16 @@ def test_prof__read_ee(self): # Modified version of `pr.print_stats()`. pstats.Stats(pr).sort_stats('cumtime').print_stats() + def test_integrates_with_geopandas_dask(self): + fc = ee.FeatureCollection('WRI/GPPD/power_plants') + df = dask_ee.read_ee(fc) + gdf = dgp.from_dask_dataframe(df) + + self.assertIsNotNone(df.compute().geometry) + self.assertIsInstance(df.geometry.dtype, shapely.GeometryType) + + print(df.head()) + if __name__ == '__main__': unittest.main() diff --git a/pyproject.toml b/pyproject.toml index 2b239b2..bedad22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ tests = [ "absl-py", "pytest", "pyink", + "dask-geopandas", ] dev = [ "dask-ee[tests]",