Skip to content

Commit

Permalink
Initial attempt to get dask-geopandas working.
Browse files Browse the repository at this point in the history
  • Loading branch information
alxmrs committed Jun 24, 2024
1 parent 6c1cad7 commit 7b71b4c
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 16 deletions.
53 changes: 41 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ _Google Earth Engine Feature Collections via Dask DataFrames._
## How to use

Install with pip:

```shell
pip install dask-ee
```

Then, authenticate Earth Engine:

```shell
earthengine authenticate
```
Expand All @@ -26,36 +28,41 @@ import dask_ee
```

You'll need to initialize Earth Engine before working with data:

```python
ee.Initialize()
```

From here, you can read Earth Engine FeatureCollections like they are DataFrames:

```python
df = dask_ee.read_ee("WRI/GPPD/power_plants")
df.head()
```

These work like Pandas DataFrames, but they are lazily evaluated via [Dask](https://dask.org/).

Feel free to do any analysis you wish. For example:

```python
# Thanks @aazuspan, https://www.aazuspan.dev/blog/dask_featurecollection
(
df[df.comm_year.gt(1940) & df.country.eq("USA") & df.fuel1.isin(["Coal", "Wind"])]
.astype({"comm_year": int})
.drop(columns=["geo"])
.groupby(["comm_year", "fuel1"])
.agg({"capacitymw": "sum"})
.reset_index()
.sort_values(by=["comm_year"])
.compute(scheduler="threads")
.pivot_table(index="comm_year", columns="fuel1", values="capacitymw", fill_value=0)
.plot()
df[df.comm_year.gt(1940) & df.country.eq("USA") & df.fuel1.isin(["Coal", "Wind"])]
.astype({"comm_year": int})
.drop(columns=["geometry"])
.groupby(["comm_year", "fuel1"])
.agg({"capacitymw": "sum"})
.reset_index()
.sort_values(by=["comm_year"])
.compute(scheduler="threads")
.pivot_table(index="comm_year", columns="fuel1", values="capacitymw", fill_value=0)
.plot()
)
```

![Coal vs Wind in the US since 1940](https://raw.githubusercontent.com/alxmrs/dask-ee/main/demo.png)

There are a few other useful things you can do.
There are a few other useful things you can do.

For one, you may pass in a pre-processed `ee.FeatureCollection`. This allows full utilization
of the Earth Engine API.
Expand All @@ -71,6 +78,7 @@ df = dask_ee.read_ee(fc)

In addition, you may change the `chunksize`, which controls how many rows are included in each
Dask partition.

```python
df = dask_ee.read_ee("WRI/GPPD/power_plants", chunksize=7_000)
df.head()
Expand All @@ -81,12 +89,33 @@ df.head()
Contributions are welcome. A good way to start is to check out open [issues](https://github.com/alxmrs/dask-ee/issues)
or file a new one. We're happy to review pull requests, too.

Before writing code, please install the development dependencies (after cloning the repo):
Before writing code, please install the development dependencies (after cloning the repo):

```shell
pip install -e ".[dev]"
```

<details>
<summary>Help! On install, I hit the error: <code>NotADirectoryError: [Errno 20] Not a directory: 'gdal-config'</code>.</summary>

You may need to install `gdal` on your system to properly test `dask-geopandas`. On a Mac
with [Homebrew](https://brew.sh), you can run:

```shell
brew install gdal
```

Or, if you're using [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html), you can just
directly install `dask-geopandas`:

```shell
conda install dask-geopandas -c conda-forge
```

</details>

## License

```
Copyright 2024 Alexander S Merose
Expand Down
3 changes: 2 additions & 1 deletion dask_ee/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,10 @@ def to_df(page: ee.FeatureCollection) -> pd.DataFrame:

meta = {k: _BUILTIN_DTYPES[v] for k, v in columns.items()}

# We rename to geometry because that is the default column that geopandas looks for.
return dd.from_map(
to_df,
pages,
meta=meta,
divisions=divisions,
)
).rename(columns={'geo': 'geometry'})
33 changes: 30 additions & 3 deletions dask_ee/read_integrationtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
import unittest

import dask.dataframe as dd
import dask_geopandas as dgp
import ee
import shapely

import dask_ee

Expand All @@ -37,6 +39,21 @@ def test_reads_dask_dataframe(self):
print(columns)
print(head)

def test_read_different_dataframe(self):
fc = ee.FeatureCollection('TIGER/2016/Roads').limit(10_001)
df = dask_ee.read_ee(fc)

head = df.head()
columns = df.columns

self.assertIsNotNone(df)
self.assertIsNotNone(head)
self.assertIsInstance(df, dd.DataFrame)
self.assertEqual(df.compute().shape, (10_001, 5))

print(columns)
print(head)

def test_works_with_defined_features(self):
# Make a list of Features.
features = [
Expand All @@ -52,14 +69,14 @@ def test_works_with_defined_features(self):

df = dask_ee.read_ee(fc)

self.assertEqual(list(df.columns), ['geo', 'name'])
self.assertEqual(list(df.columns), ['geometry', 'name'])

def test_works_with_a_single_feature_in_fc(self):
from_geom = ee.FeatureCollection(ee.Geometry.Point(16.37, 48.225))

df = dask_ee.read_ee(from_geom)

self.assertEqual(list(df.columns), ['geo'])
self.assertEqual(list(df.columns), ['geometry'])
self.assertEqual(df.compute().shape, (1, 1))

def test_can_create_random_points(self):
Expand All @@ -72,7 +89,7 @@ def test_can_create_random_points(self):
# Note: these random points have no system:index!
df = dask_ee.read_ee(random_points)

self.assertEqual(list(df.columns), ['geo'])
self.assertEqual(list(df.columns), ['geometry'])
self.assertEqual(df.compute().shape, (1000, 1))

def test_prof__read_ee(self):
Expand All @@ -83,6 +100,16 @@ def test_prof__read_ee(self):
# Modified version of `pr.print_stats()`.
pstats.Stats(pr).sort_stats('cumtime').print_stats()

def test_integrates_with_geopandas_dask(self):
fc = ee.FeatureCollection('WRI/GPPD/power_plants')
df = dask_ee.read_ee(fc)
gdf = dgp.from_dask_dataframe(df)

self.assertIsNotNone(df.compute().geometry)
self.assertIsInstance(df.geometry.dtype, shapely.GeometryType)

print(df.head())


if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ tests = [
"absl-py",
"pytest",
"pyink",
"dask-geopandas",
]
dev = [
"dask-ee[tests]",
Expand Down

0 comments on commit 7b71b4c

Please sign in to comment.