diff --git a/README.md b/README.md index 1773ba2..7bc718c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,78 @@ # dask-ee -Earth Engine FeatureCollections via Dask Dataframes +Google Earth Engine `FeatureCollection`s via Dask DataFrames + +## How to use + +Install with pip: +```shell +pip install --upgrade dask-ee +``` + +Then, authenticate Earth Engine: +```shell +earthengine authenticate --quiet +``` + +In your Python environment, you may now import the library: + +```python +import ee +import dask_ee +``` + +You'll need to initialize Earth Engine before working with data: +```python +ee.Initialize() +``` + +From here, you can read Earth Engine FeatureCollections like they are DataFrames: +```python +ddf = dask_ee.read_ee("WRI/GPPD/power_plants") +ddf.head() +``` +These work like Pandas DataFrames, but they are lazily evaluated via [Dask](https://dask.org/). + +Feel free to do any analysis you wish. For example: +```python +# Thanks @aazuspan, https://www.aazuspan.dev/blog/dask_featurecollection +( + ddf[ddf.comm_year.gt(1940) & ddf.country.eq("USA") & ddf.fuel1.isin(["Coal", "Wind"])] + .astype({"comm_year": int}) + .drop(columns=["geo"]) + .groupby(["comm_year", "fuel1"]) + .agg({"capacitymw": "sum"}) + .reset_index() + .sort_values(by=["comm_year"]) + .compute(scheduler="threads") + .pivot_table(index="comm_year", columns="fuel1", values="capacitymw", fill_value=0) + .plot() +) +``` +![Coal vs Wind in the US since 1940](demo.png) + +There are a few other useful things you can do. + +For one, you may pass in a pre-processed `ee.FeatureCollection`. This allows full utilization +of the Earth Engine API. + +```python +import dask_ee + +fc = ( + ee.FeatureCollection("WRI/GPPD/power_plants") + .filter(ee.Filter.gt("comm_year", 1940)) + .filter(ee.Filter.eq("country", "USA")) +) +ddf = dask_ee.read_ee(fc) +``` + +In addition, you may change the `chunksize`, which controls how many rows are included in each +Dask partition. +```python +ddf = dask_ee.read_ee("WRI/GPPD/power_plants", chunksize=7_000) +ddf.head() +``` ## License ``` diff --git a/dask_ee/read.py b/dask_ee/read.py index fd5c21f..d9f7ebf 100644 --- a/dask_ee/read.py +++ b/dask_ee/read.py @@ -29,11 +29,24 @@ # TODO(#4): Support 'auto' chunks, where we calculate the maximum allowed page size given the number of # bytes in each row. def read_ee( - fc: ee.FeatureCollection, io_chunks: t.Union[int, t.Literal['auto']] = 5_000 + fc: t.Union[ee.FeatureCollection, str], + chunksize: t.Union[int, t.Literal['auto']] = 5_000, ) -> dd.DataFrame: + """Read Google Earth Engine FeatureCollections into a Dask Dataframe. - if io_chunks == 'auto': - raise NotImplementedError('Auto `io_chunks` are not implemented yet!') + Args: + fc: A Google Earth Engine FeatureCollection or valid string path to a FeatureCollection. + chunksize: The number of rows per partition to use. + + Returns: + A dask DataFrame with paged Google Earth Engine data. + """ + + if isinstance(fc, str): + fc = ee.FeatureCollection(fc) + + if chunksize == 'auto': + raise NotImplementedError('Auto chunksize is not implemented yet!') # Make all the getInfo() calls at once, up front. fc_size, all_info = ee.List([fc.size(), fc.limit(0)]).getInfo() @@ -42,12 +55,12 @@ def read_ee( columns.update(all_info['columns']) del columns['system:index'] - divisions = tuple(range(0, fc_size, io_chunks)) + divisions = tuple(range(0, fc_size, chunksize)) # TODO(#5): Compare `toList()` to other range operations, like getting all index IDs via `getInfo()`. - pages = [ee.FeatureCollection(fc.toList(io_chunks, i)) for i in divisions] + pages = [ee.FeatureCollection(fc.toList(chunksize, i)) for i in divisions] # Get the remainder, if it exists. `io_chunks` are not likely to evenly partition the data. - d, r = divmod(fc_size, io_chunks) + d, r = divmod(fc_size, chunksize) if r != 0: pages.append(ee.FeatureCollection(fc.toList(r, d))) divisions += (fc_size,) diff --git a/demo.png b/demo.png new file mode 100644 index 0000000..dad54cd Binary files /dev/null and b/demo.png differ diff --git a/pyproject.toml b/pyproject.toml index 118fa77..a0b19f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,13 @@ [project] name = "dask-ee" dynamic = ["version"] -description = "Google Earth Engine FeatureCollections via Dask Dataframes." +description = "Google Earth Engine FeatureCollections via Dask DataFrames." readme = "README.md" requires-python = ">=3.8" license = {text = "Apache-2.0"} authors = [ {name = "Alexander Merose", email = "al@merose.com"}, + {name = "Aaron Zuspan"} ] classifiers = [ "Development Status :: 4 - Beta", @@ -22,11 +23,13 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Atmospheric Science", + "Topic :: Scientific/Engineering :: GIS", + "Topic :: Scientific/Engineering :: Hydrology", + "Topic :: Scientific/Engineering :: Oceanography", ] dependencies = [ "earthengine-api>=0.1.374", - "pandas", - "dask", + "dask[dataframe]", ] [project.optional-dependencies]