Added README docs. Also, changed API a bit. (#7)

- Using `chunksize`, which is a dask convention - Accepts strings or fcs - Added docstring to main function. - Aaron is an author. - rm pandas dep.
alxmrs · Jun 23, 2024 · ea03bb4 · ea03bb4
1 parent c859cf0
commit ea03bb4
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,78 @@
 # dask-ee
 
-Earth Engine FeatureCollections via Dask Dataframes
+Google Earth Engine `FeatureCollection`s via Dask DataFrames
+
+## How to use
+
+Install with pip:
+```shell
+pip install --upgrade dask-ee
+```
+
+Then, authenticate Earth Engine:
+```shell
+earthengine authenticate --quiet
+```
+
+In your Python environment, you may now import the library:
+
+```python
+import ee
+import dask_ee
+```
+
+You'll need to initialize Earth Engine before working with data:
+```python
+ee.Initialize()
+```
+
+From here, you can read Earth Engine FeatureCollections like they are DataFrames:
+```python
+ddf = dask_ee.read_ee("WRI/GPPD/power_plants")
+ddf.head()
+```
+These work like Pandas DataFrames, but they are lazily evaluated via [Dask](https://dask.org/).
+
+Feel free to do any analysis you wish. For example:
+```python
+# Thanks @aazuspan, https://www.aazuspan.dev/blog/dask_featurecollection
+(
+    ddf[ddf.comm_year.gt(1940) & ddf.country.eq("USA") & ddf.fuel1.isin(["Coal", "Wind"])]
+    .astype({"comm_year": int})
+    .drop(columns=["geo"])
+    .groupby(["comm_year", "fuel1"])
+    .agg({"capacitymw": "sum"})
+    .reset_index()
+    .sort_values(by=["comm_year"])
+    .compute(scheduler="threads")
+    .pivot_table(index="comm_year", columns="fuel1", values="capacitymw", fill_value=0)
+    .plot()
+)
+```
+![Coal vs Wind in the US since 1940](demo.png)
+
+There are a few other useful things you can do. 
+
+For one, you may pass in a pre-processed `ee.FeatureCollection`. This allows full utilization
+of the Earth Engine API.
+
+```python
+import dask_ee
+
+fc = (
+  ee.FeatureCollection("WRI/GPPD/power_plants")
+  .filter(ee.Filter.gt("comm_year", 1940))
+  .filter(ee.Filter.eq("country", "USA"))
+)
+ddf = dask_ee.read_ee(fc)
+```
+
+In addition, you may change the `chunksize`, which controls how many rows are included in each
+Dask partition.
+```python
+ddf = dask_ee.read_ee("WRI/GPPD/power_plants", chunksize=7_000)
+ddf.head()
+```
 
 ## License
 ```

diff --git a/dask_ee/read.py b/dask_ee/read.py
@@ -29,11 +29,24 @@
 # TODO(#4): Support 'auto' chunks, where we calculate the maximum allowed page size given the number of
 #  bytes in each row.
 def read_ee(
-    fc: ee.FeatureCollection, io_chunks: t.Union[int, t.Literal['auto']] = 5_000
+    fc: t.Union[ee.FeatureCollection, str],
+    chunksize: t.Union[int, t.Literal['auto']] = 5_000,
 ) -> dd.DataFrame:
+  """Read Google Earth Engine FeatureCollections into a Dask Dataframe.
 
-  if io_chunks == 'auto':
-    raise NotImplementedError('Auto `io_chunks` are not implemented yet!')
+  Args:
+    fc: A Google Earth Engine FeatureCollection or valid string path to a FeatureCollection.
+    chunksize: The number of rows per partition to use.
+
+  Returns:
+    A dask DataFrame with paged Google Earth Engine data.
+  """
+
+  if isinstance(fc, str):
+    fc = ee.FeatureCollection(fc)
+
+  if chunksize == 'auto':
+    raise NotImplementedError('Auto chunksize is not implemented yet!')
 
   # Make all the getInfo() calls at once, up front.
   fc_size, all_info = ee.List([fc.size(), fc.limit(0)]).getInfo()
@@ -42,12 +55,12 @@ def read_ee(
   columns.update(all_info['columns'])
   del columns['system:index']
 
-  divisions = tuple(range(0, fc_size, io_chunks))
+  divisions = tuple(range(0, fc_size, chunksize))
 
   # TODO(#5): Compare `toList()` to other range operations, like getting all index IDs via `getInfo()`.
-  pages = [ee.FeatureCollection(fc.toList(io_chunks, i)) for i in divisions]
+  pages = [ee.FeatureCollection(fc.toList(chunksize, i)) for i in divisions]
   # Get the remainder, if it exists. `io_chunks` are not likely to evenly partition the data.
-  d, r = divmod(fc_size, io_chunks)
+  d, r = divmod(fc_size, chunksize)
   if r != 0:
     pages.append(ee.FeatureCollection(fc.toList(r, d)))
     divisions += (fc_size,)

diff --git a/demo.png b/demo.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,13 @@
 [project]
 name = "dask-ee"
 dynamic = ["version"]
-description = "Google Earth Engine FeatureCollections via Dask Dataframes."
+description = "Google Earth Engine FeatureCollections via Dask DataFrames."
 readme = "README.md"
 requires-python = ">=3.8"
 license = {text = "Apache-2.0"}
 authors = [
   {name = "Alexander Merose", email = "[email protected]"},
+  {name = "Aaron Zuspan"}
 ]
 classifiers = [
   "Development Status :: 4 - Beta",
@@ -22,11 +23,13 @@ classifiers = [
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
   "Topic :: Scientific/Engineering :: Atmospheric Science",
+  "Topic :: Scientific/Engineering :: GIS",
+  "Topic :: Scientific/Engineering :: Hydrology",
+  "Topic :: Scientific/Engineering :: Oceanography",
 ]
 dependencies = [
   "earthengine-api>=0.1.374",
-  "pandas",
-  "dask",
+  "dask[dataframe]",
 ]
 
 [project.optional-dependencies]