Skip to content

Commit 19191ba

Browse files
authored
Merge pull request #514 from s22s/feature/ipy-display
DataFrame.display injection
2 parents 8bbed00 + 88f9af5 commit 19191ba

File tree

29 files changed

+423
-709
lines changed

29 files changed

+423
-709
lines changed

.circleci/config.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ orbs:
4242
steps:
4343
- run:
4444
name: Install requirements
45-
command: python -m pip install --progress-bar=off --user -r pyrasterframes/src/main/python/requirements.txt
45+
command: /opt/conda/bin/conda install -c conda-forge --yes --file pyrasterframes/src/main/python/requirements-condaforge.txt
46+
4647

4748
rasterframes:
4849
commands:
@@ -117,6 +118,7 @@ jobs:
117118
- checkout
118119
- sbt/setup
119120
- python/setup
121+
- python/requirements
120122
- rasterframes/setup
121123
- rasterframes/restore-cache
122124
- sbt/compile

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ tour/*.tiff
2727
scoverage-report*
2828

2929
zz-*
30+
rf-notebook/src/main/notebooks/.ipython

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,6 @@ Additional, Python sepcific build instruction may be found at [pyrasterframes/sr
6262

6363
## Copyright and License
6464

65-
RasterFrames is released under the Apache 2.0 License, copyright Astraea, Inc. 2017-2019.
65+
RasterFrames is released under the Apache 2.0 License, copyright Astraea, Inc. 2017-2020.
6666

6767

core/src/main/resources/reference.conf

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
rasterframes {
22
nominal-tile-size = 256
3-
prefer-gdal = true
3+
prefer-gdal = false
44
showable-tiles = false
55
showable-max-cells = 20
66
max-truncate-row-element-length = 40

core/src/main/scala/org/locationtech/rasterframes/util/debug/package.scala

+36
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,46 @@
2121

2222
package org.locationtech.rasterframes.util
2323

24+
import java.lang.reflect.{AccessibleObject, Modifier}
25+
26+
import org.apache.spark.Partition
27+
import org.apache.spark.rdd.RDD
28+
29+
import scala.util.Try
30+
2431
/**
2532
* Additional debugging routines. No guarantees these are or will remain stable.
2633
*
2734
* @since 4/6/18
2835
*/
2936
package object debug {
37+
38+
implicit class DescribeablePartition(val p: Partition) extends AnyVal {
39+
def describe: String = Try {
40+
def acc[A <: AccessibleObject](a: A): A = {
41+
a.setAccessible(true); a
42+
}
43+
44+
val getters = p.getClass.getDeclaredMethods
45+
.filter(_.getParameterCount == 0)
46+
.filter(m (m.getModifiers & Modifier.PUBLIC) > 0)
47+
.filterNot(_.getName == "hashCode")
48+
.map(acc)
49+
.map(m m.getName + "=" + String.valueOf(m.invoke(p)))
50+
51+
val fields = p.getClass.getDeclaredFields
52+
.filter(f (f.getModifiers & Modifier.PUBLIC) > 0)
53+
.map(acc)
54+
.map(m m.getName + "=" + String.valueOf(m.get(p)))
55+
56+
p.getClass.getSimpleName + "(" + (fields ++ getters).mkString(", ") + ")"
57+
58+
}.getOrElse(p.toString)
59+
}
60+
61+
implicit class RDDWithPartitionDescribe(val r: RDD[_]) extends AnyVal {
62+
def describePartitions: String = r.partitions.map(p ("Partition " + p.index) -> p.describe).mkString("\n")
63+
}
64+
3065
}
66+

datasource/src/it/scala/org/locationtech/rasterframes/datasource/raster/RaterSourceDataSourceIT.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class RaterSourceDataSourceIT extends TestEnvironment with TestData {
3131
// A regression test.
3232
val rf = spark.read.raster
3333
.withSpatialIndex()
34-
.load("https://s22s-test-geotiffs.s3.amazonaws.com/water_class/seasonality_90W_50N.tif")
34+
.load("https://rasterframes.s3.amazonaws.com/samples/water_class/seasonality_90W_50N.tif")
3535

3636
val target_rf =
3737
rf.select(rf_extent($"proj_raster").alias("extent"), rf_crs($"proj_raster").alias("crs"), rf_tile($"proj_raster").alias("target"))

datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,6 @@ case class RasterSourceRelation(
153153
.repartitionByRange(numParts,$"spatial_index")
154154
indexed.rdd
155155
}
156-
else df.rdd
156+
else df.repartition(numParts).rdd
157157
}
158158
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
# Set everything to be logged to the console
19+
log4j.rootCategory=INFO, console
20+
log4j.appender.console=org.apache.log4j.ConsoleAppender
21+
log4j.appender.console.target=System.err
22+
log4j.appender.console.layout=org.apache.log4j.PatternLayout
23+
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
24+
25+
# Set the default spark-shell log level to WARN. When running the spark-shell, the
26+
# log level for this class is used to overwrite the root logger's log level, so that
27+
# the user can have different defaults for the shell and regular Spark apps.
28+
log4j.logger.org.apache.spark.repl.Main=WARN
29+
30+
31+
log4j.logger.org.apache=ERROR
32+
log4j.logger.com.amazonaws=WARN
33+
log4j.logger.geotrellis=WARN
34+
35+
# Settings to quiet third party logs that are too verbose
36+
log4j.logger.org.spark_project.jetty=WARN
37+
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
38+
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
39+
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
40+
log4j.logger.org.locationtech.rasterframes=DEBUG
41+
log4j.logger.org.locationtech.rasterframes.ref=DEBUG
42+
log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=OFF
43+
44+
# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
45+
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
46+
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
47+
48+
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator=ERROR
49+
log4j.logger.org.apache.spark.sql.execution.WholeStageCodegenExec=ERROR
50+
log4j.logger.geotrellis.raster.gdal=ERROR

docs/src/main/paradox/index.md

+13-11
Original file line numberDiff line numberDiff line change
@@ -29,18 +29,20 @@ The source code can be found on GitHub at [locationtech/rasterframes](https://gi
2929

3030
## Detailed Contents
3131

32-
@@ toc { depth=4 }
32+
@@ toc { depth=3 }
3333

3434
@@@ index
35-
* [Overview](description.md)
36-
* [Getting Started](getting-started.md)
37-
* [Concepts](concepts.md)
38-
* [Raster Data I/O](raster-io.md)
39-
* [Vector Data](vector-data.md)
40-
* [Raster Processing](raster-processing.md)
41-
* [Numpy and Pandas](numpy-pandas.md)
42-
* [Scala and SQL](languages.md)
43-
* [Function Reference](reference.md)
44-
* [Release Notes](release-notes.md)
35+
* @ref:[Overview](description.md)
36+
* @ref:[Getting Started](getting-started.md)
37+
* @ref:[Concepts](concepts.md)
38+
* @ref:[Raster Data I/O](raster-io.md)
39+
* @ref:[Vector Data](vector-data.md)
40+
* @ref:[Raster Processing](raster-processing.md)
41+
* @ref:[Machine Learning](machine-learning.md)
42+
* @ref:[Numpy and Pandas](numpy-pandas.md)
43+
* @ref:[IPython Extensions](ipython.md)
44+
* @ref:[Scala and SQL](languages.md)
45+
* @ref:[Function Reference](reference.md)
46+
* @ref:[Release Notes](release-notes.md)
4547
@@@
4648

docs/src/main/paradox/raster-processing.md

-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
* @ref:[Aggregation](aggregation.md)
1010
* @ref:[Time Series](time-series.md)
1111
* @ref:[Raster Join](raster-join.md)
12-
* @ref:[Machine Learning](machine-learning.md)
1312

1413
@@@
1514

docs/src/main/paradox/release-notes.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,14 @@
55
### 0.9.1
66

77
* Upgraded to Spark 2.4.7
8+
* Added `pyspark.sql.DataFrame.display(num_rows:int, truncate:bool)` extension method when `rf_ipython` is imported.
9+
* Added users' manual section on IPython display enhancements.
810
* Added `method_name` parameter to the `rf_resample` method.
911
* __BREAKING__: In SQL, the function `rf_resample` now takes 3 arguments. You can use `rf_resample_nearest` with two arguments or refactor to `rf_resample(t, v, "nearest")`.
1012
* Added resample method parameter to SQL and Python APIs. @ref:[See updated docs](raster-join.md).
11-
13+
* Upgraded many of the pyrasterframes dependencies, including:
14+
`descartes`, `fiona`, `folium`, `geopandas`, `matplotlib`, `numpy`, `pandas`, `rasterio`, `shapely`
15+
* Changed `rasterframes.prefer-gdal` configuration parameter to default to `False`, as JVM GeoTIFF performs just as well for COGs as the GDAL one.
1216

1317
### 0.9.0
1418

pyrasterframes/build.sbt

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Python / doc := (Python / doc / target).toTask.dependsOn(
99
Def.sequential(
1010
assembly,
1111
Test / compile,
12-
pySetup.toTask(" pweave")
12+
pySetup.toTask(" pweave --quick True")
1313
)
1414
).value
1515

pyrasterframes/src/main/python/README.md

+9
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ Issue tracking is through [github](https://github.com/locationtech/rasterframes/
3838

3939
Community contributions are always welcome. To get started, please review our [contribution guidelines](https://github.com/locationtech/rasterframes/blob/develop/CONTRIBUTING.md), [code of conduct](https://github.com/locationtech/rasterframes/blob/develop/CODE_OF_CONDUCT.md), and [developer's guide](../../../README.md). Reach out to us on [gitter][gitter] so the community can help you get started!
4040

41+
## Development environment setup
4142

43+
For best results, we suggest using `conda` and the `conda-forge` channel to install the compiled dependencies before installing the packages in `setup.py`. Assuming you're in the same directory as this file:
44+
45+
conda create -n rasterframes python==3.7
46+
conda install --file ./requirements-condaforge.txt
47+
48+
Then you can install the source dependencies:
49+
50+
pip install -e .
4251

4352
[gitter]: https://gitter.im/locationtech/rasterframes

pyrasterframes/src/main/python/docs/aggregation.pymd

+3-3
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ rf.agg(rf_agg_local_mean('tile')) \
7171
We can also count the total number of data and NoData cells over all the _tiles_ in a DataFrame using @ref:[`rf_agg_data_cells`](reference.md#rf-agg-data-cells) and @ref:[`rf_agg_no_data_cells`](reference.md#rf-agg-no-data-cells). There are ~3.8 million data cells and ~1.9 million NoData cells in this DataFrame. See the section on @ref:["NoData" handling](nodata-handling.md) for additional discussion on handling missing data.
7272

7373
```python, cell_counts
74-
rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
74+
rf = spark.read.raster('https://rasterframes.s3.amazonaws.com/samples/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
7575
stats = rf.agg(rf_agg_data_cells('proj_raster'), rf_agg_no_data_cells('proj_raster'))
7676
stats
7777
```
@@ -83,7 +83,7 @@ The statistical summary functions return a summary of cell values: number of dat
8383
The @ref:[`rf_tile_stats`](reference.md#rf-tile-stats) function computes summary statistics separately for each row in a _tile_ column as shown below.
8484

8585
```python, tile_stats
86-
rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif')
86+
rf = spark.read.raster('https://rasterframes.s3.amazonaws.com/samples/luray_snp/B02.tif')
8787
stats = rf.select(rf_tile_stats('proj_raster').alias('stats'))
8888

8989
stats.printSchema()
@@ -125,7 +125,7 @@ The @ref:[`rf_tile_histogram`](reference.md#rf-tile-histogram) function computes
125125
```python, tile_histogram
126126
import matplotlib.pyplot as plt
127127

128-
rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
128+
rf = spark.read.raster('https://rasterframes.s3.amazonaws.com/samples/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
129129

130130
hist_df = rf.select(rf_tile_histogram('proj_raster')['bins'].alias('bins'))
131131
hist_df.printSchema()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# IPython/Jupyter Extensions
2+
3+
The `pyrasterframes.rf_ipython` module injects a number of visualization extensions into the IPython environment, enhancing visualization of `DataFrame`s and `Tile`s.
4+
5+
By default, the last expression's result in a IPython cell is passed to the `IPython.display.display` function. This function in turn looks for a [`DisplayFormatter`](https://ipython.readthedocs.io/en/stable/api/generated/IPython.core.formatters.html#IPython.core.formatters.DisplayFormatter) associated with the type, which in turn converts the instance to a display-appropriate representation, based on MIME type. For example, each `DisplayFormatter` may `plain/text` version for the IPython shell, and a `text/html` version for a Jupyter Notebook.
6+
7+
This will be our setup for the following examples:
8+
9+
```python setup
10+
from pyrasterframes import *
11+
from pyrasterframes.rasterfunctions import *
12+
from pyrasterframes.utils import create_rf_spark_session
13+
import pyrasterframes.rf_ipython
14+
from IPython.display import display
15+
import os.path
16+
spark = create_rf_spark_session()
17+
def scene(band):
18+
b = str(band).zfill(2) # converts int 2 to '02'
19+
return 'https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/' \
20+
'MCD43A4.A2019059.h11v08.006.2019072203257_B{}.TIF'.format(b)
21+
rf = spark.read.raster(scene(2), tile_dimensions=(256, 256))
22+
```
23+
24+
## Tile Samples
25+
26+
We have some convenience methods to quickly visualize tiles (see discussion of the RasterFrame @ref:[schema](raster-read.md#single-raster) for orientation to the concept) when inspecting a subset of the data in a Notebook.
27+
28+
In an IPython or Jupyter interpreter, a `Tile` object will be displayed as an image with limited metadata.
29+
30+
```python, sample_tile
31+
sample_tile = rf.select(rf_tile('proj_raster').alias('tile')).first()['tile']
32+
sample_tile # or `display(sample_tile)`
33+
```
34+
35+
## DataFrame Samples
36+
37+
Within an IPython or Jupyter interpreter, a Spark and Pandas DataFrames containing a column of _tiles_ will be rendered as the samples discussed above. Simply import the `rf_ipython` submodule to enable enhanced HTML rendering of these DataFrame types.
38+
39+
```python display_samples
40+
rf # or `display(rf)`, or `rf.display()`
41+
```
42+
43+
### Changing Number of Rows
44+
45+
By default the RasterFrame sample display renders 5 rows. Because the `IPython.display.display` function doesn't pass parameters to the underlying rendering functions, we have to provide a different means of passing parameters to the rendering code. Pandas approach to this is to use global settings via `set_option`/`get_option`. We take a more functional approach and have the user invoke an explicit `display` method:
46+
47+
```python custom_display, evaluate=False
48+
rf.display(num_rows=1, truncate=True)
49+
```
50+
51+
```python custom_display_mime, echo=False
52+
rf.display(num_rows=1, truncate=True, mimetype='text/markdown')
53+
```
54+
55+
### Pandas
56+
57+
There is similar rendering support injected into the Pandas by the `rf_ipython` module, for Pandas Dataframes having Tiles in them:
58+
59+
```python pandas_dataframe
60+
# Limit copy of data from Spark to a few tiles.
61+
pandas_df = rf.select(rf_tile('proj_raster'), rf_extent('proj_raster')).limit(4).toPandas()
62+
pandas_df # or `display(pandas_df)`
63+
```
64+
65+
## Sample Colorization
66+
67+
RasterFrames uses the "Viridis" color ramp as the default color profile for tile column. There are other options for reasoning about how color should be applied in the results.
68+
69+
### Color Composite
70+
71+
As shown in @ref:[Writing Raster Data section](raster-write.md) section, composites can be constructed for visualization:
72+
73+
```python, png_color_composite
74+
from IPython.display import Image # For telling IPython how to interpret the PNG byte array
75+
# Select red, green, and blue, respectively
76+
three_band_rf = spark.read.raster(source=[[scene(1), scene(4), scene(3)]])
77+
composite_rf = three_band_rf.withColumn('png',
78+
rf_render_png('proj_raster_0', 'proj_raster_1', 'proj_raster_2'))
79+
png_bytes = composite_rf.select('png').first()['png']
80+
Image(png_bytes)
81+
```
82+
83+
```python, png_render, echo=False
84+
from IPython.display import display_markdown
85+
display_markdown(pyrasterframes.rf_ipython.binary_to_html(png_bytes), raw=True)
86+
```
87+
88+
### Custom Color Ramp
89+
90+
You can also apply a different color ramp to a single-channel Tile using the @ref[`rf_render_color_ramp_png`](reference.md#rf-render-color-ramp-png) function. See the function documentation for information about the available color maps.
91+
92+
```python, color_map
93+
rf.select(rf_render_color_ramp_png('proj_raster', 'Magma'))
94+
```

pyrasterframes/src/main/python/docs/local-algebra.pymd

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ This form of `(x - y) / (x + y)` is common in remote sensing and is called a nor
3535

3636
```python, read_rasters
3737
from pyspark.sql import Row
38-
uri_pattern = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B0{}.tif'
38+
uri_pattern = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/B0{}.tif'
3939
catalog_df = spark.createDataFrame([
4040
Row(red=uri_pattern.format(4), nir=uri_pattern.format(8))
4141
])

pyrasterframes/src/main/python/docs/masking.pymd

+3-3
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ The first step is to create a catalog with our band of interest and the SCL band
3030
```python, blue_scl_cat
3131
from pyspark.sql import Row
3232

33-
blue_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif'
34-
green_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B03.tif'
35-
scl_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/SCL.tif'
33+
blue_uri = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/B02.tif'
34+
green_uri = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/B03.tif'
35+
scl_uri = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/SCL.tif'
3636
cat = spark.createDataFrame([Row(blue=blue_uri, green=green_uri, scl=scl_uri),])
3737
unmasked = spark.read.raster(cat, catalog_col_names=['blue', 'green', 'scl'])
3838
unmasked.printSchema()

pyrasterframes/src/main/python/docs/nodata-handling.pymd

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ CellType.float64()
4040
We can also inspect the cell type of a given _tile_ or `proj_raster` column.
4141

4242
```python, ct_from_sen
43-
cell_types = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif') \
43+
cell_types = spark.read.raster('https://rasterframes.s3.amazonaws.com/samples/luray_snp/B02.tif') \
4444
.select(rf_cell_type('proj_raster')).distinct()
4545
cell_types
4646
```

0 commit comments

Comments
 (0)