Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/workflows/java.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,17 @@ jobs:
fail-fast: true
matrix:
include:
- spark: 4.0.0
scala: 2.13.8
- spark: 4.1.1
scala: 2.13.17
jdk: '17'
- spark: 3.5.4
scala: 2.12.18
- spark: 4.0.2
scala: 2.13.17
jdk: '17'
- spark: 3.5.0
- spark: 3.5.8
scala: 2.13.8
jdk: '11'
skipTests: ''
- spark: 3.5.0
- spark: 3.5.8
scala: 2.12.15
jdk: '11'
skipTests: ''
Expand Down
39 changes: 3 additions & 36 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
strategy:
matrix:
include:
- spark: '4.0.0'
- spark: '4.1.1'
scala: '2.13.8'
java: '17'
python: '3.11'
Expand All @@ -69,42 +69,9 @@ jobs:
java: '17'
python: '3.10'
- spark: '3.5.0'
scala: '2.12.8'
java: '11'
python: '3.11'
- spark: '3.5.0'
scala: '2.12.8'
java: '11'
python: '3.10'
shapely: '1'
- spark: '3.5.0'
scala: '2.12.8'
java: '11'
python: '3.10'
- spark: '3.5.0'
scala: '2.12.8'
java: '11'
python: '3.9'
- spark: '3.5.0'
scala: '2.12.8'
java: '11'
python: '3.8'
- spark: '3.4.0'
scala: '2.12.8'
java: '11'
python: '3.11'
- spark: '3.4.0'
scala: '2.12.8'
java: '11'
python: '3.10'
- spark: '3.4.0'
scala: '2.12.8'
java: '11'
python: '3.9'
- spark: '3.4.0'
scala: '2.12.8'
java: '11'
python: '3.8'
- spark: '3.4.0'
scala: '2.12.8'
java: '11'
Expand Down Expand Up @@ -149,9 +116,9 @@ jobs:
fi

if [ "${SPARK_VERSION:0:1}" == "4" ]; then
# Spark 4.0 requires Python 3.9+, and we remove flink since it conflicts with pyspark 4.0
# Spark 4.x requires Python 3.10+, and we remove flink since it conflicts with pyspark 4.x
uv remove apache-flink --optional flink
uv add "pyspark==4.0.0; python_version >= '3.9'"
uv add "pyspark==${SPARK_VERSION}; python_version >= '3.10'"
else
# Install specific pyspark version matching matrix
uv add pyspark==${SPARK_VERSION}
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ repos:
- id: clang-format
name: run clang-format
description: format C files with clang-format
args: [--style=file:.github/linters/.clang-format]
args: ['--style=file:.github/linters/.clang-format']
types_or: [c]
- repo: https://github.com/PyCQA/bandit
rev: 1.9.3
Expand Down
19 changes: 14 additions & 5 deletions docs/community/publish.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,13 @@ rm -rf $LOCAL_DIR && git clone --depth 1 --branch $TAG $REPO_URL $LOCAL_DIR && c
MAVEN_PLUGIN_VERSION="2.3.2"

# Define Spark and Scala versions
declare -a SPARK_VERSIONS=("3.4" "3.5" "4.0")
declare -a SPARK_VERSIONS=("3.4" "3.5" "4.0" "4.1")
declare -a SCALA_VERSIONS=("2.12" "2.13")

# Function to get Java version for Spark version
get_java_version() {
local spark_version=$1
if [[ "$spark_version" == "4.0" ]]; then
if [[ "$spark_version" == "4."* ]]; then
echo "17"
else
echo "11"
Expand Down Expand Up @@ -217,8 +217,8 @@ verify_java_version() {
# Iterate through Spark and Scala versions
for SPARK in "${SPARK_VERSIONS[@]}"; do
for SCALA in "${SCALA_VERSIONS[@]}"; do
# Skip Spark 4.0 + Scala 2.12 combination as it's not supported
if [[ "$SPARK" == "4.0" && "$SCALA" == "2.12" ]]; then
# Skip Spark 4.0+ + Scala 2.12 combination as it's not supported
if [[ "$SPARK" == "4."* && "$SCALA" == "2.12" ]]; then
echo "Skipping Spark $SPARK with Scala $SCALA (not supported)"
continue
fi
Expand Down Expand Up @@ -286,7 +286,7 @@ mkdir apache-sedona-${SEDONA_VERSION}-bin
# Function to get Java version for Spark version
get_java_version() {
local spark_version=$1
if [[ "$spark_version" == "4.0" ]]; then
if [[ "$spark_version" == "4."* ]]; then
echo "17"
else
echo "11"
Expand Down Expand Up @@ -410,6 +410,15 @@ echo "Compiling for Spark 4.0 with Scala 2.13 using Java $JAVA_VERSION..."
cd apache-sedona-${SEDONA_VERSION}-src && $MVN_WRAPPER clean && $MVN_WRAPPER install -DskipTests -Dspark=4.0 -Dscala=2.13 && cd ..
cp apache-sedona-${SEDONA_VERSION}-src/spark-shaded/target/sedona-*${SEDONA_VERSION}.jar apache-sedona-${SEDONA_VERSION}-bin/

# Compile for Spark 4.1 with Java 17
JAVA_VERSION=$(get_java_version "4.1")
MVN_WRAPPER=$(create_mvn_wrapper $JAVA_VERSION)
verify_java_version $MVN_WRAPPER $JAVA_VERSION

echo "Compiling for Spark 4.1 with Scala 2.13 using Java $JAVA_VERSION..."
cd apache-sedona-${SEDONA_VERSION}-src && $MVN_WRAPPER clean && $MVN_WRAPPER install -DskipTests -Dspark=4.1 -Dscala=2.13 && cd ..
cp apache-sedona-${SEDONA_VERSION}-src/spark-shaded/target/sedona-*${SEDONA_VERSION}.jar apache-sedona-${SEDONA_VERSION}-bin/

# Clean up Maven wrappers
rm -f /tmp/mvn-java11 /tmp/mvn-java17

Expand Down
29 changes: 29 additions & 0 deletions docs/setup/maven-coordinates.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,22 @@ The optional GeoTools library is required if you want to use raster operators. V
</dependency>
```

=== "Spark 4.1 and Scala 2.13"

```xml
<dependency>
<groupId>org.apache.sedona</groupId>
<artifactId>sedona-spark-shaded-4.1_2.13</artifactId>
<version>{{ sedona.current_version }}</version>
</dependency>
<!-- Optional: https://mvnrepository.com/artifact/org.datasyslab/geotools-wrapper -->
<dependency>
<groupId>org.datasyslab</groupId>
<artifactId>geotools-wrapper</artifactId>
<version>{{ sedona.current_geotools }}</version>
</dependency>
```

!!! abstract "Sedona with Apache Flink"

=== "Flink 1.12+ and Scala 2.12"
Expand Down Expand Up @@ -265,6 +281,19 @@ The optional GeoTools library is required if you want to use raster operators. V
<version>{{ sedona.current_geotools }}</version>
</dependency>
```
=== "Spark 4.1 and Scala 2.13"
```xml
<dependency>
<groupId>org.apache.sedona</groupId>
<artifactId>sedona-spark-4.1_2.13</artifactId>
<version>{{ sedona.current_version }}</version>
</dependency>
<dependency>
<groupId>org.datasyslab</groupId>
<artifactId>geotools-wrapper</artifactId>
<version>{{ sedona.current_geotools }}</version>
</dependency>
```

!!! abstract "Sedona with Apache Flink"

Expand Down
28 changes: 14 additions & 14 deletions docs/setup/platform.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,28 @@ Sedona binary releases are compiled by Java 11/17 and Scala 2.12/2.13 and tested
**Java Requirements:**

- Spark 3.4 & 3.5: Java 11
- Spark 4.0: Java 17
- Spark 4.0 & 4.1: Java 17

**Note:** Java 8 support is dropped since Sedona 1.8.0. Spark 3.3 support is dropped since Sedona 1.8.0.

=== "Sedona Scala/Java"

| | Spark 3.4| Spark 3.5 | Spark 4.0 |
|:---------:|:---------:|:---------:|:---------:|
| Scala 2.12 |✅ |✅ |✅ |
| Scala 2.13 |✅ |✅ |✅ |
| | Spark 3.4| Spark 3.5 | Spark 4.0 | Spark 4.1 |
|:---------:|:---------:|:---------:|:---------:|:---------:|
| Scala 2.12 |✅ |✅ |✅ | |
| Scala 2.13 |✅ |✅ |✅ |✅ |

=== "Sedona Python"

| | Spark 3.4 (Scala 2.12)|Spark 3.5 (Scala 2.12)| Spark 4.0 (Scala 2.12)|
|:---------:|:---------:|:---------:|:---------:|
| Python 3.7 | ✅ | ✅ | |
| Python 3.8 | ✅ | ✅ | |
| Python 3.9 | ✅ | ✅ | ✅ |
| Python 3.10 | ✅ | ✅ | ✅ |
| | Spark 3.4 (Scala 2.12)|Spark 3.5 (Scala 2.12)| Spark 4.0 (Scala 2.13)| Spark 4.1 (Scala 2.13)|
|:---------:|:---------:|:---------:|:---------:|:---------:|
| Python 3.7 | ✅ | ✅ | | |
| Python 3.8 | ✅ | ✅ | | |
| Python 3.9 | ✅ | ✅ | ✅ | ✅ |
| Python 3.10 | ✅ | ✅ | ✅ | ✅ |

=== "Sedona R"

| | Spark 3.4 | Spark 3.5 | Spark 4.0 |
|:---------:|:---------:|:---------:|:---------:|
| Scala 2.12 | ✅ | ✅ | ✅ |
| | Spark 3.4 | Spark 3.5 | Spark 4.0 | Spark 4.1 |
|:---------:|:---------:|:---------:|:---------:|:---------:|
| Scala 2.12 | ✅ | ✅ | ✅ | |
27 changes: 25 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,30 @@
<log4j.version>2.24.3</log4j.version>
<slf4j.version>2.0.16</slf4j.version>

<scala.version>2.13.12</scala.version>
<scala.version>2.13.17</scala.version>
<scala.compat.version>2.13</scala.compat.version>

<!-- Skip deploying parent module. it will be deployed with sedona-spark-3.4 -->
<skip.deploy.common.modules>true</skip.deploy.common.modules>
</properties>
</profile>
<profile>
<id>sedona-spark-4.1</id>
<activation>
<property>
<name>spark</name>
<value>4.1</value>
</property>
</activation>
<properties>
<spark.version>4.1.1</spark.version>
<spark.compat.version>4.1</spark.compat.version>
<spark.major.version>4</spark.major.version>
<hadoop.version>3.4.1</hadoop.version>
<log4j.version>2.24.3</log4j.version>
<slf4j.version>2.0.16</slf4j.version>

<scala.version>2.13.17</scala.version>
<scala.compat.version>2.13</scala.compat.version>

<!-- Skip deploying parent module. it will be deployed with sedona-spark-3.4 -->
Expand All @@ -775,7 +798,7 @@
<activeByDefault>false</activeByDefault>
</activation>
<properties>
<scala.version>2.13.12</scala.version>
<scala.version>2.13.17</scala.version>
<scala.compat.version>2.13</scala.compat.version>
<scaladoc.arg>-no-java-comments</scaladoc.arg>
<!-- Skip deploying parent module for Scala 2.13 profile, it will be deployed with 2.12 -->
Expand Down
11 changes: 8 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,17 @@ dependencies = [
]

[project.optional-dependencies]
spark = ["pyspark>=3.4.0,<4.1.0"]
spark = [
"pyspark>=3.4.0,<4.1.0; python_version < '3.10'",
"pyspark>=3.4.0,<4.2.0; python_version >= '3.10'",
]
pydeck-map = ["geopandas", "pydeck==0.8.0"]
kepler-map = ["geopandas", "keplergl==0.3.2"]
flink = ["apache-flink>=1.19.0"]
db = ["sedonadb[geopandas]; python_version >= '3.9'"]
all = [
"pyspark>=3.4.0,<4.1.0",
"pyspark>=3.4.0,<4.1.0; python_version < '3.10'",
"pyspark>=3.4.0,<4.2.0; python_version >= '3.10'",
"geopandas",
"pydeck==0.8.0",
"keplergl==0.3.2",
Expand Down Expand Up @@ -71,7 +75,8 @@ dev = [
# cannot set geopandas>=0.14.4 since it doesn't support python 3.8, so we pin fiona to <1.10.0
"fiona<1.10.0",
"pyarrow",
"pyspark>=3.4.0,<4.1.0",
"pyspark>=3.4.0,<4.1.0; python_version < '3.10'",
"pyspark>=3.4.0,<4.2.0; python_version >= '3.10'",
"keplergl==0.3.2",
"pydeck==0.8.0",
"pystac==1.5.0",
Expand Down
2 changes: 1 addition & 1 deletion python/tests/sql/test_dataframe_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1790,7 +1790,7 @@ def test_dataframe_function(
elif isinstance(actual_result, Geography):
# self.assert_geometry_almost_equal(expected_result, actual_result.geometry)
return
elif isinstance(actual_result, bytearray):
elif isinstance(actual_result, (bytes, bytearray)):
actual_result = actual_result.hex()
elif isinstance(actual_result, Row):
actual_result = {
Expand Down
17 changes: 17 additions & 0 deletions spark/common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -355,5 +355,22 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>sedona-spark-4.1</id>
<activation>
<property>
<name>spark</name>
<value>4.1</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-api_${scala.compat.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@

import java.util.Iterator;
import java.util.List;
import org.apache.commons.collections.iterators.FilterIterator;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
Expand Down Expand Up @@ -57,11 +59,10 @@ public Iterator<Pair<U, T>> call(Integer partitionId, Iterator<Pair<U, T>> geome
final List<Envelope> partitionExtents = dedupParamsBroadcast.getValue().getPartitionExtents();
if (partitionId < partitionExtents.size()) {
HalfOpenRectangle extent = new HalfOpenRectangle(partitionExtents.get(partitionId));
return new FilterIterator(
geometryPair,
p ->
!GeomUtils.isDuplicate(
((Pair<U, T>) p).getLeft(), ((Pair<U, T>) p).getRight(), extent));
return StreamSupport.stream(
Spliterators.spliteratorUnknownSize(geometryPair, Spliterator.ORDERED), false)
.filter(p -> !GeomUtils.isDuplicate(p.getLeft(), p.getRight(), extent))
.iterator();
} else {
log.warn("Didn't find partition extent for this partition: " + partitionId);
return geometryPair;
Expand Down
Loading
Loading