diff --git a/common/src/main/java/org/apache/sedona/common/FunctionsProj4.java b/common/src/main/java/org/apache/sedona/common/FunctionsProj4.java index b5a5c1c43e0..8b40d93b6a0 100644 --- a/common/src/main/java/org/apache/sedona/common/FunctionsProj4.java +++ b/common/src/main/java/org/apache/sedona/common/FunctionsProj4.java @@ -18,9 +18,14 @@ */ package org.apache.sedona.common; +import java.util.Locale; +import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.datasyslab.proj4sedona.core.Proj; +import org.datasyslab.proj4sedona.defs.CRSResult; +import org.datasyslab.proj4sedona.defs.Defs; +import org.datasyslab.proj4sedona.defs.UrlCRSProvider; import org.datasyslab.proj4sedona.jts.JTSGeometryTransformer; import org.datasyslab.proj4sedona.parser.CRSSerializer; import org.locationtech.jts.geom.Geometry; @@ -62,6 +67,104 @@ public class FunctionsProj4 { private static final Pattern EPSG_PATTERN = Pattern.compile("^EPSG:(\\d+)$", Pattern.CASE_INSENSITIVE); + /** Name used for the registered URL CRS provider. */ + private static final String URL_CRS_PROVIDER_NAME = "sedona-url-crs"; + + /** + * Tracks the currently registered URL CRS provider config (baseUrl + "|" + pathTemplate + "|" + + * format). Null means no provider registered yet. Uses AtomicReference for thread-safe lazy + * initialization on executors. + */ + private static final AtomicReference registeredUrlCrsConfig = new AtomicReference<>(null); + + /** + * Reset the URL CRS provider state. Package-private for testing only. Removes the provider from + * Defs and clears the cached config key. + */ + static void resetUrlCrsProviderForTest() { + Defs.removeProvider(URL_CRS_PROVIDER_NAME); + registeredUrlCrsConfig.set(null); + } + + /** + * Register a URL-based CRS provider with proj4sedona's Defs registry. This provider will be + * consulted before the built-in provider when resolving EPSG codes. + * + *

This method is safe to call concurrently from multiple threads — it uses double-checked + * locking so the fast path (already registered with the same config) is lock-free, and the + * synchronized slow path executes at most once per JVM (or once per config change). + * + * @param baseUrl The base URL of the CRS definition server + * @param pathTemplate The URL path template (e.g., "/{authority}/{code}.json") + * @param format The expected response format: "projjson", "proj", "wkt1", or "wkt2" + */ + public static void registerUrlCrsProvider(String baseUrl, String pathTemplate, String format) { + if (baseUrl == null || baseUrl.isEmpty()) { + return; + } + + // Canonicalize format to avoid unnecessary re-registration for equivalent configs + String canonicalFormat = parseCrsFormat(format).name().toLowerCase(Locale.ROOT); + String configKey = baseUrl + "|" + pathTemplate + "|" + canonicalFormat; + + // Fast path (lock-free): already registered with the same config. + // This handles 99.999%+ of calls with just a volatile read + String.equals(). + if (configKey.equals(registeredUrlCrsConfig.get())) { + return; + } + + // Slow path: synchronize to make the remove-register-set sequence atomic. + // Only the first thread per JVM (or per config change) enters this block. + synchronized (registeredUrlCrsConfig) { + // Re-check after acquiring lock — another thread may have registered already + String current = registeredUrlCrsConfig.get(); + if (configKey.equals(current)) { + return; + } + + // Remove existing provider if config changed + if (current != null) { + Defs.removeProvider(URL_CRS_PROVIDER_NAME); + } + + CRSResult.Format crsFormat = parseCrsFormat(format); + + UrlCRSProvider provider = + UrlCRSProvider.builder(URL_CRS_PROVIDER_NAME) + .baseUrl(baseUrl) + .pathTemplate(pathTemplate) + .format(crsFormat) + .build(); + + // Priority 50: before built-in (100) and spatialreference.org (101) + Defs.registerProvider(provider, 50); + registeredUrlCrsConfig.set(configKey); + } + } + + /** + * Parse the CRS format string from config to the CRSResult.Format enum. + * + * @param format Format string: "projjson", "proj", "wkt1", or "wkt2" + * @return The corresponding CRSResult.Format + */ + private static CRSResult.Format parseCrsFormat(String format) { + if (format == null || format.isEmpty()) { + return CRSResult.Format.PROJJSON; + } + switch (format.toLowerCase(Locale.ROOT)) { + case "proj": + return CRSResult.Format.PROJ4; + case "wkt1": + return CRSResult.Format.WKT1; + case "wkt2": + return CRSResult.Format.WKT2; + case "projjson": + default: + return CRSResult.Format.PROJJSON; + } + } + /** * Transform a geometry from the source CRS specified by the geometry's SRID to the target CRS. * diff --git a/common/src/test/java/org/apache/sedona/common/FunctionsProj4Test.java b/common/src/test/java/org/apache/sedona/common/FunctionsProj4Test.java index 903bf2f9d30..2584f99ccb0 100644 --- a/common/src/test/java/org/apache/sedona/common/FunctionsProj4Test.java +++ b/common/src/test/java/org/apache/sedona/common/FunctionsProj4Test.java @@ -21,8 +21,18 @@ import static org.junit.Assert.*; import static org.junit.Assume.assumeTrue; +import com.sun.net.httpserver.HttpServer; +import java.net.InetSocketAddress; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CyclicBarrier; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; import org.junit.Test; import org.locationtech.jts.geom.*; import org.locationtech.jts.io.WKTReader; @@ -573,4 +583,213 @@ public void testRoundTrip() { assertEquals(original.getCoordinate().x, backToWgs84.getCoordinate().x, 1e-9); assertEquals(original.getCoordinate().y, backToWgs84.getCoordinate().y, 1e-9); } + + // ==================== URL CRS Provider Registration Tests ==================== + + @Test + public void testRegisterUrlCrsProviderNoOpOnNullOrEmpty() { + // null and empty baseUrl should be no-ops, not throw + FunctionsProj4.registerUrlCrsProvider(null, "/epsg/{code}.json", "projjson"); + FunctionsProj4.registerUrlCrsProvider("", "/epsg/{code}.json", "projjson"); + // No provider should have been registered + assertNull("No provider should be registered for null/empty baseUrl", findUrlCrsProvider()); + } + + @Test + public void testRegisterUrlCrsProviderRegistersAndIsIdempotent() { + String testUrl = "https://test-crs-server.example.com"; + try { + FunctionsProj4.registerUrlCrsProvider(testUrl, "/epsg/{code}.json", "projjson"); + assertNotNull("sedona-url-crs provider should be registered", findUrlCrsProvider()); + int countBefore = countProvidersByName("sedona-url-crs"); + + // Second call with same config — should not add a duplicate + FunctionsProj4.registerUrlCrsProvider(testUrl, "/epsg/{code}.json", "projjson"); + assertEquals( + "Provider should not be duplicated", countBefore, countProvidersByName("sedona-url-crs")); + } finally { + FunctionsProj4.resetUrlCrsProviderForTest(); + } + } + + @Test + public void testRegisterUrlCrsProviderReRegistersOnConfigChange() { + try { + FunctionsProj4.registerUrlCrsProvider( + "https://server-a.example.com", "/epsg/{code}.json", "projjson"); + assertEquals( + org.datasyslab.proj4sedona.defs.CRSResult.Format.PROJJSON, + findUrlCrsProvider().getFormat()); + + // Change config — should re-register with new settings + FunctionsProj4.registerUrlCrsProvider( + "https://server-b.example.com", "/epsg/{code}.json", "wkt2"); + assertEquals( + org.datasyslab.proj4sedona.defs.CRSResult.Format.WKT2, findUrlCrsProvider().getFormat()); + } finally { + FunctionsProj4.resetUrlCrsProviderForTest(); + } + } + + @Test + public void testParseCrsFormatAllMappings() { + // Verify all valid format strings map to the correct enum + Object[][] cases = { + {"projjson", org.datasyslab.proj4sedona.defs.CRSResult.Format.PROJJSON}, + {"proj", org.datasyslab.proj4sedona.defs.CRSResult.Format.PROJ4}, + {"wkt1", org.datasyslab.proj4sedona.defs.CRSResult.Format.WKT1}, + {"wkt2", org.datasyslab.proj4sedona.defs.CRSResult.Format.WKT2}, + }; + for (Object[] c : cases) { + try { + FunctionsProj4.registerUrlCrsProvider( + "https://test.example.com", "/epsg/{code}", (String) c[0]); + assertEquals("Format '" + c[0] + "'", c[1], findUrlCrsProvider().getFormat()); + } finally { + FunctionsProj4.resetUrlCrsProviderForTest(); + } + } + } + + @Test + public void testParseCrsFormatDefaultsAndCaseInsensitive() { + // null, empty, unknown, and uppercase should all default to / map to PROJJSON + String[] inputs = {null, "", "unknown-format", "PROJJSON", "ProjJson"}; + for (String input : inputs) { + try { + FunctionsProj4.registerUrlCrsProvider("https://test.example.com", "/epsg/{code}", input); + assertEquals( + "Format input '" + input + "' should resolve to PROJJSON", + org.datasyslab.proj4sedona.defs.CRSResult.Format.PROJJSON, + findUrlCrsProvider().getFormat()); + } finally { + // Use the test reset so registeredUrlCrsConfig is also cleared + FunctionsProj4.resetUrlCrsProviderForTest(); + } + } + } + + @Test + public void testTransformWithLocalUrlCrsProvider() throws Exception { + // Serve a deliberately wrong CRS definition for a fake EPSG code (990001) + // that no built-in provider knows. The definition is a Mercator projection + // with absurd false easting/northing (+x_0=10000000 +y_0=20000000). + // If the transform succeeds with these shifted coordinates, the URL provider + // resolved the CRS. If it didn't work, the transform would fail entirely + // because no built-in provider knows EPSG:990001. + AtomicInteger requestCount = new AtomicInteger(0); + HttpServer server = HttpServer.create(new InetSocketAddress(0), 0); + int port = server.getAddress().getPort(); + + // Web Mercator with intentional 10M/20M false easting/northing + String weirdMercator = + "+proj=merc +a=6378137 +b=6378137 +lat_ts=0 +lon_0=0" + + " +x_0=10000000 +y_0=20000000 +k=1 +units=m +no_defs"; + + server.createContext( + "/epsg/", + exchange -> { + String path = exchange.getRequestURI().getPath(); + if (path.contains("990001")) { + requestCount.incrementAndGet(); + byte[] body = weirdMercator.getBytes(StandardCharsets.UTF_8); + exchange.sendResponseHeaders(200, body.length); + exchange.getResponseBody().write(body); + exchange.getResponseBody().close(); + } else { + // 404 for everything else — built-in providers handle known codes + exchange.sendResponseHeaders(404, -1); + exchange.getResponseBody().close(); + } + }); + server.start(); + + try { + FunctionsProj4.registerUrlCrsProvider( + "http://localhost:" + port, "/epsg/{code}.json", "proj"); + + Point point = GEOMETRY_FACTORY.createPoint(new Coordinate(-122.4194, 37.7749)); + Geometry result = FunctionsProj4.transform(point, "EPSG:4326", "EPSG:990001"); + + assertNotNull("Transform to fake EPSG:990001 should succeed via URL provider", result); + assertEquals(990001, result.getSRID()); + // Standard Web Mercator: x = -13627665.27, y = 4547675.35 + // Our weird definition adds +x_0=10000000, +y_0=20000000 + assertEquals(-3627665.27, result.getCoordinate().x, 1.0); + assertEquals(24547675.35, result.getCoordinate().y, 1.0); + assertTrue("Local HTTP server should have been hit", requestCount.get() > 0); + } finally { + server.stop(0); + FunctionsProj4.resetUrlCrsProviderForTest(); + } + } + + @Test + public void testRegisterUrlCrsProviderConcurrentThreadSafety() throws Exception { + // Verify that concurrent calls to registerUrlCrsProvider do not produce + // duplicate providers or corrupt the registry. This exercises the + // synchronized double-checked locking path. + final int threadCount = 16; + final String testUrl = "https://concurrent-test.example.com"; + final String pathTemplate = "/epsg/{code}.json"; + final String format = "projjson"; + + ExecutorService pool = Executors.newFixedThreadPool(threadCount); + CyclicBarrier barrier = new CyclicBarrier(threadCount); + + try { + List> futures = new ArrayList<>(); + for (int i = 0; i < threadCount; i++) { + futures.add( + pool.submit( + () -> { + try { + // All threads wait at the barrier then race into registration + barrier.await(); + FunctionsProj4.registerUrlCrsProvider(testUrl, pathTemplate, format); + } catch (Exception e) { + throw new RuntimeException(e); + } + })); + } + + // Wait for all threads to complete and propagate any exceptions + for (Future f : futures) { + f.get(); + } + + // After all concurrent registrations, there should be exactly 1 provider + assertEquals( + "Concurrent registration must produce exactly 1 provider", + 1, + countProvidersByName("sedona-url-crs")); + } finally { + pool.shutdown(); + FunctionsProj4.resetUrlCrsProviderForTest(); + } + } + + // Helper: count providers with a given name + private int countProvidersByName(String name) { + int count = 0; + for (org.datasyslab.proj4sedona.defs.CRSProvider p : + org.datasyslab.proj4sedona.defs.Defs.getProviders()) { + if (name.equals(p.getName())) { + count++; + } + } + return count; + } + + // Helper: find the registered UrlCRSProvider + private org.datasyslab.proj4sedona.defs.UrlCRSProvider findUrlCrsProvider() { + for (org.datasyslab.proj4sedona.defs.CRSProvider p : + org.datasyslab.proj4sedona.defs.Defs.getProviders()) { + if ("sedona-url-crs".equals(p.getName()) + && p instanceof org.datasyslab.proj4sedona.defs.UrlCRSProvider) { + return (org.datasyslab.proj4sedona.defs.UrlCRSProvider) p; + } + } + return null; + } } diff --git a/docs/api/sql/CRS-Transformation.md b/docs/api/sql/CRS-Transformation.md index a61533aab5c..24a15c98dcc 100644 --- a/docs/api/sql/CRS-Transformation.md +++ b/docs/api/sql/CRS-Transformation.md @@ -200,6 +200,172 @@ SELECT ST_Transform( ) AS transformed_point ``` +## URL CRS Provider + +Since v1.9.0, Sedona supports resolving CRS definitions from a remote HTTP server. This is useful when you need custom or internal CRS definitions that are not included in the built-in database, or when you want to use your own CRS definition service. + +When configured, the URL provider is consulted **before** the built-in CRS database. If the URL provider returns a valid CRS definition, it is used directly. If the URL returns a 404 or an error, Sedona falls back to the built-in definitions. + +### Hosting CRS definitions + +You can host your custom CRS definitions on any HTTP-accessible location. Two common approaches: + +- **GitHub repository**: Store CRS definition files in a public GitHub repo and use the raw content URL. This is the easiest way to get started — no server infrastructure required. +- **Public S3 bucket**: Upload CRS definition files to an Amazon S3 bucket with public read access and use the S3 static website URL or CloudFront distribution. + +Each file should contain a single CRS definition in the format you specify via `spark.sedona.crs.url.format` (PROJJSON, PROJ string, WKT1, or WKT2). + +### Configuration + +Set the following Spark configuration properties when creating your Sedona session: + +```python +config = ( + SedonaContext.builder() + .config("spark.sedona.crs.url.base", "https://crs.example.com") + .config("spark.sedona.crs.url.pathTemplate", "/{authority}/{code}.json") + .config("spark.sedona.crs.url.format", "projjson") + .getOrCreate() +) +sedona = SedonaContext.create(config) +``` + +With the default path template, resolving `EPSG:4326` will fetch: + +``` +https://crs.example.com/epsg/4326.json +``` + +Only `spark.sedona.crs.url.base` is required. The other two properties have sensible defaults (`/{authority}/{code}.json` and `projjson`). + +### Supported response formats + +| Format value | Description | Content example | +|-------------|-------------|----------------| +| `projjson` | PROJJSON (default) | `{"type": "GeographicCRS", ...}` | +| `proj` | PROJ string | `+proj=longlat +datum=WGS84 +no_defs` | +| `wkt1` | OGC WKT1 | `GEOGCS["WGS 84", ...]` | +| `wkt2` | ISO 19162 WKT2 | `GEOGCRS["WGS 84", ...]` | + +### Example: GitHub repository + +Suppose you have a GitHub repo `myorg/crs-definitions` with the following structure: + +``` +crs-definitions/ + epsg/ + 990001.proj + 990002.proj +``` + +where `epsg/990001.proj` contains a PROJ string like: + +``` ++proj=merc +a=6378137 +b=6378137 +lat_ts=0 +lon_0=0 +x_0=0 +y_0=0 +k=1 +units=m +no_defs +``` + +Point Sedona to the raw GitHub content URL: + +```python +config = ( + SedonaContext.builder() + .config( + "spark.sedona.crs.url.base", + "https://raw.githubusercontent.com/myorg/crs-definitions/main", + ) + .config("spark.sedona.crs.url.pathTemplate", "/epsg/{code}.proj") + .config("spark.sedona.crs.url.format", "proj") + .getOrCreate() +) +sedona = SedonaContext.create(config) + +# Resolves EPSG:990001 from: +# https://raw.githubusercontent.com/myorg/crs-definitions/main/epsg/990001.proj +sedona.sql(""" + SELECT ST_Transform( + ST_GeomFromText('POINT(-122.4194 37.7749)'), + 'EPSG:4326', + 'EPSG:990001' + ) AS transformed_point +""").show() +``` + +### Example: self-hosted CRS server + +```python +config = ( + SedonaContext.builder() + .config("spark.sedona.crs.url.base", "https://crs.mycompany.com") + .config("spark.sedona.crs.url.pathTemplate", "/epsg/{code}.proj") + .config("spark.sedona.crs.url.format", "proj") + .getOrCreate() +) +sedona = SedonaContext.create(config) + +# Now ST_Transform will try https://crs.mycompany.com/epsg/3857.proj +# before falling back to built-in definitions +sedona.sql(""" + SELECT ST_Transform( + ST_GeomFromText('POINT(-122.4194 37.7749)'), + 'EPSG:4326', + 'EPSG:3857' + ) AS transformed_point +""").show() +``` + +### Example: custom authority codes + +The URL provider is especially useful for custom or internal authority codes that are not in any public database. With the default path template `/{authority}/{code}.json`, the `{authority}` placeholder is replaced by the authority name from the CRS string (lowercased): + +```python +config = ( + SedonaContext.builder() + .config("spark.sedona.crs.url.base", "https://crs.mycompany.com") + .config("spark.sedona.crs.url.format", "proj") + .getOrCreate() +) +sedona = SedonaContext.create(config) + +# Resolves MYORG:1001 from: +# https://crs.mycompany.com/myorg/1001.json +sedona.sql(""" + SELECT ST_Transform( + ST_GeomFromText('POINT(-122.4194 37.7749)'), + 'EPSG:4326', + 'MYORG:1001' + ) AS transformed_point +""").show() +``` + +### Example: using geometry SRID with URL provider + +If the geometry already has an SRID set (e.g., via `ST_SetSRID`), you can omit the source CRS parameter. The source CRS is derived from the geometry's SRID as an EPSG code: + +```python +config = ( + SedonaContext.builder() + .config("spark.sedona.crs.url.base", "https://crs.mycompany.com") + .config("spark.sedona.crs.url.format", "proj") + .getOrCreate() +) +sedona = SedonaContext.create(config) + +# The source CRS is taken from the geometry's SRID (4326 → EPSG:4326). +# Only the target CRS string is needed. +sedona.sql(""" + SELECT ST_Transform( + ST_SetSRID(ST_GeomFromText('POINT(-122.4194 37.7749)'), 4326), + 'EPSG:3857' + ) AS transformed_point +""").show() +``` + +### Disabling the URL provider + +To avoid enabling the URL provider, omit `spark.sedona.crs.url.base` or leave it as an empty string (the default). Note that once a URL provider has been registered in an executor JVM, it remains active for the lifetime of that JVM. + +See also: [Configuration parameters](Parameter.md#crs-transformation) for the full list of URL CRS provider settings. + ## Grid File Support Grid files enable high-accuracy datum transformations, such as NAD27 to NAD83 or OSGB36 to ETRS89. Sedona supports loading grid files from multiple sources. diff --git a/docs/api/sql/Parameter.md b/docs/api/sql/Parameter.md index b0888211b88..eac2bd29754 100644 --- a/docs/api/sql/Parameter.md +++ b/docs/api/sql/Parameter.md @@ -111,3 +111,22 @@ If you set the same parameter through both `sedona` and `spark.sedona` prefixes, * raster: Use proj4sedona for vector transformations, GeoTools for raster transformations * all: Use GeoTools for all transformations (legacy behavior) * Since: v1.9.0 +* spark.sedona.crs.url.base + * Base URL of a CRS definition server for resolving authority codes (e.g., EPSG) via HTTP. When set, ST_Transform will consult this URL provider before the built-in definitions. + * Default: (empty string — URL provider disabled) + * Example: `https://crs.example.com` + * Since: v1.9.0 +* spark.sedona.crs.url.pathTemplate + * URL path template appended to `spark.sedona.crs.url.base`. The placeholders `{authority}` and `{code}` are replaced with the authority name (e.g., `epsg`) and numeric code (e.g., `4326`) at runtime. + * Default: `/{authority}/{code}.json` + * Example: `/epsg/{code}.json` (for a server that only serves EPSG codes) + * Since: v1.9.0 +* spark.sedona.crs.url.format + * The CRS definition format returned by the URL provider. + * Default: projjson + * Possible values: + * projjson: PROJJSON format + * proj: PROJ string format + * wkt1: OGC WKT1 format + * wkt2: ISO 19162 WKT2 format + * Since: v1.9.0 diff --git a/pom.xml b/pom.xml index 05ca1cde9c0..b8025399e64 100644 --- a/pom.xml +++ b/pom.xml @@ -96,7 +96,7 @@ 2.5.0 1.52 2.9.2 - 0.0.3 + 0.0.4 provided diff --git a/spark/common/src/main/java/org/apache/sedona/core/utils/SedonaConf.java b/spark/common/src/main/java/org/apache/sedona/core/utils/SedonaConf.java index 44b28858156..1b15914f671 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/utils/SedonaConf.java +++ b/spark/common/src/main/java/org/apache/sedona/core/utils/SedonaConf.java @@ -119,6 +119,11 @@ public boolean useGeoToolsForRaster() { // Parameter for CRS transformation mode private CRSTransformMode crsTransformMode; + // Parameters for URL-based CRS provider + private String crsUrlBase; + private String crsUrlPathTemplate; + private String crsUrlFormat; + public static SedonaConf fromActiveSession() { return new SedonaConf(SparkSession.active().conf()); } @@ -234,6 +239,14 @@ private SedonaConf(ConfGetter confGetter) { // - "all": Use GeoTools for all transformations (legacy behavior) this.crsTransformMode = CRSTransformMode.fromString(confGetter.get("spark.sedona.crs.geotools", "raster")); + + // URL-based CRS provider configuration + // When spark.sedona.crs.url.base is set, a UrlCRSProvider is registered to resolve + // SRID definitions from the given HTTP(S) endpoint before falling back to built-in defs. + this.crsUrlBase = confGetter.get("spark.sedona.crs.url.base", ""); + this.crsUrlPathTemplate = + confGetter.get("spark.sedona.crs.url.pathTemplate", "/{authority}/{code}.json"); + this.crsUrlFormat = confGetter.get("spark.sedona.crs.url.format", "projjson"); } // Helper method to prioritize `sedona.*` over `spark.sedona.*` @@ -342,4 +355,36 @@ public Boolean getLibPostalUseSenzing() { public CRSTransformMode getCRSTransformMode() { return crsTransformMode; } + + /** + * Get the base URL for the URL-based CRS provider. When non-empty, a {@code UrlCRSProvider} is + * registered to resolve SRID definitions from this HTTP(S) endpoint. + * + * @return The base URL, or empty string if disabled + * @since 1.9.0 + */ + public String getCrsUrlBase() { + return crsUrlBase; + } + + /** + * Get the path template for the URL-based CRS provider. Supports placeholders: {@code + * {authority}} and {@code {code}}. + * + * @return The path template (default: "/{authority}/{code}.json") + * @since 1.9.0 + */ + public String getCrsUrlPathTemplate() { + return crsUrlPathTemplate; + } + + /** + * Get the expected response format for the URL-based CRS provider. + * + * @return The format string: "projjson", "proj", "wkt1", or "wkt2" (default: "projjson") + * @since 1.9.0 + */ + public String getCrsUrlFormat() { + return crsUrlFormat; + } } diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala index b5f85b89682..da470ef6ff1 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala @@ -307,18 +307,28 @@ private[apache] case class ST_Centroid(inputExpressions: Seq[Expression]) * @param inputExpressions * @param useGeoTools */ -private[apache] case class ST_Transform(inputExpressions: Seq[Expression], useGeoTools: Boolean) +private[apache] case class ST_Transform( + inputExpressions: Seq[Expression], + useGeoTools: Boolean, + crsUrlBase: String, + crsUrlPathTemplate: String, + crsUrlFormat: String) extends InferredExpression( inferrableFunction4(FunctionsProj4.transform), inferrableFunction3(FunctionsProj4.transform), inferrableFunction2(FunctionsProj4.transform)) { - def this(inputExpressions: Seq[Expression]) { - // We decide whether to use GeoTools based on active session config. - // SparkSession may not be available on executors, so we need to - // construct ST_Transform on driver. useGeoTools will be passed down - // to executors through object serialization/deserialization. - this(inputExpressions, ST_Transform.useGeoTools()) + private def this( + inputExpressions: Seq[Expression], + config: (Boolean, String, String, String)) = { + this(inputExpressions, config._1, config._2, config._3, config._4) + } + + def this(inputExpressions: Seq[Expression]) = { + // Read all config from SedonaConf on the driver and pass to primary constructor. + // SparkSession may not be available on executors, so config is captured here + // and serialized to executors along with the expression node. + this(inputExpressions, ST_Transform.readConfig()) } // Define proj4sedona function overloads (2, 3, 4-arg versions) @@ -335,6 +345,13 @@ private[apache] case class ST_Transform(inputExpressions: Seq[Expression], useGe inferrableFunction2(FunctionsGeoTools.transform)) override lazy val f: InferrableFunction = { + // Register URL CRS provider on executor if configured (lazy, once per JVM). + // This runs inside lazy val f so it only executes on executors during row + // evaluation, never on the driver during query planning. + if (crsUrlBase.nonEmpty) { + FunctionsProj4.registerUrlCrsProvider(crsUrlBase, crsUrlPathTemplate, crsUrlFormat) + } + // Check config to decide between proj4sedona and GeoTools // Note: 4-arg lenient parameter is ignored by proj4sedona val candidateFunctions = if (useGeoTools) geoToolsFunctions else proj4Functions @@ -347,13 +364,23 @@ private[apache] case class ST_Transform(inputExpressions: Seq[Expression], useGe } object ST_Transform { - private def useGeoTools(): Boolean = { + + /** + * Read all ST_Transform config from SedonaConf in one call. Defaults are handled by SedonaConf + * itself. Returns safe fallbacks (proj4sedona, no URL provider) when no active session exists. + */ + private def readConfig(): (Boolean, String, String, String) = { try { - SedonaConf.fromActiveSession().getCRSTransformMode.useGeoToolsForVector() + val conf = SedonaConf.fromActiveSession() + ( + conf.getCRSTransformMode.useGeoToolsForVector(), + conf.getCrsUrlBase, + conf.getCrsUrlPathTemplate, + conf.getCrsUrlFormat) } catch { case _: Exception => - // If no active session, fall back to default (proj4sedona) - false + // No active session (e.g., during constant folding) — use safe defaults + (false, "", "", "") } } } diff --git a/spark/common/src/test/java/org/apache/sedona/core/utils/SedonaConfTest.java b/spark/common/src/test/java/org/apache/sedona/core/utils/SedonaConfTest.java index f90641f76a5..0fb24e02359 100644 --- a/spark/common/src/test/java/org/apache/sedona/core/utils/SedonaConfTest.java +++ b/spark/common/src/test/java/org/apache/sedona/core/utils/SedonaConfTest.java @@ -60,4 +60,57 @@ public void testBytesFromString() { // fromSparkEnv means we don't have access to default values so sometimes we get null as input assertEquals(0, SedonaConf.bytesFromString(null)); } + + // ==================== URL CRS Provider Config Tests ==================== + + @Test + public void testCrsUrlBaseDefault() { + // Default should be empty string (disabled) + assertEquals("", SedonaConf.fromActiveSession().getCrsUrlBase()); + } + + @Test + public void testCrsUrlPathTemplateDefault() { + // Default should be "/{authority}/{code}.json" + assertEquals( + "/{authority}/{code}.json", SedonaConf.fromActiveSession().getCrsUrlPathTemplate()); + } + + @Test + public void testCrsUrlFormatDefault() { + // Default should be "projjson" + assertEquals("projjson", SedonaConf.fromActiveSession().getCrsUrlFormat()); + } + + @Test + public void testCrsUrlBaseCustom() { + SparkSession.active().conf().set("spark.sedona.crs.url.base", "https://cdn.proj.org"); + try { + assertEquals("https://cdn.proj.org", SedonaConf.fromActiveSession().getCrsUrlBase()); + } finally { + SparkSession.active().conf().set("spark.sedona.crs.url.base", ""); + } + } + + @Test + public void testCrsUrlPathTemplateCustom() { + SparkSession.active().conf().set("spark.sedona.crs.url.pathTemplate", "/{authority}/{code}"); + try { + assertEquals("/{authority}/{code}", SedonaConf.fromActiveSession().getCrsUrlPathTemplate()); + } finally { + SparkSession.active() + .conf() + .set("spark.sedona.crs.url.pathTemplate", "/{authority}/{code}.json"); + } + } + + @Test + public void testCrsUrlFormatCustom() { + SparkSession.active().conf().set("spark.sedona.crs.url.format", "wkt2"); + try { + assertEquals("wkt2", SedonaConf.fromActiveSession().getCrsUrlFormat()); + } finally { + SparkSession.active().conf().set("spark.sedona.crs.url.format", "projjson"); + } + } } diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/CRSTransformProj4Test.scala b/spark/common/src/test/scala/org/apache/sedona/sql/CRSTransformProj4Test.scala index 73b0ae55bc3..1159ebc6adc 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/CRSTransformProj4Test.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/CRSTransformProj4Test.scala @@ -18,6 +18,10 @@ */ package org.apache.sedona.sql +import com.sun.net.httpserver.HttpServer +import java.net.InetSocketAddress +import java.nio.charset.StandardCharsets +import java.util.concurrent.atomic.AtomicInteger import org.apache.spark.sql.functions.lit import org.apache.spark.sql.sedona_sql.expressions.st_functions._ import org.junit.Assert.{assertEquals, assertNotNull, assertTrue} @@ -855,4 +859,121 @@ class CRSTransformProj4Test extends TestBaseScala { assertEquals("All 40 points should transform successfully", 40, successCount) } } + + describe("URL CRS Provider config integration") { + + it("should still transform correctly when URL provider is not configured") { + // Verify default behavior (no URL provider) still works + sparkSession.conf.set("spark.sedona.crs.url.base", "") + val result = sparkSession + .sql("SELECT ST_Transform(ST_SetSRID(ST_GeomFromWKT('POINT (-122.4194 37.7749)'), 4326), 'EPSG:4326', 'EPSG:3857')") + .first() + .getAs[Geometry](0) + + assertNotNull(result) + assertEquals(3857, result.getSRID) + assertEquals(-13627665.27, result.getCoordinate.x, COORD_TOLERANCE) + assertEquals(4547675.35, result.getCoordinate.y, COORD_TOLERANCE) + } + + it("should fall back to built-in when URL provider returns nothing") { + // Point to a non-existent server — provider will fail, should fall back to built-in + sparkSession.conf.set("spark.sedona.crs.url.base", "http://127.0.0.1:1") + sparkSession.conf.set("spark.sedona.crs.url.pathTemplate", "/epsg/{code}.json") + sparkSession.conf.set("spark.sedona.crs.url.format", "projjson") + try { + val result = sparkSession + .sql("SELECT ST_Transform(ST_SetSRID(ST_GeomFromWKT('POINT (-122.4194 37.7749)'), 4326), 'EPSG:4326', 'EPSG:3857')") + .first() + .getAs[Geometry](0) + + // Should succeed via built-in fallback + assertNotNull(result) + assertEquals(3857, result.getSRID) + assertEquals(-13627665.27, result.getCoordinate.x, COORD_TOLERANCE) + assertEquals(4547675.35, result.getCoordinate.y, COORD_TOLERANCE) + } finally { + sparkSession.conf.set("spark.sedona.crs.url.base", "") + org.datasyslab.proj4sedona.defs.Defs.removeProvider("sedona-url-crs") + } + } + + it("should register URL CRS provider when config is set") { + sparkSession.conf.set("spark.sedona.crs.url.base", "https://test.example.com") + sparkSession.conf.set("spark.sedona.crs.url.pathTemplate", "/epsg/{code}.json") + sparkSession.conf.set("spark.sedona.crs.url.format", "projjson") + try { + // Force a transform to trigger provider registration + val result = sparkSession + .sql("SELECT ST_Transform(ST_SetSRID(ST_GeomFromWKT('POINT (-122.4194 37.7749)'), 4326), 'EPSG:4326', 'EPSG:3857')") + .first() + .getAs[Geometry](0) + + assertNotNull(result) + + // Verify provider was registered + val providers = org.datasyslab.proj4sedona.defs.Defs.getProviders + val found = providers.stream().anyMatch(p => p.getName == "sedona-url-crs") + assertTrue("sedona-url-crs provider should be registered", found) + } finally { + sparkSession.conf.set("spark.sedona.crs.url.base", "") + org.datasyslab.proj4sedona.defs.Defs.removeProvider("sedona-url-crs") + } + } + + it("should transform using local HTTP URL CRS provider with custom CRS") { + // Serve a deliberately wrong CRS definition for fake EPSG:990001 that no + // built-in provider knows. Uses Mercator with absurd false easting/northing. + // If the transform succeeds with shifted coordinates, the URL provider was used. + // If the URL provider didn't work, the transform would fail entirely. + val requestCount = new AtomicInteger(0) + val server = HttpServer.create(new InetSocketAddress(0), 0) + val port = server.getAddress.getPort + + // Web Mercator with intentional 10M/20M false easting/northing + val weirdMercator = + "+proj=merc +a=6378137 +b=6378137 +lat_ts=0 +lon_0=0" + + " +x_0=10000000 +y_0=20000000 +k=1 +units=m +no_defs" + + server.createContext( + "/epsg/", + exchange => { + val path = exchange.getRequestURI.getPath + if (path.contains("990001")) { + requestCount.incrementAndGet() + val body = weirdMercator.getBytes(StandardCharsets.UTF_8) + exchange.sendResponseHeaders(200, body.length) + exchange.getResponseBody.write(body) + exchange.getResponseBody.close() + } else { + // 404 for everything else — built-in providers handle known codes + exchange.sendResponseHeaders(404, -1) + exchange.getResponseBody.close() + } + }) + server.start() + + sparkSession.conf.set("spark.sedona.crs.url.base", s"http://localhost:$port") + sparkSession.conf.set("spark.sedona.crs.url.pathTemplate", "/epsg/{code}.json") + sparkSession.conf.set("spark.sedona.crs.url.format", "proj") + try { + val result = sparkSession + .sql("SELECT ST_Transform(ST_SetSRID(ST_GeomFromWKT('POINT (-122.4194 37.7749)'), 4326), 'EPSG:4326', 'EPSG:990001')") + .first() + .getAs[Geometry](0) + + assertNotNull("Transform to fake EPSG:990001 should succeed via URL provider", result) + assertEquals(990001, result.getSRID) + // Standard Web Mercator: x = -13627665.27, y = 4547675.35 + // Our weird definition adds +x_0=10000000, +y_0=20000000 + assertEquals(-3627665.27, result.getCoordinate.x, COORD_TOLERANCE) + assertEquals(24547675.35, result.getCoordinate.y, COORD_TOLERANCE) + assertTrue("Local HTTP server should have been hit", requestCount.get() > 0) + } finally { + server.stop(0) + sparkSession.conf.set("spark.sedona.crs.url.base", "") + org.datasyslab.proj4sedona.defs.Defs.removeProvider("sedona-url-crs") + } + } + } }