Skip to content

Commit 48659ed

Browse files
committed
feat(scalardb-analytics-spark-sample): update sample to support ScalarDB Analytics 3.16
- Add ScalarDB Analytics Server and CLI services to docker-compose - Configure Analytics Server with catalog database - Create data source definitions for ScalarDB and PostgreSQL - Reorganize configuration files into config directory - Add platform specification for ARM64 compatibility - Update Dockerfiles to use Gradle application plugin instead of shadowJar - Add volume caching for Spark JAR dependencies - Include setup instructions in README This update enables the sample to work with ScalarDB Analytics 3.16, providing a complete example of federated queries across ScalarDB and PostgreSQL data sources.
1 parent d316464 commit 48659ed

File tree

13 files changed

+266
-76
lines changed

13 files changed

+266
-76
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# ScalarDB Analytics Spark Sample
2+
3+
## Setup
4+
5+
### 1. Start services
6+
7+
```bash
8+
docker compose up -d
9+
```
10+
11+
### 2. Load sample data
12+
13+
```bash
14+
docker compose run --rm sample-data-loader
15+
```
16+
17+
### 3. Create catalog
18+
19+
```bash
20+
docker compose run --rm scalardb-analytics-cli catalog create --catalog sample_catalog
21+
```
22+
23+
### 4. Register data sources
24+
25+
```bash
26+
# Register ScalarDB data source
27+
docker compose run --rm scalardb-analytics-cli data-source register --data-source-json /config/data-sources/scalardb.json
28+
29+
# Register PostgreSQL data source
30+
docker compose run --rm scalardb-analytics-cli data-source register --data-source-json /config/data-sources/postgres.json
31+
```
32+
33+
### 5. Run Spark SQL
34+
35+
```bash
36+
docker compose run --rm spark-sql
37+
```
38+
39+
## Query examples
40+
41+
```sql
42+
-- List catalogs
43+
SHOW CATALOGS;
44+
45+
-- Use ScalarDB catalog
46+
USE sample_catalog;
47+
48+
-- Query ScalarDB tables
49+
SELECT * FROM scalardb.mysqlns.orders LIMIT 10;
50+
SELECT * FROM scalardb.cassandrans.lineitem LIMIT 10;
51+
52+
-- Query PostgreSQL tables
53+
SELECT * FROM postgres.public.customer LIMIT 10;
54+
```
55+
56+
## Stop services
57+
58+
```bash
59+
docker compose down
60+
```
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# ScalarDB Analytics CLI configuration
2+
scalar.db.analytics.client.server.host=scalardb-analytics-server
3+
scalar.db.analytics.client.server.catalog.port=11051
4+
scalar.db.analytics.client.server.metering.port=11052
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# ScalarDB Analytics Server configuration
2+
3+
# Server ports
4+
scalar.db.analytics.server.catalog.port=11051
5+
scalar.db.analytics.server.metering.port=11052
6+
7+
# Server database configuration (for catalog metadata)
8+
scalar.db.analytics.server.db.url=jdbc:postgresql://analytics-catalog-postgres:5432/catalogdb
9+
scalar.db.analytics.server.db.username=analytics
10+
scalar.db.analytics.server.db.password=analytics
11+
12+
# Server database connection pool configuration
13+
scalar.db.analytics.server.db.pool.size=10
14+
scalar.db.analytics.server.db.pool.max-lifetime=1800000
15+
scalar.db.analytics.server.db.pool.connection-timeout=30000
16+
scalar.db.analytics.server.db.pool.minimum-idle=5
17+
scalar.db.analytics.server.db.pool.idle-timeout=600000
18+
19+
# Metering storage configuration (filesystem for development)
20+
scalar.db.analytics.server.metering.storage.provider=filesystem
21+
scalar.db.analytics.server.metering.storage.path=/tmp/metering
22+
23+
# License configuration (required for production)
24+
# scalar.db.analytics.server.licensing.license-key=<YOUR_LICENSE_KEY>
25+
# scalar.db.analytics.server.licensing.license-check-cert-pem=<YOUR_LICENSE_CERT_PEM>
26+
27+
# Logging configuration
28+
logging.level.root=INFO
29+
logging.level.com.scalar.db.analytics=INFO
30+
31+
# Graceful shutdown configuration
32+
scalar.db.analytics.server.graceful_shutdown_delay_millis=100
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"catalog": "sample_catalog",
3+
"name": "postgres",
4+
"type": "postgres",
5+
"provider": {
6+
"host": "postgres",
7+
"port": 5432,
8+
"username": "postgres",
9+
"password": "postgres",
10+
"database": "sampledb"
11+
}
12+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"catalog": "sample_catalog",
3+
"name": "scalardb",
4+
"type": "scalardb",
5+
"provider": {
6+
"configPath": "/etc/scalardb.properties"
7+
}
8+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
spark.jars.packages com.scalar-labs:scalardb-analytics-spark-all-3.5_2.12:3.16.2
2+
spark.extraListeners com.scalar.db.analytics.spark.metering.ScalarDbAnalyticsListener
3+
4+
# Use the ScalarDB Analytics catalog as `sample_catalog`
5+
spark.sql.catalog.sample_catalog com.scalar.db.analytics.spark.ScalarDbAnalyticsCatalog
6+
spark.sql.catalog.sample_catalog.server.host scalardb-analytics-server
7+
spark.sql.catalog.sample_catalog.server.catalog.port 11051
8+
spark.sql.catalog.sample_catalog.server.metering.port 11052
9+
10+
spark.sql.defaultCatalog sample_catalog
Lines changed: 118 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,79 @@
11
services:
2-
spark-sql:
3-
build:
4-
context: ./docker
5-
dockerfile: Dockerfile.spark
2+
# ===========================================
3+
# ScalarDB Analytics Services
4+
# ===========================================
5+
6+
# Catalog database for Analytics Server metadata
7+
analytics-catalog-postgres:
8+
image: postgres:17
9+
expose:
10+
- 5432
611
volumes:
7-
- ./scalardb.properties:/etc/scalardb.properties
8-
- ./spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
9-
- .scala_history:/root/.scala_history
12+
- analytics-catalog-data:/var/lib/postgresql/data
13+
environment:
14+
- POSTGRES_USER=analytics
15+
- POSTGRES_PASSWORD=analytics
16+
- POSTGRES_DB=catalogdb
17+
networks:
18+
- scalar-network
19+
healthcheck:
20+
test:
21+
["CMD", "psql", "-U", "analytics", "-d", "catalogdb", "-c", "select 1"]
22+
interval: 1s
23+
timeout: 1s
24+
retries: 10
25+
start_period: 5s
26+
27+
# ScalarDB Analytics Server
28+
scalardb-analytics-server:
29+
image: ghcr.io/scalar-labs/scalardb-analytics-server-without-licensing:3.16.2
30+
platform: linux/amd64
31+
expose:
32+
- 11051 # Catalog service port
33+
- 11052 # Metering service port
34+
volumes:
35+
- ./config/analytics-server.properties:/scalardb-analytics-server/server.properties:ro
36+
- ./config/scalardb.properties:/etc/scalardb.properties:ro
1037
networks:
1138
- scalar-network
12-
profiles:
13-
- dev
1439
depends_on:
15-
- scalardb-cassandra
16-
- scalardb-mysql
17-
- postgres
18-
command:
19-
- "/opt/spark/bin/spark-sql"
20-
- "--packages"
21-
- "com.scalar-labs:scalardb-analytics-spark-all-3.5_2.12:3.14.0"
40+
- analytics-catalog-postgres
41+
healthcheck:
42+
test: ["CMD", "/usr/local/bin/grpc_health_probe", "-addr=:11051"]
43+
interval: 5s
44+
timeout: 3s
45+
retries: 3
46+
start_period: 10s
2247

23-
sample-data-loader:
24-
build:
25-
context: sample-data-loader
26-
dockerfile: Dockerfile
48+
# ScalarDB Analytics CLI
49+
scalardb-analytics-cli:
50+
image: ghcr.io/scalar-labs/scalardb-analytics-cli:3.16.2
2751
volumes:
28-
- ./scalardb.properties:/etc/scalardb.properties
29-
- ./schema.json:/etc/schema.json
30-
- ./data:/data
31-
working_dir: /sample-data-loader
52+
- ./config/analytics-cli-config.properties:/config/client.properties:ro
53+
- ./config/data-sources:/config/data-sources:ro
3254
networks:
3355
- scalar-network
3456
profiles:
3557
- dev
36-
depends_on:
37-
- scalardb-cassandra
38-
- scalardb-mysql
39-
- postgres
40-
command: ["java", "-jar", "/app.jar"]
58+
entrypoint:
59+
[
60+
"java",
61+
"-jar",
62+
"/scalardb-analytics-cli/scalardb-analytics-cli.jar",
63+
"-c",
64+
"/config/client.properties",
65+
]
66+
command: ["--help"] # Default command, will be overridden when running specific commands
67+
68+
# ===========================================
69+
# Data Storage Services (Sample Data)
70+
# ===========================================
4171

72+
# ScalarDB managed storage - Cassandra
4273
scalardb-cassandra:
4374
image: cassandra:3.11
44-
ports:
45-
- 9042
75+
expose:
76+
- 9042 # CQL native transport
4677
volumes:
4778
- scalardb-cassandra-data:/var/lib/cassandra
4879
environment:
@@ -57,9 +88,10 @@ services:
5788
retries: 10
5889
start_period: 10s
5990

91+
# ScalarDB managed storage - MySQL
6092
scalardb-mysql:
61-
image: mysql:8.0
62-
ports:
93+
image: mysql:9
94+
expose:
6395
- 3306
6496
volumes:
6597
- scalardb-mysql-data:/var/lib/mysql
@@ -75,9 +107,10 @@ services:
75107
retries: 10
76108
start_period: 5s
77109

110+
# Direct access storage - PostgreSQL (for federated queries)
78111
postgres:
79-
image: postgres:15.1
80-
ports:
112+
image: postgres:17
113+
expose:
81114
- 5432
82115
volumes:
83116
- postgres-data:/var/lib/postgresql/data
@@ -96,11 +129,60 @@ services:
96129
retries: 10
97130
start_period: 5s
98131

132+
# ===========================================
133+
# Data Loading Services
134+
# ===========================================
135+
136+
# Sample data loader for initial data setup
137+
sample-data-loader:
138+
build:
139+
context: sample-data-loader
140+
dockerfile: Dockerfile
141+
volumes:
142+
- ./config/scalardb.properties:/etc/scalardb.properties
143+
- ./schema.json:/etc/schema.json
144+
- ./data:/data
145+
networks:
146+
- scalar-network
147+
profiles:
148+
- dev
149+
depends_on:
150+
- scalardb-cassandra
151+
- scalardb-mysql
152+
- postgres
153+
command: ["/app/bin/sample-data-loader"]
154+
155+
# ===========================================
156+
# Query Execution Services
157+
# ===========================================
158+
159+
# Spark SQL interactive shell
160+
spark-sql:
161+
build:
162+
context: ./docker
163+
dockerfile: Dockerfile.spark
164+
volumes:
165+
- ./config/scalardb.properties:/etc/scalardb.properties
166+
- ./config/spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
167+
- .scala_history:/root/.scala_history
168+
- spark-ivy-cache:/root/.ivy2
169+
- spark-m2-cache:/root/.m2
170+
networks:
171+
- scalar-network
172+
profiles:
173+
- dev
174+
depends_on:
175+
- scalardb-analytics-server
176+
command:
177+
- "/opt/spark/bin/spark-sql"
178+
99179
volumes:
100-
analytics-data: {}
101180
scalardb-cassandra-data: {}
102181
scalardb-mysql-data: {}
103182
postgres-data: {}
183+
analytics-catalog-data: {}
184+
spark-ivy-cache: {}
185+
spark-m2-cache: {}
104186

105187
networks:
106188
scalar-network: {}
Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,28 @@
11
FROM eclipse-temurin:17-jre-jammy
22

3+
ENV SPARK_VERSION=3.5.6 \
4+
HADOOP_VERSION=3 \
5+
SPARK_HOME=/opt/spark \
6+
PATH="/opt/spark/bin:/opt/spark/sbin:${PATH}" \
7+
SPARK_NO_DAEMONIZE=true
38

4-
WORKDIR /work
5-
6-
ENV SPARK_VERSION 3.5.3
9+
WORKDIR /tmp
710

11+
# Install dependencies
812
RUN apt-get update && \
913
apt-get install -y --no-install-recommends \
1014
procps \
11-
curl && \
15+
curl \
16+
ca-certificates && \
1217
apt-get clean && \
1318
rm -rf /var/lib/apt/lists/*
1419

15-
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
16-
RUN curl -SL "https://dlcdn.apache.org/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz" | tar -xzC /opt
17-
18-
RUN mv "/opt/spark-$SPARK_VERSION-bin-hadoop3" /opt/spark
20+
# Download and verify Spark
21+
RUN curl -fsSL -o spark.tgz "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \
22+
curl -fsSL -o spark.tgz.sha512 "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz.sha512" && \
23+
sha512sum -c spark.tgz.sha512 && \
24+
tar -xzf spark.tgz -C /opt && \
25+
mv "/opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}" && \
26+
rm -rf spark.tgz spark.tgz.sha512
1927

2028
WORKDIR /opt/spark

scalardb-analytics-spark-sample/sample-data-loader/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ FROM eclipse-temurin:17-jdk-jammy AS builder
33
COPY . /app
44
WORKDIR /app
55

6-
RUN ./gradlew shadowJar
6+
RUN chmod +x gradlew && ./gradlew installDist
77

88
FROM eclipse-temurin:17-jre-jammy
99

10-
COPY --from=builder /app/build/libs/sample-data-loader-all.jar /app.jar
10+
COPY --from=builder /app/build/install/sample-data-loader /app

0 commit comments

Comments
 (0)