Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions scalardb-analytics-spark-sample/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# ScalarDB Analytics Spark Sample

## Setup

### 1. Start services

```bash
docker compose up -d
```

### 2. Load sample data

```bash
docker compose run --rm sample-data-loader
```

### 3. Create catalog

```bash
docker compose run --rm scalardb-analytics-cli catalog create --catalog sample_catalog
```

### 4. Register data sources

```bash
# Register ScalarDB data source
docker compose run --rm scalardb-analytics-cli data-source register --data-source-json /config/data-sources/scalardb.json

# Register PostgreSQL data source
docker compose run --rm scalardb-analytics-cli data-source register --data-source-json /config/data-sources/postgres.json
```

### 5. Run Spark SQL

```bash
docker compose run --rm spark-sql
```

## Query examples

```sql
-- List catalogs
SHOW CATALOGS;

-- Use ScalarDB catalog
USE sample_catalog;

-- Query ScalarDB tables
SELECT * FROM scalardb.mysqlns.orders LIMIT 10;
SELECT * FROM scalardb.cassandrans.lineitem LIMIT 10;

-- Query PostgreSQL tables
SELECT * FROM postgres.public.customer LIMIT 10;
```

## Stop services

```bash
docker compose down
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# ScalarDB Analytics CLI configuration
scalar.db.analytics.client.server.host=scalardb-analytics-server
scalar.db.analytics.client.server.catalog.port=11051
scalar.db.analytics.client.server.metering.port=11052
32 changes: 32 additions & 0 deletions scalardb-analytics-spark-sample/config/analytics-server.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# ScalarDB Analytics Server configuration

# Server ports
scalar.db.analytics.server.catalog.port=11051
scalar.db.analytics.server.metering.port=11052

# Server database configuration (for catalog metadata)
scalar.db.analytics.server.db.url=jdbc:postgresql://analytics-catalog-postgres:5432/catalogdb
scalar.db.analytics.server.db.username=analytics
scalar.db.analytics.server.db.password=analytics

# Server database connection pool configuration
scalar.db.analytics.server.db.pool.size=10
scalar.db.analytics.server.db.pool.max-lifetime=1800000
scalar.db.analytics.server.db.pool.connection-timeout=30000
scalar.db.analytics.server.db.pool.minimum-idle=5
scalar.db.analytics.server.db.pool.idle-timeout=600000

# Metering storage configuration (filesystem for development)
scalar.db.analytics.server.metering.storage.provider=filesystem
scalar.db.analytics.server.metering.storage.path=/tmp/metering

# License configuration (required for production)
# scalar.db.analytics.server.licensing.license-key=<YOUR_LICENSE_KEY>
# scalar.db.analytics.server.licensing.license-check-cert-pem=<YOUR_LICENSE_CERT_PEM>

# Logging configuration
logging.level.root=INFO
logging.level.com.scalar.db.analytics=INFO

# Graceful shutdown configuration
scalar.db.analytics.server.graceful_shutdown_delay_millis=100
12 changes: 12 additions & 0 deletions scalardb-analytics-spark-sample/config/data-sources/postgres.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"catalog": "sample_catalog",
"name": "postgres",
"type": "postgres",
"provider": {
"host": "postgres",
"port": 5432,
"username": "postgres",
"password": "postgres",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Hardcoding credentials in configuration files is a security risk. It's better to use a mechanism to inject these at runtime, for example, through environment variables. This is a good practice to follow even in sample applications to avoid accidentally leaking credentials.

"database": "sampledb"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"catalog": "sample_catalog",
"name": "scalardb",
"type": "scalardb",
"provider": {
"configPath": "/etc/scalardb.properties"
}
}
10 changes: 10 additions & 0 deletions scalardb-analytics-spark-sample/config/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
spark.jars.packages com.scalar-labs:scalardb-analytics-spark-all-3.5_2.12:3.16.2
spark.extraListeners com.scalar.db.analytics.spark.metering.ScalarDbAnalyticsListener

# Use the ScalarDB Analytics catalog as `sample_catalog`
spark.sql.catalog.sample_catalog com.scalar.db.analytics.spark.ScalarDbAnalyticsCatalog
spark.sql.catalog.sample_catalog.server.host scalardb-analytics-server
spark.sql.catalog.sample_catalog.server.catalog.port 11051
spark.sql.catalog.sample_catalog.server.metering.port 11052

spark.sql.defaultCatalog sample_catalog
168 changes: 125 additions & 43 deletions scalardb-analytics-spark-sample/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,48 +1,79 @@
services:
spark-sql:
build:
context: ./docker
dockerfile: Dockerfile.spark
# ===========================================
# ScalarDB Analytics Services
# ===========================================

# Catalog database for Analytics Server metadata
analytics-catalog-postgres:
image: postgres:17
expose:
- 5432
volumes:
- ./scalardb.properties:/etc/scalardb.properties
- ./spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
- .scala_history:/root/.scala_history
- analytics-catalog-data:/var/lib/postgresql/data
environment:
- POSTGRES_USER=analytics
- POSTGRES_PASSWORD=analytics
- POSTGRES_DB=catalogdb
networks:
- scalar-network
healthcheck:
test:
["CMD", "psql", "-U", "analytics", "-d", "catalogdb", "-c", "select 1"]
interval: 5s
timeout: 3s
retries: 10
start_period: 5s

# ScalarDB Analytics Server
scalardb-analytics-server:
image: ghcr.io/scalar-labs/scalardb-analytics-server-without-licensing:3.16.2
platform: linux/amd64
expose:
- 11051 # Catalog service port
- 11052 # Metering service port
volumes:
- ./config/analytics-server.properties:/scalardb-analytics-server/server.properties:ro
- ./config/scalardb.properties:/etc/scalardb.properties:ro
networks:
- scalar-network
profiles:
- dev
depends_on:
- scalardb-cassandra
- scalardb-mysql
- postgres
command:
- "/opt/spark/bin/spark-sql"
- "--packages"
- "com.scalar-labs:scalardb-analytics-spark-all-3.5_2.12:3.14.0"
- analytics-catalog-postgres
healthcheck:
test: ["CMD", "/usr/local/bin/grpc_health_probe", "-addr=:11051"]
interval: 5s
timeout: 3s
retries: 10
start_period: 5s

sample-data-loader:
build:
context: sample-data-loader
dockerfile: Dockerfile
# ScalarDB Analytics CLI
scalardb-analytics-cli:
image: ghcr.io/scalar-labs/scalardb-analytics-cli:3.16.2
volumes:
- ./scalardb.properties:/etc/scalardb.properties
- ./schema.json:/etc/schema.json
- ./data:/data
working_dir: /sample-data-loader
- ./config/analytics-cli-config.properties:/config/client.properties:ro
- ./config/data-sources:/config/data-sources:ro
networks:
- scalar-network
profiles:
- dev
depends_on:
- scalardb-cassandra
- scalardb-mysql
- postgres
command: ["java", "-jar", "/app.jar"]
entrypoint:
[
"java",
"-jar",
"/scalardb-analytics-cli/scalardb-analytics-cli.jar",
"-c",
"/config/client.properties",
]
command: ["--help"] # Default command, will be overridden when running specific commands

# ===========================================
# Data Storage Services (Sample Data)
# ===========================================

# ScalarDB managed storage - Cassandra
scalardb-cassandra:
image: cassandra:3.11
ports:
- 9042
expose:
- 9042 # CQL native transport
volumes:
- scalardb-cassandra-data:/var/lib/cassandra
environment:
Expand All @@ -52,14 +83,15 @@ services:
- scalar-network
healthcheck:
test: ["CMD", "cqlsh", "-e", "exit"]
interval: 1s
timeout: 1s
interval: 5s
timeout: 3s
retries: 10
start_period: 10s
start_period: 5s

# ScalarDB managed storage - MySQL
scalardb-mysql:
image: mysql:8.0
ports:
image: mysql:9
expose:
- 3306
volumes:
- scalardb-mysql-data:/var/lib/mysql
Expand All @@ -70,14 +102,15 @@ services:
- scalar-network
healthcheck:
test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root"]
interval: 1s
timeout: 1s
interval: 5s
timeout: 3s
retries: 10
start_period: 5s

# Direct access storage - PostgreSQL (for federated queries)
postgres:
image: postgres:15.1
ports:
image: postgres:17
expose:
- 5432
volumes:
- postgres-data:/var/lib/postgresql/data
Expand All @@ -91,16 +124,65 @@ services:
- scalar-network
healthcheck:
test: ["CMD", "psql", "-U", "postgres", "-c", "select 1"]
interval: 1s
timeout: 1s
interval: 5s
timeout: 3s
retries: 10
start_period: 5s

# ===========================================
# Data Loading Services
# ===========================================

# Sample data loader for initial data setup
sample-data-loader:
build:
context: sample-data-loader
dockerfile: Dockerfile
volumes:
- ./config/scalardb.properties:/etc/scalardb.properties
- ./schema.json:/etc/schema.json
- ./data:/data
networks:
- scalar-network
profiles:
- dev
depends_on:
- scalardb-cassandra
- scalardb-mysql
- postgres
command: ["/app/bin/sample-data-loader"]

# ===========================================
# Query Execution Services
# ===========================================

# Spark SQL interactive shell
spark-sql:
build:
context: ./docker
dockerfile: Dockerfile.spark
volumes:
- ./config/scalardb.properties:/etc/scalardb.properties
- ./config/spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
- .scala_history:/root/.scala_history
- spark-ivy-cache:/root/.ivy2
- spark-m2-cache:/root/.m2
networks:
- scalar-network
profiles:
- dev
depends_on:
- scalardb-analytics-server
command:
- "/opt/spark/bin/spark-sql"

volumes:
analytics-data: {}
scalardb-cassandra-data: {}
scalardb-mysql-data: {}
postgres-data: {}
analytics-catalog-data: {}
spark-ivy-cache: {}
spark-m2-cache: {}

networks:
scalar-network: {}
24 changes: 16 additions & 8 deletions scalardb-analytics-spark-sample/docker/Dockerfile.spark
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
FROM eclipse-temurin:17-jre-jammy

ENV SPARK_VERSION=3.5.6 \
HADOOP_VERSION=3 \
SPARK_HOME=/opt/spark \
PATH="/opt/spark/bin:/opt/spark/sbin:${PATH}" \
SPARK_NO_DAEMONIZE=true

WORKDIR /work

ENV SPARK_VERSION 3.5.3
WORKDIR /tmp

# Install dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
procps \
curl && \
curl \
ca-certificates && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN curl -SL "https://dlcdn.apache.org/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz" | tar -xzC /opt

RUN mv "/opt/spark-$SPARK_VERSION-bin-hadoop3" /opt/spark
# Download and verify Spark
RUN curl -fsSL -o spark.tgz "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \
curl -fsSL -o spark.tgz.sha512 "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz.sha512" && \
sha512sum -c spark.tgz.sha512 && \
tar -xzf spark.tgz -C /opt && \
mv "/opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}" && \
rm -rf spark.tgz spark.tgz.sha512

WORKDIR /opt/spark
Loading