Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions scalardb-analytics-sample/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# ScalarDB Analytics Sample

## Setup

### 1. Start services

```bash
docker compose up -d --wait
```

This command will start all services and automatically load sample data on the first run.

### 2. Create catalog

```bash
docker compose run --rm scalardb-analytics-cli catalog create --catalog sample_catalog
```

### 3. Register data sources

```bash
# Register ScalarDB data source
docker compose run --rm scalardb-analytics-cli data-source register --data-source-json /config/data-sources/scalardb.json

# Register PostgreSQL data source
docker compose run --rm scalardb-analytics-cli data-source register --data-source-json /config/data-sources/postgres.json
```

### 4. Run Spark SQL

```bash
docker compose run --rm spark-sql
```

## Query examples

```sql
-- List catalogs
SHOW CATALOGS;

-- Use ScalarDB catalog
USE sample_catalog;

-- Query ScalarDB tables
SELECT * FROM scalardb.mysqlns.orders LIMIT 10;
SELECT * FROM scalardb.cassandrans.lineitem LIMIT 10;

-- Query PostgreSQL tables
SELECT * FROM postgres.public.customer LIMIT 10;
```

## Stop services

```bash
docker compose down
```
12 changes: 12 additions & 0 deletions scalardb-analytics-sample/config/data-sources/postgres.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"catalog": "sample_catalog",
"name": "postgres",
"type": "postgres",
"provider": {
"host": "postgres",
"port": 5432,
"username": "postgres",
"password": "postgres",
"database": "sampledb"
}
}
8 changes: 8 additions & 0 deletions scalardb-analytics-sample/config/data-sources/scalardb.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"catalog": "sample_catalog",
"name": "scalardb",
"type": "scalardb",
"provider": {
"configPath": "/etc/scalardb.properties"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# ScalarDB Analytics CLI configuration
scalar.db.analytics.client.server.host=scalardb-analytics-server
scalar.db.analytics.client.server.catalog.port=11051
scalar.db.analytics.client.server.metering.port=11052
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# ScalarDB Analytics Server configuration

# Server ports
scalar.db.analytics.server.catalog.port=11051
scalar.db.analytics.server.metering.port=11052

# Server database configuration (for catalog metadata)
scalar.db.analytics.server.db.url=jdbc:postgresql://analytics-catalog-postgres:5432/catalogdb
scalar.db.analytics.server.db.username=analytics
scalar.db.analytics.server.db.password=analytics

# Server database connection pool configuration
scalar.db.analytics.server.db.pool.size=10
scalar.db.analytics.server.db.pool.max_lifetime=1800000
scalar.db.analytics.server.db.pool.connection_timeout=30000
scalar.db.analytics.server.db.pool.minimum_idle=5
scalar.db.analytics.server.db.pool.idle_timeout=600000

# Metering storage configuration (filesystem for development)
scalar.db.analytics.server.metering.storage.provider=filesystem
scalar.db.analytics.server.metering.storage.path=/tmp/metering

# License configuration (required for production)
# scalar.db.analytics.server.licensing.license_key=<YOUR_LICENSE_KEY>
# scalar.db.analytics.server.licensing.license_check_cert_pem=<YOUR_LICENSE_CERT_PEM>

# Logging configuration
logging.level.root=INFO
logging.level.com.scalar.db.analytics=INFO

# Graceful shutdown configuration
scalar.db.analytics.server.graceful_shutdown_delay_millis=100
10 changes: 10 additions & 0 deletions scalardb-analytics-sample/config/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
spark.jars.packages com.scalar-labs:scalardb-analytics-spark-all-3.5_2.12:3.16.2
spark.extraListeners com.scalar.db.analytics.spark.metering.ScalarDbAnalyticsListener

# Use the ScalarDB Analytics catalog as `sample_catalog`
spark.sql.catalog.sample_catalog com.scalar.db.analytics.spark.ScalarDbAnalyticsCatalog
spark.sql.catalog.sample_catalog.server.host scalardb-analytics-server
spark.sql.catalog.sample_catalog.server.catalog.port 11051
spark.sql.catalog.sample_catalog.server.metering.port 11052

spark.sql.defaultCatalog sample_catalog
197 changes: 197 additions & 0 deletions scalardb-analytics-sample/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
services:
# ===========================================
# ScalarDB Analytics Services
# ===========================================

# Catalog database for Analytics Server metadata
analytics-catalog-postgres:
image: postgres:17
expose:
- 5432
volumes:
- analytics-catalog-data:/var/lib/postgresql/data
environment:
- POSTGRES_USER=analytics
- POSTGRES_PASSWORD=analytics
- POSTGRES_DB=catalogdb
networks:
- scalar-network
healthcheck:
test:
["CMD", "psql", "-U", "analytics", "-d", "catalogdb", "-c", "select 1"]
interval: 5s
timeout: 3s
retries: 10
start_period: 5s

# ScalarDB Analytics Server
scalardb-analytics-server:
image: ghcr.io/scalar-labs/scalardb-analytics-server-without-licensing:3.16.2
platform: linux/amd64
expose:
- 11051 # Catalog service port
- 11052 # Metering service port
volumes:
- ./config/scalardb-analytics-server.properties:/scalardb-analytics-server/server.properties:ro
- ./config/scalardb.properties:/etc/scalardb.properties:ro
networks:
- scalar-network
depends_on:
- analytics-catalog-postgres
healthcheck:
test: ["CMD", "/usr/local/bin/grpc_health_probe", "-addr=:11051"]
interval: 5s
timeout: 3s
retries: 10
start_period: 5s

# ScalarDB Analytics CLI
scalardb-analytics-cli:
image: ghcr.io/scalar-labs/scalardb-analytics-cli:3.16.2
volumes:
- ./config/scalardb-analytics-cli.properties:/config/client.properties:ro
- ./config/data-sources:/config/data-sources:ro
networks:
- scalar-network
profiles:
- dev
entrypoint:
[
"java",
"-jar",
"/scalardb-analytics-cli/scalardb-analytics-cli.jar",
"-c",
"/config/client.properties",
]
command: ["--help"] # Default command, will be overridden when running specific commands

# ===========================================
# Data Storage Services (Sample Data)
# ===========================================

# ScalarDB managed storage - Cassandra
scalardb-cassandra:
image: cassandra:3.11
expose:
- 9042 # CQL native transport
volumes:
- scalardb-cassandra-data:/var/lib/cassandra
environment:
- CASSANDRA_DC=dc1
- CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch
networks:
- scalar-network
healthcheck:
test: ["CMD", "cqlsh", "-e", "exit"]
interval: 5s
timeout: 3s
retries: 10
start_period: 5s

# ScalarDB managed storage - MySQL
scalardb-mysql:
image: mysql:9
expose:
- 3306
volumes:
- scalardb-mysql-data:/var/lib/mysql
environment:
- MYSQL_ROOT_PASSWORD=mysql
- MYSQL_DATABASE=sampledb
networks:
- scalar-network
healthcheck:
test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root"]
interval: 5s
timeout: 3s
retries: 10
start_period: 5s

# Direct access storage - PostgreSQL (for federated queries)
postgres:
image: postgres:17
expose:
- 5432
volumes:
- postgres-data:/var/lib/postgresql/data
- ./data/customer.csv:/opt/customer.csv
- ./sql/postgres_copy.sql:/docker-entrypoint-initdb.d/postgres_copy.sql
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=postgres
- POSTGRES_DB=sampledb
networks:
- scalar-network
healthcheck:
test: ["CMD", "psql", "-U", "postgres", "-c", "select 1"]
interval: 5s
timeout: 3s
retries: 10
start_period: 5s

# ===========================================
# Data Loading Services
# ===========================================

# Sample data loader for initial data setup
sample-data-loader:
build:
context: sample-data-loader
dockerfile: Dockerfile
volumes:
- ./config/scalardb.properties:/etc/scalardb.properties
- ./schema.json:/etc/schema.json
- ./data:/data
- sample-data-flags:/flags
networks:
- scalar-network
depends_on:
scalardb-cassandra:
condition: service_healthy
scalardb-mysql:
condition: service_healthy
entrypoint: |
sh -c '
if [ -f /flags/.data-loaded ]; then
echo "Sample data already loaded, skipping..."
exit 0
fi
echo "Loading sample data..."
/app/bin/sample-data-loader && touch /flags/.data-loaded
'

# ===========================================
# Query Execution Services
# ===========================================

# Spark SQL interactive shell
spark-sql:
build:
context: ./docker
dockerfile: Dockerfile.spark
volumes:
- ./config/scalardb.properties:/etc/scalardb.properties
- ./config/spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
- .scala_history:/root/.scala_history
- spark-ivy-cache:/root/.ivy2
- spark-m2-cache:/root/.m2
networks:
- scalar-network
profiles:
- dev
depends_on:
- scalardb-analytics-server
command:
- "/opt/spark/bin/spark-sql"

volumes:
scalardb-cassandra-data: {}
scalardb-mysql-data: {}
postgres-data: {}
analytics-catalog-data: {}
spark-ivy-cache: {}
spark-m2-cache: {}
sample-data-flags: {}

networks:
scalar-network: {}
28 changes: 28 additions & 0 deletions scalardb-analytics-sample/docker/Dockerfile.spark
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM eclipse-temurin:17-jre-jammy

ENV SPARK_VERSION=3.5.6 \
HADOOP_VERSION=3 \
SPARK_HOME=/opt/spark \
PATH="/opt/spark/bin:/opt/spark/sbin:${PATH}" \
SPARK_NO_DAEMONIZE=true

WORKDIR /tmp

# Install dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
procps \
curl \
ca-certificates && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Download and verify Spark
RUN curl -fsSL -o spark.tgz "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \
curl -fsSL -o spark.tgz.sha512 "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz.sha512" && \
sha512sum -c spark.tgz.sha512 && \
tar -xzf spark.tgz -C /opt && \
mv "/opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}" && \
rm -rf spark.tgz spark.tgz.sha512

WORKDIR /opt/spark
10 changes: 10 additions & 0 deletions scalardb-analytics-sample/sample-data-loader/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM eclipse-temurin:17-jdk-jammy AS builder

COPY . /app
WORKDIR /app

RUN chmod +x gradlew && ./gradlew installDist

FROM eclipse-temurin:17-jre-jammy

COPY --from=builder /app/build/install/sample-data-loader /app
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
plugins {
application
id("com.gradleup.shadow") version "8.3.5"
id("com.diffplug.spotless") version "6.24.0"
}

Expand All @@ -9,8 +8,8 @@ repositories {
}

dependencies {
implementation("com.scalar-labs:scalardb:3.14.0")
implementation("com.scalar-labs:scalardb-schema-loader:3.14.0")
implementation("com.scalar-labs:scalardb:3.16.1")
implementation("com.scalar-labs:scalardb-schema-loader:3.16.1")
implementation("org.apache.commons:commons-csv:1.10.0")

implementation("io.netty:netty-transport-native-epoll:4.1.99.Final:linux-x86_64")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
create schema sample_ns;
create table sample_ns.customer (
create table customer (
c_custkey int,
c_name text,
c_address text,
Expand All @@ -10,4 +9,4 @@ create table sample_ns.customer (
c_comment text,
PRIMARY KEY (c_custkey)
);
\copy sample_ns.customer from '/opt/customer.csv' delimiter ',' csv;
\copy customer from '/opt/customer.csv' delimiter ',' csv;
Loading