diff --git a/getting-started/jdbc/docker-compose-bootstrap-db.yml b/getting-started/jdbc/docker-compose-bootstrap-db.yml index d23235b2d5..4d74e18cba 100644 --- a/getting-started/jdbc/docker-compose-bootstrap-db.yml +++ b/getting-started/jdbc/docker-compose-bootstrap-db.yml @@ -26,11 +26,10 @@ services: - QUARKUS_DATASOURCE_JDBC_URL=${QUARKUS_DATASOURCE_JDBC_URL} - QUARKUS_DATASOURCE_USERNAME=${QUARKUS_DATASOURCE_USERNAME} - QUARKUS_DATASOURCE_PASSWORD=${QUARKUS_DATASOURCE_PASSWORD} - command: - - "bootstrap" - - "--realm=POLARIS" - - "--credential=POLARIS,root,s3cr3t" - + command:> + bootstrap + --realm=POLARIS_MINIO_REALM + --credential=POLARIS_MINIO_REALM,root,s3cr3t polaris: depends_on: polaris-bootstrap: diff --git a/getting-started/jdbc/docker-compose.yml b/getting-started/jdbc/docker-compose.yml index fbfd427ee2..1d480764f9 100644 --- a/getting-started/jdbc/docker-compose.yml +++ b/getting-started/jdbc/docker-compose.yml @@ -21,32 +21,52 @@ services: polaris: image: apache/polaris:postgres-latest + depends_on: + postgres-minio: # Polaris server depends on PostgreSQL being healthy + condition: service_healthy + # polaris-bootstrap-minio is a setup task; polaris server doesn't need to wait for it on every start + # after the initial successful bootstrap. Other services that *use* Polaris data + # (like polaris-setup-catalog-minio) should depend on polaris: service_healthy. ports: - # API port - - "8181:8181" - # Management port (metrics and health checks) - - "8182:8182" - # Optional, allows attaching a debugger to the Polaris JVM - - "5005:5005" + # The host port is defined by POLARIS_MINIO_API_PORT from .env, container port is 8181 + - "${POLARIS_MINIO_API_PORT:-8183}:${QUARKUS_HTTP_PORT:-8181}" # Or just - "${POLARIS_MINIO_API_PORT:-8183}:8181" + # The host port is defined by POLARIS_MINIO_MGMT_PORT from .env, container port is 8182 + - "${POLARIS_MINIO_MGMT_PORT:-8184}:${QUARKUS_MANAGEMENT_PORT:-8182}" # Or just - "${POLARIS_MINIO_MGMT_PORT:-8184}:8182" environment: - - JAVA_DEBUG=true - - JAVA_DEBUG_PORT=*:5005 - - POLARIS_PERSISTENCE_TYPE=relational-jdbc - - POLARIS_PERSISTENCE_RELATIONAL_JDBC_MAX_RETRIES=5 - - POLARIS_PERSISTENCE_RELATIONAL_JDBC_INITIAL_DELAY_IN_MS=100 - - POLARIS_PERSISTENCE_RELATIONAL_JDBC_MAX_DELAY_IN_MS=5000 - - QUARKUS_DATASOURCE_DB_KIND=pgsql - - QUARKUS_DATASOURCE_JDBC_URL=${QUARKUS_DATASOURCE_JDBC_URL} - - QUARKUS_DATASOURCE_USERNAME=${QUARKUS_DATASOURCE_USERNAME} - - QUARKUS_DATASOURCE_PASSWORD=${QUARKUS_DATASOURCE_PASSWORD} - - POLARIS_REALM_CONTEXT_REALMS=POLARIS - - QUARKUS_OTEL_SDK_DISABLED=true + # These variables will be sourced from the .env file (or shell environment). + # Docker Compose makes them available to the container if they are defined. + - QUARKUS_DATASOURCE_DB_KIND + - QUARKUS_DATASOURCE_JDBC_URL + - QUARKUS_DATASOURCE_USERNAME + - QUARKUS_DATASOURCE_PASSWORD + + - POLARIS_PERSISTENCE_TYPE + - POLARIS_REALM_CONTEXT_REALMS + + # Optional JDBC retry settings + - POLARIS_PERSISTENCE_RELATIONAL_JDBC_MAX_RETRIES + - POLARIS_PERSISTENCE_RELATIONAL_JDBC_INITIAL_DELAY_IN_MS + - POLARIS_PERSISTENCE_RELATIONAL_JDBC_MAX_DELAY_IN_MS + + # Other Quarkus/App settings from .env + - QUARKUS_OTEL_SDK_DISABLED + - QUARKUS_HTTP_PORT # Tells Quarkus which port to bind to inside the container + - QUARKUS_MANAGEMENT_PORT # Tells Quarkus which management port to bind to inside the container + + # Optional: Debug logging settings (will be sourced from .env if uncommented there) + - QUARKUS_LOG_CONSOLE_LEVEL + - QUARKUS_LOG_CATEGORY_IO_SMALLRYE_CONFIG_LEVEL + - QUARKUS_LOG_CATEGORY_ORG_APACHE_POLARIS_LEVEL + - QUARKUS_LOG_CATEGORY_IO_QUARKUS_DATASOURCE_LEVEL + - QUARKUS_LOG_CATEGORY_ORG_AGROAL_LEVEL healthcheck: - test: ["CMD", "curl", "http://localhost:8182/q/health"] - interval: 2s - timeout: 10s - retries: 10 - start_period: 10s + # Uses the management port defined by POLARIS_MINIO_MGMT_PORT (which sets QUARKUS_MANAGEMENT_PORT for inside the container) + # The healthcheck runs INSIDE the container network, so it checks localhost:QUARKUS_MANAGEMENT_PORT (e.g. localhost:8182) + test: ["CMD-SHELL", "curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health/live || curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health/ready || curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health"] + interval: 10s + timeout: 5s + retries: 15 + start_period: 30s polaris-setup: image: alpine/curl diff --git a/getting-started/minio/.env b/getting-started/minio/.env new file mode 100644 index 0000000000..a0f56f808d --- /dev/null +++ b/getting-started/minio/.env @@ -0,0 +1,63 @@ +# .env +# Default environment variables for Polaris Minio S3 example + +# Minio Root Credentials (used by minio service and mc script) +MINIO_ROOT_USER=minioadmin +MINIO_ROOT_PASSWORD=minioadmin + +# Minio S3 User Credentials (created by mc script, used by services) +POLARIS_S3_USER=polaris_s3_user +POLARIS_S3_PASSWORD=polaris_s3_password_val + +SPARK_MINIO_S3_USER=spark_minio_s3_user +SPARK_MINIO_S3_PASSWORD=spark_minio_s3_password_val + +TRINO_MINIO_S3_USER=trino_minio_s3_user +TRINO_MINIO_S3_PASSWORD=trino_minio_s3_password_val + +# Polaris Client Credentials (for Spark & Trino to auth to Polaris) +# These are used by: +# - polaris-bootstrap-minio (command to create them) +# - polaris-setup-governance (environment for script to know client IDs, and to create credentials if bootstrap doesn't) +# - spark-sql-minio (environment for Spark's Polaris catalog auth) +# - trino-minio (environment for Trino's Polaris catalog auth) +SPARK_POLARIS_CLIENT_ID=spark_app_client +SPARK_POLARIS_CLIENT_SECRET=spark_client_secret_val + +TRINO_POLARIS_CLIENT_ID=trino_app_client +TRINO_POLARIS_CLIENT_SECRET=trino_client_secret_val + +# These specific _ENV suffixed versions are referenced by the spark-sql-minio service environment block +# Setting them explicitly here to match the defaults and avoid Docker Compose warnings. +SPARK_POLARIS_CLIENT_ID_ENV=spark_app_client +SPARK_POLARIS_CLIENT_SECRET_ENV=spark_client_secret_val + +# --- Polaris Service Specific Configuration --- +POLARIS_PERSISTENCE_TYPE=in-memory +POLARIS_REALM_CONTEXT_REALMS=POLARIS_MINIO_REALM +POLARIS_BOOTSTRAP_CREDENTIALS="POLARIS_MINIO_REALM,root,s3cr3t" # Custom root credentials for the realm +# --- Other Quarkus and Port Mappings for Services --- +QUARKUS_OTEL_SDK_DISABLED=true # For polaris service + +# Port Mappings (defaults used in docker-compose.yml) +MINIO_API_PORT=9000 +MINIO_CONSOLE_PORT=9001 +POSTGRES_MINIO_PORT=5433 +POLARIS_MINIO_API_PORT=8183 +POLARIS_MINIO_MGMT_PORT=8184 # Important for health check + +SPARK_UI_MINIO_START_PORT=4050 +SPARK_UI_MINIO_END_PORT=4055 # Used in port range mapping + +TRINO_MINIO_PORT=8083 + +# Quarkus HTTP/Management ports for Polaris Service (can reference variables above) +QUARKUS_HTTP_PORT=${POLARIS_MINIO_API_PORT} +QUARKUS_MANAGEMENT_PORT=${POLARIS_MINIO_MGMT_PORT} + +# --- Optional: Debug Logging for Polaris Service (uncomment if needed) --- +# QUARKUS_LOG_CONSOLE_LEVEL=DEBUG +# QUARKUS_LOG_CATEGORY_IO_SMALLRYE_CONFIG_LEVEL=DEBUG +# QUARKUS_LOG_CATEGORY_ORG_APACHE_POLARIS_LEVEL=DEBUG +# QUARKUS_LOG_CATEGORY_IO_QUARKUS_DATASOURCE_LEVEL=DEBUG +# QUARKUS_LOG_CATEGORY_ORG_AGROAL_LEVEL=DEBUG diff --git a/getting-started/minio/README.md b/getting-started/minio/README.md new file mode 100644 index 0000000000..cf187ab435 --- /dev/null +++ b/getting-started/minio/README.md @@ -0,0 +1,158 @@ +# Getting Started with Apache Polaris: Minio S3, Governance with Spark & Trino (Read-Only) + +This example demonstrates setting up Apache Polaris to manage an Iceberg data lake in Minio S3, focusing on governance. +Polaris uses Postgres for its metadata. Spark SQL is configured for read/write access to create and populate Iceberg tables. Trino is configured for **strict read-only access** to query these tables. Access control is enforced by Polaris, with underlying S3 permissions managed by Minio. + +**Prerequisites:** +* Docker and Docker Compose. +* `jq` installed on your host machine. +* Apache Polaris images (`apache/polaris-admin-tool:postgres-latest`, `apache/polaris:postgres-latest`) built from source with JDBC support, tagged as `postgres-latest`. + +Run + +```shell + ./gradlew \ + :polaris-quarkus-server:assemble \ + :polaris-quarkus-server:quarkusAppPartsBuild --rerun \ + :polaris-quarkus-admin:assemble \ + :polaris-quarkus-admin:quarkusAppPartsBuild --rerun \ + -Dquarkus.container-image.tag=postgres-latest \ + -Dquarkus.container-image.build=true +``` + +**Security Overview:** +* **Minio (S3 Storage):** + * `polaris_s3_user` (R/W): Used by Polaris service for warehouse management. + * `spark_minio_s3_user` (R/W): Used by Spark engine for data R/W operations. + * `trino_minio_s3_user` (R/O): Used by Trino engine for data read operations. +* **Polaris (Catalog & Governance):** + * `root` user: Admin access to Polaris. + * `spark_app_client`: Polaris client ID for Spark, assigned `polaris_spark_role` (R/W permissions on `minio_catalog.ns_governed`). + * `trino_app_client`: Polaris client ID for Trino, assigned `polaris_trino_role` (R/O permissions on `minio_catalog.ns_governed`). + +**Setup and Execution:** + +1. **Environment Variables (Optional):** + Create a `.env` file in this directory (`getting-started/minio/.env`) to customize credentials and ports. Example: + ```env + # Minio Settings + MINIO_ROOT_USER=minioadmin + MINIO_ROOT_PASSWORD=minioadmin + MINIO_API_PORT=9000 + MINIO_CONSOLE_PORT=9001 + + # Minio S3 User Credentials (used by services, created by mc) + POLARIS_S3_USER=polaris_s3_user + POLARIS_S3_PASSWORD=polaris_s3_password_val + SPARK_MINIO_S3_USER=spark_minio_s3_user + SPARK_MINIO_S3_PASSWORD=spark_minio_s3_password_val + TRINO_MINIO_S3_USER=trino_minio_s3_user + TRINO_MINIO_S3_PASSWORD=trino_minio_s3_password_val + + # Polaris Client Credentials (for Spark & Trino to auth to Polaris, created by bootstrap) + SPARK_POLARIS_CLIENT_ID=spark_app_client + SPARK_POLARIS_CLIENT_SECRET=spark_client_secret_val + TRINO_POLARIS_CLIENT_ID=trino_app_client + TRINO_POLARIS_CLIENT_SECRET=trino_client_secret_val + + # Ports + POSTGRES_MINIO_PORT=5433 + POLARIS_MINIO_API_PORT=8183 + POLARIS_MINIO_MGMT_PORT=8184 + SPARK_UI_MINIO_START_PORT=4050 + # SPARK_UI_MINIO_END_PORT=4055 # Not strictly needed if using start port only for mapping range + TRINO_MINIO_PORT=8083 + ``` + +2. **Ensure Scripts are Executable:** + ```bash + chmod +x getting-started/minio/minio-config/setup-minio.sh + chmod +x getting-started/minio/polaris-config/create-catalog-minio.sh + chmod +x getting-started/minio/polaris-config/setup-polaris-governance.sh + ``` + +3. **Start Services:** + Navigate to `getting-started/minio` and run: + ```shell + docker compose up -d --build + ``` + This will start all services, including Minio setup, Polaris bootstrap (creating `root`, `spark_app_client`, `trino_app_client` principals), Polaris catalog creation, and Polaris governance setup (creating roles and assigning grants). Check logs with `docker compose logs -f`. + +4. **Access Minio Console:** + `http://localhost:${MINIO_CONSOLE_PORT:-9001}` (default: `minioadmin`/`minioadmin`). Verify `polaris-bucket`. + +5. **Using Spark SQL (Read/Write Access):** + Attach to Spark: `docker attach spark-sql-minio-gov` (Press ENTER for prompt). + The default catalog is `polaris_minio_gov`. + ```sql + -- Create a namespace governed by Polaris policies + CREATE NAMESPACE IF NOT EXISTS ns_governed + COMMENT 'Namespace for governed data access' + LOCATION 's3a://polaris-bucket/iceberg_warehouse/minio_catalog/ns_governed/'; -- Optional but good practice + + USE ns_governed; + + -- Create an Iceberg table + CREATE TABLE IF NOT EXISTS my_gov_table (id INT, name STRING, value DOUBLE) + USING iceberg + COMMENT 'Governed table for Spark R/W and Trino R/O demo' + TBLPROPERTIES ('format-version'='2'); + + -- Insert data + INSERT INTO my_gov_table VALUES (1, 'SparkRecordOne', 10.1), (2, 'SparkRecordTwo', 20.2); + + -- Select data + SELECT * FROM my_gov_table ORDER BY id; + -- Expected: Shows inserted records. + ``` + +6. **Using Trino CLI (Strict Read-Only Access):** + Access Trino CLI: `docker exec -it minio-trino-gov trino` + The Polaris catalog is mapped to `iceberg` in Trino. + ```sql + SHOW CATALOGS; + -- Expected: iceberg, system, ... + + SHOW SCHEMAS FROM iceberg; + -- Expected: information_schema, ns_governed + + SHOW TABLES FROM iceberg.ns_governed; + -- Expected: my_gov_table + + DESCRIBE iceberg.ns_governed.my_gov_table; + -- Expected: Schema of my_gov_table + + SELECT * FROM iceberg.ns_governed.my_gov_table ORDER BY id; + -- Expected: Shows records inserted by Spark. + + -- Test Read-Only: Attempt to create a table (SHOULD FAIL) + -- CREATE TABLE iceberg.ns_governed.trino_test_table (id INT) WITH (location = 's3a://polaris-bucket/iceberg_warehouse/minio_catalog/ns_governed/trino_test_table/'); + -- Expected: Error from Polaris indicating permission denied for CREATE_TABLE. + + -- Test Read-Only: Attempt to insert data (SHOULD FAIL) + -- INSERT INTO iceberg.ns_governed.my_gov_table VALUES (3, 'TrinoRecord', 30.3); + -- Expected: Error, as Trino's Polaris role and Minio S3 user are read-only. + ``` + +7. **Accessing Polaris API (Optional):** + Get token for `trino_app_client` (should have limited scope): + ```shell + export POLARIS_API_ENDPOINT="http://localhost:${POLARIS_MINIO_API_PORT:-8183}" + export TRINO_APP_TOKEN=$(curl -s "${POLARIS_API_ENDPOINT}/api/catalog/v1/oauth/tokens" \ + --user "${TRINO_POLARIS_CLIENT_ID:-trino_app_client}:${TRINO_POLARIS_CLIENT_SECRET:-trino_client_secret_val}" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d 'grant_type=client_credentials' \ + -d 'realmName=POLARIS_MINIO_REALM' | jq -r .access_token) + echo "Trino App Token: $TRINO_APP_TOKEN" + + # Try to list tables using Trino's token + curl -v "${POLARIS_API_ENDPOINT}/api/catalog/v1/minio_catalog/namespaces/ns_governed/tables" -H "Authorization: Bearer $TRINO_APP_TOKEN" + # This should succeed. + ``` + +8. **Cleanup:** + ```shell + docker compose down -v + ``` + +This set of scripts and configurations should enforce the desired access controls, with Trino having strictly read-only capabilities. \ No newline at end of file diff --git a/getting-started/minio/docker-compose.yml b/getting-started/minio/docker-compose.yml new file mode 100644 index 0000000000..3e88396909 --- /dev/null +++ b/getting-started/minio/docker-compose.yml @@ -0,0 +1,187 @@ +services: + minio: + image: minio/minio:latest + ports: + - "${MINIO_API_PORT:-9000}:9000" + - "${MINIO_CONSOLE_PORT:-9001}:9001" + volumes: + - minio_data:/data + environment: + MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} + MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin} + # Credentials for Minio users (to be created by mc) + POLARIS_S3_USER: ${POLARIS_S3_USER:-polaris_s3_user} + POLARIS_S3_PASSWORD: ${POLARIS_S3_PASSWORD:-polaris_s3_password_val} # Changed default + SPARK_MINIO_S3_USER: ${SPARK_MINIO_S3_USER:-spark_minio_s3_user} + SPARK_MINIO_S3_PASSWORD: ${SPARK_MINIO_S3_PASSWORD:-spark_minio_s3_password_val} # Changed default + TRINO_MINIO_S3_USER: ${TRINO_MINIO_S3_USER:-trino_minio_s3_user} + TRINO_MINIO_S3_PASSWORD: ${TRINO_MINIO_S3_PASSWORD:-trino_minio_s3_password_val} # Changed default + command: server /data --console-address ":9001" + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:9000/minio/health/live"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + mc: + image: minio/mc:latest + depends_on: + minio: + condition: service_healthy + volumes: + - ./minio-config:/config + entrypoint: /bin/sh + command: /config/setup-minio.sh + environment: + MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} + MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin} + POLARIS_S3_USER: ${POLARIS_S3_USER:-polaris_s3_user} + POLARIS_S3_PASSWORD: ${POLARIS_S3_PASSWORD:-polaris_s3_password_val} + SPARK_MINIO_S3_USER: ${SPARK_MINIO_S3_USER:-spark_minio_s3_user} + SPARK_MINIO_S3_PASSWORD: ${SPARK_MINIO_S3_PASSWORD:-spark_minio_s3_password_val} + TRINO_MINIO_S3_USER: ${TRINO_MINIO_S3_USER:-trino_minio_s3_user} + TRINO_MINIO_S3_PASSWORD: ${TRINO_MINIO_S3_PASSWORD:-trino_minio_s3_password_val} + + polaris: + image: apache/polaris:postgres-latest + depends_on: + minio: # Polaris server depends on PostgreSQL being healthy + condition: service_healthy + # polaris-bootstrap-minio is a setup task; polaris server doesn't need to wait for it on every start + # after the initial successful bootstrap. Other services that *use* Polaris data + # (like polaris-setup-catalog-minio) should depend on polaris: service_healthy. + ports: + # The host port is defined by POLARIS_MINIO_API_PORT from .env, container port is 8181 + - "${POLARIS_MINIO_API_PORT:-8183}:${QUARKUS_HTTP_PORT:-8181}" # Or just - "${POLARIS_MINIO_API_PORT:-8183}:8181" + # The host port is defined by POLARIS_MINIO_MGMT_PORT from .env, container port is 8182 + - "${POLARIS_MINIO_MGMT_PORT:-8184}:${QUARKUS_MANAGEMENT_PORT:-8182}" # Or just - "${POLARIS_MINIO_MGMT_PORT:-8184}:8182" + environment: + # These variables will be sourced from the .env file (or shell environment). + # Docker Compose makes them available to the container if they are defined. + - POLARIS_PERSISTENCE_TYPE + - POLARIS_REALM_CONTEXT_REALMS + + # Other Quarkus/App settings from .env + - QUARKUS_OTEL_SDK_DISABLED + - QUARKUS_HTTP_PORT # Tells Quarkus which port to bind to inside the container + - QUARKUS_MANAGEMENT_PORT # Tells Quarkus which management port to bind to inside the container + + # Optional: Debug logging settings (will be sourced from .env if uncommented there) + - QUARKUS_LOG_CONSOLE_LEVEL + - QUARKUS_LOG_CATEGORY_IO_SMALLRYE_CONFIG_LEVEL + - QUARKUS_LOG_CATEGORY_ORG_APACHE_POLARIS_LEVEL + - QUARKUS_LOG_CATEGORY_IO_QUARKUS_DATASOURCE_LEVEL + - QUARKUS_LOG_CATEGORY_ORG_AGROAL_LEVEL + healthcheck: + # Uses the management port defined by POLARIS_MINIO_MGMT_PORT (which sets QUARKUS_MANAGEMENT_PORT for inside the container) + # The healthcheck runs INSIDE the container network, so it checks localhost:QUARKUS_MANAGEMENT_PORT (e.g. localhost:8182) + test: ["CMD-SHELL", "curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health/live || curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health/ready || curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 12s # Generous start period for app init and DB connection + + polaris-setup-catalog-minio: + image: alpine/curl:latest + depends_on: + polaris: + condition: service_healthy + volumes: + - ./polaris-config:/polaris-config + entrypoint: /bin/sh + command: '-c "apk add --no-cache jq && chmod +x /polaris-config/create-catalog-minio.sh && /polaris-config/create-catalog-minio.sh"' + environment: + - POLARIS_S3_USER + - POLARIS_S3_PASSWORD + - QUARKUS_HTTP_PORT + - POLARIS_REALM_CONTEXT_REALMS + + polaris-setup-governance: + image: alpine/curl:latest + depends_on: + polaris-setup-catalog-minio: # Should depend on polaris-bootstrap-minio being done first for root user + condition: service_completed_successfully + polaris: # Also ensure polaris service itself is healthy for API calls + condition: service_healthy + volumes: + - ./polaris-config:/polaris-config + entrypoint: /bin/sh + command: '-c "apk add --no-cache jq && chmod +x /polaris-config/setup-polaris-governance.sh && /polaris-config/setup-polaris-governance.sh"' + environment: + - SPARK_POLARIS_CLIENT_ID + - SPARK_POLARIS_CLIENT_SECRET + - TRINO_POLARIS_CLIENT_ID + - TRINO_POLARIS_CLIENT_SECRET + - POLARIS_REALM_CONTEXT_REALMS + - QUARKUS_HTTP_PORT # To construct http://polaris:${QUARKUS_HTTP_PORT} + + + spark-sql-minio: + image: apache/spark:3.5.5-java17-python3 + container_name: spark-sql-minio-gov + depends_on: + polaris-setup-governance: + condition: service_completed_successfully + minio: + condition: service_healthy + stdin_open: true + tty: true + ports: + - "${SPARK_UI_MINIO_START_PORT:-4050}-${SPARK_UI_MINIO_END_PORT:-4055}:4040-4045" + environment: + # Minio S3 credentials for Spark data plane (R/W) + AWS_ACCESS_KEY_ID: ${SPARK_MINIO_S3_USER:-spark_minio_s3_user} + AWS_SECRET_ACCESS_KEY: ${SPARK_MINIO_S3_PASSWORD:-spark_minio_s3_password_val} + # Polaris client credentials for Spark control plane + SPARK_POLARIS_CLIENT_ID_ENV: ${SPARK_POLARIS_CLIENT_ID:-spark_app_client} + SPARK_POLARIS_CLIENT_SECRET_ENV: ${SPARK_POLARIS_CLIENT_SECRET:-spark_client_secret_val} + command: [ + "/opt/spark/bin/spark-sql", + "--packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,software.amazon.awssdk:bundle:2.25.31,software.amazon.awssdk:url-connection-client:2.25.31,org.apache.hadoop:hadoop-aws:3.3.6", # Updated Iceberg, AWS SDK versions + "--conf", "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", + "--conf", "spark.sql.catalog.polaris_minio_gov=org.apache.iceberg.spark.SparkCatalog", + "--conf", "spark.sql.catalog.polaris_minio_gov.catalog-impl=org.apache.iceberg.rest.RESTCatalog", + "--conf", "spark.sql.catalog.polaris_minio_gov.uri=http://polaris:8181/api/catalog", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.type=oauth2", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.grant-type=client_credentials", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.client-id=${SPARK_POLARIS_CLIENT_ID_ENV}", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.client-secret=${SPARK_POLARIS_CLIENT_SECRET_ENV}", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.token-endpoint=http://polaris:8181/api/catalog/v1/oauth/tokens", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.additional-parameters=realmName=POLARIS_MINIO_REALM", + "--conf", "spark.sql.catalog.polaris_minio_gov.warehouse=minio_catalog", + "--conf", "spark.sql.catalog.polaris_minio_gov.io-impl=org.apache.iceberg.aws.s3.S3FileIO", + "--conf", "spark.hadoop.fs.s3a.endpoint=http://minio:9000", + "--conf", "spark.hadoop.fs.s3a.path.style.access=true", + "--conf", "spark.hadoop.fs.s3a.connection.ssl.enabled=false", + "--conf", "spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem", + "--conf", "spark.sql.defaultCatalog=polaris_minio_gov", + "--conf", "spark.driver.extraJavaOptions=-Divy.cache.dir=/tmp -Divy.home=/tmp" + ] + + trino-minio: + image: trinodb/trino:449 # Using a specific Trino version + container_name: minio-trino-gov + depends_on: + polaris-setup-governance: + condition: service_completed_successfully + minio: + condition: service_healthy + ports: + - "${TRINO_MINIO_PORT:-8083}:8080" + volumes: + - ./trino-catalog:/etc/trino/catalog + environment: + # Polaris client credentials for Trino control plane (R/O role in Polaris) + TRINO_POLARIS_CLIENT_ID: ${TRINO_POLARIS_CLIENT_ID:-trino_app_client} + TRINO_POLARIS_CLIENT_SECRET: ${TRINO_POLARIS_CLIENT_SECRET:-trino_client_secret_val} + # Minio S3 credentials for Trino data plane (R/O Minio user) + TRINO_MINIO_S3_USER: ${TRINO_MINIO_S3_USER:-trino_minio_s3_user} + TRINO_MINIO_S3_PASSWORD: ${TRINO_MINIO_S3_PASSWORD:-trino_minio_s3_password_val} + +volumes: + minio_data: + +networks: + default: + name: polaris_minio_gov_network diff --git a/getting-started/minio/minio-config/polaris-s3-rw-policy.json b/getting-started/minio/minio-config/polaris-s3-rw-policy.json new file mode 100644 index 0000000000..c26b1cfe1b --- /dev/null +++ b/getting-started/minio/minio-config/polaris-s3-rw-policy.json @@ -0,0 +1,22 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:ListBucketMultipartUploads", + "s3:AbortMultipartUpload", + "s3:ListMultipartUploadParts" + ], + "Resource": [ + "arn:aws:s3:::polaris-bucket", + "arn:aws:s3:::polaris-bucket/*" + ] + } + ] +} diff --git a/getting-started/minio/minio-config/setup-minio.sh b/getting-started/minio/minio-config/setup-minio.sh new file mode 100755 index 0000000000..130ba41d07 --- /dev/null +++ b/getting-started/minio/minio-config/setup-minio.sh @@ -0,0 +1,24 @@ +#!/bin/sh +set -e + +mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} +mc mb myminio/polaris-bucket --ignore-existing + +# Create Minio policies from JSON files +mc admin policy create myminio polaris-s3-rw-policy /config/polaris-s3-rw-policy.json +mc admin policy create myminio spark-minio-rw-policy /config/spark-minio-rw-policy.json +mc admin policy create myminio trino-minio-ro-policy /config/trino-minio-ro-policy.json + +# Create Minio user for Polaris Service (R/W) +mc admin user add myminio ${POLARIS_S3_USER} ${POLARIS_S3_PASSWORD} +mc admin policy attach myminio polaris-s3-rw-policy --user ${POLARIS_S3_USER} + +# Create Minio user for Spark Engine data access (R/W) +mc admin user add myminio ${SPARK_MINIO_S3_USER} ${SPARK_MINIO_S3_PASSWORD} +mc admin policy attach myminio spark-minio-rw-policy --user ${SPARK_MINIO_S3_USER} + +# Create Minio user for Trino Engine data access (R/O) +mc admin user add myminio ${TRINO_MINIO_S3_USER} ${TRINO_MINIO_S3_PASSWORD} +mc admin policy attach myminio trino-minio-ro-policy --user ${TRINO_MINIO_S3_USER} + +echo "Minio setup complete: users (polaris_s3_user, spark_minio_s3_user, trino_minio_s3_user) and policies configured." diff --git a/getting-started/minio/minio-config/spark-minio-rw-policy.json b/getting-started/minio/minio-config/spark-minio-rw-policy.json new file mode 100644 index 0000000000..c26b1cfe1b --- /dev/null +++ b/getting-started/minio/minio-config/spark-minio-rw-policy.json @@ -0,0 +1,22 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:ListBucketMultipartUploads", + "s3:AbortMultipartUpload", + "s3:ListMultipartUploadParts" + ], + "Resource": [ + "arn:aws:s3:::polaris-bucket", + "arn:aws:s3:::polaris-bucket/*" + ] + } + ] +} diff --git a/getting-started/minio/minio-config/trino-minio-ro-policy.json b/getting-started/minio/minio-config/trino-minio-ro-policy.json new file mode 100644 index 0000000000..b1923672fd --- /dev/null +++ b/getting-started/minio/minio-config/trino-minio-ro-policy.json @@ -0,0 +1,17 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:ListBucket", + "s3:GetBucketLocation" + ], + "Resource": [ + "arn:aws:s3:::polaris-bucket", + "arn:aws:s3:::polaris-bucket/*" + ] + } + ] +} diff --git a/getting-started/minio/polaris-config/create-catalog-minio.sh b/getting-started/minio/polaris-config/create-catalog-minio.sh new file mode 100755 index 0000000000..9d00d23c12 --- /dev/null +++ b/getting-started/minio/polaris-config/create-catalog-minio.sh @@ -0,0 +1,94 @@ +#!/bin/sh +set -e + +POLARIS_SERVICE_URL="http://polaris:8181" +POLARIS_MGMT_API_URL="${POLARIS_SERVICE_URL}/api/management/v1/catalogs" +POLARIS_TOKEN_URL="${POLARIS_SERVICE_URL}/api/catalog/v1/oauth/tokens" +POLARIS_ADMIN_USER="root" +POLARIS_ADMIN_PASS="s3cr3t" +POLARIS_REALM="POLARIS_MINIO_REALM" + +echo "Waiting for Polaris service to be healthy..." +attempt_counter=0 +max_attempts=20 +until curl -s -f "${POLARIS_SERVICE_URL}/q/health/live" > /dev/null; do + if [ ${attempt_counter} -eq ${max_attempts} ]; then + echo "Max attempts reached. Failed to connect to Polaris health check." + exit 1 + fi + echo "Attempting to connect to Polaris (${attempt_counter}/${max_attempts})..." + attempt_counter=$((attempt_counter+1)) + sleep 5 +done +echo "Polaris service is live." + +echo "Attempting to get Polaris admin token..." +ADMIN_TOKEN_RESPONSE=$(curl -s -w "%{http_code}" -X POST "${POLARIS_TOKEN_URL}" \ + --user "${POLARIS_ADMIN_USER}:${POLARIS_ADMIN_PASS}" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "grant_type=client_credentials" -d "scope=PRINCIPAL_ROLE:ALL" -d "realmName=${POLARIS_REALM}") + +HTTP_CODE=$(echo "$ADMIN_TOKEN_RESPONSE" | tail -n1) +TOKEN_BODY=$(echo "$ADMIN_TOKEN_RESPONSE" | sed '$d') + +if [ "$HTTP_CODE" -ne 200 ]; then + echo "Failed to get Polaris admin token. HTTP Code: $HTTP_CODE. Response:" + echo "$TOKEN_BODY" + exit 1 +fi +ADMIN_TOKEN=$(echo "$TOKEN_BODY" | jq -r .access_token) +if [ -z "$ADMIN_TOKEN" ] || [ "$ADMIN_TOKEN" = "null" ]; then echo "Failed to parse admin token"; exit 1; fi +echo "Polaris admin token obtained." + +CATALOG_NAME="minio_catalog" +BUCKET_NAME="polaris-bucket" +CATALOG_WAREHOUSE_PATH="s3a://${BUCKET_NAME}/iceberg_warehouse/${CATALOG_NAME}" + +S3_ACCESS_KEY="${POLARIS_S3_USER}" # Polaris service's S3 user +S3_SECRET_KEY="${POLARIS_S3_PASSWORD}" +S3_ENDPOINT="http://minio:9000" + +CREATE_CATALOG_PAYLOAD=$(cat <