Skip to content

WIP docker: lightweight setup #13085

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions docker/docker-compose.lightweight.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# Docker compose file for DataHub Lightweight
#
# This configuration provides a minimal setup of DataHub with:
# - Redpanda (instead of Kafka + Zookeeper + Schema Registry)
# - MySQL/MariaDB database
# - Elasticsearch
---
services:
datahub-frontend-react:
hostname: datahub-frontend-react
image: ${DATAHUB_FRONTEND_IMAGE:-acryldata/datahub-frontend-react}:${DATAHUB_VERSION:-head}
ports:
- ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002
build:
context: ../
dockerfile: docker/datahub-frontend/Dockerfile
env_file: datahub-frontend/env/docker.env
environment:
- KAFKA_BOOTSTRAP_SERVER=redpanda:9092
- KAFKA_SCHEMAREGISTRY_URL=http://redpanda:8081
depends_on:
datahub-gms:
condition: service_healthy
volumes:
- ${HOME}/.datahub/plugins:/etc/datahub/plugins

datahub-gms:
hostname: datahub-gms
image: ${DATAHUB_GMS_IMAGE:-acryldata/datahub-gms}:${DATAHUB_VERSION:-head}
ports:
- ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
build:
context: ../
dockerfile: docker/datahub-gms/Dockerfile
env_file: datahub-gms/env/docker-without-neo4j.env
environment:
- KAFKA_BOOTSTRAP_SERVER=redpanda:9092
- KAFKA_SCHEMAREGISTRY_URL=http://redpanda:8081
- KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
- METADATA_SERVICE_AUTH_ENABLED=${METADATA_SERVICE_AUTH_ENABLED:-false}
# Ensure Elasticsearch indices are created properly
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
healthcheck:
test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health
start_period: 90s
interval: 1s
retries: 3
timeout: 5s
depends_on:
elasticsearch-setup:
condition: service_completed_successfully
redpanda:
condition: service_healthy
mysql-setup:
condition: service_completed_successfully
volumes:
- ${HOME}/.datahub/plugins:/etc/datahub/plugins

datahub-upgrade:
hostname: datahub-upgrade
image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head}
command:
- -u
- SystemUpdate
build:
context: ../
dockerfile: docker/datahub-upgrade/Dockerfile
env_file: datahub-upgrade/env/docker-without-neo4j.env
environment:
- KAFKA_BOOTSTRAP_SERVER=redpanda:9092
- KAFKA_SCHEMAREGISTRY_URL=http://redpanda:8081
# Ensure all indices are created during system upgrade
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
depends_on:
elasticsearch-setup:
condition: service_completed_successfully
kafka-setup:
condition: service_completed_successfully
mysql-setup:
condition: service_completed_successfully
labels:
datahub_setup_job: true

# Pre-create Elasticsearch indices
elasticsearch-setup:
hostname: elasticsearch-setup
image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-acryldata/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-head}
build:
context: ../
dockerfile: docker/elasticsearch-setup/Dockerfile
environment:
- ELASTICSEARCH_HOST=elasticsearch
- ELASTICSEARCH_PORT=9200
- ELASTICSEARCH_USE_SSL=false
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
depends_on:
elasticsearch:
condition: service_healthy
labels:
datahub_setup_job: true

kafka-setup:
hostname: kafka-setup
image: ${DATAHUB_KAFKA_SETUP_IMAGE:-acryldata/datahub-kafka-setup}:${DATAHUB_VERSION:-head}
build:
dockerfile: ./docker/kafka-setup/Dockerfile
context: ../
env_file: kafka-setup/env/docker.env
environment:
- KAFKA_BOOTSTRAP_SERVER=redpanda:9092
- KAFKA_SCHEMAREGISTRY_URL=http://redpanda:8081
# Disable attempts to configure the internal "_schemas" topic
# Redpanda manages schema registry topics differently than Confluent
- USE_CONFLUENT_SCHEMA_REGISTRY=FALSE
depends_on:
redpanda:
condition: service_healthy
labels:
datahub_setup_job: true

mysql-setup:
hostname: mysql-setup
image: ${DATAHUB_MYSQL_SETUP_IMAGE:-acryldata/datahub-mysql-setup}:${DATAHUB_VERSION:-head}
build:
context: ../
dockerfile: docker/mysql-setup/Dockerfile
env_file: mysql-setup/env/docker.env
depends_on:
mysql:
condition: service_healthy
labels:
datahub_setup_job: true

# Use MariaDB on ARM64 platforms like M1/M2 Macs
mysql:
hostname: mysql
image: ${MYSQL_IMAGE:-mysql:8.2}
platform: ${MYSQL_PLATFORM:-linux/amd64}
command: --character-set-server=utf8mb4 --collation-server=utf8mb4_bin --default-authentication-plugin=mysql_native_password
ports:
- ${DATAHUB_MAPPED_MYSQL_PORT:-3306}:3306
env_file: mysql/env/docker.env
restart: on-failure
healthcheck:
test: mysqladmin ping -h mysql -u $$MYSQL_USER --password=$$MYSQL_PASSWORD
start_period: 20s
interval: 1s
retries: 5
timeout: 5s
volumes:
- mysqldata:/var/lib/mysql

# ElasticSearch (minimal configuration)
elasticsearch:
hostname: elasticsearch
image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1}
ports:
- ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- "ES_JAVA_OPTS=-Xms256m -Xmx256m"
healthcheck:
test: curl -sS --fail http://elasticsearch:9200/_cluster/health?wait_for_status=yellow&timeout=0s
start_period: 20s
interval: 5s
retries: 3
timeout: 5s
volumes:
- esdata:/usr/share/elasticsearch/data
deploy:
resources:
limits:
memory: 512M

# Redpanda as a lightweight Kafka replacement
redpanda:
image: redpandadata/redpanda:latest
hostname: redpanda
container_name: redpanda
command:
- redpanda
- start
- --smp=1
- --memory=1G
- --reserve-memory=0M
- --overprovisioned
- --node-id=0
- --check=false
- --kafka-addr=PLAINTEXT://0.0.0.0:9092
- --advertise-kafka-addr=PLAINTEXT://redpanda:9092
- --rpc-addr=0.0.0.0:33145
- --advertise-rpc-addr=redpanda:33145
- --schema-registry-addr=0.0.0.0:8081
ports:
- "${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092"
- "${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081"
healthcheck:
test: ["CMD-SHELL", "rpk cluster health | grep -E 'Healthy:.+true' || exit 1"]
interval: 10s
timeout: 5s
retries: 5

networks:
default:
name: datahub_network_lightweight

volumes:
esdata:
mysqldata:
106 changes: 106 additions & 0 deletions docker/lightweight.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/bin/bash

set -euo pipefail

# Define color codes
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Print banner
echo -e "${GREEN}==========================================${NC}"
echo -e "${GREEN} DataHub Lightweight Setup${NC}"
echo -e "${GREEN} Using: Redpanda, MySQL/MariaDB, Elasticsearch${NC}"
echo -e "${GREEN}==========================================${NC}"

# Check Docker is running
if ! docker info > /dev/null 2>&1; then
echo -e "${RED}Error: Docker is not running. Please start Docker and try again.${NC}"
exit 1
fi

# Check for M1/M2 Mac and set appropriate MySQL image
if [[ $(uname -m) == 'arm64' ]]; then
export MYSQL_IMAGE="mariadb:10.5.8"
export MYSQL_PLATFORM="linux/arm64"
echo -e "${YELLOW}M1/M2 Mac detected. Using MariaDB instead of MySQL.${NC}"
fi

# Parse command-line arguments
FRONTEND=true
for arg in "$@"; do
case $arg in
--no-frontend)
FRONTEND=false
shift
;;
--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --no-frontend Start without the frontend"
echo " --help Display this help message"
exit 0
;;
esac
done

# Configure services based on command-line arguments
SERVICES="redpanda elasticsearch elasticsearch-setup mysql mysql-setup kafka-setup datahub-upgrade datahub-gms"
if [ "$FRONTEND" = true ]; then
SERVICES="$SERVICES datahub-frontend-react"
fi

# Print configuration
echo -e "${YELLOW}Starting with configuration:${NC}"
echo -e "- Frontend: $([ "$FRONTEND" = true ] && echo 'Enabled' || echo 'Disabled')"
echo ""

# Start the services
echo -e "${YELLOW}Starting services: $SERVICES${NC}"
docker-compose -f docker-compose.lightweight.yml up -d $SERVICES

echo ""
echo -e "${GREEN}DataHub is starting up...${NC}"
echo -e "${YELLOW}Waiting for GMS to be healthy...${NC}"

# Wait for GMS to be ready
GMS_URL="http://localhost:8080/health"
MAX_WAIT=300 # 5 minutes
START_TIME=$(date +%s)

while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))

if [ $ELAPSED -gt $MAX_WAIT ]; then
echo -e "${RED}GMS did not become healthy within the timeout period.${NC}"
echo -e "${YELLOW}Check logs with: docker-compose -f docker-compose.lightweight.yml logs datahub-gms${NC}"
exit 1
fi

STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$GMS_URL" || echo "000")

if [ "$STATUS" = "200" ]; then
echo -e "${GREEN}GMS is healthy!${NC}"
break
fi

echo -e "${YELLOW}Waiting for GMS to be ready... (${ELAPSED}s)${NC}"
sleep 5
done

# Display success message
echo -e "${GREEN}==========================================${NC}"
echo -e "${GREEN}DataHub is now running!${NC}"
echo ""
echo -e "${YELLOW}Access points:${NC}"
echo -e "- GMS API: http://localhost:8080"
if [ "$FRONTEND" = true ]; then
echo -e "- Frontend: http://localhost:9002"
fi
echo ""
echo -e "${YELLOW}Useful commands:${NC}"
echo -e "- View logs: docker-compose -f docker-compose.lightweight.yml logs -f"
echo -e "- Stop: docker-compose -f docker-compose.lightweight.yml down"
echo -e "${GREEN}==========================================${NC}"
Loading