diff --git a/docker/docker-compose.lightweight.yml b/docker/docker-compose.lightweight.yml new file mode 100644 index 00000000000000..e2c57ec91a9bc5 --- /dev/null +++ b/docker/docker-compose.lightweight.yml @@ -0,0 +1,213 @@ +# Docker compose file for DataHub Lightweight +# +# This configuration provides a minimal setup of DataHub with: +# - Redpanda (instead of Kafka + Zookeeper + Schema Registry) +# - MySQL/MariaDB database +# - Elasticsearch +--- +services: + datahub-frontend-react: + hostname: datahub-frontend-react + image: ${DATAHUB_FRONTEND_IMAGE:-acryldata/datahub-frontend-react}:${DATAHUB_VERSION:-head} + ports: + - ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002 + build: + context: ../ + dockerfile: docker/datahub-frontend/Dockerfile + env_file: datahub-frontend/env/docker.env + environment: + - KAFKA_BOOTSTRAP_SERVER=redpanda:9092 + - KAFKA_SCHEMAREGISTRY_URL=http://redpanda:8081 + depends_on: + datahub-gms: + condition: service_healthy + volumes: + - ${HOME}/.datahub/plugins:/etc/datahub/plugins + + datahub-gms: + hostname: datahub-gms + image: ${DATAHUB_GMS_IMAGE:-acryldata/datahub-gms}:${DATAHUB_VERSION:-head} + ports: + - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080 + build: + context: ../ + dockerfile: docker/datahub-gms/Dockerfile + env_file: datahub-gms/env/docker-without-neo4j.env + environment: + - KAFKA_BOOTSTRAP_SERVER=redpanda:9092 + - KAFKA_SCHEMAREGISTRY_URL=http://redpanda:8081 + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} + - METADATA_SERVICE_AUTH_ENABLED=${METADATA_SERVICE_AUTH_ENABLED:-false} + # Ensure Elasticsearch indices are created properly + - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true + - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true + healthcheck: + test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health + start_period: 90s + interval: 1s + retries: 3 + timeout: 5s + depends_on: + elasticsearch-setup: + condition: service_completed_successfully + redpanda: + condition: service_healthy + mysql-setup: + condition: service_completed_successfully + volumes: + - ${HOME}/.datahub/plugins:/etc/datahub/plugins + + datahub-upgrade: + hostname: datahub-upgrade + image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} + command: + - -u + - SystemUpdate + build: + context: ../ + dockerfile: docker/datahub-upgrade/Dockerfile + env_file: datahub-upgrade/env/docker-without-neo4j.env + environment: + - KAFKA_BOOTSTRAP_SERVER=redpanda:9092 + - KAFKA_SCHEMAREGISTRY_URL=http://redpanda:8081 + # Ensure all indices are created during system upgrade + - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true + - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true + depends_on: + elasticsearch-setup: + condition: service_completed_successfully + kafka-setup: + condition: service_completed_successfully + mysql-setup: + condition: service_completed_successfully + labels: + datahub_setup_job: true + + # Pre-create Elasticsearch indices + elasticsearch-setup: + hostname: elasticsearch-setup + image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-acryldata/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-head} + build: + context: ../ + dockerfile: docker/elasticsearch-setup/Dockerfile + environment: + - ELASTICSEARCH_HOST=elasticsearch + - ELASTICSEARCH_PORT=9200 + - ELASTICSEARCH_USE_SSL=false + - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true + - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true + depends_on: + elasticsearch: + condition: service_healthy + labels: + datahub_setup_job: true + + kafka-setup: + hostname: kafka-setup + image: ${DATAHUB_KAFKA_SETUP_IMAGE:-acryldata/datahub-kafka-setup}:${DATAHUB_VERSION:-head} + build: + dockerfile: ./docker/kafka-setup/Dockerfile + context: ../ + env_file: kafka-setup/env/docker.env + environment: + - KAFKA_BOOTSTRAP_SERVER=redpanda:9092 + - KAFKA_SCHEMAREGISTRY_URL=http://redpanda:8081 + # Disable attempts to configure the internal "_schemas" topic + # Redpanda manages schema registry topics differently than Confluent + - USE_CONFLUENT_SCHEMA_REGISTRY=FALSE + depends_on: + redpanda: + condition: service_healthy + labels: + datahub_setup_job: true + + mysql-setup: + hostname: mysql-setup + image: ${DATAHUB_MYSQL_SETUP_IMAGE:-acryldata/datahub-mysql-setup}:${DATAHUB_VERSION:-head} + build: + context: ../ + dockerfile: docker/mysql-setup/Dockerfile + env_file: mysql-setup/env/docker.env + depends_on: + mysql: + condition: service_healthy + labels: + datahub_setup_job: true + + # Use MariaDB on ARM64 platforms like M1/M2 Macs + mysql: + hostname: mysql + image: ${MYSQL_IMAGE:-mysql:8.2} + platform: ${MYSQL_PLATFORM:-linux/amd64} + command: --character-set-server=utf8mb4 --collation-server=utf8mb4_bin --default-authentication-plugin=mysql_native_password + ports: + - ${DATAHUB_MAPPED_MYSQL_PORT:-3306}:3306 + env_file: mysql/env/docker.env + restart: on-failure + healthcheck: + test: mysqladmin ping -h mysql -u $$MYSQL_USER --password=$$MYSQL_PASSWORD + start_period: 20s + interval: 1s + retries: 5 + timeout: 5s + volumes: + - mysqldata:/var/lib/mysql + + # ElasticSearch (minimal configuration) + elasticsearch: + hostname: elasticsearch + image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1} + ports: + - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - "ES_JAVA_OPTS=-Xms256m -Xmx256m" + healthcheck: + test: curl -sS --fail http://elasticsearch:9200/_cluster/health?wait_for_status=yellow&timeout=0s + start_period: 20s + interval: 5s + retries: 3 + timeout: 5s + volumes: + - esdata:/usr/share/elasticsearch/data + deploy: + resources: + limits: + memory: 512M + + # Redpanda as a lightweight Kafka replacement + redpanda: + image: redpandadata/redpanda:latest + hostname: redpanda + container_name: redpanda + command: + - redpanda + - start + - --smp=1 + - --memory=1G + - --reserve-memory=0M + - --overprovisioned + - --node-id=0 + - --check=false + - --kafka-addr=PLAINTEXT://0.0.0.0:9092 + - --advertise-kafka-addr=PLAINTEXT://redpanda:9092 + - --rpc-addr=0.0.0.0:33145 + - --advertise-rpc-addr=redpanda:33145 + - --schema-registry-addr=0.0.0.0:8081 + ports: + - "${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092" + - "${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081" + healthcheck: + test: ["CMD-SHELL", "rpk cluster health | grep -E 'Healthy:.+true' || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + +networks: + default: + name: datahub_network_lightweight + +volumes: + esdata: + mysqldata: \ No newline at end of file diff --git a/docker/lightweight.sh b/docker/lightweight.sh new file mode 100755 index 00000000000000..ec63f0c55367ef --- /dev/null +++ b/docker/lightweight.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +set -euo pipefail + +# Define color codes +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Print banner +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN} DataHub Lightweight Setup${NC}" +echo -e "${GREEN} Using: Redpanda, MySQL/MariaDB, Elasticsearch${NC}" +echo -e "${GREEN}==========================================${NC}" + +# Check Docker is running +if ! docker info > /dev/null 2>&1; then + echo -e "${RED}Error: Docker is not running. Please start Docker and try again.${NC}" + exit 1 +fi + +# Check for M1/M2 Mac and set appropriate MySQL image +if [[ $(uname -m) == 'arm64' ]]; then + export MYSQL_IMAGE="mariadb:10.5.8" + export MYSQL_PLATFORM="linux/arm64" + echo -e "${YELLOW}M1/M2 Mac detected. Using MariaDB instead of MySQL.${NC}" +fi + +# Parse command-line arguments +FRONTEND=true +for arg in "$@"; do + case $arg in + --no-frontend) + FRONTEND=false + shift + ;; + --help) + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --no-frontend Start without the frontend" + echo " --help Display this help message" + exit 0 + ;; + esac +done + +# Configure services based on command-line arguments +SERVICES="redpanda elasticsearch elasticsearch-setup mysql mysql-setup kafka-setup datahub-upgrade datahub-gms" +if [ "$FRONTEND" = true ]; then + SERVICES="$SERVICES datahub-frontend-react" +fi + +# Print configuration +echo -e "${YELLOW}Starting with configuration:${NC}" +echo -e "- Frontend: $([ "$FRONTEND" = true ] && echo 'Enabled' || echo 'Disabled')" +echo "" + +# Start the services +echo -e "${YELLOW}Starting services: $SERVICES${NC}" +docker-compose -f docker-compose.lightweight.yml up -d $SERVICES + +echo "" +echo -e "${GREEN}DataHub is starting up...${NC}" +echo -e "${YELLOW}Waiting for GMS to be healthy...${NC}" + +# Wait for GMS to be ready +GMS_URL="http://localhost:8080/health" +MAX_WAIT=300 # 5 minutes +START_TIME=$(date +%s) + +while true; do + CURRENT_TIME=$(date +%s) + ELAPSED=$((CURRENT_TIME - START_TIME)) + + if [ $ELAPSED -gt $MAX_WAIT ]; then + echo -e "${RED}GMS did not become healthy within the timeout period.${NC}" + echo -e "${YELLOW}Check logs with: docker-compose -f docker-compose.lightweight.yml logs datahub-gms${NC}" + exit 1 + fi + + STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$GMS_URL" || echo "000") + + if [ "$STATUS" = "200" ]; then + echo -e "${GREEN}GMS is healthy!${NC}" + break + fi + + echo -e "${YELLOW}Waiting for GMS to be ready... (${ELAPSED}s)${NC}" + sleep 5 +done + +# Display success message +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN}DataHub is now running!${NC}" +echo "" +echo -e "${YELLOW}Access points:${NC}" +echo -e "- GMS API: http://localhost:8080" +if [ "$FRONTEND" = true ]; then + echo -e "- Frontend: http://localhost:9002" +fi +echo "" +echo -e "${YELLOW}Useful commands:${NC}" +echo -e "- View logs: docker-compose -f docker-compose.lightweight.yml logs -f" +echo -e "- Stop: docker-compose -f docker-compose.lightweight.yml down" +echo -e "${GREEN}==========================================${NC}" \ No newline at end of file