diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..e2b3699 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,344 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: CI + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +env: + PYTHONPATH: ${{ github.workspace }} + +jobs: + pre-commit: + name: Pre-commit Checks + runs-on: self-hosted + container: + image: python:3.12-slim-bullseye + steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Disable Git hooks completely + git config --global core.hooksPath /dev/null + + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Cache Python packages + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: precommit-${{ hashFiles('.pre-commit-config.yaml') }} + restore-keys: precommit + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip pre-commit + pre-commit --version + + - name: Run pre-commit + run: | + pre-commit run --all-files + timeout-minutes: 60 + + package-check: + name: Package Check + needs: pre-commit + runs-on: self-hosted + container: + image: python:3.12-slim-bullseye + steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Disable Git hooks completely + git config --global core.hooksPath /dev/null + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Clear Poetry cache + run: | + rm -rf ~/.cache/pypoetry + rm -rf ~/.cache/pip + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip poetry + poetry env use python3.12 + poetry install --no-cache + + - name: Run Poetry check + run: poetry check + timeout-minutes: 15 + + unit-tests: + name: Unit Tests + needs: package-check + runs-on: self-hosted + env: + CUDA_VISIBLE_DEVICES: all + NVIDIA_VISIBLE_DEVICES: all + NVIDIA_DRIVER_CAPABILITIES: all + CUDA_HOME: /usr/local/cuda + LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 + container: + image: nvidia/cuda:12.6.0-devel-ubuntu22.04 + options: --privileged --gpus all --group-add 998 --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 + steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Disable Git hooks completely + git config --global core.hooksPath /dev/null + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Verify CUDA setup + run: | + # Verify NVIDIA drivers and CUDA + nvcc --version + nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv,noheader + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Cache Python packages + uses: actions/cache@v4 + with: + path: | + ~/.cache/pip + venv/ + key: unit-tests-cuda-${{ hashFiles('**/requirements.txt') }} + restore-keys: unit-tests-cuda- + + - name: Install dependencies + run: | + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3-pip python3-venv \ + libgl1-mesa-glx libglib2.0-0 \ + libcudnn8 libcudnn8-dev \ + nvidia-cuda-toolkit + python3 -m pip install --upgrade pip poetry + poetry env use python3.12 + poetry install + + - name: Run unit tests + run: | + mkdir -p test-results + poetry run python -m unittest discover -p "test_*.py" -v > test-results/unit-tests.txt + poetry run python -c 'import xml.etree.ElementTree as ET, re; root = ET.Element("testsuites"); suite = ET.SubElement(root, "testsuite", name="unit-tests"); [suite.append(ET.SubElement(suite, "testcase", classname=m.group(2), name=m.group(1))) if m.group(3) == "ok" else suite.append(ET.SubElement(suite, "testcase", classname=m.group(2), name=m.group(1)).append(ET.SubElement(suite, "failure", message=f"Test {m.group(1)} in {m.group(2)} {m.group(3).lower()}ed"))) for m in re.finditer(r"test_(\w+) \((.*?)\) \.\.\. (ok|FAIL|ERROR)", open("test-results/unit-tests.txt").read())]; ET.ElementTree(root).write("test-results/unit-tests.xml", encoding="utf-8", xml_declaration=True)' + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: unit-test-results + path: test-results/ + retention-days: 7 + + e2e-tests: + name: End-to-End Tests + needs: unit-tests + runs-on: self-hosted + env: + CUDA_VISIBLE_DEVICES: all + NVIDIA_VISIBLE_DEVICES: all + NVIDIA_DRIVER_CAPABILITIES: all + CUDA_HOME: /usr/local/cuda + LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 + container: + image: nvidia/cuda:12.6.0-devel-ubuntu22.04 + options: --privileged --gpus all --group-add 998 --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 + steps: + - name: Install Git and LFS + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Remove any existing hooks + rm -f .git/hooks/* + # Configure Git LFS + git lfs install --system + git lfs update --force + # Verify Git LFS installation + git lfs version + + - name: Checkout code + uses: actions/checkout@v4 + with: + lfs: true + + - name: Pull LFS files + run: | + git lfs pull + git lfs ls-files + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3-pip python3-venv \ + libgl1-mesa-glx libglib2.0-0 \ + libcudnn8 libcudnn8-dev \ + nvidia-cuda-toolkit + python3 -m pip install --upgrade pip poetry + poetry env use python3.12 + poetry install + + - name: Run E2E tests + run: | + # Set CUDA device order + export CUDA_DEVICE_ORDER=PCI_BUS_ID + poetry run python scripts/generate_graph.py \ + --map-path maps/carter_warehouse_navigation.png \ + --graph-eval.active \ + --perf-eval.active + poetry run python scripts/evaluate_graph.py \ + --graph-path graphs/graph.gml \ + --map-path maps/carter_warehouse_navigation.png \ + --output-dir results \ + --resolution 0.05 \ + --safety-distance 0.3 \ + --occupancy-threshold 127 + + - name: Cleanup + if: always() + run: rm -rf results graphs + + docker-build: + name: Docker Build and Test + needs: e2e-tests + runs-on: self-hosted + env: + CUDA_VISIBLE_DEVICES: all + NVIDIA_VISIBLE_DEVICES: all + NVIDIA_DRIVER_CAPABILITIES: all + CUDA_HOME: /usr/local/cuda + LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 + DOCKER_HOST: unix:///var/run/docker.sock + DOCKER_TLS_CERTDIR: "" + container: + image: nvidia/cuda:12.6.0-devel-ubuntu22.04 + options: --privileged --gpus all --group-add 998 --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 + steps: + - name: Install Git and LFS + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Remove any existing hooks + rm -f .git/hooks/* + # Configure Git LFS + git lfs install --system + git lfs update --force + # Verify Git LFS installation + git lfs version + + - name: Checkout code + uses: actions/checkout@v4 + with: + lfs: true + + - name: Pull LFS files + run: | + git lfs pull + git lfs ls-files + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3-pip python3-venv \ + libgl1-mesa-glx libglib2.0-0 \ + libcudnn8 libcudnn8-dev \ + nvidia-cuda-toolkit curl ca-certificates gnupg + # Add Docker's official GPG key + install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + chmod a+r /etc/apt/keyrings/docker.gpg + # Add Docker repository + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + # Install Docker and Docker Compose + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + python3 -m pip install --upgrade pip poetry + poetry env use python3.12 + poetry install + + - name: Verify Docker and CUDA setup + run: | + # Verify Docker setup + docker info + docker compose version + # Verify CUDA setup + nvidia-smi + nvcc --version + # Verify NVIDIA runtime + ldconfig -p | grep nvidia + # Verify CUDA libraries + ldconfig -p | grep cuda + # Verify CUDA environment + echo "CUDA_HOME: $CUDA_HOME" + echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" + ls -l $CUDA_HOME/lib64/libcuda* + # Verify CUDA device + python3 -c 'import os; print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES")); print("NVIDIA_VISIBLE_DEVICES:", os.environ.get("NVIDIA_VISIBLE_DEVICES")); print("NVIDIA_DRIVER_CAPABILITIES:", os.environ.get("NVIDIA_DRIVER_CAPABILITIES"))' + + - name: Build and test + run: | + cd docker || { echo "Missing docker directory"; exit 1; } + # Build with GPU support + DOCKER_BUILDKIT=1 docker compose build --no-cache + # Run with GPU support and NVIDIA runtime + docker compose up -d rest-api + sleep 10 + cd ../ + # Verify CUDA is available in the container + docker exec docker-rest-api-1 nvidia-smi + # Run tests + poetry run python scripts/test_api_client.py \ + --map_path maps/carter_warehouse_navigation.png \ + --host localhost + + - name: Cleanup + if: always() + run: cd docker && docker compose down -v diff --git a/infrastructure/cloudformation/github-runners-stack.yml b/infrastructure/cloudformation/github-runners-stack.yml new file mode 100644 index 0000000..4771e65 --- /dev/null +++ b/infrastructure/cloudformation/github-runners-stack.yml @@ -0,0 +1,752 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +AWSTemplateFormatVersion: '2010-09-09' +Description: 'CloudFormation template for CI/CD infrastructure' + +Parameters: + Environment: + Type: String + Default: production + AllowedValues: + - production + - staging + Description: Environment name for resource naming + + ProjectName: + Type: String + Default: projectName + Description: Name of the project (used in resource naming) + + GitHubRepoPath: + Type: String + Description: Full GitHub repository path (e.g., 'microsoft/vscode' or 'octocat/Hello-World'). For organization runners, use the organization name. + AllowedPattern: ^[a-zA-Z0-9-]+/[a-zA-Z0-9-]+$|^[a-zA-Z0-9-]+$ + + VpcCidr: + Type: String + Default: 10.0.0.0/16 + Description: CIDR block for the VPC + + PublicSubnetCidr: + Type: String + Default: 10.0.1.0/24 + Description: CIDR block for the public subnet + + GitHubToken: + Type: String + Description: GitHub personal access token with 'repo' and 'workflow' scopes. For organization runners, requires 'admin:org' scope. + NoEcho: true + + RunnerCount: + Type: Number + Description: Number of GitHub Actions runners to create + Default: 1 + MinValue: 1 + MaxValue: 10 + + InstanceType: + Type: String + Description: EC2 instance type for runners + Default: g4dn.xlarge + AllowedValues: + - g4dn.xlarge + - g4dn.2xlarge + - g4dn.4xlarge + - g5.xlarge + - g5.2xlarge + - g5.4xlarge + + LogRetentionDays: + Type: Number + Description: Number of days to retain logs + Default: 90 + MinValue: 1 + MaxValue: 3653 + +Conditions: + IsProduction: + Fn::Equals: + - Fn::Ref: Environment + - production + +Resources: + # VPC Resources + VPC: + Type: AWS::EC2::VPC + Properties: + CidrBlock: + Fn::Ref: VpcCidr + EnableDnsHostnames: true + EnableDnsSupport: true + Tags: + - Key: Name + Value: + Fn::Sub: ${Environment}-${ProjectName}-vpc + - Key: Environment + Value: + Fn::Ref: Environment + - Key: Project + Value: + Fn::Ref: ProjectName + - Key: ManagedBy + Value: CloudFormation + + InternetGateway: + Type: AWS::EC2::InternetGateway + Properties: + Tags: + - Key: Name + Value: + Fn::Sub: ${Environment}-${ProjectName}-igw + - Key: Environment + Value: + Fn::Ref: Environment + - Key: Project + Value: + Fn::Ref: ProjectName + - Key: ManagedBy + Value: CloudFormation + + VpcGatewayAttachment: + Type: AWS::EC2::VPCGatewayAttachment + Properties: + VpcId: + Fn::Ref: VPC + InternetGatewayId: + Fn::Ref: InternetGateway + + PublicSubnet: + Type: AWS::EC2::Subnet + Properties: + VpcId: + Fn::Ref: VPC + CidrBlock: + Fn::Ref: PublicSubnetCidr + AvailabilityZone: + Fn::Select: + - 0 + - Fn::GetAZs: '' + MapPublicIpOnLaunch: true + Tags: + - Key: Name + Value: + Fn::Sub: ${Environment}-${ProjectName}-public-subnet + - Key: Environment + Value: + Fn::Ref: Environment + - Key: Project + Value: + Fn::Ref: ProjectName + - Key: ManagedBy + Value: CloudFormation + + RouteTable: + Type: AWS::EC2::RouteTable + Properties: + VpcId: + Fn::Ref: VPC + Tags: + - Key: Name + Value: + Fn::Sub: ${Environment}-${ProjectName}-rt + - Key: Environment + Value: + Fn::Ref: Environment + - Key: Project + Value: + Fn::Ref: ProjectName + - Key: ManagedBy + Value: CloudFormation + + Route: + Type: AWS::EC2::Route + DependsOn: VpcGatewayAttachment + Properties: + RouteTableId: + Fn::Ref: RouteTable + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: + Fn::Ref: InternetGateway + + SubnetRouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + SubnetId: + Fn::Ref: PublicSubnet + RouteTableId: + Fn::Ref: RouteTable + + # Security Group + BuildSecurityGroup: + Type: AWS::EC2::SecurityGroup + Properties: + GroupName: + Fn::Sub: ${Environment}-${ProjectName}-build-sg + GroupDescription: Security group for build instances + VpcId: + Fn::Ref: VPC + SecurityGroupIngress: + - IpProtocol: tcp + FromPort: 22 + ToPort: 22 + CidrIp: 0.0.0.0/0 + Description: Allow SSH access + - IpProtocol: tcp + FromPort: 80 + ToPort: 80 + CidrIp: 0.0.0.0/0 + Description: Allow HTTP access + - IpProtocol: tcp + FromPort: 443 + ToPort: 443 + CidrIp: 0.0.0.0/0 + Description: Allow HTTPS access + SecurityGroupEgress: + - IpProtocol: -1 + CidrIp: 0.0.0.0/0 + Description: Allow all outbound traffic + Tags: + - Key: Name + Value: + Fn::Sub: ${Environment}-${ProjectName}-build-sg + - Key: Environment + Value: + Fn::Ref: Environment + - Key: Project + Value: + Fn::Ref: ProjectName + - Key: ManagedBy + Value: CloudFormation + + # IAM Role for EC2 instances + RunnerRole: + Type: AWS::IAM::Role + Properties: + RoleName: + Fn::Sub: ${Environment}-${ProjectName}-runner-role + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: ec2.amazonaws.com + Action: sts:AssumeRole + ManagedPolicyArns: + - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly + Tags: + - Key: Environment + Value: + Fn::Ref: Environment + - Key: Project + Value: + Fn::Ref: ProjectName + - Key: ManagedBy + Value: CloudFormation + + RunnerInstanceProfile: + Type: AWS::IAM::InstanceProfile + Properties: + Path: / + Roles: + - Fn::Ref: RunnerRole + InstanceProfileName: + Fn::Sub: ${Environment}-${ProjectName}-runner-profile + + # Launch Template for runners + RunnerLaunchTemplate: + Type: AWS::EC2::LaunchTemplate + Properties: + LaunchTemplateName: + Fn::Sub: ${Environment}-${ProjectName}-runner-template + LaunchTemplateData: + ImageId: ami-0eabc4ddf08279fc3 # Ubuntu 22.04 LTS + InstanceType: + Fn::Ref: InstanceType + NetworkInterfaces: + - DeviceIndex: 0 + AssociatePublicIpAddress: true + SubnetId: + Fn::Ref: PublicSubnet + Groups: + - Fn::Ref: BuildSecurityGroup + IamInstanceProfile: + Name: + Fn::Ref: RunnerInstanceProfile + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 100 + VolumeType: gp3 + DeleteOnTermination: true + - DeviceName: /dev/xvdf + Ebs: + VolumeSize: 1024 + VolumeType: gp3 + DeleteOnTermination: true + Encrypted: true + Iops: 3000 + Throughput: 125 + TagSpecifications: + - ResourceType: instance + Tags: + - Key: Name + Value: + Fn::Sub: ${Environment}-${ProjectName}-runner + - Key: Environment + Value: + Fn::Ref: Environment + - Key: Project + Value: + Fn::Ref: ProjectName + - Key: ManagedBy + Value: CloudFormation + UserData: + Fn::Base64: + Fn::Sub: | + #!/bin/bash + set -e + + # Wait for apt lock to be released + while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "Waiting for other package manager to finish..." + sleep 1 + done + + # Install Docker + apt-get update + apt-get install -y ca-certificates curl gnupg + install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + chmod a+r /etc/apt/keyrings/docker.gpg + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + + # Wait for apt lock again before updating + while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "Waiting for other package manager to finish..." + sleep 1 + done + + apt-get update + apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + + # Debug information + echo "=== Debug Information ===" + lsblk + echo "=== Mount Points ===" + mount + echo "=== FSTAB ===" + cat /etc/fstab + + # Wait for EBS volume to be available and find the correct device + echo "Waiting for EBS volume to be attached..." + while true; do + # Check for NVMe devices + if [ -e /dev/nvme1n1 ]; then + EBS_DEVICE="/dev/nvme1n1" + break + elif [ -e /dev/nvme0n1 ]; then + EBS_DEVICE="/dev/nvme0n1" + break + fi + echo "Waiting for EBS volume..." + sleep 5 + done + + echo "Found EBS volume at $EBS_DEVICE" + + # Wait for the volume to be ready + sleep 10 + + # Check if the volume is already formatted + if ! blkid $EBS_DEVICE; then + echo "Formatting EBS volume..." + mkfs -t ext4 $EBS_DEVICE + fi + + # Create mount point and add to fstab + mkdir -p /var/lib/docker-cache + + # Remove any existing mount entry for this device + sed -i '/\/var\/lib\/docker-cache/d' /etc/fstab + + # Add new mount entry + echo "$EBS_DEVICE /var/lib/docker-cache ext4 defaults,nofail 0 2" >> /etc/fstab + + # Unmount if already mounted + umount /var/lib/docker-cache 2>/dev/null || true + + # Mount the volume + mount -a + + # Verify mount + echo "=== After Mount ===" + mount | grep docker-cache + df -h /var/lib/docker-cache + + # Configure Docker to use the mounted volume for cache + mkdir -p /etc/docker + cat > /etc/docker/daemon.json << EOF + { + "data-root": "/var/lib/docker-cache", + "storage-driver": "overlay2", + "log-driver": "json-file", + "log-opts": { + "max-size": "100m", + "max-file": "3" + } + } + EOF + + # Stop Docker before moving data + systemctl stop docker + + # Move existing Docker data if it exists + if [ -d "/var/lib/docker" ] && [ "$(ls -A /var/lib/docker)" ]; then + echo "Moving existing Docker data..." + mv /var/lib/docker/* /var/lib/docker-cache/ 2>/dev/null || true + fi + + # Restart Docker to apply new configuration + systemctl start docker + + # Create Docker cache cleanup script + cat > /usr/local/bin/cleanup-docker-cache.sh << 'EOF' + #!/bin/bash + + # Set threshold (in percentage) for cleanup + THRESHOLD=80 + + # Get current disk usage + USAGE=$(df -h /var/lib/docker-cache | awk 'NR==2 {print $5}' | sed 's/%//') + + if [ "$USAGE" -gt "$THRESHOLD" ]; then + echo "Docker cache usage is at $USAGE%, cleaning up..." + + # Remove unused containers + docker container prune -f + + # Remove unused images + docker image prune -a -f + + # Remove unused volumes + docker volume prune -f + + # Remove build cache + docker builder prune -f + + echo "Cleanup completed" + else + echo "Docker cache usage is at $USAGE%, no cleanup needed" + fi + EOF + + chmod +x /usr/local/bin/cleanup-docker-cache.sh + + # Add cleanup script to crontab + (crontab -l 2>/dev/null; echo "0 */4 * * * /usr/local/bin/cleanup-docker-cache.sh") | crontab - + + # Wait for apt lock before installing jq + while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "Waiting for other package manager to finish..." + sleep 1 + done + + # Install jq for JSON parsing + apt-get install -y jq + + # Install GitHub Actions runner + mkdir -p /opt/github-runner + cd /opt/github-runner + curl -o actions-runner-linux-x64-2.323.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.323.0/actions-runner-linux-x64-2.323.0.tar.gz + echo "0dbc9bf5a58620fc52cb6cc0448abcca964a8d74b5f39773b7afcad9ab691e19 actions-runner-linux-x64-2.323.0.tar.gz" | shasum -a 256 -c + tar xzf ./actions-runner-linux-x64-2.323.0.tar.gz + + # Create runner user and add to docker group + useradd -m -s /bin/bash github-runner + usermod -aG docker github-runner + + # Set ownership of runner directory + chown -R github-runner:github-runner /opt/github-runner + + # Create systemd service for GitHub runner + cat > /etc/systemd/system/github-runner.service << EOF + [Unit] + Description=GitHub Actions Runner + After=network.target docker.service + StartLimitIntervalSec=0 + + [Service] + Type=simple + User=github-runner + WorkingDirectory=/opt/github-runner + ExecStartPre=/bin/bash -c 'if [ ! -f /opt/github-runner/.runner ] || [ ! -f /opt/github-runner/.credentials ]; then \ + rm -rf /opt/github-runner/.runner /opt/github-runner/.credentials; \ + ./config.sh --url https://github.com/${GitHubRepoPath} --token \$(curl -s -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GitHubToken}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/${GitHubRepoPath}/actions/runners/registration-token" | jq -r ".token") \ + --labels self-hosted,gpu --unattended --ephemeral --replace || exit 1; fi' + ExecStart=/opt/github-runner/run.sh + ExecStopPost=/bin/bash -c 'rm -rf /opt/github-runner/.runner /opt/github-runner/.credentials' + Restart=always + RestartSec=10 + StartLimitBurst=10 + TimeoutStartSec=300 + TimeoutStopSec=300 + KillMode=process + KillSignal=SIGTERM + Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + Environment="DOCKER_HOST=unix:///var/run/docker.sock" + Environment="ACTIONS_RUNNER_HOOK_JOB_STARTED=/usr/local/bin/runner-hook.sh" + Environment="ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/local/bin/runner-hook.sh" + Environment="RUNNER_ALLOW_RUNASROOT=0" + + [Install] + WantedBy=multi-user.target + EOF + + # Create runner hook script to handle job lifecycle + cat > /usr/local/bin/runner-hook.sh << 'EOF' + #!/bin/bash + set -e + + # Log the event with timestamp and more details + log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/github-runner.log + } + + log "Runner hook called with event: $1" + log "Current Docker status: $(systemctl status docker | grep Active)" + + case "$1" in + "job_started") + # Verify Docker is running and healthy + if ! docker info >/dev/null 2>&1; then + log "Docker appears to be unhealthy, attempting restart" + systemctl restart docker + sleep 10 + if ! docker info >/dev/null 2>&1; then + log "Docker failed to recover after restart" + exit 1 + fi + log "Docker successfully restarted" + fi + # Clean up any stale Docker resources + docker system prune -f + log "System pruned before job start" + ;; + "job_completed") + # Clean up after job completion + docker system prune -f + log "System pruned after job completion" + ;; + esac + EOF + + chmod +x /usr/local/bin/runner-hook.sh + + # Create a more robust runner health check script + cat > /usr/local/bin/reconfigure-runner.sh << 'EOF' + #!/bin/bash + set -e + + # Log function + log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/github-runner-health.log + } + + # Check if the runner service is active + check_runner_service() { + if ! systemctl is-active github-runner >/dev/null 2>&1; then + log "Runner service is not active" + return 1 + fi + return 0 + } + + # Check if runner process is running + check_runner_process() { + if ! pgrep -f "run.sh" >/dev/null; then + log "Runner process is not running" + return 1 + fi + return 0 + } + + # Check Docker health + check_docker_health() { + if ! docker info >/dev/null 2>&1; then + log "Docker is not healthy" + return 1 + fi + return 0 + } + + # Main health check logic + log "Starting health check" + + NEEDS_RESTART=0 + + # Check Docker first + if ! check_docker_health; then + log "Attempting to restart Docker" + systemctl restart docker + sleep 10 + if ! check_docker_health; then + log "Docker failed to recover" + NEEDS_RESTART=1 + fi + fi + + # Check runner service and process + if ! check_runner_service || ! check_runner_process; then + NEEDS_RESTART=1 + fi + + if [ $NEEDS_RESTART -eq 1 ]; then + log "Issues detected, performing full runner reset" + + # Stop services + systemctl stop github-runner + + # Clean up runner files + rm -rf /opt/github-runner/.runner + rm -rf /opt/github-runner/.credentials + rm -rf /opt/github-runner/.env + + # Clean up Docker + docker system prune -af --volumes + + # Restart Docker + systemctl restart docker + sleep 10 + + # Start runner service + systemctl start github-runner + + log "Runner reset completed" + else + log "Health check passed" + fi + EOF + + chmod +x /usr/local/bin/reconfigure-runner.sh + + # Create a more frequent health check timer + cat > /etc/systemd/system/github-runner-healthcheck.timer << EOF + [Unit] + Description=Run GitHub Runner Health Check frequently + + [Timer] + OnBootSec=1min + OnUnitActiveSec=5min + RandomizedDelaySec=30 + Unit=github-runner-healthcheck.service + + [Install] + WantedBy=multi-user.target + EOF + + # Create log rotation for runner logs + cat > /etc/logrotate.d/github-runner << EOF + /var/log/github-runner*.log { + daily + rotate 7 + compress + delaycompress + missingok + notifempty + create 0644 github-runner github-runner + } + EOF + + # Create runner logs with proper permissions + touch /var/log/github-runner.log /var/log/github-runner-health.log + chown github-runner:github-runner /var/log/github-runner.log /var/log/github-runner-health.log + chmod 644 /var/log/github-runner.log /var/log/github-runner-health.log + + # Enable and start the services + systemctl daemon-reload + systemctl enable github-runner + systemctl enable github-runner-healthcheck.timer + systemctl start github-runner + systemctl start github-runner-healthcheck.timer + + # Auto Scaling Group for runners + RunnerAutoScalingGroup: + Type: AWS::AutoScaling::AutoScalingGroup + Properties: + AutoScalingGroupName: + Fn::Sub: ${Environment}-${ProjectName}-runners + LaunchTemplate: + LaunchTemplateId: + Fn::Ref: RunnerLaunchTemplate + Version: + Fn::GetAtt: + - RunnerLaunchTemplate + - LatestVersionNumber + MinSize: + Fn::Ref: RunnerCount + MaxSize: + Fn::Ref: RunnerCount + DesiredCapacity: + Fn::Ref: RunnerCount + VPCZoneIdentifier: + - Fn::Ref: PublicSubnet + Tags: + - Key: Name + Value: + Fn::Sub: ${Environment}-${ProjectName}-runner + PropagateAtLaunch: true + - Key: Environment + Value: + Fn::Ref: Environment + PropagateAtLaunch: true + - Key: Project + Value: + Fn::Ref: ProjectName + PropagateAtLaunch: true + - Key: ManagedBy + Value: CloudFormation + PropagateAtLaunch: true + +Outputs: + VpcId: + Description: ID of the VPC + Value: + Fn::Ref: VPC + + PublicSubnetId: + Description: ID of the public subnet + Value: + Fn::Ref: PublicSubnet + + SecurityGroupId: + Description: ID of the build security group + Value: + Fn::Ref: BuildSecurityGroup + + RunnerRoleArn: + Description: ARN of the runner IAM role + Value: + Fn::GetAtt: + - RunnerRole + - Arn + + RunnerAutoScalingGroupName: + Description: Name of the runner Auto Scaling Group + Value: + Fn::Ref: RunnerAutoScalingGroup diff --git a/pyproject.toml b/pyproject.toml index 2017c11..ccff549 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,10 +22,10 @@ name = "swagger" version = "1.0.0" description = "A library for generating waypoint graphs from occupancy grid maps" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.10,<4.0" classifiers = [ "Development Status :: 3 - Alpha", - "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.12", ] dependencies = [ "colorlog", @@ -37,7 +37,7 @@ dependencies = [ "networkx>=2.6.0", "numba", "numpy>=1.21.0,<2.0", - "numpydantic", + "numpydantic>=1.6.9", "opencv-python>=4.5.0", "psutil", "pydantic==2.10.6",