From a968734308b6650ab4ac51b139cb14f6d3621898 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 12 May 2025 17:36:50 -0700 Subject: [PATCH 01/72] Adding CI --- .github/workflows/ci.yml | 160 +++++++ .../cloudformation/github-runners-stack.yml | 397 ++++++++++++++++++ 2 files changed, 557 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 infrastructure/cloudformation/github-runners-stack.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..d2550e2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,160 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: CI + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main, develop ] + +env: + DOCKER_HOST: tcp://localhost:2375 + DOCKER_TLS_CERTDIR: "" + +jobs: + pre-commit: + runs-on: [self-hosted, gpu] + steps: + - uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends git + pip install pre-commit + + - name: Run pre-commit + run: pre-commit run --all-files + + unit-tests: + needs: pre-commit + runs-on: [self-hosted, gpu] + steps: + - uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y python3-pip libgl1-mesa-glx libglib2.0-0 + pip install --upgrade pip + pip install -e . + pip install pytest pytest-xdist junitparser + + - name: Run unit tests + run: | + python3 -m pytest --junitxml=test-results/unit-tests.xml --cov=. --cov-report=xml:test-results/coverage.xml + + - name: Upload test results + uses: actions/upload-artifact@v4 + if: always() + with: + name: unit-test-results + path: test-results/ + retention-days: 7 + + e2e-tests: + needs: unit-tests + runs-on: [self-hosted, gpu] + steps: + - uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y python3-pip libgl1-mesa-glx libglib2.0-0 + pip install --upgrade pip + pip install -e . + pip install pytest pytest-xdist junitparser + + - name: Run E2E tests + run: | + mkdir -p test-results + python3 scripts/generate_graph.py --map-path maps/carter_warehouse_navigation.png --graph-eval.active --perf-eval.active + python3 scripts/evaluate_graph.py --map-path maps/carter_warehouse_navigation.png --output-dir results --resolution 0.05 --safety-distance 0.3 --occupancy-threshold 127 + # Convert test results to JUnit XML format + python3 -c " +import junitparser +import os +import json + +suite = junitparser.TestSuite('E2E Tests') +test = junitparser.TestCase('Graph Generation and Evaluation') +test.result = junitparser.junitparser.TestResult() +test.result.message = 'E2E test completed successfully' +suite.add_testcase(test) + +with open('test-results/e2e-tests.xml', 'w') as f: + junitparser.JUnitXml().add_testsuite(suite).write(f) + " + + - name: Upload test results + uses: actions/upload-artifact@v4 + if: always() + with: + name: e2e-test-results + path: test-results/ + retention-days: 7 + + - name: Cleanup + if: always() + run: rm -rf results graphs test-results + + docker-build: + needs: e2e-tests + runs-on: [self-hosted, gpu] + container: + image: nvidia/dind:cuda-12.6.0 + options: --privileged + services: + - name: docker + image: nvidia/dind:cuda-12.6.0 + options: --privileged + steps: + - uses: actions/checkout@v4 + + - name: Install dependencies + run: | + apt-get update + apt-get install -y python3-full ffmpeg + + - name: Wait for Docker daemon + run: | + for i in $(seq 1 10); do + if docker info; then + break + fi + echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." + sleep 10 + done + + - name: Build and test Docker + run: | + cd docker + docker compose build --no-cache + docker compose up rest-api & + sleep 10 + cd ../ + python3 -m venv venv + source venv/bin/activate + pip install -e . + docker ps + python scripts/test_api_client.py --map_path maps/carter_warehouse_navigation.png --host docker + + - name: Cleanup + if: always() + run: | + cd docker + docker compose down \ No newline at end of file diff --git a/infrastructure/cloudformation/github-runners-stack.yml b/infrastructure/cloudformation/github-runners-stack.yml new file mode 100644 index 0000000..0e89074 --- /dev/null +++ b/infrastructure/cloudformation/github-runners-stack.yml @@ -0,0 +1,397 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +AWSTemplateFormatVersion: '2010-09-09' +Description: 'CloudFormation template for CI/CD infrastructure' + +Parameters: + Environment: + Type: String + Default: production + AllowedValues: + - production + - staging + Description: Environment name for resource naming + + ProjectName: + Type: String + Default: projectName + Description: Name of the project (used in resource naming) + + GitHubRepoPath: + Type: String + Description: Full GitHub repository path (e.g., 'microsoft/vscode' or 'octocat/Hello-World') + + VpcCidr: + Type: String + Default: 10.0.0.0/16 + Description: CIDR block for the VPC + + PublicSubnetCidr: + Type: String + Default: 10.0.1.0/24 + Description: CIDR block for the public subnet + + GitHubToken: + Type: String + Description: GitHub personal access token with 'repo' and 'workflow' scopes. If adding to an organization, also requires 'admin:org' scope. + NoEcho: true + + RunnerCount: + Type: Number + Description: Number of GitHub Actions runners to create + Default: 1 + MinValue: 1 + MaxValue: 10 + + InstanceType: + Type: String + Description: EC2 instance type for runners + Default: g4dn.xlarge + AllowedValues: + - g4dn.xlarge + - g4dn.2xlarge + - g4dn.4xlarge + - g5.xlargexr bv + - g5.2xlarge + - g5.4xlarge + + LogRetentionDays: + Type: Number + Description: Number of days to retain logs + Default: 90 + MinValue: 1 + MaxValue: 3653 + +Conditions: + IsProduction: !Equals [!Ref Environment, 'production'] + +Resources: + # VPC Resources + VPC: + Type: AWS::EC2::VPC + Properties: + CidrBlock: !Ref VpcCidr + EnableDnsHostnames: true + EnableDnsSupport: true + Tags: + - Key: Name + Value: !Sub ${Environment}-${ProjectName}-vpc + - Key: Environment + Value: !Ref Environment + - Key: Project + Value: !Ref ProjectName + - Key: ManagedBy + Value: CloudFormation + + InternetGateway: + Type: AWS::EC2::InternetGateway + Properties: + Tags: + - Key: Name + Value: !Sub ${Environment}-${ProjectName}-igw + - Key: Environment + Value: !Ref Environment + - Key: Project + Value: !Ref ProjectName + - Key: ManagedBy + Value: CloudFormation + + VpcGatewayAttachment: + Type: AWS::EC2::VPCGatewayAttachment + Properties: + VpcId: !Ref VPC + InternetGatewayId: !Ref InternetGateway + + PublicSubnet: + Type: AWS::EC2::Subnet + Properties: + VpcId: !Ref VPC + CidrBlock: !Ref PublicSubnetCidr + AvailabilityZone: !Select [0, !GetAZs ''] + MapPublicIpOnLaunch: true + Tags: + - Key: Name + Value: !Sub ${Environment}-${ProjectName}-public-subnet + - Key: Environment + Value: !Ref Environment + - Key: Project + Value: !Ref ProjectName + - Key: ManagedBy + Value: CloudFormation + + RouteTable: + Type: AWS::EC2::RouteTable + Properties: + VpcId: !Ref VPC + Tags: + - Key: Name + Value: !Sub ${Environment}-${ProjectName}-rt + - Key: Environment + Value: !Ref Environment + - Key: Project + Value: !Ref ProjectName + - Key: ManagedBy + Value: CloudFormation + + Route: + Type: AWS::EC2::Route + DependsOn: VpcGatewayAttachment + Properties: + RouteTableId: !Ref RouteTable + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: !Ref InternetGateway + + SubnetRouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + SubnetId: !Ref PublicSubnet + RouteTableId: !Ref RouteTable + + # Security Group + BuildSecurityGroup: + Type: AWS::EC2::SecurityGroup + Properties: + GroupName: !Sub ${Environment}-${ProjectName}-build-sg + GroupDescription: Security group for build instances + VpcId: !Ref VPC + SecurityGroupIngress: + - IpProtocol: tcp + FromPort: 22 + ToPort: 22 + CidrIp: 0.0.0.0/0 + Description: Allow SSH access + - IpProtocol: tcp + FromPort: 80 + ToPort: 80 + CidrIp: 0.0.0.0/0 + Description: Allow HTTP access + - IpProtocol: tcp + FromPort: 443 + ToPort: 443 + CidrIp: 0.0.0.0/0 + Description: Allow HTTPS access + SecurityGroupEgress: + - IpProtocol: -1 + CidrIp: 0.0.0.0/0 + Description: Allow all outbound traffic + Tags: + - Key: Name + Value: !Sub ${Environment}-${ProjectName}-build-sg + - Key: Environment + Value: !Ref Environment + - Key: Project + Value: !Ref ProjectName + - Key: ManagedBy + Value: CloudFormation + + # IAM Role for EC2 instances + RunnerRole: + Type: AWS::IAM::Role + Properties: + RoleName: !Sub ${Environment}-${ProjectName}-runner-role + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: ec2.amazonaws.com + Action: sts:AssumeRole + ManagedPolicyArns: + - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly + Tags: + - Key: Environment + Value: !Ref Environment + - Key: Project + Value: !Ref ProjectName + - Key: ManagedBy + Value: CloudFormation + + RunnerInstanceProfile: + Type: AWS::IAM::InstanceProfile + Properties: + Path: / + Roles: + - !Ref RunnerRole + InstanceProfileName: !Sub ${Environment}-${ProjectName}-runner-profile + + # Launch Template for runners + RunnerLaunchTemplate: + Type: AWS::EC2::LaunchTemplate + Properties: + LaunchTemplateName: !Sub ${Environment}-${ProjectName}-runner-template + LaunchTemplateData: + ImageId: ami-0eabc4ddf08279fc3 # Ubuntu 22.04 LTS + InstanceType: !Ref InstanceType + NetworkInterfaces: + - DeviceIndex: 0 + AssociatePublicIpAddress: true + SubnetId: !Ref PublicSubnet + Groups: + - !Ref BuildSecurityGroup + IamInstanceProfile: + Name: !Ref RunnerInstanceProfile + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 100 + VolumeType: gp3 + DeleteOnTermination: true + - DeviceName: /dev/sdf + Ebs: + VolumeSize: 1024 + VolumeType: gp3 + DeleteOnTermination: true + TagSpecifications: + - ResourceType: instance + Tags: + - Key: Name + Value: !Sub ${Environment}-${ProjectName}-runner + - Key: Environment + Value: !Ref Environment + - Key: Project + Value: !Ref ProjectName + - Key: ManagedBy + Value: CloudFormation + UserData: + Fn::Base64: !Sub | + #!/bin/bash + set -e + + # Install Docker + apt-get update + apt-get install -y ca-certificates curl gnupg + install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + chmod a+r /etc/apt/keyrings/docker.gpg + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + apt-get update + apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + + # Format and mount EBS volume for Docker cache + mkfs -t ext4 /dev/sdf + mkdir -p /var/lib/docker-cache + mount /dev/sdf /var/lib/docker-cache + + # Configure Docker to use the mounted volume for cache + mkdir -p /etc/docker + cat > /etc/docker/daemon.json << EOF + { + "data-root": "/var/lib/docker-cache", + "storage-driver": "overlay2", + "log-driver": "json-file", + "log-opts": { + "max-size": "100m", + "max-file": "3" + } + } + EOF + + # Restart Docker to apply new configuration + systemctl restart docker + + # Create Docker cache cleanup script + cat > /usr/local/bin/cleanup-docker-cache.sh << 'EOF' + #!/bin/bash + + # Set threshold (in percentage) for cleanup + THRESHOLD=80 + + # Get current disk usage + USAGE=$(df -h /var/lib/docker-cache | awk 'NR==2 {print $5}' | sed 's/%//') + + if [ "$USAGE" -gt "$THRESHOLD" ]; then + echo "Docker cache usage is at $USAGE%, cleaning up..." + + # Remove unused containers + docker container prune -f + + # Remove unused images + docker image prune -a -f + + # Remove unused volumes + docker volume prune -f + + # Remove build cache + docker builder prune -f + + echo "Cleanup completed" + else + echo "Docker cache usage is at $USAGE%, no cleanup needed" + fi + EOF + + chmod +x /usr/local/bin/cleanup-docker-cache.sh + + # Add cleanup script to crontab + (crontab -l 2>/dev/null; echo "0 */4 * * * /usr/local/bin/cleanup-docker-cache.sh") | crontab - + + # Install GitHub Actions runner + mkdir -p /opt/actions-runner + cd /opt/actions-runner + curl -o actions-runner-linux-x64.tar.gz -L https://github.com/actions/runner/releases/download/v3.1.0/actions-runner-linux-x64-3.1.0.tar.gz + tar xzf ./actions-runner-linux-x64.tar.gz + ./bin/installdependencies.sh + + # Configure and start the runner + ./config.sh --url https://github.com/${GitHubRepoPath} --token ${GitHubToken} --labels self-hosted,gpu --unattended + ./run.sh + + # Auto Scaling Group for runners + RunnerAutoScalingGroup: + Type: AWS::AutoScaling::AutoScalingGroup + Properties: + AutoScalingGroupName: !Sub ${Environment}-${ProjectName}-runners + LaunchTemplate: + LaunchTemplateId: !Ref RunnerLaunchTemplate + Version: !GetAtt RunnerLaunchTemplate.LatestVersionNumber + MinSize: !Ref RunnerCount + MaxSize: !Ref RunnerCount + DesiredCapacity: !Ref RunnerCount + VPCZoneIdentifier: + - !Ref PublicSubnet + Tags: + - Key: Name + Value: !Sub ${Environment}-${ProjectName}-runner + PropagateAtLaunch: true + - Key: Environment + Value: !Ref Environment + PropagateAtLaunch: true + - Key: Project + Value: !Ref ProjectName + PropagateAtLaunch: true + - Key: ManagedBy + Value: CloudFormation + PropagateAtLaunch: true + +Outputs: + VpcId: + Description: ID of the VPC + Value: !Ref VPC + + PublicSubnetId: + Description: ID of the public subnet + Value: !Ref PublicSubnet + + SecurityGroupId: + Description: ID of the build security group + Value: !Ref BuildSecurityGroup + + RunnerRoleArn: + Description: ARN of the runner IAM role + Value: !GetAtt RunnerRole.Arn + + RunnerAutoScalingGroupName: + Description: Name of the runner Auto Scaling Group + Value: !Ref RunnerAutoScalingGroup \ No newline at end of file From eb53fe3d04780410f52293c1c7182773e8b9a97e Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 12 May 2025 18:18:05 -0700 Subject: [PATCH 02/72] Running CI steps in docker containers --- .github/workflows/ci.yml | 93 +++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d2550e2..58748db 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,13 +27,30 @@ env: jobs: pre-commit: runs-on: [self-hosted, gpu] + container: + image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 + options: --privileged + services: + - name: docker + image: nvidia/dind:cuda-12.6.0 + options: --privileged steps: - uses: actions/checkout@v4 + - name: Wait for Docker daemon + run: | + for i in $(seq 1 10); do + if docker info; then + break + fi + echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." + sleep 10 + done + - name: Install dependencies run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git + apt-get update + apt-get install -y --no-install-recommends git python3-pip pip install pre-commit - name: Run pre-commit @@ -42,13 +59,30 @@ jobs: unit-tests: needs: pre-commit runs-on: [self-hosted, gpu] + container: + image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 + options: --privileged + services: + - name: docker + image: nvidia/dind:cuda-12.6.0 + options: --privileged steps: - uses: actions/checkout@v4 + - name: Wait for Docker daemon + run: | + for i in $(seq 1 10); do + if docker info; then + break + fi + echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." + sleep 10 + done + - name: Install dependencies run: | - sudo apt-get update - sudo apt-get install -y python3-pip libgl1-mesa-glx libglib2.0-0 + apt-get update + apt-get install -y python3-pip libgl1-mesa-glx libglib2.0-0 pip install --upgrade pip pip install -e . pip install pytest pytest-xdist junitparser @@ -68,37 +102,38 @@ jobs: e2e-tests: needs: unit-tests runs-on: [self-hosted, gpu] + container: + image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 + options: --privileged + services: + - name: docker + image: nvidia/dind:cuda-12.6.0 + options: --privileged steps: - uses: actions/checkout@v4 + - name: Wait for Docker daemon + run: | + for i in $(seq 1 10); do + if docker info; then + break + fi + echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." + sleep 10 + done + - name: Install dependencies run: | - sudo apt-get update - sudo apt-get install -y python3-pip libgl1-mesa-glx libglib2.0-0 + apt-get update + apt-get install -y python3-pip libgl1-mesa-glx libglib2.0-0 pip install --upgrade pip pip install -e . - pip install pytest pytest-xdist junitparser + pip install pytest pytest-xdist - name: Run E2E tests run: | - mkdir -p test-results python3 scripts/generate_graph.py --map-path maps/carter_warehouse_navigation.png --graph-eval.active --perf-eval.active python3 scripts/evaluate_graph.py --map-path maps/carter_warehouse_navigation.png --output-dir results --resolution 0.05 --safety-distance 0.3 --occupancy-threshold 127 - # Convert test results to JUnit XML format - python3 -c " -import junitparser -import os -import json - -suite = junitparser.TestSuite('E2E Tests') -test = junitparser.TestCase('Graph Generation and Evaluation') -test.result = junitparser.junitparser.TestResult() -test.result.message = 'E2E test completed successfully' -suite.add_testcase(test) - -with open('test-results/e2e-tests.xml', 'w') as f: - junitparser.JUnitXml().add_testsuite(suite).write(f) - " - name: Upload test results uses: actions/upload-artifact@v4 @@ -116,7 +151,7 @@ with open('test-results/e2e-tests.xml', 'w') as f: needs: e2e-tests runs-on: [self-hosted, gpu] container: - image: nvidia/dind:cuda-12.6.0 + image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged services: - name: docker @@ -125,11 +160,6 @@ with open('test-results/e2e-tests.xml', 'w') as f: steps: - uses: actions/checkout@v4 - - name: Install dependencies - run: | - apt-get update - apt-get install -y python3-full ffmpeg - - name: Wait for Docker daemon run: | for i in $(seq 1 10); do @@ -139,6 +169,11 @@ with open('test-results/e2e-tests.xml', 'w') as f: echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." sleep 10 done + + - name: Install dependencies + run: | + apt-get update + apt-get install -y python3-full ffmpeg - name: Build and test Docker run: | From f8b5bfe3f0c461a7737bce576398ca01a93a2f0d Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Tue, 13 May 2025 15:32:50 -0700 Subject: [PATCH 03/72] Updating infra setup --- .github/workflows/ci.yml | 8 +- .../cloudformation/github-runners-stack.yml | 148 ++++++++++++++++-- 2 files changed, 136 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 58748db..2f86a68 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ env: jobs: pre-commit: - runs-on: [self-hosted, gpu] + runs-on: self-hosted container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged @@ -58,7 +58,7 @@ jobs: unit-tests: needs: pre-commit - runs-on: [self-hosted, gpu] + runs-on: self-hosted container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged @@ -101,7 +101,7 @@ jobs: e2e-tests: needs: unit-tests - runs-on: [self-hosted, gpu] + runs-on: self-hosted container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged @@ -149,7 +149,7 @@ jobs: docker-build: needs: e2e-tests - runs-on: [self-hosted, gpu] + runs-on: self-hosted container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged diff --git a/infrastructure/cloudformation/github-runners-stack.yml b/infrastructure/cloudformation/github-runners-stack.yml index 0e89074..4056951 100644 --- a/infrastructure/cloudformation/github-runners-stack.yml +++ b/infrastructure/cloudformation/github-runners-stack.yml @@ -31,7 +31,8 @@ Parameters: GitHubRepoPath: Type: String - Description: Full GitHub repository path (e.g., 'microsoft/vscode' or 'octocat/Hello-World') + Description: Full GitHub repository path (e.g., 'microsoft/vscode' or 'octocat/Hello-World'). For organization runners, use the organization name. + AllowedPattern: ^[a-zA-Z0-9-]+/[a-zA-Z0-9-]+$|^[a-zA-Z0-9-]+$ VpcCidr: Type: String @@ -45,7 +46,7 @@ Parameters: GitHubToken: Type: String - Description: GitHub personal access token with 'repo' and 'workflow' scopes. If adding to an organization, also requires 'admin:org' scope. + Description: GitHub personal access token with 'repo' and 'workflow' scopes. For organization runners, requires 'admin:org' scope. NoEcho: true RunnerCount: @@ -248,11 +249,14 @@ Resources: VolumeSize: 100 VolumeType: gp3 DeleteOnTermination: true - - DeviceName: /dev/sdf + - DeviceName: /dev/xvdf Ebs: VolumeSize: 1024 VolumeType: gp3 DeleteOnTermination: true + Encrypted: true + Iops: 3000 + Throughput: 125 TagSpecifications: - ResourceType: instance Tags: @@ -269,6 +273,12 @@ Resources: #!/bin/bash set -e + # Wait for apt lock to be released + while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "Waiting for other package manager to finish..." + sleep 1 + done + # Install Docker apt-get update apt-get install -y ca-certificates curl gnupg @@ -276,13 +286,69 @@ Resources: curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg chmod a+r /etc/apt/keyrings/docker.gpg echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + + # Wait for apt lock again before updating + while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "Waiting for other package manager to finish..." + sleep 1 + done + apt-get update apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - # Format and mount EBS volume for Docker cache - mkfs -t ext4 /dev/sdf + # Debug information + echo "=== Debug Information ===" + lsblk + echo "=== Mount Points ===" + mount + echo "=== FSTAB ===" + cat /etc/fstab + + # Wait for EBS volume to be available and find the correct device + echo "Waiting for EBS volume to be attached..." + while true; do + # Check for NVMe devices + if [ -e /dev/nvme1n1 ]; then + EBS_DEVICE="/dev/nvme1n1" + break + elif [ -e /dev/nvme0n1 ]; then + EBS_DEVICE="/dev/nvme0n1" + break + fi + echo "Waiting for EBS volume..." + sleep 5 + done + + echo "Found EBS volume at $EBS_DEVICE" + + # Wait for the volume to be ready + sleep 10 + + # Check if the volume is already formatted + if ! blkid $EBS_DEVICE; then + echo "Formatting EBS volume..." + mkfs -t ext4 $EBS_DEVICE + fi + + # Create mount point and add to fstab mkdir -p /var/lib/docker-cache - mount /dev/sdf /var/lib/docker-cache + + # Remove any existing mount entry for this device + sed -i '/\/var\/lib\/docker-cache/d' /etc/fstab + + # Add new mount entry + echo "$EBS_DEVICE /var/lib/docker-cache ext4 defaults,nofail 0 2" >> /etc/fstab + + # Unmount if already mounted + umount /var/lib/docker-cache 2>/dev/null || true + + # Mount the volume + mount -a + + # Verify mount + echo "=== After Mount ===" + mount | grep docker-cache + df -h /var/lib/docker-cache # Configure Docker to use the mounted volume for cache mkdir -p /etc/docker @@ -298,8 +364,17 @@ Resources: } EOF + # Stop Docker before moving data + systemctl stop docker + + # Move existing Docker data if it exists + if [ -d "/var/lib/docker" ] && [ "$(ls -A /var/lib/docker)" ]; then + echo "Moving existing Docker data..." + mv /var/lib/docker/* /var/lib/docker-cache/ 2>/dev/null || true + fi + # Restart Docker to apply new configuration - systemctl restart docker + systemctl start docker # Create Docker cache cleanup script cat > /usr/local/bin/cleanup-docker-cache.sh << 'EOF' @@ -337,16 +412,57 @@ Resources: # Add cleanup script to crontab (crontab -l 2>/dev/null; echo "0 */4 * * * /usr/local/bin/cleanup-docker-cache.sh") | crontab - + # Wait for apt lock before installing jq + while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "Waiting for other package manager to finish..." + sleep 1 + done + + # Install jq for JSON parsing + apt-get install -y jq + # Install GitHub Actions runner - mkdir -p /opt/actions-runner - cd /opt/actions-runner - curl -o actions-runner-linux-x64.tar.gz -L https://github.com/actions/runner/releases/download/v3.1.0/actions-runner-linux-x64-3.1.0.tar.gz - tar xzf ./actions-runner-linux-x64.tar.gz - ./bin/installdependencies.sh - - # Configure and start the runner - ./config.sh --url https://github.com/${GitHubRepoPath} --token ${GitHubToken} --labels self-hosted,gpu --unattended - ./run.sh + mkdir -p /opt/github-runner + cd /opt/github-runner + curl -o actions-runner-linux-x64-2.323.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.323.0/actions-runner-linux-x64-2.323.0.tar.gz + echo "0dbc9bf5a58620fc52cb6cc0448abcca964a8d74b5f39773b7afcad9ab691e19 actions-runner-linux-x64-2.323.0.tar.gz" | shasum -a 256 -c + tar xzf ./actions-runner-linux-x64-2.323.0.tar.gz + + # Create runner user + useradd -m -s /bin/bash github-runner + + # Set ownership of runner directory + chown -R github-runner:github-runner /opt/github-runner + + # Create systemd service for GitHub runner + cat > /etc/systemd/system/github-runner.service << EOF + [Unit] + Description=GitHub Actions Runner + After=network.target docker.service + + [Service] + Type=simple + User=github-runner + WorkingDirectory=/opt/github-runner + ExecStart=/opt/github-runner/run.sh + Restart=always + RestartSec=10 + + [Install] + WantedBy=multi-user.target + EOF + + # Configure the runner as non-root user using the registration token + su - github-runner -c "cd /opt/github-runner && ./config.sh --url https://github.com/${GitHubRepoPath} --token \$(curl -s -X POST \ + -H \"Accept: application/vnd.github+json\" \ + -H \"Authorization: Bearer ${GitHubToken}\" \ + -H \"X-GitHub-Api-Version: 2022-11-28\" \ + \"https://api.github.com/repos/${GitHubRepoPath}/actions/runners/registration-token\" | jq -r '.token') --labels self-hosted,gpu --unattended --ephemeral" + + # Enable and start the service + systemctl daemon-reload + systemctl enable github-runner + systemctl start github-runner # Auto Scaling Group for runners RunnerAutoScalingGroup: From 2ad5e7366c051cf89b14a11a60184d2554a06687 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Tue, 13 May 2025 15:35:22 -0700 Subject: [PATCH 04/72] Updating github workflow --- .github/workflows/ci.yml | 119 ++++++++++++++++++++++++++++++--------- 1 file changed, 91 insertions(+), 28 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2f86a68..ec9ce6e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,73 +23,104 @@ on: env: DOCKER_HOST: tcp://localhost:2375 DOCKER_TLS_CERTDIR: "" + PYTHONPATH: ${{ github.workspace }} + CUDA_VISIBLE_DEVICES: all jobs: pre-commit: + name: Pre-commit Checks runs-on: self-hosted container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 - options: --privileged + options: --privileged --gpus all services: - name: docker image: nvidia/dind:cuda-12.6.0 options: --privileged steps: - - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history for pre-commit - name: Wait for Docker daemon run: | for i in $(seq 1 10); do - if docker info; then + if docker info >/dev/null 2>&1; then + echo "Docker daemon is ready" break fi echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." sleep 10 done + if [ $i -eq 10 ]; then + echo "Docker daemon failed to start" + exit 1 + fi - name: Install dependencies run: | apt-get update - apt-get install -y --no-install-recommends git python3-pip - pip install pre-commit + apt-get install -y --no-install-recommends \ + git \ + python3-pip \ + python3-venv + python3 -m pip install --upgrade pip + python3 -m pip install pre-commit - name: Run pre-commit run: pre-commit run --all-files unit-tests: + name: Unit Tests needs: pre-commit runs-on: self-hosted container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 - options: --privileged + options: --privileged --gpus all services: - name: docker image: nvidia/dind:cuda-12.6.0 options: --privileged steps: - - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - name: Wait for Docker daemon run: | for i in $(seq 1 10); do - if docker info; then + if docker info >/dev/null 2>&1; then + echo "Docker daemon is ready" break fi echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." sleep 10 done + if [ $i -eq 10 ]; then + echo "Docker daemon failed to start" + exit 1 + fi - name: Install dependencies run: | apt-get update - apt-get install -y python3-pip libgl1-mesa-glx libglib2.0-0 - pip install --upgrade pip - pip install -e . - pip install pytest pytest-xdist junitparser + apt-get install -y --no-install-recommends \ + python3-pip \ + python3-venv \ + libgl1-mesa-glx \ + libglib2.0-0 + python3 -m pip install --upgrade pip + python3 -m pip install -e . + python3 -m pip install pytest pytest-xdist pytest-cov junitparser - name: Run unit tests run: | - python3 -m pytest --junitxml=test-results/unit-tests.xml --cov=. --cov-report=xml:test-results/coverage.xml + mkdir -p test-results + python3 -m pytest \ + --junitxml=test-results/unit-tests.xml \ + --cov=. \ + --cov-report=xml:test-results/coverage.xml \ + --cov-report=term-missing - name: Upload test results uses: actions/upload-artifact@v4 @@ -100,40 +131,60 @@ jobs: retention-days: 7 e2e-tests: + name: End-to-End Tests needs: unit-tests runs-on: self-hosted container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 - options: --privileged + options: --privileged --gpus all services: - name: docker image: nvidia/dind:cuda-12.6.0 options: --privileged steps: - - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - name: Wait for Docker daemon run: | for i in $(seq 1 10); do - if docker info; then + if docker info >/dev/null 2>&1; then + echo "Docker daemon is ready" break fi echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." sleep 10 done + if [ $i -eq 10 ]; then + echo "Docker daemon failed to start" + exit 1 + fi - name: Install dependencies run: | apt-get update - apt-get install -y python3-pip libgl1-mesa-glx libglib2.0-0 - pip install --upgrade pip - pip install -e . - pip install pytest pytest-xdist + apt-get install -y --no-install-recommends \ + python3-pip \ + python3-venv \ + libgl1-mesa-glx \ + libglib2.0-0 + python3 -m pip install --upgrade pip + python3 -m pip install -e . + python3 -m pip install pytest pytest-xdist - name: Run E2E tests run: | - python3 scripts/generate_graph.py --map-path maps/carter_warehouse_navigation.png --graph-eval.active --perf-eval.active - python3 scripts/evaluate_graph.py --map-path maps/carter_warehouse_navigation.png --output-dir results --resolution 0.05 --safety-distance 0.3 --occupancy-threshold 127 + mkdir -p test-results + python3 scripts/generate_graph.py \ + --map-path maps/carter_warehouse_navigation.png \ + --graph-eval.active \ + --perf-eval.active + python3 scripts/evaluate_graph.py \ + --map-path maps/carter_warehouse_navigation.png \ + --output-dir results \ + --resolution 0.05 \ + --safety-distance 0.3 \ + --occupancy-threshold 127 - name: Upload test results uses: actions/upload-artifact@v4 @@ -148,45 +199,57 @@ jobs: run: rm -rf results graphs test-results docker-build: + name: Docker Build and Test needs: e2e-tests runs-on: self-hosted container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 - options: --privileged + options: --privileged --gpus all services: - name: docker image: nvidia/dind:cuda-12.6.0 options: --privileged steps: - - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - name: Wait for Docker daemon run: | for i in $(seq 1 10); do - if docker info; then + if docker info >/dev/null 2>&1; then + echo "Docker daemon is ready" break fi echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." sleep 10 done + if [ $i -eq 10 ]; then + echo "Docker daemon failed to start" + exit 1 + fi - name: Install dependencies run: | apt-get update - apt-get install -y python3-full ffmpeg + apt-get install -y --no-install-recommends \ + python3-full \ + python3-venv \ + ffmpeg - name: Build and test Docker run: | cd docker docker compose build --no-cache - docker compose up rest-api & + docker compose up -d rest-api sleep 10 cd ../ python3 -m venv venv source venv/bin/activate pip install -e . docker ps - python scripts/test_api_client.py --map_path maps/carter_warehouse_navigation.png --host docker + python scripts/test_api_client.py \ + --map_path maps/carter_warehouse_navigation.png \ + --host docker - name: Cleanup if: always() From e42abc95a51602e482a4de53383f6b544d9dc86b Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Tue, 13 May 2025 15:37:30 -0700 Subject: [PATCH 05/72] Updating github workflow structure --- .github/workflows/ci.yml | 44 ++++++++-------------------------------- 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ec9ce6e..446b9a4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,10 +61,7 @@ jobs: - name: Install dependencies run: | apt-get update - apt-get install -y --no-install-recommends \ - git \ - python3-pip \ - python3-venv + apt-get install -y --no-install-recommends git python3-pip python3-venv python3 -m pip install --upgrade pip python3 -m pip install pre-commit @@ -104,11 +101,7 @@ jobs: - name: Install dependencies run: | apt-get update - apt-get install -y --no-install-recommends \ - python3-pip \ - python3-venv \ - libgl1-mesa-glx \ - libglib2.0-0 + apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip python3 -m pip install -e . python3 -m pip install pytest pytest-xdist pytest-cov junitparser @@ -116,11 +109,7 @@ jobs: - name: Run unit tests run: | mkdir -p test-results - python3 -m pytest \ - --junitxml=test-results/unit-tests.xml \ - --cov=. \ - --cov-report=xml:test-results/coverage.xml \ - --cov-report=term-missing + python3 -m pytest --junitxml=test-results/unit-tests.xml --cov=. --cov-report=xml:test-results/coverage.xml --cov-report=term-missing - name: Upload test results uses: actions/upload-artifact@v4 @@ -163,11 +152,7 @@ jobs: - name: Install dependencies run: | apt-get update - apt-get install -y --no-install-recommends \ - python3-pip \ - python3-venv \ - libgl1-mesa-glx \ - libglib2.0-0 + apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip python3 -m pip install -e . python3 -m pip install pytest pytest-xdist @@ -175,16 +160,8 @@ jobs: - name: Run E2E tests run: | mkdir -p test-results - python3 scripts/generate_graph.py \ - --map-path maps/carter_warehouse_navigation.png \ - --graph-eval.active \ - --perf-eval.active - python3 scripts/evaluate_graph.py \ - --map-path maps/carter_warehouse_navigation.png \ - --output-dir results \ - --resolution 0.05 \ - --safety-distance 0.3 \ - --occupancy-threshold 127 + python3 scripts/generate_graph.py --map-path maps/carter_warehouse_navigation.png --graph-eval.active --perf-eval.active + python3 scripts/evaluate_graph.py --map-path maps/carter_warehouse_navigation.png --output-dir results --resolution 0.05 --safety-distance 0.3 --occupancy-threshold 127 - name: Upload test results uses: actions/upload-artifact@v4 @@ -231,10 +208,7 @@ jobs: - name: Install dependencies run: | apt-get update - apt-get install -y --no-install-recommends \ - python3-full \ - python3-venv \ - ffmpeg + apt-get install -y --no-install-recommends python3-full python3-venv ffmpeg - name: Build and test Docker run: | @@ -247,9 +221,7 @@ jobs: source venv/bin/activate pip install -e . docker ps - python scripts/test_api_client.py \ - --map_path maps/carter_warehouse_navigation.png \ - --host docker + python scripts/test_api_client.py --map_path maps/carter_warehouse_navigation.png --host docker - name: Cleanup if: always() From 6ecd02912f836cd18eddbb6bd224f31d19c7296d Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Tue, 13 May 2025 15:50:59 -0700 Subject: [PATCH 06/72] Fixing github workflow structure --- .github/workflows/ci.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 446b9a4..bd2a05b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,9 +41,10 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - fetch-depth: 0 # Fetch all history for pre-commit + fetch-depth: 0 - name: Wait for Docker daemon + shell: bash run: | for i in $(seq 1 10); do if docker info >/dev/null 2>&1; then @@ -59,6 +60,7 @@ jobs: fi - name: Install dependencies + shell: bash run: | apt-get update apt-get install -y --no-install-recommends git python3-pip python3-venv @@ -66,6 +68,7 @@ jobs: python3 -m pip install pre-commit - name: Run pre-commit + shell: bash run: pre-commit run --all-files unit-tests: @@ -84,6 +87,7 @@ jobs: uses: actions/checkout@v4 - name: Wait for Docker daemon + shell: bash run: | for i in $(seq 1 10); do if docker info >/dev/null 2>&1; then @@ -99,6 +103,7 @@ jobs: fi - name: Install dependencies + shell: bash run: | apt-get update apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 @@ -107,6 +112,7 @@ jobs: python3 -m pip install pytest pytest-xdist pytest-cov junitparser - name: Run unit tests + shell: bash run: | mkdir -p test-results python3 -m pytest --junitxml=test-results/unit-tests.xml --cov=. --cov-report=xml:test-results/coverage.xml --cov-report=term-missing @@ -135,6 +141,7 @@ jobs: uses: actions/checkout@v4 - name: Wait for Docker daemon + shell: bash run: | for i in $(seq 1 10); do if docker info >/dev/null 2>&1; then @@ -150,6 +157,7 @@ jobs: fi - name: Install dependencies + shell: bash run: | apt-get update apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 @@ -158,6 +166,7 @@ jobs: python3 -m pip install pytest pytest-xdist - name: Run E2E tests + shell: bash run: | mkdir -p test-results python3 scripts/generate_graph.py --map-path maps/carter_warehouse_navigation.png --graph-eval.active --perf-eval.active @@ -173,6 +182,7 @@ jobs: - name: Cleanup if: always() + shell: bash run: rm -rf results graphs test-results docker-build: @@ -191,6 +201,7 @@ jobs: uses: actions/checkout@v4 - name: Wait for Docker daemon + shell: bash run: | for i in $(seq 1 10); do if docker info >/dev/null 2>&1; then @@ -206,11 +217,13 @@ jobs: fi - name: Install dependencies + shell: bash run: | apt-get update apt-get install -y --no-install-recommends python3-full python3-venv ffmpeg - name: Build and test Docker + shell: bash run: | cd docker docker compose build --no-cache @@ -225,6 +238,7 @@ jobs: - name: Cleanup if: always() + shell: bash run: | cd docker docker compose down \ No newline at end of file From abae2f25cd9a2192a93cdcbdfbbdb72bc7845ece Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Tue, 13 May 2025 16:02:44 -0700 Subject: [PATCH 07/72] Updating github workflow structure --- .github/workflows/ci.yml | 228 +++++++++++++++++++-------------------- 1 file changed, 111 insertions(+), 117 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd2a05b..6094f20 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,20 +16,21 @@ name: CI on: push: - branches: [ main, develop ] + branches: [main, develop] pull_request: - branches: [ main, develop ] + branches: [main, develop] env: - DOCKER_HOST: tcp://localhost:2375 + DOCKER_HOST: tcp://docker:2375 DOCKER_TLS_CERTDIR: "" PYTHONPATH: ${{ github.workspace }} - CUDA_VISIBLE_DEVICES: all jobs: pre-commit: name: Pre-commit Checks runs-on: self-hosted + env: + CUDA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all @@ -37,44 +38,51 @@ jobs: - name: docker image: nvidia/dind:cuda-12.6.0 options: --privileged + with: + healthcheck: | + until docker info; do + sleep 1 + done steps: - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 0 - - - name: Wait for Docker daemon - shell: bash + + - name: Validate workflow syntax + uses: rhysd/actionlint@v1 + + - name: Wait for Docker run: | - for i in $(seq 1 10); do - if docker info >/dev/null 2>&1; then - echo "Docker daemon is ready" - break - fi - echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." + timeout 120s bash -c 'until docker info; do sleep 10 - done - if [ $i -eq 10 ]; then - echo "Docker daemon failed to start" - exit 1 - fi - + done' + + - name: Cache Python packages + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: precommit-${{ hashFiles('.pre-commit-config.yaml') }} + - name: Install dependencies - shell: bash run: | apt-get update - apt-get install -y --no-install-recommends git python3-pip python3-venv + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + --no-install-recommends \ + git python3-pip python3-venv python3 -m pip install --upgrade pip python3 -m pip install pre-commit - + - name: Run pre-commit - shell: bash run: pre-commit run --all-files + timeout-minutes: 60 unit-tests: name: Unit Tests needs: pre-commit runs-on: self-hosted + env: + CUDA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all @@ -83,43 +91,45 @@ jobs: image: nvidia/dind:cuda-12.6.0 options: --privileged steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Wait for Docker daemon - shell: bash + - uses: actions/checkout@v4 + + - name: Wait for Docker run: | - for i in $(seq 1 10); do - if docker info >/dev/null 2>&1; then - echo "Docker daemon is ready" - break - fi - echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." + timeout 120s bash -c 'until docker info; do sleep 10 - done - if [ $i -eq 10 ]; then - echo "Docker daemon failed to start" - exit 1 - fi - + done' + + - name: Cache Python packages + uses: actions/cache@v4 + with: + path: | + ~/.cache/pip + venv/ + key: unit-tests-${{ hashFiles('**/requirements.txt') }} + - name: Install dependencies - shell: bash run: | apt-get update - apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + --no-install-recommends \ + python3-pip python3-venv \ + libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip - python3 -m pip install -e . - python3 -m pip install pytest pytest-xdist pytest-cov junitparser - + python3 -m pip install -e .[test] + - name: Run unit tests - shell: bash run: | mkdir -p test-results - python3 -m pytest --junitxml=test-results/unit-tests.xml --cov=. --cov-report=xml:test-results/coverage.xml --cov-report=term-missing - + pytest \ + --junitxml=test-results/unit-tests.xml \ + --cov=. \ + --cov-report=xml:test-results/coverage.xml \ + --cov-report=term-missing \ + tests/unit + - name: Upload test results - uses: actions/upload-artifact@v4 if: always() + uses: actions/upload-artifact@v4 with: name: unit-test-results path: test-results/ @@ -129,6 +139,8 @@ jobs: name: End-to-End Tests needs: unit-tests runs-on: self-hosted + env: + CUDA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all @@ -137,58 +149,59 @@ jobs: image: nvidia/dind:cuda-12.6.0 options: --privileged steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Wait for Docker daemon - shell: bash + - uses: actions/checkout@v4 + + - name: Wait for Docker run: | - for i in $(seq 1 10); do - if docker info >/dev/null 2>&1; then - echo "Docker daemon is ready" - break - fi - echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." + timeout 120s bash -c 'until docker info; do sleep 10 - done - if [ $i -eq 10 ]; then - echo "Docker daemon failed to start" - exit 1 - fi - + done' + - name: Install dependencies - shell: bash run: | apt-get update - apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + --no-install-recommends \ + python3-pip python3-venv \ + libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip python3 -m pip install -e . - python3 -m pip install pytest pytest-xdist - + - name: Run E2E tests - shell: bash run: | mkdir -p test-results - python3 scripts/generate_graph.py --map-path maps/carter_warehouse_navigation.png --graph-eval.active --perf-eval.active - python3 scripts/evaluate_graph.py --map-path maps/carter_warehouse_navigation.png --output-dir results --resolution 0.05 --safety-distance 0.3 --occupancy-threshold 127 - - - name: Upload test results - uses: actions/upload-artifact@v4 + python3 scripts/generate_graph.py \ + --map-path maps/carter_warehouse_navigation.png \ + --graph-eval.active \ + --perf-eval.active + python3 scripts/evaluate_graph.py \ + --map-path maps/carter_warehouse_navigation.png \ + --output-dir results \ + --resolution 0.05 \ + --safety-distance 0.3 \ + --occupancy-threshold 127 + + - name: Upload artifacts if: always() + uses: actions/upload-artifact@v4 with: - name: e2e-test-results - path: test-results/ + name: e2e-results + path: | + test-results/ + results/ + graphs/ retention-days: 7 - + - name: Cleanup if: always() - shell: bash run: rm -rf results graphs test-results docker-build: name: Docker Build and Test needs: e2e-tests runs-on: self-hosted + env: + CUDA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all @@ -197,48 +210,29 @@ jobs: image: nvidia/dind:cuda-12.6.0 options: --privileged steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Wait for Docker daemon - shell: bash + - uses: actions/checkout@v4 + + - name: Wait for Docker run: | - for i in $(seq 1 10); do - if docker info >/dev/null 2>&1; then - echo "Docker daemon is ready" - break - fi - echo "Docker daemon not ready yet, waiting 10s (attempt $i/10)..." + timeout 120s bash -c 'until docker info; do sleep 10 - done - if [ $i -eq 10 ]; then - echo "Docker daemon failed to start" - exit 1 - fi - - - name: Install dependencies - shell: bash - run: | - apt-get update - apt-get install -y --no-install-recommends python3-full python3-venv ffmpeg - - - name: Build and test Docker - shell: bash + done' + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Build and test run: | - cd docker + cd docker || { echo "Missing docker directory"; exit 1; } docker compose build --no-cache docker compose up -d rest-api - sleep 10 - cd ../ - python3 -m venv venv - source venv/bin/activate - pip install -e . - docker ps - python scripts/test_api_client.py --map_path maps/carter_warehouse_navigation.png --host docker - + sleep 15 # Wait for service initialization + docker ps -a + pytest ../tests/integration/test_api.py -v + - name: Cleanup if: always() - shell: bash run: | - cd docker - docker compose down \ No newline at end of file + cd docker && docker compose down -v From 19849844d2d7352d826f17858c341d9e30da5863 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Tue, 13 May 2025 16:11:00 -0700 Subject: [PATCH 08/72] Removed lint verification as not approved action --- .github/workflows/ci.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6094f20..1c608be 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,9 +49,6 @@ jobs: with: fetch-depth: 0 - - name: Validate workflow syntax - uses: rhysd/actionlint@v1 - - name: Wait for Docker run: | timeout 120s bash -c 'until docker info; do From ce8986ad5ae47df9c6de9f4bd34b5d08292b24f2 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Tue, 13 May 2025 16:12:34 -0700 Subject: [PATCH 09/72] Fixing github workflow structure --- .github/workflows/ci.yml | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1c608be..66ad2bd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,10 +50,7 @@ jobs: fetch-depth: 0 - name: Wait for Docker - run: | - timeout 120s bash -c 'until docker info; do - sleep 10 - done' + run: timeout 120s bash -c 'until docker info; do sleep 10; done' - name: Cache Python packages uses: actions/cache@v4 @@ -91,10 +88,7 @@ jobs: - uses: actions/checkout@v4 - name: Wait for Docker - run: | - timeout 120s bash -c 'until docker info; do - sleep 10 - done' + run: timeout 120s bash -c 'until docker info; do sleep 10; done' - name: Cache Python packages uses: actions/cache@v4 @@ -149,10 +143,7 @@ jobs: - uses: actions/checkout@v4 - name: Wait for Docker - run: | - timeout 120s bash -c 'until docker info; do - sleep 10 - done' + run: timeout 120s bash -c 'until docker info; do sleep 10; done' - name: Install dependencies run: | @@ -210,10 +201,7 @@ jobs: - uses: actions/checkout@v4 - name: Wait for Docker - run: | - timeout 120s bash -c 'until docker info; do - sleep 10 - done' + run: timeout 120s bash -c 'until docker info; do sleep 10; done' - name: Setup Python uses: actions/setup-python@v5 From 107349da40a248ffa44fefaaef88583d3c345558 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Tue, 13 May 2025 16:25:08 -0700 Subject: [PATCH 10/72] Fixing github workflow structure --- .github/workflows/ci.yml | 98 ++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 66ad2bd..962923f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,14 +35,13 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all services: - - name: docker + docker: image: nvidia/dind:cuda-12.6.0 options: --privileged - with: - healthcheck: | - until docker info; do - sleep 1 - done + ports: + - 2375:2375 + env: + DOCKER_TLS_CERTDIR: "" steps: - name: Checkout code uses: actions/checkout@v4 @@ -50,7 +49,19 @@ jobs: fetch-depth: 0 - name: Wait for Docker - run: timeout 120s bash -c 'until docker info; do sleep 10; done' + run: | + i=1 + while [ $i -le 12 ]; do + if docker ps >/dev/null 2>&1; then + echo "Docker daemon is ready" + exit 0 + fi + echo "Docker daemon not ready yet, waiting 10s (attempt $i/12)..." + sleep 10 + i=$((i + 1)) + done + echo "Docker daemon failed to start" + exit 1 - name: Cache Python packages uses: actions/cache@v4 @@ -60,10 +71,9 @@ jobs: - name: Install dependencies run: | + set -e apt-get update - DEBIAN_FRONTEND=noninteractive apt-get install -y \ - --no-install-recommends \ - git python3-pip python3-venv + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv python3 -m pip install --upgrade pip python3 -m pip install pre-commit @@ -81,14 +91,26 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all services: - - name: docker + docker: image: nvidia/dind:cuda-12.6.0 options: --privileged steps: - uses: actions/checkout@v4 - name: Wait for Docker - run: timeout 120s bash -c 'until docker info; do sleep 10; done' + run: | + i=1 + while [ $i -le 12 ]; do + if docker ps >/dev/null 2>&1; then + echo "Docker daemon is ready" + exit 0 + fi + echo "Docker daemon not ready yet, waiting 10s (attempt $i/12)..." + sleep 10 + i=$((i + 1)) + done + echo "Docker daemon failed to start" + exit 1 - name: Cache Python packages uses: actions/cache@v4 @@ -100,13 +122,12 @@ jobs: - name: Install dependencies run: | + set -e apt-get update - DEBIAN_FRONTEND=noninteractive apt-get install -y \ - --no-install-recommends \ - python3-pip python3-venv \ - libgl1-mesa-glx libglib2.0-0 + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip - python3 -m pip install -e .[test] + python3 -m pip install -e . + python3 -m pip install pytest pytest-xdist pytest-cov junitparser - name: Run unit tests run: | @@ -136,22 +157,32 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all services: - - name: docker + docker: image: nvidia/dind:cuda-12.6.0 options: --privileged steps: - uses: actions/checkout@v4 - name: Wait for Docker - run: timeout 120s bash -c 'until docker info; do sleep 10; done' + run: | + i=1 + while [ $i -le 12 ]; do + if docker ps >/dev/null 2>&1; then + echo "Docker daemon is ready" + exit 0 + fi + echo "Docker daemon not ready yet, waiting 10s (attempt $i/12)..." + sleep 10 + i=$((i + 1)) + done + echo "Docker daemon failed to start" + exit 1 - name: Install dependencies run: | + set -e apt-get update - DEBIAN_FRONTEND=noninteractive apt-get install -y \ - --no-install-recommends \ - python3-pip python3-venv \ - libgl1-mesa-glx libglib2.0-0 + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip python3 -m pip install -e . @@ -194,14 +225,30 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all services: - - name: docker + docker: image: nvidia/dind:cuda-12.6.0 options: --privileged + ports: + - 2375:2375 + env: + DOCKER_TLS_CERTDIR: "" steps: - uses: actions/checkout@v4 - name: Wait for Docker - run: timeout 120s bash -c 'until docker info; do sleep 10; done' + run: | + i=1 + while [ $i -le 12 ]; do + if docker ps >/dev/null 2>&1; then + echo "Docker daemon is ready" + exit 0 + fi + echo "Docker daemon not ready yet, waiting 10s (attempt $i/12)..." + sleep 10 + i=$((i + 1)) + done + echo "Docker daemon failed to start" + exit 1 - name: Setup Python uses: actions/setup-python@v5 @@ -210,6 +257,7 @@ jobs: - name: Build and test run: | + set -e cd docker || { echo "Missing docker directory"; exit 1; } docker compose build --no-cache docker compose up -d rest-api From 2a95b36d9d042b4cab035053a9f0c91bf4690dd3 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Tue, 13 May 2025 16:29:22 -0700 Subject: [PATCH 11/72] Fixing github workflow pre-commit checks --- .github/workflows/ci.yml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 962923f..f1dd876 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,11 +37,10 @@ jobs: services: docker: image: nvidia/dind:cuda-12.6.0 - options: --privileged + options: --privileged --network=host ports: - 2375:2375 - env: - DOCKER_TLS_CERTDIR: "" + steps: - name: Checkout code uses: actions/checkout@v4 @@ -93,7 +92,10 @@ jobs: services: docker: image: nvidia/dind:cuda-12.6.0 - options: --privileged + options: --privileged --network=host + ports: + - 2375:2375 + steps: - uses: actions/checkout@v4 @@ -159,7 +161,10 @@ jobs: services: docker: image: nvidia/dind:cuda-12.6.0 - options: --privileged + options: --privileged --network=host + ports: + - 2375:2375 + steps: - uses: actions/checkout@v4 @@ -227,11 +232,10 @@ jobs: services: docker: image: nvidia/dind:cuda-12.6.0 - options: --privileged + options: --privileged --network=host ports: - 2375:2375 - env: - DOCKER_TLS_CERTDIR: "" + steps: - uses: actions/checkout@v4 From 89a9031d46f8d9a31936dcb10b85b764cf63302e Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Wed, 14 May 2025 11:24:41 -0700 Subject: [PATCH 12/72] Making CI infra more reliable --- .github/workflows/ci.yml | 8 +- .../cloudformation/github-runners-stack.yml | 116 ++++++++++++++++-- 2 files changed, 111 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f1dd876..2e8962e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,7 +33,7 @@ jobs: CUDA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 - options: --privileged --gpus all + options: --privileged --gpus all --group-add 998 services: docker: image: nvidia/dind:cuda-12.6.0 @@ -88,7 +88,7 @@ jobs: CUDA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 - options: --privileged --gpus all + options: --privileged --gpus all --group-add 998 services: docker: image: nvidia/dind:cuda-12.6.0 @@ -157,7 +157,7 @@ jobs: CUDA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 - options: --privileged --gpus all + options: --privileged --gpus all --group-add 998 services: docker: image: nvidia/dind:cuda-12.6.0 @@ -228,7 +228,7 @@ jobs: CUDA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 - options: --privileged --gpus all + options: --privileged --gpus all --group-add 998 services: docker: image: nvidia/dind:cuda-12.6.0 diff --git a/infrastructure/cloudformation/github-runners-stack.yml b/infrastructure/cloudformation/github-runners-stack.yml index 4056951..32c55cf 100644 --- a/infrastructure/cloudformation/github-runners-stack.yml +++ b/infrastructure/cloudformation/github-runners-stack.yml @@ -439,30 +439,128 @@ Resources: [Unit] Description=GitHub Actions Runner After=network.target docker.service + StartLimitIntervalSec=0 [Service] Type=simple User=github-runner WorkingDirectory=/opt/github-runner + ExecStartPre=/bin/bash -c 'if [ ! -f /opt/github-runner/.runner ]; then \ + ./config.sh --url https://github.com/${GitHubRepoPath} --token \$(curl -s -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GitHubToken}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/${GitHubRepoPath}/actions/runners/registration-token" | jq -r ".token") \ + --labels self-hosted,gpu --unattended --ephemeral; fi' ExecStart=/opt/github-runner/run.sh Restart=always RestartSec=10 + StartLimitBurst=5 + TimeoutStartSec=300 + TimeoutStopSec=300 + Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + Environment="DOCKER_HOST=unix:///var/run/docker.sock" + Environment="ACTIONS_RUNNER_HOOK_JOB_STARTED=/usr/local/bin/runner-hook.sh" + Environment="ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/local/bin/runner-hook.sh" [Install] WantedBy=multi-user.target EOF - - # Configure the runner as non-root user using the registration token - su - github-runner -c "cd /opt/github-runner && ./config.sh --url https://github.com/${GitHubRepoPath} --token \$(curl -s -X POST \ - -H \"Accept: application/vnd.github+json\" \ - -H \"Authorization: Bearer ${GitHubToken}\" \ - -H \"X-GitHub-Api-Version: 2022-11-28\" \ - \"https://api.github.com/repos/${GitHubRepoPath}/actions/runners/registration-token\" | jq -r '.token') --labels self-hosted,gpu --unattended --ephemeral" - - # Enable and start the service + + # Create runner hook script to handle job lifecycle + cat > /usr/local/bin/runner-hook.sh << 'EOF' + #!/bin/bash + set -e + + # Log the event + echo "$(date): Runner hook called with event: $1" >> /var/log/github-runner.log + + case "$1" in + "job_started") + # Clean up any stale Docker resources + docker system prune -f + # Reset Docker daemon if needed + if ! docker info >/dev/null 2>&1; then + systemctl restart docker + sleep 10 + fi + ;; + "job_completed") + # Clean up after job completion + docker system prune -f + ;; + esac + EOF + + chmod +x /usr/local/bin/runner-hook.sh + + # Create a script to handle runner reconfiguration + cat > /usr/local/bin/reconfigure-runner.sh << 'EOF' + #!/bin/bash + set -e + + # Check if runner is responsive + if ! curl -s http://localhost:8080/health >/dev/null 2>&1; then + echo "Runner is not responsive, reconfiguring..." + + # Stop the service + systemctl stop github-runner + + # Remove existing configuration + rm -rf /opt/github-runner/.runner + rm -rf /opt/github-runner/.credentials + rm -rf /opt/github-runner/.env + + # Clean up Docker + docker system prune -f + + # Restart Docker + systemctl restart docker + sleep 10 + + # Start the service + systemctl start github-runner + else + echo "Runner is healthy" + fi + EOF + + chmod +x /usr/local/bin/reconfigure-runner.sh + + # Create a systemd timer to check runner health + cat > /etc/systemd/system/github-runner-healthcheck.service << EOF + [Unit] + Description=GitHub Runner Health Check + After=github-runner.service + + [Service] + Type=oneshot + ExecStart=/usr/local/bin/reconfigure-runner.sh + User=root + + [Install] + WantedBy=multi-user.target + EOF + + cat > /etc/systemd/system/github-runner-healthcheck.timer << EOF + [Unit] + Description=Run GitHub Runner Health Check every hour + + [Timer] + OnBootSec=5min + OnUnitActiveSec=1h + Unit=github-runner-healthcheck.service + + [Install] + WantedBy=multi-user.target + EOF + + # Enable and start the services systemctl daemon-reload systemctl enable github-runner + systemctl enable github-runner-healthcheck.timer systemctl start github-runner + systemctl start github-runner-healthcheck.timer # Auto Scaling Group for runners RunnerAutoScalingGroup: From 6131c38b5bb1445c5e9a58c6636b33835fae333c Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Wed, 14 May 2025 13:46:31 -0700 Subject: [PATCH 13/72] Making CI infra more reliable --- .github/workflows/ci.yml | 8 +- .../cloudformation/github-runners-stack.yml | 178 +++++++++++++----- 2 files changed, 132 insertions(+), 54 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2e8962e..e2c0ec8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,7 +36,7 @@ jobs: options: --privileged --gpus all --group-add 998 services: docker: - image: nvidia/dind:cuda-12.6.0 + image: docker:26-dind options: --privileged --network=host ports: - 2375:2375 @@ -91,7 +91,7 @@ jobs: options: --privileged --gpus all --group-add 998 services: docker: - image: nvidia/dind:cuda-12.6.0 + image: docker:26-dind options: --privileged --network=host ports: - 2375:2375 @@ -160,7 +160,7 @@ jobs: options: --privileged --gpus all --group-add 998 services: docker: - image: nvidia/dind:cuda-12.6.0 + image: docker:26-dind options: --privileged --network=host ports: - 2375:2375 @@ -231,7 +231,7 @@ jobs: options: --privileged --gpus all --group-add 998 services: docker: - image: nvidia/dind:cuda-12.6.0 + image: docker:26-dind options: --privileged --network=host ports: - 2375:2375 diff --git a/infrastructure/cloudformation/github-runners-stack.yml b/infrastructure/cloudformation/github-runners-stack.yml index 32c55cf..f5ceadb 100644 --- a/infrastructure/cloudformation/github-runners-stack.yml +++ b/infrastructure/cloudformation/github-runners-stack.yml @@ -428,8 +428,9 @@ Resources: echo "0dbc9bf5a58620fc52cb6cc0448abcca964a8d74b5f39773b7afcad9ab691e19 actions-runner-linux-x64-2.323.0.tar.gz" | shasum -a 256 -c tar xzf ./actions-runner-linux-x64-2.323.0.tar.gz - # Create runner user + # Create runner user and add to docker group useradd -m -s /bin/bash github-runner + usermod -aG docker github-runner # Set ownership of runner directory chown -R github-runner:github-runner /opt/github-runner @@ -445,23 +446,28 @@ Resources: Type=simple User=github-runner WorkingDirectory=/opt/github-runner - ExecStartPre=/bin/bash -c 'if [ ! -f /opt/github-runner/.runner ]; then \ + ExecStartPre=/bin/bash -c 'if [ ! -f /opt/github-runner/.runner ] || [ ! -f /opt/github-runner/.credentials ]; then \ + rm -rf /opt/github-runner/.runner /opt/github-runner/.credentials; \ ./config.sh --url https://github.com/${GitHubRepoPath} --token \$(curl -s -X POST \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer ${GitHubToken}" \ -H "X-GitHub-Api-Version: 2022-11-28" \ "https://api.github.com/repos/${GitHubRepoPath}/actions/runners/registration-token" | jq -r ".token") \ - --labels self-hosted,gpu --unattended --ephemeral; fi' + --labels self-hosted,gpu --unattended --ephemeral --replace || exit 1; fi' ExecStart=/opt/github-runner/run.sh + ExecStopPost=/bin/bash -c 'rm -rf /opt/github-runner/.runner /opt/github-runner/.credentials' Restart=always RestartSec=10 - StartLimitBurst=5 + StartLimitBurst=10 TimeoutStartSec=300 TimeoutStopSec=300 + KillMode=process + KillSignal=SIGTERM Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" Environment="DOCKER_HOST=unix:///var/run/docker.sock" Environment="ACTIONS_RUNNER_HOOK_JOB_STARTED=/usr/local/bin/runner-hook.sh" Environment="ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/local/bin/runner-hook.sh" + Environment="RUNNER_ALLOW_RUNASROOT=0" [Install] WantedBy=multi-user.target @@ -472,89 +478,161 @@ Resources: #!/bin/bash set -e - # Log the event - echo "$(date): Runner hook called with event: $1" >> /var/log/github-runner.log + # Log the event with timestamp and more details + log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/github-runner.log + } + + log "Runner hook called with event: $1" + log "Current Docker status: $(systemctl status docker | grep Active)" case "$1" in "job_started") - # Clean up any stale Docker resources - docker system prune -f - # Reset Docker daemon if needed + # Verify Docker is running and healthy if ! docker info >/dev/null 2>&1; then + log "Docker appears to be unhealthy, attempting restart" systemctl restart docker sleep 10 + if ! docker info >/dev/null 2>&1; then + log "Docker failed to recover after restart" + exit 1 + fi + log "Docker successfully restarted" fi + # Clean up any stale Docker resources + docker system prune -f + log "System pruned before job start" ;; "job_completed") # Clean up after job completion docker system prune -f + log "System pruned after job completion" ;; esac EOF chmod +x /usr/local/bin/runner-hook.sh - # Create a script to handle runner reconfiguration + # Create a more robust runner health check script cat > /usr/local/bin/reconfigure-runner.sh << 'EOF' #!/bin/bash set -e - # Check if runner is responsive - if ! curl -s http://localhost:8080/health >/dev/null 2>&1; then - echo "Runner is not responsive, reconfiguring..." - - # Stop the service - systemctl stop github-runner - - # Remove existing configuration - rm -rf /opt/github-runner/.runner - rm -rf /opt/github-runner/.credentials - rm -rf /opt/github-runner/.env - - # Clean up Docker - docker system prune -f - - # Restart Docker - systemctl restart docker - sleep 10 - - # Start the service - systemctl start github-runner - else - echo "Runner is healthy" - fi - EOF + # Log function + log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/github-runner-health.log + } - chmod +x /usr/local/bin/reconfigure-runner.sh + # Check if the runner service is active + check_runner_service() { + if ! systemctl is-active github-runner >/dev/null 2>&1; then + log "Runner service is not active" + return 1 + fi + return 0 + } - # Create a systemd timer to check runner health - cat > /etc/systemd/system/github-runner-healthcheck.service << EOF - [Unit] - Description=GitHub Runner Health Check - After=github-runner.service + # Check if runner process is running + check_runner_process() { + if ! pgrep -f "run.sh" >/dev/null; then + log "Runner process is not running" + return 1 + fi + return 0 + } - [Service] - Type=oneshot - ExecStart=/usr/local/bin/reconfigure-runner.sh - User=root + # Check Docker health + check_docker_health() { + if ! docker info >/dev/null 2>&1; then + log "Docker is not healthy" + return 1 + fi + return 0 + } - [Install] - WantedBy=multi-user.target + # Main health check logic + log "Starting health check" + + NEEDS_RESTART=0 + + # Check Docker first + if ! check_docker_health; then + log "Attempting to restart Docker" + systemctl restart docker + sleep 10 + if ! check_docker_health; then + log "Docker failed to recover" + NEEDS_RESTART=1 + fi + fi + + # Check runner service and process + if ! check_runner_service || ! check_runner_process; then + NEEDS_RESTART=1 + fi + + if [ $NEEDS_RESTART -eq 1 ]; then + log "Issues detected, performing full runner reset" + + # Stop services + systemctl stop github-runner + + # Clean up runner files + rm -rf /opt/github-runner/.runner + rm -rf /opt/github-runner/.credentials + rm -rf /opt/github-runner/.env + + # Clean up Docker + docker system prune -af --volumes + + # Restart Docker + systemctl restart docker + sleep 10 + + # Start runner service + systemctl start github-runner + + log "Runner reset completed" + else + log "Health check passed" + fi EOF + chmod +x /usr/local/bin/reconfigure-runner.sh + + # Create a more frequent health check timer cat > /etc/systemd/system/github-runner-healthcheck.timer << EOF [Unit] - Description=Run GitHub Runner Health Check every hour + Description=Run GitHub Runner Health Check frequently [Timer] - OnBootSec=5min - OnUnitActiveSec=1h + OnBootSec=1min + OnUnitActiveSec=5min + RandomizedDelaySec=30 Unit=github-runner-healthcheck.service [Install] WantedBy=multi-user.target EOF + # Create log rotation for runner logs + cat > /etc/logrotate.d/github-runner << EOF + /var/log/github-runner*.log { + daily + rotate 7 + compress + delaycompress + missingok + notifempty + create 0644 github-runner github-runner + } + EOF + + # Create runner logs with proper permissions + touch /var/log/github-runner.log /var/log/github-runner-health.log + chown github-runner:github-runner /var/log/github-runner.log /var/log/github-runner-health.log + chmod 644 /var/log/github-runner.log /var/log/github-runner-health.log + # Enable and start the services systemctl daemon-reload systemctl enable github-runner From 063b9f6f40f38b88aa2b15440da1893798620c61 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Wed, 14 May 2025 13:48:20 -0700 Subject: [PATCH 14/72] Making CI infra more reliable --- .github/workflows/ci.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e2c0ec8..e06030f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,8 +38,6 @@ jobs: docker: image: docker:26-dind options: --privileged --network=host - ports: - - 2375:2375 steps: - name: Checkout code @@ -93,8 +91,6 @@ jobs: docker: image: docker:26-dind options: --privileged --network=host - ports: - - 2375:2375 steps: - uses: actions/checkout@v4 @@ -162,8 +158,6 @@ jobs: docker: image: docker:26-dind options: --privileged --network=host - ports: - - 2375:2375 steps: - uses: actions/checkout@v4 @@ -233,8 +227,6 @@ jobs: docker: image: docker:26-dind options: --privileged --network=host - ports: - - 2375:2375 steps: - uses: actions/checkout@v4 From 4ef42e7b48517ad92549104f0c4f277dc409139b Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Wed, 14 May 2025 13:56:53 -0700 Subject: [PATCH 15/72] Docker fixes --- .github/workflows/ci.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e06030f..55c8170 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,7 +37,9 @@ jobs: services: docker: image: docker:26-dind - options: --privileged --network=host + options: --privileged + ports: + - 2375:2375 steps: - name: Checkout code @@ -90,7 +92,9 @@ jobs: services: docker: image: docker:26-dind - options: --privileged --network=host + options: --privileged + ports: + - 2375:2375 steps: - uses: actions/checkout@v4 @@ -157,7 +161,9 @@ jobs: services: docker: image: docker:26-dind - options: --privileged --network=host + options: --privileged + ports: + - 2375:2375 steps: - uses: actions/checkout@v4 @@ -226,7 +232,9 @@ jobs: services: docker: image: docker:26-dind - options: --privileged --network=host + options: --privileged + ports: + - 2375:2375 steps: - uses: actions/checkout@v4 From a54c6867584ce5c42b8f691171cbe9922aa13256 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Wed, 14 May 2025 15:57:52 -0700 Subject: [PATCH 16/72] Added package check stage --- .github/workflows/ci.yml | 130 +++++++++++---------------------------- 1 file changed, 35 insertions(+), 95 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 55c8170..34324e0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,52 +21,27 @@ on: branches: [main, develop] env: - DOCKER_HOST: tcp://docker:2375 - DOCKER_TLS_CERTDIR: "" PYTHONPATH: ${{ github.workspace }} jobs: pre-commit: name: Pre-commit Checks runs-on: self-hosted - env: - CUDA_VISIBLE_DEVICES: all container: - image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 - options: --privileged --gpus all --group-add 998 - services: - docker: - image: docker:26-dind - options: --privileged - ports: - - 2375:2375 - + image: python:3.12-slim-bullseye steps: - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 0 - - name: Wait for Docker - run: | - i=1 - while [ $i -le 12 ]; do - if docker ps >/dev/null 2>&1; then - echo "Docker daemon is ready" - exit 0 - fi - echo "Docker daemon not ready yet, waiting 10s (attempt $i/12)..." - sleep 10 - i=$((i + 1)) - done - echo "Docker daemon failed to start" - exit 1 - - name: Cache Python packages uses: actions/cache@v4 with: path: ~/.cache/pip key: precommit-${{ hashFiles('.pre-commit-config.yaml') }} + restore-keys: | + precommit- - name: Install dependencies run: | @@ -80,40 +55,47 @@ jobs: run: pre-commit run --all-files timeout-minutes: 60 + package-check: + name: Package Check + runs-on: self-hosted + needs: pre-commit + container: + image: python:3.12-slim-bullseye + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Cache pip packages for Poetry + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: poetry-pip-${{ hashFiles('**/pyproject.toml') }} + restore-keys: | + poetry-pip- + + - name: Install dependencies + run: | + set -e + apt-get update && apt-get install -y --no-install-recommends git + python3 -m pip install --upgrade pip + python3 -m pip install poetry + + - name: Run Poetry check + run: poetry check + timeout-minutes: 15 + unit-tests: name: Unit Tests - needs: pre-commit + needs: package-check runs-on: self-hosted env: CUDA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 - services: - docker: - image: docker:26-dind - options: --privileged - ports: - - 2375:2375 - steps: - uses: actions/checkout@v4 - - name: Wait for Docker - run: | - i=1 - while [ $i -le 12 ]; do - if docker ps >/dev/null 2>&1; then - echo "Docker daemon is ready" - exit 0 - fi - echo "Docker daemon not ready yet, waiting 10s (attempt $i/12)..." - sleep 10 - i=$((i + 1)) - done - echo "Docker daemon failed to start" - exit 1 - - name: Cache Python packages uses: actions/cache@v4 with: @@ -158,31 +140,9 @@ jobs: container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 - services: - docker: - image: docker:26-dind - options: --privileged - ports: - - 2375:2375 - steps: - uses: actions/checkout@v4 - - name: Wait for Docker - run: | - i=1 - while [ $i -le 12 ]; do - if docker ps >/dev/null 2>&1; then - echo "Docker daemon is ready" - exit 0 - fi - echo "Docker daemon not ready yet, waiting 10s (attempt $i/12)..." - sleep 10 - i=$((i + 1)) - done - echo "Docker daemon failed to start" - exit 1 - - name: Install dependencies run: | set -e @@ -226,34 +186,14 @@ jobs: runs-on: self-hosted env: CUDA_VISIBLE_DEVICES: all + DOCKER_HOST: tcp://docker:2375 + DOCKER_TLS_CERTDIR: "" container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 - services: - docker: - image: docker:26-dind - options: --privileged - ports: - - 2375:2375 - steps: - uses: actions/checkout@v4 - - name: Wait for Docker - run: | - i=1 - while [ $i -le 12 ]; do - if docker ps >/dev/null 2>&1; then - echo "Docker daemon is ready" - exit 0 - fi - echo "Docker daemon not ready yet, waiting 10s (attempt $i/12)..." - sleep 10 - i=$((i + 1)) - done - echo "Docker daemon failed to start" - exit 1 - - name: Setup Python uses: actions/setup-python@v5 with: From f8a31f8d8e4b817d8c75eac3035641ba6ed41107 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Wed, 14 May 2025 16:10:30 -0700 Subject: [PATCH 17/72] Fixing pre-check stage --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 34324e0..1c7b54b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,7 +47,7 @@ jobs: run: | set -e apt-get update - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git python3 -m pip install --upgrade pip python3 -m pip install pre-commit From 0b81c55db9a57491a7d21c94885cbab4bce4399d Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Wed, 14 May 2025 16:26:25 -0700 Subject: [PATCH 18/72] Fixing pre-check stage --- .github/workflows/ci.yml | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1c7b54b..8256022 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,16 +43,43 @@ jobs: restore-keys: | precommit- - - name: Install dependencies + - name: Install dependencies and verify Git run: | set -e - apt-get update + echo "Updating package lists..." + apt-get update -y + echo "Installing git..." DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + + echo "Verifying git installation..." + if ! command -v git &> /dev/null; then + echo "ERROR: git command not found after installation attempt!" + exit 1 + fi + git --version + + echo "Verifying Git repository status..." + if ! git rev-parse --is-inside-work-tree; then + echo "ERROR: Not inside a Git work tree, or git command failed to determine it." + ls -la ${{ github.workspace }} # List files to see if .git is present + # exit 1 # Optionally exit if not in a git repo is unexpected here + else + echo "Successfully inside a Git work tree." + fi + + echo "Installing Python dependencies (pip, pre-commit)..." python3 -m pip install --upgrade pip python3 -m pip install pre-commit + pre-commit --version - name: Run pre-commit - run: pre-commit run --all-files + working-directory: ${{ github.workspace }} + run: | + echo "Current directory: $(pwd)" + echo "Listing files in workspace:" + ls -la + echo "Running pre-commit..." + pre-commit run --all-files timeout-minutes: 60 package-check: From 2cfacc2c92560846e1aae749b5d4abf798d45e64 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Wed, 14 May 2025 16:30:13 -0700 Subject: [PATCH 19/72] Debugging pre-check stage --- .github/workflows/ci.yml | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8256022..97546ff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,18 +51,42 @@ jobs: echo "Installing git..." DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git - echo "Verifying git installation..." - if ! command -v git &> /dev/null; then - echo "ERROR: git command not found after installation attempt!" + echo "--- Git Installation Diagnostics ---" + echo "Current PATH: $PATH" + + echo "Checking for /usr/bin/git existence and permissions:" + if [ -f "/usr/bin/git" ]; then + echo "/usr/bin/git exists." + if [ -x "/usr/bin/git" ]; then + echo "/usr/bin/git is executable." + echo "Version from /usr/bin/git --version:" + /usr/bin/git --version + else + echo "ERROR: /usr/bin/git exists but is NOT executable." + ls -l /usr/bin/git exit 1 + fi + else + echo "ERROR: /usr/bin/git does NOT exist after apt-get install attempt." + exit 1 + fi + + echo "Checking 'command -v git' (locates git via PATH):" + if command -v git &> /dev/null; then + echo "'command -v git' succeeded. Git found in PATH." + echo "Version from 'git --version' (via PATH):" + git --version + else + echo "ERROR: 'command -v git' FAILED. Git NOT found in PATH." + echo "This means pre-commit will likely fail. Exiting." + exit 1 # Exit if git is not found in PATH, as pre-commit needs it. fi - git --version + echo "--- End Git Installation Diagnostics ---" echo "Verifying Git repository status..." if ! git rev-parse --is-inside-work-tree; then echo "ERROR: Not inside a Git work tree, or git command failed to determine it." - ls -la ${{ github.workspace }} # List files to see if .git is present - # exit 1 # Optionally exit if not in a git repo is unexpected here + ls -la ${{ github.workspace }} else echo "Successfully inside a Git work tree." fi From dea5f24f07d21cbd4cdd9d3e1e45da7c494e5a5b Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Wed, 14 May 2025 16:31:33 -0700 Subject: [PATCH 20/72] Debugging pre-check stage --- .github/workflows/ci.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 97546ff..556085c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -83,14 +83,6 @@ jobs: fi echo "--- End Git Installation Diagnostics ---" - echo "Verifying Git repository status..." - if ! git rev-parse --is-inside-work-tree; then - echo "ERROR: Not inside a Git work tree, or git command failed to determine it." - ls -la ${{ github.workspace }} - else - echo "Successfully inside a Git work tree." - fi - echo "Installing Python dependencies (pip, pre-commit)..." python3 -m pip install --upgrade pip python3 -m pip install pre-commit From 700b5d5dc586004c779fa61bd24e33203696ca2a Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 12:38:31 -0700 Subject: [PATCH 21/72] debugging git failure --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 556085c..d87f3d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -94,6 +94,10 @@ jobs: echo "Current directory: $(pwd)" echo "Listing files in workspace:" ls -la + echo "Checking git status:" + git status + echo "Checking git config:" + git config --list echo "Running pre-commit..." pre-commit run --all-files timeout-minutes: 60 From ba8b9d3f073d1837e7f8a8bf916f9d9eab4aafee Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 13:14:45 -0700 Subject: [PATCH 22/72] debugging git failure --- .github/workflows/ci.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d87f3d4..2e14f27 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + persist-credentials: false - name: Cache Python packages uses: actions/cache@v4 @@ -88,6 +89,15 @@ jobs: python3 -m pip install pre-commit pre-commit --version + - name: Initialize Git Repository + run: | + git init + git config --global --add safe.directory ${{ github.workspace }} + git remote add origin ${{ github.server_url }}/${{ github.repository }} + git fetch origin + git checkout -b ${{ github.head_ref || github.ref_name }} + git reset --hard origin/${{ github.head_ref || github.ref_name }} + - name: Run pre-commit working-directory: ${{ github.workspace }} run: | From f4ec0ce023eadf8dca720ecbe6eb88fd91f30263 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 13:23:02 -0700 Subject: [PATCH 23/72] debugging git failure --- .github/workflows/ci.yml | 77 ++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 51 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2e14f27..c3f9761 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,11 +30,15 @@ jobs: container: image: python:3.12-slim-bullseye steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 0 - persist-credentials: false - name: Cache Python packages uses: actions/cache@v4 @@ -44,70 +48,21 @@ jobs: restore-keys: | precommit- - - name: Install dependencies and verify Git + - name: Install dependencies run: | set -e - echo "Updating package lists..." apt-get update -y - echo "Installing git..." DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git - - echo "--- Git Installation Diagnostics ---" - echo "Current PATH: $PATH" - - echo "Checking for /usr/bin/git existence and permissions:" - if [ -f "/usr/bin/git" ]; then - echo "/usr/bin/git exists." - if [ -x "/usr/bin/git" ]; then - echo "/usr/bin/git is executable." - echo "Version from /usr/bin/git --version:" - /usr/bin/git --version - else - echo "ERROR: /usr/bin/git exists but is NOT executable." - ls -l /usr/bin/git - exit 1 - fi - else - echo "ERROR: /usr/bin/git does NOT exist after apt-get install attempt." - exit 1 - fi - - echo "Checking 'command -v git' (locates git via PATH):" - if command -v git &> /dev/null; then - echo "'command -v git' succeeded. Git found in PATH." - echo "Version from 'git --version' (via PATH):" - git --version - else - echo "ERROR: 'command -v git' FAILED. Git NOT found in PATH." - echo "This means pre-commit will likely fail. Exiting." - exit 1 # Exit if git is not found in PATH, as pre-commit needs it. - fi - echo "--- End Git Installation Diagnostics ---" - - echo "Installing Python dependencies (pip, pre-commit)..." python3 -m pip install --upgrade pip python3 -m pip install pre-commit pre-commit --version - - name: Initialize Git Repository - run: | - git init - git config --global --add safe.directory ${{ github.workspace }} - git remote add origin ${{ github.server_url }}/${{ github.repository }} - git fetch origin - git checkout -b ${{ github.head_ref || github.ref_name }} - git reset --hard origin/${{ github.head_ref || github.ref_name }} - - name: Run pre-commit working-directory: ${{ github.workspace }} run: | echo "Current directory: $(pwd)" echo "Listing files in workspace:" ls -la - echo "Checking git status:" - git status - echo "Checking git config:" - git config --list echo "Running pre-commit..." pre-commit run --all-files timeout-minutes: 60 @@ -119,6 +74,11 @@ jobs: container: image: python:3.12-slim-bullseye steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + - name: Checkout code uses: actions/checkout@v4 @@ -151,6 +111,11 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + - uses: actions/checkout@v4 - name: Cache Python packages @@ -198,6 +163,11 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + - uses: actions/checkout@v4 - name: Install dependencies @@ -249,6 +219,11 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + - uses: actions/checkout@v4 - name: Setup Python From 55300d945e1610591a321962fb4472230c78790a Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 13:33:00 -0700 Subject: [PATCH 24/72] debugging git failure --- .github/workflows/ci.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c3f9761..0f45284 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -63,6 +63,17 @@ jobs: echo "Current directory: $(pwd)" echo "Listing files in workspace:" ls -la + + # Ensure we're in a Git repository + if [ ! -d .git ]; then + echo "Initializing Git repository..." + git init + git config --global user.email "github-actions@github.com" + git config --global user.name "GitHub Actions" + git add . + git commit -m "Initial commit for pre-commit" + fi + echo "Running pre-commit..." pre-commit run --all-files timeout-minutes: 60 From fb7179993e1619b9ff2c4471189f3134b5bb576a Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 13:42:05 -0700 Subject: [PATCH 25/72] debugging git failure --- .github/workflows/ci.yml | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0f45284..e201cfa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,22 +58,12 @@ jobs: pre-commit --version - name: Run pre-commit - working-directory: ${{ github.workspace }} run: | echo "Current directory: $(pwd)" echo "Listing files in workspace:" ls -la - - # Ensure we're in a Git repository - if [ ! -d .git ]; then - echo "Initializing Git repository..." - git init - git config --global user.email "github-actions@github.com" - git config --global user.name "GitHub Actions" - git add . - git commit -m "Initial commit for pre-commit" - fi - + git status + git log -1 echo "Running pre-commit..." pre-commit run --all-files timeout-minutes: 60 From 28850e1d7f89cdfbaaf911d5505fdf825305ebdb Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 13:43:51 -0700 Subject: [PATCH 26/72] debugging git failure --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e201cfa..5125fe6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,6 +62,7 @@ jobs: echo "Current directory: $(pwd)" echo "Listing files in workspace:" ls -la + git config --global --add safe.directory ${{ github.workspace }} git status git log -1 echo "Running pre-commit..." From 7cc2f3ecbce4b78ee6a01f2e0c646e4e471d6acf Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 13:51:20 -0700 Subject: [PATCH 27/72] debugging git failure --- .github/workflows/ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5125fe6..7b89a4a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory ${{ github.workspace }} - name: Checkout code uses: actions/checkout@v4 @@ -62,7 +63,6 @@ jobs: echo "Current directory: $(pwd)" echo "Listing files in workspace:" ls -la - git config --global --add safe.directory ${{ github.workspace }} git status git log -1 echo "Running pre-commit..." @@ -80,6 +80,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory ${{ github.workspace }} - name: Checkout code uses: actions/checkout@v4 @@ -117,6 +118,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory ${{ github.workspace }} - uses: actions/checkout@v4 @@ -169,6 +171,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory ${{ github.workspace }} - uses: actions/checkout@v4 @@ -225,6 +228,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory ${{ github.workspace }} - uses: actions/checkout@v4 From af187d482a074814a5378105ac09a1bc93f163da Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 14:34:24 -0700 Subject: [PATCH 28/72] debugging git failure --- .github/workflows/ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7b89a4a..13d1d72 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER git config --global --add safe.directory ${{ github.workspace }} - name: Checkout code @@ -80,6 +81,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER git config --global --add safe.directory ${{ github.workspace }} - name: Checkout code @@ -118,6 +120,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER git config --global --add safe.directory ${{ github.workspace }} - uses: actions/checkout@v4 @@ -171,6 +174,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER git config --global --add safe.directory ${{ github.workspace }} - uses: actions/checkout@v4 @@ -228,6 +232,7 @@ jobs: run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER git config --global --add safe.directory ${{ github.workspace }} - uses: actions/checkout@v4 From 0578546df2f53bd54b60c6baf360874e196c1f16 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 14:46:52 -0700 Subject: [PATCH 29/72] Addressing pre-che issues --- .github/workflows/ci.yml | 5 ----- infrastructure/cloudformation/github-runners-stack.yml | 7 ++++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 13d1d72..776679d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,7 +35,6 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git config --global --add safe.directory ${{ github.workspace }} - name: Checkout code uses: actions/checkout@v4 @@ -82,7 +81,6 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git config --global --add safe.directory ${{ github.workspace }} - name: Checkout code uses: actions/checkout@v4 @@ -121,7 +119,6 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git config --global --add safe.directory ${{ github.workspace }} - uses: actions/checkout@v4 @@ -175,7 +172,6 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git config --global --add safe.directory ${{ github.workspace }} - uses: actions/checkout@v4 @@ -233,7 +229,6 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git config --global --add safe.directory ${{ github.workspace }} - uses: actions/checkout@v4 diff --git a/infrastructure/cloudformation/github-runners-stack.yml b/infrastructure/cloudformation/github-runners-stack.yml index f5ceadb..c4d8561 100644 --- a/infrastructure/cloudformation/github-runners-stack.yml +++ b/infrastructure/cloudformation/github-runners-stack.yml @@ -64,7 +64,7 @@ Parameters: - g4dn.xlarge - g4dn.2xlarge - g4dn.4xlarge - - g5.xlargexr bv + - g5.xlarge - g5.2xlarge - g5.4xlarge @@ -76,7 +76,8 @@ Parameters: MaxValue: 3653 Conditions: - IsProduction: !Equals [!Ref Environment, 'production'] + IsProduction: + Fn::Equals: [!Ref Environment, 'production'] Resources: # VPC Resources @@ -686,4 +687,4 @@ Outputs: RunnerAutoScalingGroupName: Description: Name of the runner Auto Scaling Group - Value: !Ref RunnerAutoScalingGroup \ No newline at end of file + Value: !Ref RunnerAutoScalingGroup From e582a16992b5dd4d190bc870e98adb7e237c1962 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 15:26:30 -0700 Subject: [PATCH 30/72] Addressing pre-che issues --- .../cloudformation/github-runners-stack.yml | 908 ++++++++++-------- 1 file changed, 485 insertions(+), 423 deletions(-) diff --git a/infrastructure/cloudformation/github-runners-stack.yml b/infrastructure/cloudformation/github-runners-stack.yml index c4d8561..4771e65 100644 --- a/infrastructure/cloudformation/github-runners-stack.yml +++ b/infrastructure/cloudformation/github-runners-stack.yml @@ -76,24 +76,30 @@ Parameters: MaxValue: 3653 Conditions: - IsProduction: - Fn::Equals: [!Ref Environment, 'production'] + IsProduction: + Fn::Equals: + - Fn::Ref: Environment + - production Resources: # VPC Resources VPC: Type: AWS::EC2::VPC Properties: - CidrBlock: !Ref VpcCidr + CidrBlock: + Fn::Ref: VpcCidr EnableDnsHostnames: true EnableDnsSupport: true Tags: - Key: Name - Value: !Sub ${Environment}-${ProjectName}-vpc + Value: + Fn::Sub: ${Environment}-${ProjectName}-vpc - Key: Environment - Value: !Ref Environment + Value: + Fn::Ref: Environment - Key: Project - Value: !Ref ProjectName + Value: + Fn::Ref: ProjectName - Key: ManagedBy Value: CloudFormation @@ -102,48 +108,65 @@ Resources: Properties: Tags: - Key: Name - Value: !Sub ${Environment}-${ProjectName}-igw + Value: + Fn::Sub: ${Environment}-${ProjectName}-igw - Key: Environment - Value: !Ref Environment + Value: + Fn::Ref: Environment - Key: Project - Value: !Ref ProjectName + Value: + Fn::Ref: ProjectName - Key: ManagedBy Value: CloudFormation VpcGatewayAttachment: Type: AWS::EC2::VPCGatewayAttachment Properties: - VpcId: !Ref VPC - InternetGatewayId: !Ref InternetGateway + VpcId: + Fn::Ref: VPC + InternetGatewayId: + Fn::Ref: InternetGateway PublicSubnet: Type: AWS::EC2::Subnet Properties: - VpcId: !Ref VPC - CidrBlock: !Ref PublicSubnetCidr - AvailabilityZone: !Select [0, !GetAZs ''] + VpcId: + Fn::Ref: VPC + CidrBlock: + Fn::Ref: PublicSubnetCidr + AvailabilityZone: + Fn::Select: + - 0 + - Fn::GetAZs: '' MapPublicIpOnLaunch: true Tags: - Key: Name - Value: !Sub ${Environment}-${ProjectName}-public-subnet + Value: + Fn::Sub: ${Environment}-${ProjectName}-public-subnet - Key: Environment - Value: !Ref Environment + Value: + Fn::Ref: Environment - Key: Project - Value: !Ref ProjectName + Value: + Fn::Ref: ProjectName - Key: ManagedBy Value: CloudFormation RouteTable: Type: AWS::EC2::RouteTable Properties: - VpcId: !Ref VPC + VpcId: + Fn::Ref: VPC Tags: - Key: Name - Value: !Sub ${Environment}-${ProjectName}-rt + Value: + Fn::Sub: ${Environment}-${ProjectName}-rt - Key: Environment - Value: !Ref Environment + Value: + Fn::Ref: Environment - Key: Project - Value: !Ref ProjectName + Value: + Fn::Ref: ProjectName - Key: ManagedBy Value: CloudFormation @@ -151,23 +174,29 @@ Resources: Type: AWS::EC2::Route DependsOn: VpcGatewayAttachment Properties: - RouteTableId: !Ref RouteTable + RouteTableId: + Fn::Ref: RouteTable DestinationCidrBlock: 0.0.0.0/0 - GatewayId: !Ref InternetGateway + GatewayId: + Fn::Ref: InternetGateway SubnetRouteTableAssociation: Type: AWS::EC2::SubnetRouteTableAssociation Properties: - SubnetId: !Ref PublicSubnet - RouteTableId: !Ref RouteTable + SubnetId: + Fn::Ref: PublicSubnet + RouteTableId: + Fn::Ref: RouteTable # Security Group BuildSecurityGroup: Type: AWS::EC2::SecurityGroup Properties: - GroupName: !Sub ${Environment}-${ProjectName}-build-sg + GroupName: + Fn::Sub: ${Environment}-${ProjectName}-build-sg GroupDescription: Security group for build instances - VpcId: !Ref VPC + VpcId: + Fn::Ref: VPC SecurityGroupIngress: - IpProtocol: tcp FromPort: 22 @@ -190,11 +219,14 @@ Resources: Description: Allow all outbound traffic Tags: - Key: Name - Value: !Sub ${Environment}-${ProjectName}-build-sg + Value: + Fn::Sub: ${Environment}-${ProjectName}-build-sg - Key: Environment - Value: !Ref Environment + Value: + Fn::Ref: Environment - Key: Project - Value: !Ref ProjectName + Value: + Fn::Ref: ProjectName - Key: ManagedBy Value: CloudFormation @@ -202,7 +234,8 @@ Resources: RunnerRole: Type: AWS::IAM::Role Properties: - RoleName: !Sub ${Environment}-${ProjectName}-runner-role + RoleName: + Fn::Sub: ${Environment}-${ProjectName}-runner-role AssumeRolePolicyDocument: Version: '2012-10-17' Statement: @@ -214,9 +247,11 @@ Resources: - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly Tags: - Key: Environment - Value: !Ref Environment + Value: + Fn::Ref: Environment - Key: Project - Value: !Ref ProjectName + Value: + Fn::Ref: ProjectName - Key: ManagedBy Value: CloudFormation @@ -225,25 +260,30 @@ Resources: Properties: Path: / Roles: - - !Ref RunnerRole - InstanceProfileName: !Sub ${Environment}-${ProjectName}-runner-profile + - Fn::Ref: RunnerRole + InstanceProfileName: + Fn::Sub: ${Environment}-${ProjectName}-runner-profile # Launch Template for runners RunnerLaunchTemplate: Type: AWS::EC2::LaunchTemplate Properties: - LaunchTemplateName: !Sub ${Environment}-${ProjectName}-runner-template + LaunchTemplateName: + Fn::Sub: ${Environment}-${ProjectName}-runner-template LaunchTemplateData: ImageId: ami-0eabc4ddf08279fc3 # Ubuntu 22.04 LTS - InstanceType: !Ref InstanceType + InstanceType: + Fn::Ref: InstanceType NetworkInterfaces: - DeviceIndex: 0 AssociatePublicIpAddress: true - SubnetId: !Ref PublicSubnet + SubnetId: + Fn::Ref: PublicSubnet Groups: - - !Ref BuildSecurityGroup + - Fn::Ref: BuildSecurityGroup IamInstanceProfile: - Name: !Ref RunnerInstanceProfile + Name: + Fn::Ref: RunnerInstanceProfile BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: @@ -262,407 +302,422 @@ Resources: - ResourceType: instance Tags: - Key: Name - Value: !Sub ${Environment}-${ProjectName}-runner + Value: + Fn::Sub: ${Environment}-${ProjectName}-runner - Key: Environment - Value: !Ref Environment + Value: + Fn::Ref: Environment - Key: Project - Value: !Ref ProjectName + Value: + Fn::Ref: ProjectName - Key: ManagedBy Value: CloudFormation UserData: - Fn::Base64: !Sub | - #!/bin/bash - set -e - - # Wait for apt lock to be released - while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do - echo "Waiting for other package manager to finish..." - sleep 1 - done - - # Install Docker - apt-get update - apt-get install -y ca-certificates curl gnupg - install -m 0755 -d /etc/apt/keyrings - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg - chmod a+r /etc/apt/keyrings/docker.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null - - # Wait for apt lock again before updating - while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do - echo "Waiting for other package manager to finish..." - sleep 1 - done - - apt-get update - apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - - # Debug information - echo "=== Debug Information ===" - lsblk - echo "=== Mount Points ===" - mount - echo "=== FSTAB ===" - cat /etc/fstab - - # Wait for EBS volume to be available and find the correct device - echo "Waiting for EBS volume to be attached..." - while true; do - # Check for NVMe devices - if [ -e /dev/nvme1n1 ]; then - EBS_DEVICE="/dev/nvme1n1" - break - elif [ -e /dev/nvme0n1 ]; then - EBS_DEVICE="/dev/nvme0n1" - break + Fn::Base64: + Fn::Sub: | + #!/bin/bash + set -e + + # Wait for apt lock to be released + while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "Waiting for other package manager to finish..." + sleep 1 + done + + # Install Docker + apt-get update + apt-get install -y ca-certificates curl gnupg + install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + chmod a+r /etc/apt/keyrings/docker.gpg + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + + # Wait for apt lock again before updating + while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "Waiting for other package manager to finish..." + sleep 1 + done + + apt-get update + apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + + # Debug information + echo "=== Debug Information ===" + lsblk + echo "=== Mount Points ===" + mount + echo "=== FSTAB ===" + cat /etc/fstab + + # Wait for EBS volume to be available and find the correct device + echo "Waiting for EBS volume to be attached..." + while true; do + # Check for NVMe devices + if [ -e /dev/nvme1n1 ]; then + EBS_DEVICE="/dev/nvme1n1" + break + elif [ -e /dev/nvme0n1 ]; then + EBS_DEVICE="/dev/nvme0n1" + break + fi + echo "Waiting for EBS volume..." + sleep 5 + done + + echo "Found EBS volume at $EBS_DEVICE" + + # Wait for the volume to be ready + sleep 10 + + # Check if the volume is already formatted + if ! blkid $EBS_DEVICE; then + echo "Formatting EBS volume..." + mkfs -t ext4 $EBS_DEVICE fi - echo "Waiting for EBS volume..." - sleep 5 - done - - echo "Found EBS volume at $EBS_DEVICE" - - # Wait for the volume to be ready - sleep 10 - - # Check if the volume is already formatted - if ! blkid $EBS_DEVICE; then - echo "Formatting EBS volume..." - mkfs -t ext4 $EBS_DEVICE - fi - - # Create mount point and add to fstab - mkdir -p /var/lib/docker-cache - - # Remove any existing mount entry for this device - sed -i '/\/var\/lib\/docker-cache/d' /etc/fstab - - # Add new mount entry - echo "$EBS_DEVICE /var/lib/docker-cache ext4 defaults,nofail 0 2" >> /etc/fstab - - # Unmount if already mounted - umount /var/lib/docker-cache 2>/dev/null || true - - # Mount the volume - mount -a - - # Verify mount - echo "=== After Mount ===" - mount | grep docker-cache - df -h /var/lib/docker-cache - - # Configure Docker to use the mounted volume for cache - mkdir -p /etc/docker - cat > /etc/docker/daemon.json << EOF - { - "data-root": "/var/lib/docker-cache", - "storage-driver": "overlay2", - "log-driver": "json-file", - "log-opts": { - "max-size": "100m", - "max-file": "3" + + # Create mount point and add to fstab + mkdir -p /var/lib/docker-cache + + # Remove any existing mount entry for this device + sed -i '/\/var\/lib\/docker-cache/d' /etc/fstab + + # Add new mount entry + echo "$EBS_DEVICE /var/lib/docker-cache ext4 defaults,nofail 0 2" >> /etc/fstab + + # Unmount if already mounted + umount /var/lib/docker-cache 2>/dev/null || true + + # Mount the volume + mount -a + + # Verify mount + echo "=== After Mount ===" + mount | grep docker-cache + df -h /var/lib/docker-cache + + # Configure Docker to use the mounted volume for cache + mkdir -p /etc/docker + cat > /etc/docker/daemon.json << EOF + { + "data-root": "/var/lib/docker-cache", + "storage-driver": "overlay2", + "log-driver": "json-file", + "log-opts": { + "max-size": "100m", + "max-file": "3" + } } - } - EOF - - # Stop Docker before moving data - systemctl stop docker - - # Move existing Docker data if it exists - if [ -d "/var/lib/docker" ] && [ "$(ls -A /var/lib/docker)" ]; then - echo "Moving existing Docker data..." - mv /var/lib/docker/* /var/lib/docker-cache/ 2>/dev/null || true - fi - - # Restart Docker to apply new configuration - systemctl start docker - - # Create Docker cache cleanup script - cat > /usr/local/bin/cleanup-docker-cache.sh << 'EOF' - #!/bin/bash - - # Set threshold (in percentage) for cleanup - THRESHOLD=80 - - # Get current disk usage - USAGE=$(df -h /var/lib/docker-cache | awk 'NR==2 {print $5}' | sed 's/%//') - - if [ "$USAGE" -gt "$THRESHOLD" ]; then - echo "Docker cache usage is at $USAGE%, cleaning up..." - - # Remove unused containers - docker container prune -f - - # Remove unused images - docker image prune -a -f - - # Remove unused volumes - docker volume prune -f - - # Remove build cache - docker builder prune -f - - echo "Cleanup completed" - else - echo "Docker cache usage is at $USAGE%, no cleanup needed" - fi - EOF - - chmod +x /usr/local/bin/cleanup-docker-cache.sh - - # Add cleanup script to crontab - (crontab -l 2>/dev/null; echo "0 */4 * * * /usr/local/bin/cleanup-docker-cache.sh") | crontab - - - # Wait for apt lock before installing jq - while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do - echo "Waiting for other package manager to finish..." - sleep 1 - done - - # Install jq for JSON parsing - apt-get install -y jq - - # Install GitHub Actions runner - mkdir -p /opt/github-runner - cd /opt/github-runner - curl -o actions-runner-linux-x64-2.323.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.323.0/actions-runner-linux-x64-2.323.0.tar.gz - echo "0dbc9bf5a58620fc52cb6cc0448abcca964a8d74b5f39773b7afcad9ab691e19 actions-runner-linux-x64-2.323.0.tar.gz" | shasum -a 256 -c - tar xzf ./actions-runner-linux-x64-2.323.0.tar.gz - - # Create runner user and add to docker group - useradd -m -s /bin/bash github-runner - usermod -aG docker github-runner - - # Set ownership of runner directory - chown -R github-runner:github-runner /opt/github-runner - - # Create systemd service for GitHub runner - cat > /etc/systemd/system/github-runner.service << EOF - [Unit] - Description=GitHub Actions Runner - After=network.target docker.service - StartLimitIntervalSec=0 - - [Service] - Type=simple - User=github-runner - WorkingDirectory=/opt/github-runner - ExecStartPre=/bin/bash -c 'if [ ! -f /opt/github-runner/.runner ] || [ ! -f /opt/github-runner/.credentials ]; then \ - rm -rf /opt/github-runner/.runner /opt/github-runner/.credentials; \ - ./config.sh --url https://github.com/${GitHubRepoPath} --token \$(curl -s -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${GitHubToken}" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - "https://api.github.com/repos/${GitHubRepoPath}/actions/runners/registration-token" | jq -r ".token") \ - --labels self-hosted,gpu --unattended --ephemeral --replace || exit 1; fi' - ExecStart=/opt/github-runner/run.sh - ExecStopPost=/bin/bash -c 'rm -rf /opt/github-runner/.runner /opt/github-runner/.credentials' - Restart=always - RestartSec=10 - StartLimitBurst=10 - TimeoutStartSec=300 - TimeoutStopSec=300 - KillMode=process - KillSignal=SIGTERM - Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" - Environment="DOCKER_HOST=unix:///var/run/docker.sock" - Environment="ACTIONS_RUNNER_HOOK_JOB_STARTED=/usr/local/bin/runner-hook.sh" - Environment="ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/local/bin/runner-hook.sh" - Environment="RUNNER_ALLOW_RUNASROOT=0" - - [Install] - WantedBy=multi-user.target - EOF - - # Create runner hook script to handle job lifecycle - cat > /usr/local/bin/runner-hook.sh << 'EOF' - #!/bin/bash - set -e - - # Log the event with timestamp and more details - log() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/github-runner.log - } - - log "Runner hook called with event: $1" - log "Current Docker status: $(systemctl status docker | grep Active)" - - case "$1" in - "job_started") - # Verify Docker is running and healthy - if ! docker info >/dev/null 2>&1; then - log "Docker appears to be unhealthy, attempting restart" + EOF + + # Stop Docker before moving data + systemctl stop docker + + # Move existing Docker data if it exists + if [ -d "/var/lib/docker" ] && [ "$(ls -A /var/lib/docker)" ]; then + echo "Moving existing Docker data..." + mv /var/lib/docker/* /var/lib/docker-cache/ 2>/dev/null || true + fi + + # Restart Docker to apply new configuration + systemctl start docker + + # Create Docker cache cleanup script + cat > /usr/local/bin/cleanup-docker-cache.sh << 'EOF' + #!/bin/bash + + # Set threshold (in percentage) for cleanup + THRESHOLD=80 + + # Get current disk usage + USAGE=$(df -h /var/lib/docker-cache | awk 'NR==2 {print $5}' | sed 's/%//') + + if [ "$USAGE" -gt "$THRESHOLD" ]; then + echo "Docker cache usage is at $USAGE%, cleaning up..." + + # Remove unused containers + docker container prune -f + + # Remove unused images + docker image prune -a -f + + # Remove unused volumes + docker volume prune -f + + # Remove build cache + docker builder prune -f + + echo "Cleanup completed" + else + echo "Docker cache usage is at $USAGE%, no cleanup needed" + fi + EOF + + chmod +x /usr/local/bin/cleanup-docker-cache.sh + + # Add cleanup script to crontab + (crontab -l 2>/dev/null; echo "0 */4 * * * /usr/local/bin/cleanup-docker-cache.sh") | crontab - + + # Wait for apt lock before installing jq + while fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "Waiting for other package manager to finish..." + sleep 1 + done + + # Install jq for JSON parsing + apt-get install -y jq + + # Install GitHub Actions runner + mkdir -p /opt/github-runner + cd /opt/github-runner + curl -o actions-runner-linux-x64-2.323.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.323.0/actions-runner-linux-x64-2.323.0.tar.gz + echo "0dbc9bf5a58620fc52cb6cc0448abcca964a8d74b5f39773b7afcad9ab691e19 actions-runner-linux-x64-2.323.0.tar.gz" | shasum -a 256 -c + tar xzf ./actions-runner-linux-x64-2.323.0.tar.gz + + # Create runner user and add to docker group + useradd -m -s /bin/bash github-runner + usermod -aG docker github-runner + + # Set ownership of runner directory + chown -R github-runner:github-runner /opt/github-runner + + # Create systemd service for GitHub runner + cat > /etc/systemd/system/github-runner.service << EOF + [Unit] + Description=GitHub Actions Runner + After=network.target docker.service + StartLimitIntervalSec=0 + + [Service] + Type=simple + User=github-runner + WorkingDirectory=/opt/github-runner + ExecStartPre=/bin/bash -c 'if [ ! -f /opt/github-runner/.runner ] || [ ! -f /opt/github-runner/.credentials ]; then \ + rm -rf /opt/github-runner/.runner /opt/github-runner/.credentials; \ + ./config.sh --url https://github.com/${GitHubRepoPath} --token \$(curl -s -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GitHubToken}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/${GitHubRepoPath}/actions/runners/registration-token" | jq -r ".token") \ + --labels self-hosted,gpu --unattended --ephemeral --replace || exit 1; fi' + ExecStart=/opt/github-runner/run.sh + ExecStopPost=/bin/bash -c 'rm -rf /opt/github-runner/.runner /opt/github-runner/.credentials' + Restart=always + RestartSec=10 + StartLimitBurst=10 + TimeoutStartSec=300 + TimeoutStopSec=300 + KillMode=process + KillSignal=SIGTERM + Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + Environment="DOCKER_HOST=unix:///var/run/docker.sock" + Environment="ACTIONS_RUNNER_HOOK_JOB_STARTED=/usr/local/bin/runner-hook.sh" + Environment="ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/local/bin/runner-hook.sh" + Environment="RUNNER_ALLOW_RUNASROOT=0" + + [Install] + WantedBy=multi-user.target + EOF + + # Create runner hook script to handle job lifecycle + cat > /usr/local/bin/runner-hook.sh << 'EOF' + #!/bin/bash + set -e + + # Log the event with timestamp and more details + log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/github-runner.log + } + + log "Runner hook called with event: $1" + log "Current Docker status: $(systemctl status docker | grep Active)" + + case "$1" in + "job_started") + # Verify Docker is running and healthy + if ! docker info >/dev/null 2>&1; then + log "Docker appears to be unhealthy, attempting restart" + systemctl restart docker + sleep 10 + if ! docker info >/dev/null 2>&1; then + log "Docker failed to recover after restart" + exit 1 + fi + log "Docker successfully restarted" + fi + # Clean up any stale Docker resources + docker system prune -f + log "System pruned before job start" + ;; + "job_completed") + # Clean up after job completion + docker system prune -f + log "System pruned after job completion" + ;; + esac + EOF + + chmod +x /usr/local/bin/runner-hook.sh + + # Create a more robust runner health check script + cat > /usr/local/bin/reconfigure-runner.sh << 'EOF' + #!/bin/bash + set -e + + # Log function + log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/github-runner-health.log + } + + # Check if the runner service is active + check_runner_service() { + if ! systemctl is-active github-runner >/dev/null 2>&1; then + log "Runner service is not active" + return 1 + fi + return 0 + } + + # Check if runner process is running + check_runner_process() { + if ! pgrep -f "run.sh" >/dev/null; then + log "Runner process is not running" + return 1 + fi + return 0 + } + + # Check Docker health + check_docker_health() { + if ! docker info >/dev/null 2>&1; then + log "Docker is not healthy" + return 1 + fi + return 0 + } + + # Main health check logic + log "Starting health check" + + NEEDS_RESTART=0 + + # Check Docker first + if ! check_docker_health; then + log "Attempting to restart Docker" systemctl restart docker sleep 10 - if ! docker info >/dev/null 2>&1; then - log "Docker failed to recover after restart" - exit 1 + if ! check_docker_health; then + log "Docker failed to recover" + NEEDS_RESTART=1 fi - log "Docker successfully restarted" - fi - # Clean up any stale Docker resources - docker system prune -f - log "System pruned before job start" - ;; - "job_completed") - # Clean up after job completion - docker system prune -f - log "System pruned after job completion" - ;; - esac - EOF - - chmod +x /usr/local/bin/runner-hook.sh - - # Create a more robust runner health check script - cat > /usr/local/bin/reconfigure-runner.sh << 'EOF' - #!/bin/bash - set -e - - # Log function - log() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/github-runner-health.log - } - - # Check if the runner service is active - check_runner_service() { - if ! systemctl is-active github-runner >/dev/null 2>&1; then - log "Runner service is not active" - return 1 - fi - return 0 - } - - # Check if runner process is running - check_runner_process() { - if ! pgrep -f "run.sh" >/dev/null; then - log "Runner process is not running" - return 1 - fi - return 0 - } - - # Check Docker health - check_docker_health() { - if ! docker info >/dev/null 2>&1; then - log "Docker is not healthy" - return 1 - fi - return 0 - } - - # Main health check logic - log "Starting health check" - - NEEDS_RESTART=0 - - # Check Docker first - if ! check_docker_health; then - log "Attempting to restart Docker" - systemctl restart docker - sleep 10 - if ! check_docker_health; then - log "Docker failed to recover" - NEEDS_RESTART=1 - fi - fi - - # Check runner service and process - if ! check_runner_service || ! check_runner_process; then - NEEDS_RESTART=1 - fi - - if [ $NEEDS_RESTART -eq 1 ]; then - log "Issues detected, performing full runner reset" - - # Stop services - systemctl stop github-runner - - # Clean up runner files - rm -rf /opt/github-runner/.runner - rm -rf /opt/github-runner/.credentials - rm -rf /opt/github-runner/.env - - # Clean up Docker - docker system prune -af --volumes - - # Restart Docker - systemctl restart docker - sleep 10 - - # Start runner service - systemctl start github-runner - - log "Runner reset completed" - else - log "Health check passed" - fi - EOF - - chmod +x /usr/local/bin/reconfigure-runner.sh - - # Create a more frequent health check timer - cat > /etc/systemd/system/github-runner-healthcheck.timer << EOF - [Unit] - Description=Run GitHub Runner Health Check frequently - - [Timer] - OnBootSec=1min - OnUnitActiveSec=5min - RandomizedDelaySec=30 - Unit=github-runner-healthcheck.service - - [Install] - WantedBy=multi-user.target - EOF - - # Create log rotation for runner logs - cat > /etc/logrotate.d/github-runner << EOF - /var/log/github-runner*.log { - daily - rotate 7 - compress - delaycompress - missingok - notifempty - create 0644 github-runner github-runner - } - EOF - - # Create runner logs with proper permissions - touch /var/log/github-runner.log /var/log/github-runner-health.log - chown github-runner:github-runner /var/log/github-runner.log /var/log/github-runner-health.log - chmod 644 /var/log/github-runner.log /var/log/github-runner-health.log - - # Enable and start the services - systemctl daemon-reload - systemctl enable github-runner - systemctl enable github-runner-healthcheck.timer - systemctl start github-runner - systemctl start github-runner-healthcheck.timer + fi + + # Check runner service and process + if ! check_runner_service || ! check_runner_process; then + NEEDS_RESTART=1 + fi + + if [ $NEEDS_RESTART -eq 1 ]; then + log "Issues detected, performing full runner reset" + + # Stop services + systemctl stop github-runner + + # Clean up runner files + rm -rf /opt/github-runner/.runner + rm -rf /opt/github-runner/.credentials + rm -rf /opt/github-runner/.env + + # Clean up Docker + docker system prune -af --volumes + + # Restart Docker + systemctl restart docker + sleep 10 + + # Start runner service + systemctl start github-runner + + log "Runner reset completed" + else + log "Health check passed" + fi + EOF + + chmod +x /usr/local/bin/reconfigure-runner.sh + + # Create a more frequent health check timer + cat > /etc/systemd/system/github-runner-healthcheck.timer << EOF + [Unit] + Description=Run GitHub Runner Health Check frequently + + [Timer] + OnBootSec=1min + OnUnitActiveSec=5min + RandomizedDelaySec=30 + Unit=github-runner-healthcheck.service + + [Install] + WantedBy=multi-user.target + EOF + + # Create log rotation for runner logs + cat > /etc/logrotate.d/github-runner << EOF + /var/log/github-runner*.log { + daily + rotate 7 + compress + delaycompress + missingok + notifempty + create 0644 github-runner github-runner + } + EOF + + # Create runner logs with proper permissions + touch /var/log/github-runner.log /var/log/github-runner-health.log + chown github-runner:github-runner /var/log/github-runner.log /var/log/github-runner-health.log + chmod 644 /var/log/github-runner.log /var/log/github-runner-health.log + + # Enable and start the services + systemctl daemon-reload + systemctl enable github-runner + systemctl enable github-runner-healthcheck.timer + systemctl start github-runner + systemctl start github-runner-healthcheck.timer # Auto Scaling Group for runners RunnerAutoScalingGroup: Type: AWS::AutoScaling::AutoScalingGroup Properties: - AutoScalingGroupName: !Sub ${Environment}-${ProjectName}-runners + AutoScalingGroupName: + Fn::Sub: ${Environment}-${ProjectName}-runners LaunchTemplate: - LaunchTemplateId: !Ref RunnerLaunchTemplate - Version: !GetAtt RunnerLaunchTemplate.LatestVersionNumber - MinSize: !Ref RunnerCount - MaxSize: !Ref RunnerCount - DesiredCapacity: !Ref RunnerCount + LaunchTemplateId: + Fn::Ref: RunnerLaunchTemplate + Version: + Fn::GetAtt: + - RunnerLaunchTemplate + - LatestVersionNumber + MinSize: + Fn::Ref: RunnerCount + MaxSize: + Fn::Ref: RunnerCount + DesiredCapacity: + Fn::Ref: RunnerCount VPCZoneIdentifier: - - !Ref PublicSubnet + - Fn::Ref: PublicSubnet Tags: - Key: Name - Value: !Sub ${Environment}-${ProjectName}-runner + Value: + Fn::Sub: ${Environment}-${ProjectName}-runner PropagateAtLaunch: true - Key: Environment - Value: !Ref Environment + Value: + Fn::Ref: Environment PropagateAtLaunch: true - Key: Project - Value: !Ref ProjectName + Value: + Fn::Ref: ProjectName PropagateAtLaunch: true - Key: ManagedBy Value: CloudFormation @@ -671,20 +726,27 @@ Resources: Outputs: VpcId: Description: ID of the VPC - Value: !Ref VPC + Value: + Fn::Ref: VPC PublicSubnetId: Description: ID of the public subnet - Value: !Ref PublicSubnet + Value: + Fn::Ref: PublicSubnet SecurityGroupId: Description: ID of the build security group - Value: !Ref BuildSecurityGroup + Value: + Fn::Ref: BuildSecurityGroup RunnerRoleArn: Description: ARN of the runner IAM role - Value: !GetAtt RunnerRole.Arn + Value: + Fn::GetAtt: + - RunnerRole + - Arn RunnerAutoScalingGroupName: Description: Name of the runner Auto Scaling Group - Value: !Ref RunnerAutoScalingGroup + Value: + Fn::Ref: RunnerAutoScalingGroup From 97517495408878c29d1e1c1e3485f0dd656bf0a2 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 15:42:25 -0700 Subject: [PATCH 31/72] Addressing pre-che issues --- .github/workflows/ci.yml | 48 +++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 776679d..a00a812 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -136,18 +136,30 @@ jobs: apt-get update DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip - python3 -m pip install -e . - python3 -m pip install pytest pytest-xdist pytest-cov junitparser + python3 -m pip install poetry + poetry install - name: Run unit tests run: | mkdir -p test-results - pytest \ - --junitxml=test-results/unit-tests.xml \ - --cov=. \ - --cov-report=xml:test-results/coverage.xml \ - --cov-report=term-missing \ - tests/unit + poetry run python -m unittest discover -p "test_*.py" -v > test-results/unit-tests.txt + # Convert unittest output to JUnit XML format + poetry run python -c 'import xml.etree.ElementTree as ET; import re; import sys; \ + def parse_unittest_output(filename): \ + root = ET.Element("testsuites"); \ + suite = ET.SubElement(root, "testsuite", name="unit-tests"); \ + with open(filename, "r") as f: \ + content = f.read(); \ + test_pattern = r"test_(\w+) \((.*?)\) \.\.\. (ok|FAIL|ERROR)"; \ + for match in re.finditer(test_pattern, content): \ + test_name, test_class, status = match.groups(); \ + testcase = ET.SubElement(suite, "testcase", classname=test_class, name=test_name); \ + if status != "ok": \ + failure = ET.SubElement(testcase, "failure"); \ + failure.text = f"Test {test_name} in {test_class} {status.lower()}ed"; \ + tree = ET.ElementTree(root); \ + tree.write("test-results/unit-tests.xml", encoding="utf-8", xml_declaration=True); \ + parse_unittest_output("test-results/unit-tests.txt")' - name: Upload test results if: always() @@ -181,16 +193,19 @@ jobs: apt-get update DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip - python3 -m pip install -e . + python3 -m pip install poetry + poetry install - name: Run E2E tests run: | + set -e mkdir -p test-results - python3 scripts/generate_graph.py \ + poetry run python scripts/generate_graph.py \ --map-path maps/carter_warehouse_navigation.png \ --graph-eval.active \ --perf-eval.active - python3 scripts/evaluate_graph.py \ + poetry run python scripts/evaluate_graph.py \ + --graph-path graphs/graph.gml \ --map-path maps/carter_warehouse_navigation.png \ --output-dir results \ --resolution 0.05 \ @@ -242,10 +257,13 @@ jobs: set -e cd docker || { echo "Missing docker directory"; exit 1; } docker compose build --no-cache - docker compose up -d rest-api - sleep 15 # Wait for service initialization - docker ps -a - pytest ../tests/integration/test_api.py -v + docker compose up rest-api & + sleep 10 + cd ../ + poetry install + poetry run python scripts/test_api_client.py \ + --map_path maps/carter_warehouse_navigation.png \ + --host docker - name: Cleanup if: always() From eb7d7f3a19d0ec8c61e5c680eae2a1e357d3035c Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 15:52:34 -0700 Subject: [PATCH 32/72] Addressing pre-che issues --- .github/workflows/ci.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a00a812..81abce7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,8 +55,9 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git python3 -m pip install --upgrade pip - python3 -m pip install pre-commit - pre-commit --version + python3 -m pip install poetry + poetry env use python3.12 + poetry install - name: Run pre-commit run: | @@ -99,6 +100,8 @@ jobs: apt-get update && apt-get install -y --no-install-recommends git python3 -m pip install --upgrade pip python3 -m pip install poetry + poetry env use python3.12 + poetry install - name: Run Poetry check run: poetry check @@ -137,6 +140,7 @@ jobs: DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip python3 -m pip install poetry + poetry env use python3.12 poetry install - name: Run unit tests @@ -194,6 +198,7 @@ jobs: DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 python3 -m pip install --upgrade pip python3 -m pip install poetry + poetry env use python3.12 poetry install - name: Run E2E tests @@ -250,7 +255,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.12' - name: Build and test run: | From 350551c55375a537b2380439ad05359398735aff Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 16:29:21 -0700 Subject: [PATCH 33/72] Addressing pre-che issues --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81abce7..e0bca59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,9 +55,8 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git python3 -m pip install --upgrade pip - python3 -m pip install poetry - poetry env use python3.12 - poetry install + python3 -m pip install pre-commit + pre-commit --version - name: Run pre-commit run: | @@ -265,6 +264,7 @@ jobs: docker compose up rest-api & sleep 10 cd ../ + poetry env use python3.12 poetry install poetry run python scripts/test_api_client.py \ --map_path maps/carter_warehouse_navigation.png \ From e246194823937989b547c70c7163f7245bd05ffc Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 16:39:30 -0700 Subject: [PATCH 34/72] Addressing pre-che issues --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2017c11..b39eb35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,10 +22,10 @@ name = "swagger" version = "1.0.0" description = "A library for generating waypoint graphs from occupancy grid maps" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.12,<4.0" classifiers = [ "Development Status :: 3 - Alpha", - "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.12", ] dependencies = [ "colorlog", @@ -37,7 +37,7 @@ dependencies = [ "networkx>=2.6.0", "numba", "numpy>=1.21.0,<2.0", - "numpydantic", + "numpydantic>=1.6.9", "opencv-python>=4.5.0", "psutil", "pydantic==2.10.6", From e57c8ad0e5ccb04924389d54ce7709834f76094d Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 16:49:28 -0700 Subject: [PATCH 35/72] Addressing pre-che issues --- .github/workflows/ci.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e0bca59..45fdafa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: name: Pre-commit Checks runs-on: self-hosted container: - image: python:3.12-slim-bullseye + image: python:3.12-slim steps: - name: Install Git run: | @@ -74,7 +74,7 @@ jobs: runs-on: self-hosted needs: pre-commit container: - image: python:3.12-slim-bullseye + image: python:3.12-slim steps: - name: Install Git run: | @@ -124,6 +124,11 @@ jobs: - uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Cache Python packages uses: actions/cache@v4 with: @@ -190,6 +195,11 @@ jobs: - uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Install dependencies run: | set -e From 117418ba22494b3c37aa0735db15f97a0baeee4f Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 16:54:46 -0700 Subject: [PATCH 36/72] Addressing pre-che issues --- .github/workflows/ci.yml | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45fdafa..268c837 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -152,22 +152,7 @@ jobs: mkdir -p test-results poetry run python -m unittest discover -p "test_*.py" -v > test-results/unit-tests.txt # Convert unittest output to JUnit XML format - poetry run python -c 'import xml.etree.ElementTree as ET; import re; import sys; \ - def parse_unittest_output(filename): \ - root = ET.Element("testsuites"); \ - suite = ET.SubElement(root, "testsuite", name="unit-tests"); \ - with open(filename, "r") as f: \ - content = f.read(); \ - test_pattern = r"test_(\w+) \((.*?)\) \.\.\. (ok|FAIL|ERROR)"; \ - for match in re.finditer(test_pattern, content): \ - test_name, test_class, status = match.groups(); \ - testcase = ET.SubElement(suite, "testcase", classname=test_class, name=test_name); \ - if status != "ok": \ - failure = ET.SubElement(testcase, "failure"); \ - failure.text = f"Test {test_name} in {test_class} {status.lower()}ed"; \ - tree = ET.ElementTree(root); \ - tree.write("test-results/unit-tests.xml", encoding="utf-8", xml_declaration=True); \ - parse_unittest_output("test-results/unit-tests.txt")' + poetry run python -c "import xml.etree.ElementTree as ET, re; root = ET.Element('testsuites'); suite = ET.SubElement(root, 'testsuite', name='unit-tests'); [suite.append(ET.SubElement(suite, 'testcase', classname=m.group(2), name=m.group(1))) if m.group(3) == 'ok' else suite.append(ET.SubElement(suite, 'testcase', classname=m.group(2), name=m.group(1)).append(ET.SubElement(suite, 'failure', message=f'Test {m.group(1)} in {m.group(2)} {m.group(3).lower()}ed'))) for m in re.finditer(r'test_(\w+) \((.*?)\) \.\.\. (ok|FAIL|ERROR)', open('test-results/unit-tests.txt').read())]; ET.ElementTree(root).write('test-results/unit-tests.xml', encoding='utf-8', xml_declaration=True)" - name: Upload test results if: always() From a787b6be7e310c03e568d5348b6ac191bff42067 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 17:03:07 -0700 Subject: [PATCH 37/72] Fixing e2e stage --- .github/workflows/ci.yml | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 268c837..215f43c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -168,6 +168,7 @@ jobs: runs-on: self-hosted env: CUDA_VISIBLE_DEVICES: all + NVIDIA_VISIBLE_DEVICES: all container: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 @@ -195,10 +196,14 @@ jobs: poetry env use python3.12 poetry install + - name: Verify CUDA setup + run: | + nvidia-smi + python3 -c "import torch; print('CUDA available:', torch.cuda.is_available()); print('CUDA device count:', torch.cuda.device_count()); print('CUDA device name:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None')" + - name: Run E2E tests run: | set -e - mkdir -p test-results poetry run python scripts/generate_graph.py \ --map-path maps/carter_warehouse_navigation.png \ --graph-eval.active \ @@ -211,20 +216,9 @@ jobs: --safety-distance 0.3 \ --occupancy-threshold 127 - - name: Upload artifacts - if: always() - uses: actions/upload-artifact@v4 - with: - name: e2e-results - path: | - test-results/ - results/ - graphs/ - retention-days: 7 - - name: Cleanup if: always() - run: rm -rf results graphs test-results + run: rm -rf results graphs docker-build: name: Docker Build and Test From 3108f7b3ea525f7922995e7bdf6925eb4c9b7485 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 17:11:32 -0700 Subject: [PATCH 38/72] Fixing e2e stage --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 215f43c..f53aba9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -180,6 +180,8 @@ jobs: git config --global --add safe.directory /__w/SWAGGER/SWAGGER - uses: actions/checkout@v4 + with: + lfs: true - name: Setup Python uses: actions/setup-python@v5 @@ -199,7 +201,6 @@ jobs: - name: Verify CUDA setup run: | nvidia-smi - python3 -c "import torch; print('CUDA available:', torch.cuda.is_available()); print('CUDA device count:', torch.cuda.device_count()); print('CUDA device name:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None')" - name: Run E2E tests run: | From 997f8afb9702e0849d16dedceadffe788d24c75f Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 17:21:14 -0700 Subject: [PATCH 39/72] Fixing e2e stage --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f53aba9..835b6bc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -176,8 +176,9 @@ jobs: - name: Install Git run: | apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER + git lfs install - uses: actions/checkout@v4 with: From 6d4aeec2bfa8ed2794dd3313849cf1ba7ba8590d Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 17:28:05 -0700 Subject: [PATCH 40/72] Fixing e2e stage --- .github/workflows/ci.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 835b6bc..7aed685 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -193,7 +193,11 @@ jobs: run: | set -e apt-get update - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3-pip python3-venv \ + libgl1-mesa-glx libglib2.0-0 \ + libcudnn8 libcudnn8-dev \ + nvidia-cuda-toolkit python3 -m pip install --upgrade pip python3 -m pip install poetry poetry env use python3.12 @@ -201,7 +205,9 @@ jobs: - name: Verify CUDA setup run: | + echo "Checking CUDA setup..." nvidia-smi + python3 -c 'import os, sys, cv2, numpy as np; print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES")); print("NVIDIA_VISIBLE_DEVICES:", os.environ.get("NVIDIA_VISIBLE_DEVICES")); try: img = cv2.imread("maps/carter_warehouse_navigation.png"); print("Image exists:", os.path.exists("maps/carter_warehouse_navigation.png")); print("Image size:", os.path.getsize("maps/carter_warehouse_navigation.png")); if img is None: print("Failed to read image with OpenCV"); sys.exit(1); print("Image shape:", img.shape); print("Image type:", img.dtype); except Exception as e: print("Error reading image:", str(e)); sys.exit(1)' - name: Run E2E tests run: | From d325b45905d159bd7aeef5cb802de34b0fbe69d2 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 17:32:38 -0700 Subject: [PATCH 41/72] Fixing e2e stage --- .github/workflows/ci.yml | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7aed685..61f65ee 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,13 +33,14 @@ jobs: - name: Install Git run: | apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER + git lfs install - name: Checkout code uses: actions/checkout@v4 with: - fetch-depth: 0 + lfs: true - name: Cache Python packages uses: actions/cache@v4 @@ -79,11 +80,14 @@ jobs: - name: Install Git run: | apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER + git lfs install - name: Checkout code uses: actions/checkout@v4 + with: + lfs: true - name: Cache pip packages for Poetry uses: actions/cache@v4 @@ -119,10 +123,13 @@ jobs: - name: Install Git run: | apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER + git lfs install - uses: actions/checkout@v4 + with: + lfs: true - name: Setup Python uses: actions/setup-python@v5 @@ -243,10 +250,13 @@ jobs: - name: Install Git run: | apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER + git lfs install - uses: actions/checkout@v4 + with: + lfs: true - name: Setup Python uses: actions/setup-python@v5 From 60f8a580cf20e39321866ee6b0d632da53384dc5 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 17:49:40 -0700 Subject: [PATCH 42/72] Fixing e2e stage --- .github/workflows/ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 61f65ee..7e641a1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -120,12 +120,16 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Install Git + - name: Install Git and Git LFS run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER git lfs install + # Remove pre-push hook if it exists + if [ -f .git/hooks/pre-push ]; then + rm .git/hooks/pre-push + fi - uses: actions/checkout@v4 with: From afebeaf533498f91419d87fef40ec97c58dae200 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 17:56:11 -0700 Subject: [PATCH 43/72] Fixing e2e stage --- .github/workflows/ci.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7e641a1..9cdd204 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -125,15 +125,16 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install - # Remove pre-push hook if it exists - if [ -f .git/hooks/pre-push ]; then - rm .git/hooks/pre-push - fi + git lfs install --force + git config --global lfs.allowincompletepush true + git config --global lfs.concurrenttransfers 8 + git config --global lfs.batch true + git config --global lfs.verify false - uses: actions/checkout@v4 with: lfs: true + fetch-depth: 0 - name: Setup Python uses: actions/setup-python@v5 @@ -165,7 +166,7 @@ jobs: # Convert unittest output to JUnit XML format poetry run python -c "import xml.etree.ElementTree as ET, re; root = ET.Element('testsuites'); suite = ET.SubElement(root, 'testsuite', name='unit-tests'); [suite.append(ET.SubElement(suite, 'testcase', classname=m.group(2), name=m.group(1))) if m.group(3) == 'ok' else suite.append(ET.SubElement(suite, 'testcase', classname=m.group(2), name=m.group(1)).append(ET.SubElement(suite, 'failure', message=f'Test {m.group(1)} in {m.group(2)} {m.group(3).lower()}ed'))) for m in re.finditer(r'test_(\w+) \((.*?)\) \.\.\. (ok|FAIL|ERROR)', open('test-results/unit-tests.txt').read())]; ET.ElementTree(root).write('test-results/unit-tests.xml', encoding='utf-8', xml_declaration=True)" - - name: Upload test results + - name: Upload test resul ts if: always() uses: actions/upload-artifact@v4 with: From dfbe9417e9f06d4ea932d828632ee9a778a796c4 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 18:00:31 -0700 Subject: [PATCH 44/72] Fixing e2e stage --- .github/workflows/ci.yml | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9cdd204..322f271 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,17 +30,22 @@ jobs: container: image: python:3.12-slim steps: - - name: Install Git + - name: Setup Git and LFS run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install + git lfs install --force + mkdir -p .git/hooks + rm -f .git/hooks/pre-push + rm -f .git/hooks/post-checkout + git config --global core.hooksPath /dev/null - name: Checkout code uses: actions/checkout@v4 with: lfs: true + fetch-depth: 0 - name: Cache Python packages uses: actions/cache@v4 @@ -53,8 +58,6 @@ jobs: - name: Install dependencies run: | set -e - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git python3 -m pip install --upgrade pip python3 -m pip install pre-commit pre-commit --version @@ -77,7 +80,7 @@ jobs: container: image: python:3.12-slim steps: - - name: Install Git + - name: Setup Git and LFS run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs @@ -100,7 +103,6 @@ jobs: - name: Install dependencies run: | set -e - apt-get update && apt-get install -y --no-install-recommends git python3 -m pip install --upgrade pip python3 -m pip install poetry poetry env use python3.12 @@ -120,21 +122,16 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Install Git and Git LFS + - name: Setup Git and LFS run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install --force - git config --global lfs.allowincompletepush true - git config --global lfs.concurrenttransfers 8 - git config --global lfs.batch true - git config --global lfs.verify false + git lfs install - uses: actions/checkout@v4 with: lfs: true - fetch-depth: 0 - name: Setup Python uses: actions/setup-python@v5 @@ -164,9 +161,9 @@ jobs: mkdir -p test-results poetry run python -m unittest discover -p "test_*.py" -v > test-results/unit-tests.txt # Convert unittest output to JUnit XML format - poetry run python -c "import xml.etree.ElementTree as ET, re; root = ET.Element('testsuites'); suite = ET.SubElement(root, 'testsuite', name='unit-tests'); [suite.append(ET.SubElement(suite, 'testcase', classname=m.group(2), name=m.group(1))) if m.group(3) == 'ok' else suite.append(ET.SubElement(suite, 'testcase', classname=m.group(2), name=m.group(1)).append(ET.SubElement(suite, 'failure', message=f'Test {m.group(1)} in {m.group(2)} {m.group(3).lower()}ed'))) for m in re.finditer(r'test_(\w+) \((.*?)\) \.\.\. (ok|FAIL|ERROR)', open('test-results/unit-tests.txt').read())]; ET.ElementTree(root).write('test-results/unit-tests.xml', encoding='utf-8', xml_declaration=True)" + poetry run python -c 'import xml.etree.ElementTree as ET, re; root = ET.Element("testsuites"); suite = ET.SubElement(root, "testsuite", name="unit-tests"); [suite.append(ET.SubElement(suite, "testcase", classname=m.group(2), name=m.group(1))) if m.group(3) == "ok" else suite.append(ET.SubElement(suite, "testcase", classname=m.group(2), name=m.group(1)).append(ET.SubElement(suite, "failure", message=f"Test {m.group(1)} in {m.group(2)} {m.group(3).lower()}ed"))) for m in re.finditer(r"test_(\w+) \((.*?)\) \.\.\. (ok|FAIL|ERROR)", open("test-results/unit-tests.txt").read())]; ET.ElementTree(root).write("test-results/unit-tests.xml", encoding="utf-8", xml_declaration=True)' - - name: Upload test resul ts + - name: Upload test results if: always() uses: actions/upload-artifact@v4 with: @@ -185,7 +182,7 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Install Git + - name: Setup Git and LFS run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs @@ -252,7 +249,7 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Install Git + - name: Setup Git and LFS run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs From 28e940ad76ed00a6e88c098c48ef8e6b50672c51 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 18:02:34 -0700 Subject: [PATCH 45/72] Fixing e2e stage --- .github/workflows/ci.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 322f271..00e2126 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,11 +35,12 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install --force - mkdir -p .git/hooks - rm -f .git/hooks/pre-push - rm -f .git/hooks/post-checkout - git config --global core.hooksPath /dev/null + # Configure Git LFS + git lfs install --local + git lfs update --force + # Configure Git to skip hooks in CI + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks - name: Checkout code uses: actions/checkout@v4 From 7fca9c2ca8c66be709be9cf1ee61bea439d6cade Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 18:07:05 -0700 Subject: [PATCH 46/72] Fixing e2e stage --- .github/actions/setup-git/action.yml | 23 ++++++++++++++ .github/workflows/ci.yml | 45 +++++++--------------------- 2 files changed, 33 insertions(+), 35 deletions(-) create mode 100644 .github/actions/setup-git/action.yml diff --git a/.github/actions/setup-git/action.yml b/.github/actions/setup-git/action.yml new file mode 100644 index 0000000..83abada --- /dev/null +++ b/.github/actions/setup-git/action.yml @@ -0,0 +1,23 @@ +name: 'Setup Git and Git LFS' +description: 'Installs and configures Git and Git LFS for CI environment' + +runs: + using: "composite" + steps: + - name: Install Git and Git LFS + shell: bash + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Configure Git LFS + git lfs install --local + # Remove existing pre-push hook if it exists + if [ -f .git/hooks/pre-push ]; then + rm .git/hooks/pre-push + fi + # Reinstall Git LFS hooks + git lfs update --force + # Configure Git to skip hooks in CI + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 00e2126..9f4b22a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,17 +30,8 @@ jobs: container: image: python:3.12-slim steps: - - name: Setup Git and LFS - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Configure Git LFS - git lfs install --local - git lfs update --force - # Configure Git to skip hooks in CI - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks + - name: Setup Git and Git LFS + uses: ./.github/actions/setup-git - name: Checkout code uses: actions/checkout@v4 @@ -81,12 +72,8 @@ jobs: container: image: python:3.12-slim steps: - - name: Setup Git and LFS - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install + - name: Setup Git and Git LFS + uses: ./.github/actions/setup-git - name: Checkout code uses: actions/checkout@v4 @@ -123,12 +110,8 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Setup Git and LFS - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install + - name: Setup Git and Git LFS + uses: ./.github/actions/setup-git - uses: actions/checkout@v4 with: @@ -183,12 +166,8 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Setup Git and LFS - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install + - name: Setup Git and Git LFS + uses: ./.github/actions/setup-git - uses: actions/checkout@v4 with: @@ -250,12 +229,8 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Setup Git and LFS - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install + - name: Setup Git and Git LFS + uses: ./.github/actions/setup-git - uses: actions/checkout@v4 with: From a3cd0d9a56936803b0501f3b085bcb67a440359d Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 18:16:09 -0700 Subject: [PATCH 47/72] Fixing e2e stage --- .github/actions/setup-git/action.yml | 23 -------- .github/workflows/ci.yml | 80 ++++++++++++++++++++++++++-- 2 files changed, 75 insertions(+), 28 deletions(-) delete mode 100644 .github/actions/setup-git/action.yml diff --git a/.github/actions/setup-git/action.yml b/.github/actions/setup-git/action.yml deleted file mode 100644 index 83abada..0000000 --- a/.github/actions/setup-git/action.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: 'Setup Git and Git LFS' -description: 'Installs and configures Git and Git LFS for CI environment' - -runs: - using: "composite" - steps: - - name: Install Git and Git LFS - shell: bash - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Configure Git LFS - git lfs install --local - # Remove existing pre-push hook if it exists - if [ -f .git/hooks/pre-push ]; then - rm .git/hooks/pre-push - fi - # Reinstall Git LFS hooks - git lfs update --force - # Configure Git to skip hooks in CI - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f4b22a..c25b927 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,21 @@ jobs: image: python:3.12-slim steps: - name: Setup Git and Git LFS - uses: ./.github/actions/setup-git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Configure Git LFS + git lfs install --local + # Remove existing pre-push hook if it exists + if [ -f .git/hooks/pre-push ]; then + rm .git/hooks/pre-push + fi + # Reinstall Git LFS hooks + git lfs update --force + # Configure Git to skip hooks in CI + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks - name: Checkout code uses: actions/checkout@v4 @@ -73,7 +87,21 @@ jobs: image: python:3.12-slim steps: - name: Setup Git and Git LFS - uses: ./.github/actions/setup-git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Configure Git LFS + git lfs install --local + # Remove existing pre-push hook if it exists + if [ -f .git/hooks/pre-push ]; then + rm .git/hooks/pre-push + fi + # Reinstall Git LFS hooks + git lfs update --force + # Configure Git to skip hooks in CI + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks - name: Checkout code uses: actions/checkout@v4 @@ -111,7 +139,21 @@ jobs: options: --privileged --gpus all --group-add 998 steps: - name: Setup Git and Git LFS - uses: ./.github/actions/setup-git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Configure Git LFS + git lfs install --local + # Remove existing pre-push hook if it exists + if [ -f .git/hooks/pre-push ]; then + rm .git/hooks/pre-push + fi + # Reinstall Git LFS hooks + git lfs update --force + # Configure Git to skip hooks in CI + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks - uses: actions/checkout@v4 with: @@ -167,7 +209,21 @@ jobs: options: --privileged --gpus all --group-add 998 steps: - name: Setup Git and Git LFS - uses: ./.github/actions/setup-git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Configure Git LFS + git lfs install --local + # Remove existing pre-push hook if it exists + if [ -f .git/hooks/pre-push ]; then + rm .git/hooks/pre-push + fi + # Reinstall Git LFS hooks + git lfs update --force + # Configure Git to skip hooks in CI + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks - uses: actions/checkout@v4 with: @@ -230,7 +286,21 @@ jobs: options: --privileged --gpus all --group-add 998 steps: - name: Setup Git and Git LFS - uses: ./.github/actions/setup-git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Configure Git LFS + git lfs install --local + # Remove existing pre-push hook if it exists + if [ -f .git/hooks/pre-push ]; then + rm .git/hooks/pre-push + fi + # Reinstall Git LFS hooks + git lfs update --force + # Configure Git to skip hooks in CI + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks - uses: actions/checkout@v4 with: From db39a638094ebae42ab8e05248eda3143f127fa6 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 18:22:22 -0700 Subject: [PATCH 48/72] Fixing e2e stage --- .github/workflows/ci.yml | 145 +++++++-------------------------------- 1 file changed, 23 insertions(+), 122 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c25b927..b5b68d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,26 +24,20 @@ env: PYTHONPATH: ${{ github.workspace }} jobs: - pre-commit: - name: Pre-commit Checks + setup-git: + name: Setup Git and LFS runs-on: self-hosted container: image: python:3.12-slim steps: - - name: Setup Git and Git LFS + - name: Install Git and LFS run: | apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Configure Git LFS git lfs install --local - # Remove existing pre-push hook if it exists - if [ -f .git/hooks/pre-push ]; then - rm .git/hooks/pre-push - fi - # Reinstall Git LFS hooks + if [ -f .git/hooks/pre-push ]; then rm .git/hooks/pre-push; fi git lfs update --force - # Configure Git to skip hooks in CI git config --global core.hooksPath /tmp/git-hooks mkdir -p /tmp/git-hooks @@ -53,74 +47,51 @@ jobs: lfs: true fetch-depth: 0 + pre-commit: + name: Pre-commit Checks + needs: setup-git + runs-on: self-hosted + container: + image: python:3.12-slim + steps: - name: Cache Python packages uses: actions/cache@v4 with: path: ~/.cache/pip key: precommit-${{ hashFiles('.pre-commit-config.yaml') }} - restore-keys: | - precommit- + restore-keys: precommit- - name: Install dependencies run: | - set -e - python3 -m pip install --upgrade pip - python3 -m pip install pre-commit + python3 -m pip install --upgrade pip pre-commit pre-commit --version - name: Run pre-commit run: | echo "Current directory: $(pwd)" - echo "Listing files in workspace:" ls -la git status git log -1 - echo "Running pre-commit..." pre-commit run --all-files timeout-minutes: 60 package-check: name: Package Check - runs-on: self-hosted needs: pre-commit + runs-on: self-hosted container: image: python:3.12-slim steps: - - name: Setup Git and Git LFS - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Configure Git LFS - git lfs install --local - # Remove existing pre-push hook if it exists - if [ -f .git/hooks/pre-push ]; then - rm .git/hooks/pre-push - fi - # Reinstall Git LFS hooks - git lfs update --force - # Configure Git to skip hooks in CI - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks - - - name: Checkout code - uses: actions/checkout@v4 - with: - lfs: true - - - name: Cache pip packages for Poetry + - name: Cache pip packages uses: actions/cache@v4 with: path: ~/.cache/pip key: poetry-pip-${{ hashFiles('**/pyproject.toml') }} - restore-keys: | - poetry-pip- + restore-keys: poetry-pip- - name: Install dependencies run: | - set -e - python3 -m pip install --upgrade pip - python3 -m pip install poetry + python3 -m pip install --upgrade pip poetry poetry env use python3.12 poetry install @@ -138,27 +109,6 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Setup Git and Git LFS - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Configure Git LFS - git lfs install --local - # Remove existing pre-push hook if it exists - if [ -f .git/hooks/pre-push ]; then - rm .git/hooks/pre-push - fi - # Reinstall Git LFS hooks - git lfs update --force - # Configure Git to skip hooks in CI - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks - - - uses: actions/checkout@v4 - with: - lfs: true - - name: Setup Python uses: actions/setup-python@v5 with: @@ -174,11 +124,10 @@ jobs: - name: Install dependencies run: | - set -e apt-get update - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 - python3 -m pip install --upgrade pip - python3 -m pip install poetry + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 + python3 -m pip install --upgrade pip poetry poetry env use python3.12 poetry install @@ -186,7 +135,6 @@ jobs: run: | mkdir -p test-results poetry run python -m unittest discover -p "test_*.py" -v > test-results/unit-tests.txt - # Convert unittest output to JUnit XML format poetry run python -c 'import xml.etree.ElementTree as ET, re; root = ET.Element("testsuites"); suite = ET.SubElement(root, "testsuite", name="unit-tests"); [suite.append(ET.SubElement(suite, "testcase", classname=m.group(2), name=m.group(1))) if m.group(3) == "ok" else suite.append(ET.SubElement(suite, "testcase", classname=m.group(2), name=m.group(1)).append(ET.SubElement(suite, "failure", message=f"Test {m.group(1)} in {m.group(2)} {m.group(3).lower()}ed"))) for m in re.finditer(r"test_(\w+) \((.*?)\) \.\.\. (ok|FAIL|ERROR)", open("test-results/unit-tests.txt").read())]; ET.ElementTree(root).write("test-results/unit-tests.xml", encoding="utf-8", xml_declaration=True)' - name: Upload test results @@ -208,27 +156,6 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Setup Git and Git LFS - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Configure Git LFS - git lfs install --local - # Remove existing pre-push hook if it exists - if [ -f .git/hooks/pre-push ]; then - rm .git/hooks/pre-push - fi - # Reinstall Git LFS hooks - git lfs update --force - # Configure Git to skip hooks in CI - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks - - - uses: actions/checkout@v4 - with: - lfs: true - - name: Setup Python uses: actions/setup-python@v5 with: @@ -236,27 +163,23 @@ jobs: - name: Install dependencies run: | - set -e apt-get update DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ python3-pip python3-venv \ libgl1-mesa-glx libglib2.0-0 \ libcudnn8 libcudnn8-dev \ nvidia-cuda-toolkit - python3 -m pip install --upgrade pip - python3 -m pip install poetry + python3 -m pip install --upgrade pip poetry poetry env use python3.12 poetry install - name: Verify CUDA setup run: | - echo "Checking CUDA setup..." nvidia-smi python3 -c 'import os, sys, cv2, numpy as np; print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES")); print("NVIDIA_VISIBLE_DEVICES:", os.environ.get("NVIDIA_VISIBLE_DEVICES")); try: img = cv2.imread("maps/carter_warehouse_navigation.png"); print("Image exists:", os.path.exists("maps/carter_warehouse_navigation.png")); print("Image size:", os.path.getsize("maps/carter_warehouse_navigation.png")); if img is None: print("Failed to read image with OpenCV"); sys.exit(1); print("Image shape:", img.shape); print("Image type:", img.dtype); except Exception as e: print("Error reading image:", str(e)); sys.exit(1)' - name: Run E2E tests run: | - set -e poetry run python scripts/generate_graph.py \ --map-path maps/carter_warehouse_navigation.png \ --graph-eval.active \ @@ -285,27 +208,6 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - - name: Setup Git and Git LFS - run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs - git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Configure Git LFS - git lfs install --local - # Remove existing pre-push hook if it exists - if [ -f .git/hooks/pre-push ]; then - rm .git/hooks/pre-push - fi - # Reinstall Git LFS hooks - git lfs update --force - # Configure Git to skip hooks in CI - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks - - - uses: actions/checkout@v4 - with: - lfs: true - - name: Setup Python uses: actions/setup-python@v5 with: @@ -313,12 +215,12 @@ jobs: - name: Build and test run: | - set -e cd docker || { echo "Missing docker directory"; exit 1; } docker compose build --no-cache docker compose up rest-api & sleep 10 cd ../ + python3 -m pip install --upgrade pip poetry poetry env use python3.12 poetry install poetry run python scripts/test_api_client.py \ @@ -327,5 +229,4 @@ jobs: - name: Cleanup if: always() - run: | - cd docker && docker compose down -v + run: cd docker && docker compose down -v From 189e8ccce283a705c0bf7d3693aefc21378533c0 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 18:26:32 -0700 Subject: [PATCH 49/72] Fixing e2e stage --- .github/workflows/ci.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b5b68d4..bd19e34 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,12 +28,10 @@ jobs: name: Setup Git and LFS runs-on: self-hosted container: - image: python:3.12-slim + image: python:3.12-slim-bullseye steps: - - name: Install Git and LFS + - name: Configure Git and LFS run: | - apt-get update -y - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER git lfs install --local if [ -f .git/hooks/pre-push ]; then rm .git/hooks/pre-push; fi @@ -52,7 +50,7 @@ jobs: needs: setup-git runs-on: self-hosted container: - image: python:3.12-slim + image: python:3.12-slim-bullseye steps: - name: Cache Python packages uses: actions/cache@v4 @@ -80,7 +78,7 @@ jobs: needs: pre-commit runs-on: self-hosted container: - image: python:3.12-slim + image: python:3.12-slim-bullseye steps: - name: Cache pip packages uses: actions/cache@v4 From 1080e7ba6bc8785b2adfae655872c7a7ce647e20 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 18:32:33 -0700 Subject: [PATCH 50/72] Fixing e2e stage --- .github/workflows/ci.yml | 71 +++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd19e34..0b07eb3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,34 +24,23 @@ env: PYTHONPATH: ${{ github.workspace }} jobs: - setup-git: - name: Setup Git and LFS + pre-commit: + name: Pre-commit Checks runs-on: self-hosted container: image: python:3.12-slim-bullseye steps: - - name: Configure Git and LFS + - name: Install Git run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install --local - if [ -f .git/hooks/pre-push ]; then rm .git/hooks/pre-push; fi - git lfs update --force - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks - name: Checkout code uses: actions/checkout@v4 with: - lfs: true fetch-depth: 0 - pre-commit: - name: Pre-commit Checks - needs: setup-git - runs-on: self-hosted - container: - image: python:3.12-slim-bullseye - steps: - name: Cache Python packages uses: actions/cache@v4 with: @@ -80,6 +69,15 @@ jobs: container: image: python:3.12-slim-bullseye steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + + - name: Checkout code + uses: actions/checkout@v4 + - name: Cache pip packages uses: actions/cache@v4 with: @@ -107,6 +105,15 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: + - name: Install Git + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + + - name: Checkout code + uses: actions/checkout@v4 + - name: Setup Python uses: actions/setup-python@v5 with: @@ -154,6 +161,22 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: + - name: Install Git and LFS + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + git lfs install --local + if [ -f .git/hooks/pre-push ]; then rm .git/hooks/pre-push; fi + git lfs update --force + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks + + - name: Checkout code + uses: actions/checkout@v4 + with: + lfs: true + - name: Setup Python uses: actions/setup-python@v5 with: @@ -206,6 +229,22 @@ jobs: image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: + - name: Install Git and LFS + run: | + apt-get update -y + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs + git config --global --add safe.directory /__w/SWAGGER/SWAGGER + git lfs install --local + if [ -f .git/hooks/pre-push ]; then rm .git/hooks/pre-push; fi + git lfs update --force + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks + + - name: Checkout code + uses: actions/checkout@v4 + with: + lfs: true + - name: Setup Python uses: actions/setup-python@v5 with: From 39ffab11c4d685c2c12ee2eb219045b910d331e0 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Mon, 19 May 2025 18:45:01 -0700 Subject: [PATCH 51/72] Fixing e2e stage --- .github/workflows/ci.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0b07eb3..936bd1b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,6 +35,9 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Disable Git LFS hooks + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks - name: Checkout code uses: actions/checkout@v4 @@ -74,6 +77,9 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Disable Git LFS hooks + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks - name: Checkout code uses: actions/checkout@v4 @@ -110,6 +116,9 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Disable Git LFS hooks + git config --global core.hooksPath /tmp/git-hooks + mkdir -p /tmp/git-hooks - name: Checkout code uses: actions/checkout@v4 From 8226d1ede880729744591b92b579e1598fea0752 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 07:11:40 -0700 Subject: [PATCH 52/72] Fixing e2e stage --- .github/workflows/ci.yml | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 936bd1b..159fdf5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,9 +35,8 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Disable Git LFS hooks - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks + # Disable Git hooks completely + git config --global core.hooksPath /dev/null - name: Checkout code uses: actions/checkout@v4 @@ -77,9 +76,8 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Disable Git LFS hooks - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks + # Disable Git hooks completely + git config --global core.hooksPath /dev/null - name: Checkout code uses: actions/checkout@v4 @@ -116,9 +114,8 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Disable Git LFS hooks - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks + # Disable Git hooks completely + git config --global core.hooksPath /dev/null - name: Checkout code uses: actions/checkout@v4 @@ -176,10 +173,7 @@ jobs: DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER git lfs install --local - if [ -f .git/hooks/pre-push ]; then rm .git/hooks/pre-push; fi git lfs update --force - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks - name: Checkout code uses: actions/checkout@v4 @@ -244,10 +238,7 @@ jobs: DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER git lfs install --local - if [ -f .git/hooks/pre-push ]; then rm .git/hooks/pre-push; fi git lfs update --force - git config --global core.hooksPath /tmp/git-hooks - mkdir -p /tmp/git-hooks - name: Checkout code uses: actions/checkout@v4 From df37604cdea01c65dd21d2c1c635846daab389c8 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 07:39:54 -0700 Subject: [PATCH 53/72] Fixing e2e stage --- .github/workflows/ci.yml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 159fdf5..712b8c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,10 +57,6 @@ jobs: - name: Run pre-commit run: | - echo "Current directory: $(pwd)" - ls -la - git status - git log -1 pre-commit run --all-files timeout-minutes: 60 @@ -86,8 +82,8 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pip - key: poetry-pip-${{ hashFiles('**/pyproject.toml') }} - restore-keys: poetry-pip- + key: poetry-pip-bullseye-${{ hashFiles('**/pyproject.toml') }} + restore-keys: poetry-pip-bullseye- - name: Install dependencies run: | @@ -106,7 +102,7 @@ jobs: env: CUDA_VISIBLE_DEVICES: all container: - image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 + image: nvidia/cuda:12.6.0-devel-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - name: Install Git @@ -131,7 +127,8 @@ jobs: path: | ~/.cache/pip venv/ - key: unit-tests-${{ hashFiles('**/requirements.txt') }} + key: unit-tests-cuda-${{ hashFiles('**/requirements.txt') }} + restore-keys: unit-tests-cuda- - name: Install dependencies run: | @@ -164,7 +161,7 @@ jobs: CUDA_VISIBLE_DEVICES: all NVIDIA_VISIBLE_DEVICES: all container: - image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 + image: nvidia/cuda:12.6.0-devel-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - name: Install Git and LFS @@ -229,7 +226,7 @@ jobs: DOCKER_HOST: tcp://docker:2375 DOCKER_TLS_CERTDIR: "" container: - image: nvidia/cuda:12.6.0-runtime-ubuntu22.04 + image: nvidia/cuda:12.6.0-devel-ubuntu22.04 options: --privileged --gpus all --group-add 998 steps: - name: Install Git and LFS From cf697a7dc2faf0de0649fd12f9389c63f6fc41f2 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 07:50:57 -0700 Subject: [PATCH 54/72] Fixing e2e stage --- .github/workflows/ci.yml | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 712b8c7..6cac1d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,7 +48,7 @@ jobs: with: path: ~/.cache/pip key: precommit-${{ hashFiles('.pre-commit-config.yaml') }} - restore-keys: precommit- + restore-keys: precommit - name: Install dependencies run: | @@ -78,18 +78,16 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Cache pip packages - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: poetry-pip-bullseye-${{ hashFiles('**/pyproject.toml') }} - restore-keys: poetry-pip-bullseye- + - name: Clear Poetry cache + run: | + rm -rf ~/.cache/pypoetry + rm -rf ~/.cache/pip - name: Install dependencies run: | python3 -m pip install --upgrade pip poetry poetry env use python3.12 - poetry install + poetry install --no-cache - name: Run Poetry check run: poetry check From c90dc897718b03cdf00265d9b9f6d74747a22aee Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 07:57:48 -0700 Subject: [PATCH 55/72] Fixing e2e stage --- .github/workflows/ci.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6cac1d5..0700f6f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -167,8 +167,11 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install --local + # Configure Git LFS + git lfs install --system git lfs update --force + # Verify Git LFS installation + git lfs version - name: Checkout code uses: actions/checkout@v4 @@ -232,8 +235,11 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER - git lfs install --local + # Configure Git LFS + git lfs install --system git lfs update --force + # Verify Git LFS installation + git lfs version - name: Checkout code uses: actions/checkout@v4 From f8f5fa095e0b06aee7ffa96f8d350014ccc4a871 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 08:05:18 -0700 Subject: [PATCH 56/72] Fixing e2e stage --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0700f6f..17839d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -167,6 +167,8 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Remove any existing pre-push hook + rm -f .git/hooks/pre-push # Configure Git LFS git lfs install --system git lfs update --force @@ -235,6 +237,8 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER + # Remove any existing pre-push hook + rm -f .git/hooks/pre-push # Configure Git LFS git lfs install --system git lfs update --force From b9183a13d96755bee537d7a0101f6dc49c7acd2b Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 08:12:24 -0700 Subject: [PATCH 57/72] Fixing e2e stage --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 17839d2..828386e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -167,8 +167,8 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Remove any existing pre-push hook - rm -f .git/hooks/pre-push + # Remove any existing hooks + rm -f .git/hooks/* # Configure Git LFS git lfs install --system git lfs update --force @@ -237,8 +237,8 @@ jobs: apt-get update -y DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git git-lfs git config --global --add safe.directory /__w/SWAGGER/SWAGGER - # Remove any existing pre-push hook - rm -f .git/hooks/pre-push + # Remove any existing hooks + rm -f .git/hooks/* # Configure Git LFS git lfs install --system git lfs update --force From 17410843f0af1725f91c3dc180fbcdde3071d3e4 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 08:30:31 -0700 Subject: [PATCH 58/72] Fixing e2e stage --- .github/workflows/ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 828386e..aeb9f5d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -200,7 +200,6 @@ jobs: - name: Verify CUDA setup run: | nvidia-smi - python3 -c 'import os, sys, cv2, numpy as np; print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES")); print("NVIDIA_VISIBLE_DEVICES:", os.environ.get("NVIDIA_VISIBLE_DEVICES")); try: img = cv2.imread("maps/carter_warehouse_navigation.png"); print("Image exists:", os.path.exists("maps/carter_warehouse_navigation.png")); print("Image size:", os.path.getsize("maps/carter_warehouse_navigation.png")); if img is None: print("Failed to read image with OpenCV"); sys.exit(1); print("Image shape:", img.shape); print("Image type:", img.dtype); except Exception as e: print("Error reading image:", str(e)); sys.exit(1)' - name: Run E2E tests run: | From 60581e0a1520a8b7340343cd39a900dd8b805617 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 08:41:10 -0700 Subject: [PATCH 59/72] Fixing e2e stage --- .github/workflows/ci.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aeb9f5d..f469b0c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -180,6 +180,11 @@ jobs: with: lfs: true + - name: Pull LFS files + run: | + git lfs pull + git lfs ls-files + - name: Setup Python uses: actions/setup-python@v5 with: @@ -199,7 +204,14 @@ jobs: - name: Verify CUDA setup run: | + # Verify NVIDIA drivers and CUDA nvidia-smi + nvcc --version + # Verify CUDA device is available + python3 -c 'import torch; print("CUDA available:", torch.cuda.is_available()); print("CUDA device count:", torch.cuda.device_count()); print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device")' + # Verify image file + ls -l maps/carter_warehouse_navigation.png + file maps/carter_warehouse_navigation.png - name: Run E2E tests run: | From c8a5f82f6891d45bd30db6afd116bd4309d3a5ff Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 08:49:30 -0700 Subject: [PATCH 60/72] Fixing e2e stage --- .github/workflows/ci.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f469b0c..ce09b62 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -207,12 +207,16 @@ jobs: # Verify NVIDIA drivers and CUDA nvidia-smi nvcc --version - # Verify CUDA device is available - python3 -c 'import torch; print("CUDA available:", torch.cuda.is_available()); print("CUDA device count:", torch.cuda.device_count()); print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device")' + # Verify CUDA device is available using nvidia-smi + nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv,noheader # Verify image file ls -l maps/carter_warehouse_navigation.png file maps/carter_warehouse_navigation.png + - name: Verify PyTorch CUDA + run: | + poetry run python -c 'import torch; print("CUDA available:", torch.cuda.is_available()); print("CUDA device count:", torch.cuda.device_count()); print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device")' + - name: Run E2E tests run: | poetry run python scripts/generate_graph.py \ From 9a7e7e29d6125abbe30d2e6cb964ea810a915d15 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 08:58:17 -0700 Subject: [PATCH 61/72] Fixing e2e stage --- .github/workflows/ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ce09b62..07ae2f6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -197,7 +197,8 @@ jobs: python3-pip python3-venv \ libgl1-mesa-glx libglib2.0-0 \ libcudnn8 libcudnn8-dev \ - nvidia-cuda-toolkit + nvidia-cuda-toolkit \ + file python3 -m pip install --upgrade pip poetry poetry env use python3.12 poetry install @@ -211,7 +212,7 @@ jobs: nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv,noheader # Verify image file ls -l maps/carter_warehouse_navigation.png - file maps/carter_warehouse_navigation.png + echo "Image size: $(stat -c %s maps/carter_warehouse_navigation.png) bytes" - name: Verify PyTorch CUDA run: | From 8a113d1f4c86c7ec84eac1855d7536e95e192822 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 09:11:28 -0700 Subject: [PATCH 62/72] Fixing e2e stage --- .github/workflows/ci.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 07ae2f6..afaf691 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -214,10 +214,6 @@ jobs: ls -l maps/carter_warehouse_navigation.png echo "Image size: $(stat -c %s maps/carter_warehouse_navigation.png) bytes" - - name: Verify PyTorch CUDA - run: | - poetry run python -c 'import torch; print("CUDA available:", torch.cuda.is_available()); print("CUDA device count:", torch.cuda.device_count()); print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device")' - - name: Run E2E tests run: | poetry run python scripts/generate_graph.py \ From 3b76e72ea0b0f2f20a65864d915cf962852378a5 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 09:35:22 -0700 Subject: [PATCH 63/72] Fixing docker stage --- .github/workflows/ci.yml | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index afaf691..c6994cc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -197,8 +197,7 @@ jobs: python3-pip python3-venv \ libgl1-mesa-glx libglib2.0-0 \ libcudnn8 libcudnn8-dev \ - nvidia-cuda-toolkit \ - file + nvidia-cuda-toolkit python3 -m pip install --upgrade pip poetry poetry env use python3.12 poetry install @@ -206,13 +205,8 @@ jobs: - name: Verify CUDA setup run: | # Verify NVIDIA drivers and CUDA - nvidia-smi nvcc --version - # Verify CUDA device is available using nvidia-smi nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv,noheader - # Verify image file - ls -l maps/carter_warehouse_navigation.png - echo "Image size: $(stat -c %s maps/carter_warehouse_navigation.png) bytes" - name: Run E2E tests run: | @@ -238,7 +232,7 @@ jobs: runs-on: self-hosted env: CUDA_VISIBLE_DEVICES: all - DOCKER_HOST: tcp://docker:2375 + DOCKER_HOST: unix:///var/run/docker.sock DOCKER_TLS_CERTDIR: "" container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 @@ -262,11 +256,35 @@ jobs: with: lfs: true + - name: Pull LFS files + run: | + git lfs pull + git lfs ls-files + - name: Setup Python uses: actions/setup-python@v5 with: python-version: '3.12' + - name: Install dependencies + run: | + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3-pip python3-venv \ + libgl1-mesa-glx libglib2.0-0 \ + libcudnn8 libcudnn8-dev \ + nvidia-cuda-toolkit + python3 -m pip install --upgrade pip poetry + poetry env use python3.12 + poetry install + + - name: Verify Docker setup + run: | + # Verify Docker daemon is accessible + docker info + # Verify Docker Compose is available + docker compose version + - name: Build and test run: | cd docker || { echo "Missing docker directory"; exit 1; } @@ -274,12 +292,9 @@ jobs: docker compose up rest-api & sleep 10 cd ../ - python3 -m pip install --upgrade pip poetry - poetry env use python3.12 - poetry install poetry run python scripts/test_api_client.py \ --map_path maps/carter_warehouse_navigation.png \ - --host docker + --host localhost - name: Cleanup if: always() From 572cf80e862ee3bad1899f6c8dde3755e5e213a3 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 09:50:42 -0700 Subject: [PATCH 64/72] Fixing docker stage --- .github/workflows/ci.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c6994cc..270834b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -273,7 +273,21 @@ jobs: python3-pip python3-venv \ libgl1-mesa-glx libglib2.0-0 \ libcudnn8 libcudnn8-dev \ - nvidia-cuda-toolkit + nvidia-cuda-toolkit \ + ca-certificates curl gnupg + # Add Docker's official GPG key + install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + chmod a+r /etc/apt/keyrings/docker.gpg + # Add Docker repository + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + # Install Docker and Docker Compose + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + # Verify Docker installation + docker --version + docker compose version python3 -m pip install --upgrade pip poetry poetry env use python3.12 poetry install From 52c3ea653e9708e32a07c55eb850b1c19597830a Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 10:07:15 -0700 Subject: [PATCH 65/72] Fixing docker stage --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b39eb35..ccff549 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ name = "swagger" version = "1.0.0" description = "A library for generating waypoint graphs from occupancy grid maps" readme = "README.md" -requires-python = ">=3.12,<4.0" +requires-python = ">=3.10,<4.0" classifiers = [ "Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.12", From 47b3f846367ba87841260639cf8e440f21cb4d4c Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 10:40:52 -0700 Subject: [PATCH 66/72] Fixing docker stage --- .github/workflows/ci.yml | 48 ++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 270834b..b746879 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -99,6 +99,8 @@ jobs: runs-on: self-hosted env: CUDA_VISIBLE_DEVICES: all + NVIDIA_VISIBLE_DEVICES: all + NVIDIA_DRIVER_CAPABILITIES: all container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 options: --privileged --gpus all --group-add 998 @@ -114,6 +116,12 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Verify CUDA setup + run: | + # Verify NVIDIA drivers and CUDA + nvcc --version + nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv,noheader + - name: Setup Python uses: actions/setup-python@v5 with: @@ -132,7 +140,10 @@ jobs: run: | apt-get update DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - python3-pip python3-venv libgl1-mesa-glx libglib2.0-0 + python3-pip python3-venv \ + libgl1-mesa-glx libglib2.0-0 \ + libcudnn8 libcudnn8-dev \ + nvidia-cuda-toolkit python3 -m pip install --upgrade pip poetry poetry env use python3.12 poetry install @@ -158,6 +169,7 @@ jobs: env: CUDA_VISIBLE_DEVICES: all NVIDIA_VISIBLE_DEVICES: all + NVIDIA_DRIVER_CAPABILITIES: all container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 options: --privileged --gpus all --group-add 998 @@ -202,18 +214,16 @@ jobs: poetry env use python3.12 poetry install - - name: Verify CUDA setup - run: | - # Verify NVIDIA drivers and CUDA - nvcc --version - nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv,noheader - - name: Run E2E tests run: | + # Set CUDA device order + export CUDA_DEVICE_ORDER=PCI_BUS_ID + # Run tests with explicit CUDA device poetry run python scripts/generate_graph.py \ --map-path maps/carter_warehouse_navigation.png \ --graph-eval.active \ - --perf-eval.active + --perf-eval.active \ + --cuda-device 0 poetry run python scripts/evaluate_graph.py \ --graph-path graphs/graph.gml \ --map-path maps/carter_warehouse_navigation.png \ @@ -232,6 +242,8 @@ jobs: runs-on: self-hosted env: CUDA_VISIBLE_DEVICES: all + NVIDIA_VISIBLE_DEVICES: all + NVIDIA_DRIVER_CAPABILITIES: all DOCKER_HOST: unix:///var/run/docker.sock DOCKER_TLS_CERTDIR: "" container: @@ -285,27 +297,31 @@ jobs: apt-get update DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - # Verify Docker installation - docker --version - docker compose version python3 -m pip install --upgrade pip poetry poetry env use python3.12 poetry install - - name: Verify Docker setup + - name: Verify Docker and CUDA setup run: | - # Verify Docker daemon is accessible + # Verify Docker setup docker info - # Verify Docker Compose is available docker compose version + # Verify CUDA setup + nvidia-smi + nvcc --version - name: Build and test run: | cd docker || { echo "Missing docker directory"; exit 1; } - docker compose build --no-cache - docker compose up rest-api & + # Build with GPU support + DOCKER_BUILDKIT=1 docker compose build --no-cache + # Run with GPU support + docker compose up -d rest-api sleep 10 cd ../ + # Verify CUDA is available in the container + docker exec rest-api nvidia-smi + # Run tests poetry run python scripts/test_api_client.py \ --map_path maps/carter_warehouse_navigation.png \ --host localhost From 1f5d9354e3c062bc60edbc694bd713e408f19cac Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 10:48:21 -0700 Subject: [PATCH 67/72] Fixing docker stage --- .github/workflows/ci.yml | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b746879..41c22af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -103,7 +103,18 @@ jobs: NVIDIA_DRIVER_CAPABILITIES: all container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 - options: --privileged --gpus all --group-add 998 + options: | + --privileged + --gpus all + --group-add 998 + --device /dev/nvidia0:/dev/nvidia0 + --device /dev/nvidiactl:/dev/nvidiactl + --device /dev/nvidia-modeset:/dev/nvidia-modeset + --device /dev/nvidia-uvm:/dev/nvidia-uvm + --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools + -v /usr/lib/x86_64-linux-gnu/nvidia:/usr/lib/x86_64-linux-gnu/nvidia + -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi + -v /usr/bin/nvcc:/usr/bin/nvcc steps: - name: Install Git run: | @@ -172,7 +183,18 @@ jobs: NVIDIA_DRIVER_CAPABILITIES: all container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 - options: --privileged --gpus all --group-add 998 + options: | + --privileged + --gpus all + --group-add 998 + --device /dev/nvidia0:/dev/nvidia0 + --device /dev/nvidiactl:/dev/nvidiactl + --device /dev/nvidia-modeset:/dev/nvidia-modeset + --device /dev/nvidia-uvm:/dev/nvidia-uvm + --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools + -v /usr/lib/x86_64-linux-gnu/nvidia:/usr/lib/x86_64-linux-gnu/nvidia + -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi + -v /usr/bin/nvcc:/usr/bin/nvcc steps: - name: Install Git and LFS run: | @@ -248,7 +270,18 @@ jobs: DOCKER_TLS_CERTDIR: "" container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 - options: --privileged --gpus all --group-add 998 + options: | + --privileged + --gpus all + --group-add 998 + --device /dev/nvidia0:/dev/nvidia0 + --device /dev/nvidiactl:/dev/nvidiactl + --device /dev/nvidia-modeset:/dev/nvidia-modeset + --device /dev/nvidia-uvm:/dev/nvidia-uvm + --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools + -v /usr/lib/x86_64-linux-gnu/nvidia:/usr/lib/x86_64-linux-gnu/nvidia + -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi + -v /usr/bin/nvcc:/usr/bin/nvcc steps: - name: Install Git and LFS run: | From ae75ac4155facd90fd38e446a84fbf0163a33778 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 10:59:40 -0700 Subject: [PATCH 68/72] Fixing docker stage --- .github/workflows/ci.yml | 45 +++++++--------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 41c22af..2eadb91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -103,18 +103,7 @@ jobs: NVIDIA_DRIVER_CAPABILITIES: all container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 - options: | - --privileged - --gpus all - --group-add 998 - --device /dev/nvidia0:/dev/nvidia0 - --device /dev/nvidiactl:/dev/nvidiactl - --device /dev/nvidia-modeset:/dev/nvidia-modeset - --device /dev/nvidia-uvm:/dev/nvidia-uvm - --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools - -v /usr/lib/x86_64-linux-gnu/nvidia:/usr/lib/x86_64-linux-gnu/nvidia - -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi - -v /usr/bin/nvcc:/usr/bin/nvcc + options: --privileged --gpus all --group-add 998 --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 steps: - name: Install Git run: | @@ -183,18 +172,7 @@ jobs: NVIDIA_DRIVER_CAPABILITIES: all container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 - options: | - --privileged - --gpus all - --group-add 998 - --device /dev/nvidia0:/dev/nvidia0 - --device /dev/nvidiactl:/dev/nvidiactl - --device /dev/nvidia-modeset:/dev/nvidia-modeset - --device /dev/nvidia-uvm:/dev/nvidia-uvm - --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools - -v /usr/lib/x86_64-linux-gnu/nvidia:/usr/lib/x86_64-linux-gnu/nvidia - -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi - -v /usr/bin/nvcc:/usr/bin/nvcc + options: --privileged --gpus all --group-add 998 --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 steps: - name: Install Git and LFS run: | @@ -270,18 +248,7 @@ jobs: DOCKER_TLS_CERTDIR: "" container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 - options: | - --privileged - --gpus all - --group-add 998 - --device /dev/nvidia0:/dev/nvidia0 - --device /dev/nvidiactl:/dev/nvidiactl - --device /dev/nvidia-modeset:/dev/nvidia-modeset - --device /dev/nvidia-uvm:/dev/nvidia-uvm - --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools - -v /usr/lib/x86_64-linux-gnu/nvidia:/usr/lib/x86_64-linux-gnu/nvidia - -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi - -v /usr/bin/nvcc:/usr/bin/nvcc + options: --privileged --gpus all --group-add 998 --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 steps: - name: Install Git and LFS run: | @@ -342,13 +309,17 @@ jobs: # Verify CUDA setup nvidia-smi nvcc --version + # Verify NVIDIA runtime + ldconfig -p | grep nvidia + # Verify CUDA libraries + ldconfig -p | grep cuda - name: Build and test run: | cd docker || { echo "Missing docker directory"; exit 1; } # Build with GPU support DOCKER_BUILDKIT=1 docker compose build --no-cache - # Run with GPU support + # Run with GPU support and NVIDIA runtime docker compose up -d rest-api sleep 10 cd ../ From 1391723faa0f8db74a4f45511c8d9e4c6b5aff45 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 11:32:38 -0700 Subject: [PATCH 69/72] Fixing docker stage --- .github/workflows/ci.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2eadb91..f6dc25c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -101,6 +101,8 @@ jobs: CUDA_VISIBLE_DEVICES: all NVIDIA_VISIBLE_DEVICES: all NVIDIA_DRIVER_CAPABILITIES: all + CUDA_HOME: /usr/local/cuda + LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 options: --privileged --gpus all --group-add 998 --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 @@ -170,6 +172,8 @@ jobs: CUDA_VISIBLE_DEVICES: all NVIDIA_VISIBLE_DEVICES: all NVIDIA_DRIVER_CAPABILITIES: all + CUDA_HOME: /usr/local/cuda + LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 container: image: nvidia/cuda:12.6.0-devel-ubuntu22.04 options: --privileged --gpus all --group-add 998 --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 @@ -244,6 +248,8 @@ jobs: CUDA_VISIBLE_DEVICES: all NVIDIA_VISIBLE_DEVICES: all NVIDIA_DRIVER_CAPABILITIES: all + CUDA_HOME: /usr/local/cuda + LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 DOCKER_HOST: unix:///var/run/docker.sock DOCKER_TLS_CERTDIR: "" container: @@ -285,8 +291,7 @@ jobs: python3-pip python3-venv \ libgl1-mesa-glx libglib2.0-0 \ libcudnn8 libcudnn8-dev \ - nvidia-cuda-toolkit \ - ca-certificates curl gnupg + nvidia-cuda-toolkit # Add Docker's official GPG key install -m 0755 -d /etc/apt/keyrings curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg @@ -313,6 +318,12 @@ jobs: ldconfig -p | grep nvidia # Verify CUDA libraries ldconfig -p | grep cuda + # Verify CUDA environment + echo "CUDA_HOME: $CUDA_HOME" + echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" + ls -l $CUDA_HOME/lib64/libcuda* + # Verify CUDA device + python3 -c 'import os; print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES")); print("NVIDIA_VISIBLE_DEVICES:", os.environ.get("NVIDIA_VISIBLE_DEVICES")); print("NVIDIA_DRIVER_CAPABILITIES:", os.environ.get("NVIDIA_DRIVER_CAPABILITIES"))' - name: Build and test run: | From 2e5726a88f44f9f4b73f5dfffe810c17d0ccde21 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 14:07:22 -0700 Subject: [PATCH 70/72] Fixing docker stage --- .github/workflows/ci.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f6dc25c..5120f66 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -222,12 +222,10 @@ jobs: run: | # Set CUDA device order export CUDA_DEVICE_ORDER=PCI_BUS_ID - # Run tests with explicit CUDA device poetry run python scripts/generate_graph.py \ --map-path maps/carter_warehouse_navigation.png \ --graph-eval.active \ - --perf-eval.active \ - --cuda-device 0 + --perf-eval.active poetry run python scripts/evaluate_graph.py \ --graph-path graphs/graph.gml \ --map-path maps/carter_warehouse_navigation.png \ From 466ba4653250b85eb3c72be3b6a0d658b9773f56 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 14:23:45 -0700 Subject: [PATCH 71/72] Fixing docker stage --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5120f66..c35f240 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -289,7 +289,7 @@ jobs: python3-pip python3-venv \ libgl1-mesa-glx libglib2.0-0 \ libcudnn8 libcudnn8-dev \ - nvidia-cuda-toolkit + nvidia-cuda-toolkit curl ca-certificates gnupg # Add Docker's official GPG key install -m 0755 -d /etc/apt/keyrings curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg From db23e535211579982209a430e751bdee8142e933 Mon Sep 17 00:00:00 2001 From: Alexander Poddubny Date: Thu, 22 May 2025 14:45:04 -0700 Subject: [PATCH 72/72] Fixing docker stage --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c35f240..e2b3699 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -333,7 +333,7 @@ jobs: sleep 10 cd ../ # Verify CUDA is available in the container - docker exec rest-api nvidia-smi + docker exec docker-rest-api-1 nvidia-smi # Run tests poetry run python scripts/test_api_client.py \ --map_path maps/carter_warehouse_navigation.png \