|
13 | 13 | CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
|
14 | 14 |
|
15 | 15 | jobs:
|
| 16 | + verify-hf_interactive: |
| 17 | + #if: ${{ github.event.label.name == 'test-additional-notebooks' }} |
| 18 | + runs-on: ubuntu-20.04-4core-gpu |
| 19 | + |
| 20 | + steps: |
| 21 | + - name: Checkout code |
| 22 | + uses: actions/checkout@v4 |
| 23 | + with: |
| 24 | + submodules: recursive |
| 25 | + |
| 26 | + - name: Checkout common repo code |
| 27 | + uses: actions/checkout@v4 |
| 28 | + with: |
| 29 | + repository: 'project-codeflare/codeflare-common' |
| 30 | + ref: 'main' |
| 31 | + path: 'common' |
| 32 | + |
| 33 | + - name: Checkout CodeFlare operator repository |
| 34 | + uses: actions/checkout@v4 |
| 35 | + with: |
| 36 | + repository: project-codeflare/codeflare-operator |
| 37 | + path: codeflare-operator |
| 38 | + |
| 39 | + - name: Set Go |
| 40 | + uses: actions/setup-go@v5 |
| 41 | + with: |
| 42 | + go-version-file: './codeflare-operator/go.mod' |
| 43 | + cache-dependency-path: "./codeflare-operator/go.sum" |
| 44 | + |
| 45 | + - name: Set up gotestfmt |
| 46 | + uses: gotesttools/gotestfmt-action@v2 |
| 47 | + with: |
| 48 | + token: ${{ secrets.GITHUB_TOKEN }} |
| 49 | + |
| 50 | + - name: Set up specific Python version |
| 51 | + uses: actions/setup-python@v5 |
| 52 | + with: |
| 53 | + python-version: '3.9' |
| 54 | + cache: 'pip' # caching pip dependencies |
| 55 | + |
| 56 | + - name: Setup NVidia GPU environment for KinD |
| 57 | + uses: ./common/github-actions/nvidia-gpu-setup |
| 58 | + |
| 59 | + - name: Setup and start KinD cluster |
| 60 | + uses: ./common/github-actions/kind |
| 61 | + |
| 62 | + - name: Install NVidia GPU operator for KinD |
| 63 | + uses: ./common/github-actions/nvidia-gpu-operator |
| 64 | + with: |
| 65 | + enable-time-slicing: 'true' |
| 66 | + |
| 67 | + - name: Deploy CodeFlare stack |
| 68 | + id: deploy |
| 69 | + run: | |
| 70 | + cd codeflare-operator |
| 71 | + echo Setting up CodeFlare stack |
| 72 | + make setup-e2e |
| 73 | + echo Deploying CodeFlare operator |
| 74 | + make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" |
| 75 | + kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager |
| 76 | + cd .. |
| 77 | +
|
| 78 | + - name: Install MINIO |
| 79 | + run: | |
| 80 | + kubectl apply -f ./tests/e2e/minio_deployment.yaml |
| 81 | + kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio |
| 82 | +
|
| 83 | + - name: Setup Additional demo notebooks execution |
| 84 | + run: | |
| 85 | + echo "Installing papermill and dependencies..." |
| 86 | + pip install poetry papermill ipython ipykernel |
| 87 | + # Disable virtualenv due to problems using packaged in virtualenv in papermill |
| 88 | + poetry config virtualenvs.create false |
| 89 | +
|
| 90 | + echo "Installing SDK..." |
| 91 | + poetry install --with test,docs |
| 92 | +
|
| 93 | + - name: Run hf_interactive.ipynb |
| 94 | + run: | |
| 95 | + set -euo pipefail |
| 96 | + set -x |
| 97 | +
|
| 98 | + # Remove login/logout cells, as KinD doesn't support authentication using token |
| 99 | + jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb |
| 100 | + jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb |
| 101 | + # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster |
| 102 | + sed -i "s/cluster_uri()/local_client_url()/g" hf_interactive.ipynb |
| 103 | + # Replace async logs with waiting for job to finish, async logs don't work properly in papermill |
| 104 | + JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json) |
| 105 | + jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb |
| 106 | + # Add MINIO related modules to runtime environment |
| 107 | + sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" hf_interactive.ipynb |
| 108 | + # Replace markdown cell with remote configuration for MINIO |
| 109 | + MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json) |
| 110 | + jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Transfer learning code from huggingface"))) |= $minio_config' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb |
| 111 | + # Change cluster parameters (need to decrease) |
| 112 | + sed -i "s/{'nvidia.com\/gpu':1}/{'nvidia.com\/gpu':0}/g" hf_interactive.ipynb |
| 113 | + sed -i "s/worker_cpu_requests=8,/worker_cpu_requests='250m', namespace='default',/" hf_interactive.ipynb |
| 114 | + sed -i "s/worker_cpu_limits=8,/worker_cpu_limits=4,/" hf_interactive.ipynb |
| 115 | + sed -i "s/use_gpu=True/use_gpu=False/" hf_interactive.ipynb |
| 116 | + # Configure persistent storage for Ray trainer |
| 117 | + sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" hf_interactive.ipynb |
| 118 | + cat hf_interactive.ipynb |
| 119 | + # Run notebook |
| 120 | + poetry run papermill hf_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200 |
| 121 | + env: |
| 122 | + GRPC_DNS_RESOLVER: "native" |
| 123 | + working-directory: demo-notebooks/additional-demos |
| 124 | + |
| 125 | + - name: Print CodeFlare operator logs |
| 126 | + if: always() && steps.deploy.outcome == 'success' |
| 127 | + run: | |
| 128 | + echo "Printing CodeFlare operator logs" |
| 129 | + kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log |
| 130 | +
|
| 131 | + - name: Print Kueue operator logs |
| 132 | + if: always() && steps.deploy.outcome == 'success' |
| 133 | + run: | |
| 134 | + echo "Printing Kueue operator logs" |
| 135 | + KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}') |
| 136 | + kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log |
| 137 | +
|
| 138 | + - name: Print KubeRay operator logs |
| 139 | + if: always() && steps.deploy.outcome == 'success' |
| 140 | + run: | |
| 141 | + echo "Printing KubeRay operator logs" |
| 142 | + kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log |
| 143 | +
|
| 144 | + - name: Export all KinD pod logs |
| 145 | + uses: ./common/github-actions/kind-export-logs |
| 146 | + if: always() && steps.deploy.outcome == 'success' |
| 147 | + with: |
| 148 | + output-directory: ${TEMP_DIR} |
| 149 | + |
| 150 | + - name: Upload logs |
| 151 | + uses: actions/upload-artifact@v4 |
| 152 | + if: always() && steps.deploy.outcome == 'success' |
| 153 | + with: |
| 154 | + name: logs-verify-hf_interactive |
| 155 | + retention-days: 10 |
| 156 | + path: | |
| 157 | + ${{ env.TEMP_DIR }}/**/*.log |
16 | 158 | verify-local_interactive:
|
17 |
| - if: ${{ github.event.label.name == 'test-additional-notebooks' }} |
| 159 | + #if: ${{ github.event.label.name == 'test-additional-notebooks' }} |
18 | 160 | runs-on: ubuntu-20.04-4core
|
19 | 161 |
|
20 | 162 | steps:
|
@@ -132,7 +274,7 @@ jobs:
|
132 | 274 | ${{ env.TEMP_DIR }}/**/*.log
|
133 | 275 |
|
134 | 276 | verify-ray_job_client:
|
135 |
| - if: ${{ github.event.label.name == 'test-additional-notebooks' }} |
| 277 | + #if: ${{ github.event.label.name == 'test-additional-notebooks' }} |
136 | 278 | runs-on: ubuntu-20.04-4core
|
137 | 279 |
|
138 | 280 | steps:
|
|
0 commit comments