Skip to content

Commit 7112e56

Browse files
committed
Debug
1 parent ea67ed3 commit 7112e56

File tree

1 file changed

+146
-2
lines changed

1 file changed

+146
-2
lines changed

.github/workflows/additional_demo_notebook_tests.yaml

+146-2
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,152 @@ env:
1313
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
1414

1515
jobs:
16+
verify-hf_interactive:
17+
#if: ${{ github.event.label.name == 'test-additional-notebooks' }}
18+
runs-on: ubuntu-20.04-4core-gpu
19+
20+
steps:
21+
- name: Checkout code
22+
uses: actions/checkout@v4
23+
with:
24+
submodules: recursive
25+
26+
- name: Checkout common repo code
27+
uses: actions/checkout@v4
28+
with:
29+
repository: 'project-codeflare/codeflare-common'
30+
ref: 'main'
31+
path: 'common'
32+
33+
- name: Checkout CodeFlare operator repository
34+
uses: actions/checkout@v4
35+
with:
36+
repository: project-codeflare/codeflare-operator
37+
path: codeflare-operator
38+
39+
- name: Set Go
40+
uses: actions/setup-go@v5
41+
with:
42+
go-version-file: './codeflare-operator/go.mod'
43+
cache-dependency-path: "./codeflare-operator/go.sum"
44+
45+
- name: Set up gotestfmt
46+
uses: gotesttools/gotestfmt-action@v2
47+
with:
48+
token: ${{ secrets.GITHUB_TOKEN }}
49+
50+
- name: Set up specific Python version
51+
uses: actions/setup-python@v5
52+
with:
53+
python-version: '3.9'
54+
cache: 'pip' # caching pip dependencies
55+
56+
- name: Setup NVidia GPU environment for KinD
57+
uses: ./common/github-actions/nvidia-gpu-setup
58+
59+
- name: Setup and start KinD cluster
60+
uses: ./common/github-actions/kind
61+
62+
- name: Install NVidia GPU operator for KinD
63+
uses: ./common/github-actions/nvidia-gpu-operator
64+
with:
65+
enable-time-slicing: 'true'
66+
67+
- name: Deploy CodeFlare stack
68+
id: deploy
69+
run: |
70+
cd codeflare-operator
71+
echo Setting up CodeFlare stack
72+
make setup-e2e
73+
echo Deploying CodeFlare operator
74+
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
75+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
76+
cd ..
77+
78+
- name: Install MINIO
79+
run: |
80+
kubectl apply -f ./tests/e2e/minio_deployment.yaml
81+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio
82+
83+
- name: Setup Additional demo notebooks execution
84+
run: |
85+
echo "Installing papermill and dependencies..."
86+
pip install poetry papermill ipython ipykernel
87+
# Disable virtualenv due to problems using packaged in virtualenv in papermill
88+
poetry config virtualenvs.create false
89+
90+
echo "Installing SDK..."
91+
poetry install --with test,docs
92+
93+
- name: Run hf_interactive.ipynb
94+
run: |
95+
set -euo pipefail
96+
set -x
97+
98+
# Remove login/logout cells, as KinD doesn't support authentication using token
99+
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
100+
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
101+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
102+
sed -i "s/cluster_uri()/local_client_url()/g" hf_interactive.ipynb
103+
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
104+
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
105+
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
106+
# Add MINIO related modules to runtime environment
107+
sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" hf_interactive.ipynb
108+
# Replace markdown cell with remote configuration for MINIO
109+
MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json)
110+
jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Transfer learning code from huggingface"))) |= $minio_config' hf_interactive.ipynb > hf_interactive.ipynb.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
111+
# Change cluster parameters (need to decrease)
112+
sed -i "s/{'nvidia.com\/gpu':1}/{'nvidia.com\/gpu':0}/g" hf_interactive.ipynb
113+
sed -i "s/worker_cpu_requests=8,/worker_cpu_requests='250m', namespace='default',/" hf_interactive.ipynb
114+
sed -i "s/worker_cpu_limits=8,/worker_cpu_limits=4,/" hf_interactive.ipynb
115+
sed -i "s/worker_memory_requests=16,/worker_memory_requests=12,/" hf_interactive.ipynb
116+
sed -i "s/worker_memory_limits=16,/worker_memory_limits=12,/" hf_interactive.ipynb
117+
sed -i "s/use_gpu=True/use_gpu=False/" hf_interactive.ipynb
118+
# Configure persistent storage for Ray trainer
119+
sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" hf_interactive.ipynb
120+
cat hf_interactive.ipynb
121+
# Run notebook
122+
poetry run papermill hf_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
123+
env:
124+
GRPC_DNS_RESOLVER: "native"
125+
working-directory: demo-notebooks/additional-demos
126+
127+
- name: Print CodeFlare operator logs
128+
if: always() && steps.deploy.outcome == 'success'
129+
run: |
130+
echo "Printing CodeFlare operator logs"
131+
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
132+
133+
- name: Print Kueue operator logs
134+
if: always() && steps.deploy.outcome == 'success'
135+
run: |
136+
echo "Printing Kueue operator logs"
137+
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
138+
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
139+
140+
- name: Print KubeRay operator logs
141+
if: always() && steps.deploy.outcome == 'success'
142+
run: |
143+
echo "Printing KubeRay operator logs"
144+
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
145+
146+
- name: Export all KinD pod logs
147+
uses: ./common/github-actions/kind-export-logs
148+
if: always() && steps.deploy.outcome == 'success'
149+
with:
150+
output-directory: ${TEMP_DIR}
151+
152+
- name: Upload logs
153+
uses: actions/upload-artifact@v4
154+
if: always() && steps.deploy.outcome == 'success'
155+
with:
156+
name: logs-verify-hf_interactive
157+
retention-days: 10
158+
path: |
159+
${{ env.TEMP_DIR }}/**/*.log
16160
verify-local_interactive:
17-
if: ${{ github.event.label.name == 'test-additional-notebooks' }}
161+
#if: ${{ github.event.label.name == 'test-additional-notebooks' }}
18162
runs-on: ubuntu-20.04-4core
19163

20164
steps:
@@ -132,7 +276,7 @@ jobs:
132276
${{ env.TEMP_DIR }}/**/*.log
133277
134278
verify-ray_job_client:
135-
if: ${{ github.event.label.name == 'test-additional-notebooks' }}
279+
#if: ${{ github.event.label.name == 'test-additional-notebooks' }}
136280
runs-on: ubuntu-20.04-4core
137281

138282
steps:

0 commit comments

Comments
 (0)