Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions .github/workflows/_test_pax.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ on:
description: Extra command line args to pass to test-pax.sh
default: ""
required: false
PUBLISH:
type: boolean
description: Publish badge?
default: false
required: false
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
Expand Down Expand Up @@ -201,8 +206,8 @@ jobs:
if: ( always() )
secrets: inherit
with:
ENDPOINT_FILENAME: 'pax-test-status.json'
PUBLISH: false
ENDPOINT_FILENAME: 'upstream-pax-test-overall-status.deleteme.json'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is that deleteme infix?

PUBLISH: ${{ inputs.PUBLISH }}
SCRIPT: |
EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*FSDP*TP*PP/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
Expand Down Expand Up @@ -244,7 +249,7 @@ jobs:
BADGE_COLOR=yellow
fi
echo "STATUS='${STATUS}'" >> ${GITHUB_OUTPUT}
echo "LABEL='Completion'" >> $GITHUB_OUTPUT
echo "LABEL='Upstream tests'" >> $GITHUB_OUTPUT
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} ran ${PYTEST_PASSED_TESTS}/${PYTEST_TOTAL_TESTS} pass loss+perf'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT

Expand Down
11 changes: 8 additions & 3 deletions .github/workflows/_test_t5x.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ on:
description: Extra gin args to pass to test-t5x.sh
default: ""
required: false
PUBLISH:
type: boolean
description: Publish badge?
default: false
required: false
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
Expand Down Expand Up @@ -312,8 +317,8 @@ jobs:
if: ( always() )
secrets: inherit
with:
ENDPOINT_FILENAME: 't5x-test-completion-status.json'
PUBLISH: false
ENDPOINT_FILENAME: 'upstream-t5x-test-overall-status.deleteme.json'
PUBLISH: ${{ inputs.PUBLISH }}
SCRIPT: |
EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
Expand Down Expand Up @@ -356,7 +361,7 @@ jobs:
BADGE_COLOR=yellow
fi
echo "STATUS='${STATUS}'" >> ${GITHUB_OUTPUT}
echo "LABEL='Completion'" >> $GITHUB_OUTPUT
echo "LABEL='Upstream tests'" >> $GITHUB_OUTPUT
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} ran ${PYTEST_PASSED_TESTS}/${PYTEST_TOTAL_TESTS} pass loss+perf'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT

Expand Down
173 changes: 69 additions & 104 deletions .github/workflows/nightly-pax-test-mgmn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,112 +55,77 @@ jobs:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
with:
PAX_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
secrets: inherit

publish:
needs: [metadata, run-jobs]
runs-on: ubuntu-22.04
steps:
- name: Setup SSH agent
uses: webfactory/[email protected]
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

- name: Setup SSH known hosts
id: ssh-known-hosts
run: |
mkdir -p ~/.ssh
cat >> ~/.ssh/known_hosts << EOF
${{ vars.SSH_KNOWN_HOSTS }}
EOF
chmod 600 ~/.ssh/known_hosts
echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT

- name: Setup SSH config
id: ssh-config
run: |
mkdir -p ~/.ssh
cat >> ~/.ssh/config << EOF
${{ vars.SSH_CONFIG }}
EOF
chmod 600 ~/.ssh/config

- name: Create dated folder and generate TensorBoard query URL
id: mkdir
shell: bash -x -e {0}
run: |
FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/PAX"
# copy folder
ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER}
ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/
# generate query URL
(
cat << EOF

## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }}

[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per)

EOF
) | tee $GITHUB_STEP_SUMMARY

publish-completion:
needs: [metadata, run-jobs]
uses: ./.github/workflows/_publish_badge.yaml
if: success() || failure()
secrets: inherit
with:
ENDPOINT_FILENAME: 'pax-test-completion-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
STATUS=failure
if [[ ${{ needs.run-jobs.result }} == "success" ]]; then
EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)

echo "Test statuses:"
jq -rc 'input_filename,.' $EXIT_STATUSES

if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then
BADGE_COLOR=brightgreen
STATUS=success
elif [[ $PASSED_TESTS -eq 0 ]]; then
BADGE_COLOR=red
else
BADGE_COLOR=yellow
fi
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
else
echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT
echo "COLOR='red'" >> $GITHUB_OUTPUT
fi
echo "LABEL='Completion'" >> $GITHUB_OUTPUT
echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT

publish-verified:
if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH))
needs: [metadata, publish-completion]
uses: ./.github/workflows/_publish_container.yaml
secrets: inherit
with:
SOURCE_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
TARGET_IMAGE: upstream-pax
TARGET_TAGS: |
type=raw,value=latest-verified,priority=1000

triage:
needs: [metadata, publish-completion]
uses: ./.github/workflows/_triage.yaml
if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch')
secrets: inherit
with:
BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
BASE_IMAGE: ghcr.io/nvidia/upstream-pax:latest-verified
REPO_DIRS: "/opt/paxml /opt/praxis"
FILE_ISSUE: true

#publish:
# needs: [metadata, run-jobs]
# runs-on: ubuntu-22.04
# steps:
# - name: Setup SSH agent
# uses: webfactory/[email protected]
# with:
# ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

# - name: Setup SSH known hosts
# id: ssh-known-hosts
# run: |
# mkdir -p ~/.ssh
# cat >> ~/.ssh/known_hosts << EOF
# ${{ vars.SSH_KNOWN_HOSTS }}
# EOF
# chmod 600 ~/.ssh/known_hosts
# echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT

# - name: Setup SSH config
# id: ssh-config
# run: |
# mkdir -p ~/.ssh
# cat >> ~/.ssh/config << EOF
# ${{ vars.SSH_CONFIG }}
# EOF
# chmod 600 ~/.ssh/config

# - name: Create dated folder and generate TensorBoard query URL
# id: mkdir
# shell: bash -x -e {0}
# run: |
# FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/PAX"
# # copy folder
# ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER}
# ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/
# # generate query URL
# (
# cat << EOF

# ## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }}

# [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per)

# EOF
# ) | tee $GITHUB_STEP_SUMMARY

#publish-verified:
# if: needs.run-jobs.outputs.TEST_STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH))
# needs: [metadata, run-jobs]
# uses: ./.github/workflows/_publish_container.yaml
# secrets: inherit
# with:
# SOURCE_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
# TARGET_IMAGE: upstream-pax
# TARGET_TAGS: |
# type=raw,value=latest-verified,priority=1000

#triage:
# needs: [metadata, run-jobs]
# uses: ./.github/workflows/_triage.yaml
# if: needs.run-jobs.outputs.TEST_STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch')
# secrets: inherit
# with:
# BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
# BASE_IMAGE: ghcr.io/nvidia/upstream-pax:latest-verified
# REPO_DIRS: "/opt/paxml /opt/praxis"
# FILE_ISSUE: true

if-upstream-failed:
runs-on: ubuntu-latest
Expand Down
93 changes: 29 additions & 64 deletions .github/workflows/nightly-t5x-test-mgmn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,74 +55,39 @@ jobs:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
with:
T5X_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
secrets: inherit

publish:
needs: [metadata, run-jobs]
uses: ./.github/workflows/_publish_t5x_results.yaml
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
with:
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
EXPERIMENT_SUBDIR: T5X
secrets: inherit

publish-completion:
needs: [metadata, run-jobs]
uses: ./.github/workflows/_publish_badge.yaml
if: success() || failure()
secrets: inherit
with:
ENDPOINT_FILENAME: 't5x-test-overall-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
STATUS=failure
if [[ ${{ needs.run-jobs.result }} == "success" ]]; then
EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
secrets: inherit

echo "Test statuses:"
jq -rc 'input_filename,.' $EXIT_STATUSES
#publish:
# needs: [metadata, run-jobs]
# uses: ./.github/workflows/_publish_t5x_results.yaml
# if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
# with:
# BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
# EXPERIMENT_SUBDIR: T5X
# secrets: inherit

if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then
BADGE_COLOR=brightgreen
STATUS=success
elif [[ $PASSED_TESTS -eq 0 ]]; then
BADGE_COLOR=red
else
BADGE_COLOR=yellow
fi
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
else
echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT
echo "COLOR='red'" >> $GITHUB_OUTPUT
fi
echo "LABEL='Completion'" >> $GITHUB_OUTPUT
echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT
#publish-verified:
# if: needs.run-jobs.outputs.TEST_STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH))
# needs: [metadata, run-jobs]
# uses: ./.github/workflows/_publish_container.yaml
# secrets: inherit
# with:
# SOURCE_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
# TARGET_IMAGE: upstream-t5x
# TARGET_TAGS: |
# type=raw,value=latest-verified,priority=1000

publish-verified:
if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH))
needs: [metadata, publish-completion]
uses: ./.github/workflows/_publish_container.yaml
secrets: inherit
with:
SOURCE_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
TARGET_IMAGE: upstream-t5x
TARGET_TAGS: |
type=raw,value=latest-verified,priority=1000

triage:
needs: [metadata, publish-completion]
uses: ./.github/workflows/_triage.yaml
if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch')
secrets: inherit
with:
BROKEN_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
BASE_IMAGE: ghcr.io/nvidia/upstream-t5x:latest-verified
REPO_DIRS: "/opt/t5x /opt/flax"
FILE_ISSUE: true
#triage:
# needs: [metadata, run-jobs]
# uses: ./.github/workflows/_triage.yaml
# if: needs.run-jobs.outputs.TEST_STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch')
# secrets: inherit
# with:
# BROKEN_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
# BASE_IMAGE: ghcr.io/nvidia/upstream-t5x:latest-verified
# REPO_DIRS: "/opt/t5x /opt/flax"
# FILE_ISSUE: true

if-upstream-failed:
runs-on: ubuntu-latest
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@

[test-badge-jax-V100]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-jax-unit-test-V100.json&logo=nvidia
[test-badge-jax-A100]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-jax-unit-test-A100.json&logo=nvidia
[test-badge-t5x]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Ft5x-test-overall-status.json&logo=nvidia
[test-badge-pax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fpax-test-completion-status.json&logo=nvidia
[test-badge-t5x]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fupstream-t5x-test-overall-status.deleteme.json&logo=nvidia
[test-badge-pax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fupstream-pax-test-overall-status.deleteme.json&logo=nvidia
[unit-test-badge-te]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fte-unit-test-status.json&logo=nvidia
[integration-test-badge-te]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fte-integration-test-status.json&logo=nvidia
[test-badge-rosetta-t5x]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Frosetta-t5x-overall-test-status.json&logo=nvidia
Expand Down