diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index b70e3c87a..9571e144c 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -13,6 +13,11 @@ on: description: Extra command line args to pass to test-pax.sh default: "" required: false + PUBLISH: + type: boolean + description: Publish badge? + default: false + required: false outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' @@ -201,8 +206,8 @@ jobs: if: ( always() ) secrets: inherit with: - ENDPOINT_FILENAME: 'pax-test-status.json' - PUBLISH: false + ENDPOINT_FILENAME: 'upstream-pax-test-overall-status.deleteme.json' + PUBLISH: ${{ inputs.PUBLISH }} SCRIPT: | EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*FSDP*TP*PP/*-status.json" PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) @@ -244,7 +249,7 @@ jobs: BADGE_COLOR=yellow fi echo "STATUS='${STATUS}'" >> ${GITHUB_OUTPUT} - echo "LABEL='Completion'" >> $GITHUB_OUTPUT + echo "LABEL='Upstream tests'" >> $GITHUB_OUTPUT echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} ran ${PYTEST_PASSED_TESTS}/${PYTEST_TOTAL_TESTS} pass loss+perf'" >> $GITHUB_OUTPUT echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 4a4f9ab21..89e096473 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -18,6 +18,11 @@ on: description: Extra gin args to pass to test-t5x.sh default: "" required: false + PUBLISH: + type: boolean + description: Publish badge? + default: false + required: false outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' @@ -312,8 +317,8 @@ jobs: if: ( always() ) secrets: inherit with: - ENDPOINT_FILENAME: 't5x-test-completion-status.json' - PUBLISH: false + ENDPOINT_FILENAME: 'upstream-t5x-test-overall-status.deleteme.json' + PUBLISH: ${{ inputs.PUBLISH }} SCRIPT: | EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json" PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) @@ -356,7 +361,7 @@ jobs: BADGE_COLOR=yellow fi echo "STATUS='${STATUS}'" >> ${GITHUB_OUTPUT} - echo "LABEL='Completion'" >> $GITHUB_OUTPUT + echo "LABEL='Upstream tests'" >> $GITHUB_OUTPUT echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} ran ${PYTEST_PASSED_TESTS}/${PYTEST_TOTAL_TESTS} pass loss+perf'" >> $GITHUB_OUTPUT echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT diff --git a/.github/workflows/nightly-pax-test-mgmn.yaml b/.github/workflows/nightly-pax-test-mgmn.yaml index db041cd77..24de966cb 100644 --- a/.github/workflows/nightly-pax-test-mgmn.yaml +++ b/.github/workflows/nightly-pax-test-mgmn.yaml @@ -55,112 +55,77 @@ jobs: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: PAX_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} - secrets: inherit - - publish: - needs: [metadata, run-jobs] - runs-on: ubuntu-22.04 - steps: - - name: Setup SSH agent - uses: webfactory/ssh-agent@v0.8.0 - with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - - - name: Setup SSH known hosts - id: ssh-known-hosts - run: | - mkdir -p ~/.ssh - cat >> ~/.ssh/known_hosts << EOF - ${{ vars.SSH_KNOWN_HOSTS }} - EOF - chmod 600 ~/.ssh/known_hosts - echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - - name: Setup SSH config - id: ssh-config - run: | - mkdir -p ~/.ssh - cat >> ~/.ssh/config << EOF - ${{ vars.SSH_CONFIG }} - EOF - chmod 600 ~/.ssh/config - - - name: Create dated folder and generate TensorBoard query URL - id: mkdir - shell: bash -x -e {0} - run: | - FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/PAX" - # copy folder - ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER} - ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/ - # generate query URL - ( - cat << EOF - - ## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }} - - [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) - - EOF - ) | tee $GITHUB_STEP_SUMMARY - - publish-completion: - needs: [metadata, run-jobs] - uses: ./.github/workflows/_publish_badge.yaml - if: success() || failure() - secrets: inherit - with: - ENDPOINT_FILENAME: 'pax-test-completion-status.json' PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - STATUS=failure - if [[ ${{ needs.run-jobs.result }} == "success" ]]; then - EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES - - if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then - BADGE_COLOR=brightgreen - STATUS=success - elif [[ $PASSED_TESTS -eq 0 ]]; then - BADGE_COLOR=red - else - BADGE_COLOR=yellow - fi - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - else - echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT - echo "COLOR='red'" >> $GITHUB_OUTPUT - fi - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT - - publish-verified: - if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) - needs: [metadata, publish-completion] - uses: ./.github/workflows/_publish_container.yaml secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} - TARGET_IMAGE: upstream-pax - TARGET_TAGS: | - type=raw,value=latest-verified,priority=1000 - - triage: - needs: [metadata, publish-completion] - uses: ./.github/workflows/_triage.yaml - if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') - secrets: inherit - with: - BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} - BASE_IMAGE: ghcr.io/nvidia/upstream-pax:latest-verified - REPO_DIRS: "/opt/paxml /opt/praxis" - FILE_ISSUE: true + + #publish: + # needs: [metadata, run-jobs] + # runs-on: ubuntu-22.04 + # steps: + # - name: Setup SSH agent + # uses: webfactory/ssh-agent@v0.8.0 + # with: + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + + # - name: Setup SSH known hosts + # id: ssh-known-hosts + # run: | + # mkdir -p ~/.ssh + # cat >> ~/.ssh/known_hosts << EOF + # ${{ vars.SSH_KNOWN_HOSTS }} + # EOF + # chmod 600 ~/.ssh/known_hosts + # echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT + + # - name: Setup SSH config + # id: ssh-config + # run: | + # mkdir -p ~/.ssh + # cat >> ~/.ssh/config << EOF + # ${{ vars.SSH_CONFIG }} + # EOF + # chmod 600 ~/.ssh/config + + # - name: Create dated folder and generate TensorBoard query URL + # id: mkdir + # shell: bash -x -e {0} + # run: | + # FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/PAX" + # # copy folder + # ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER} + # ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/ + # # generate query URL + # ( + # cat << EOF + + # ## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }} + + # [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) + + # EOF + # ) | tee $GITHUB_STEP_SUMMARY + + #publish-verified: + # if: needs.run-jobs.outputs.TEST_STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) + # needs: [metadata, run-jobs] + # uses: ./.github/workflows/_publish_container.yaml + # secrets: inherit + # with: + # SOURCE_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} + # TARGET_IMAGE: upstream-pax + # TARGET_TAGS: | + # type=raw,value=latest-verified,priority=1000 + + #triage: + # needs: [metadata, run-jobs] + # uses: ./.github/workflows/_triage.yaml + # if: needs.run-jobs.outputs.TEST_STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') + # secrets: inherit + # with: + # BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} + # BASE_IMAGE: ghcr.io/nvidia/upstream-pax:latest-verified + # REPO_DIRS: "/opt/paxml /opt/praxis" + # FILE_ISSUE: true if-upstream-failed: runs-on: ubuntu-latest diff --git a/.github/workflows/nightly-t5x-test-mgmn.yaml b/.github/workflows/nightly-t5x-test-mgmn.yaml index 40fa91819..84278b7f3 100644 --- a/.github/workflows/nightly-t5x-test-mgmn.yaml +++ b/.github/workflows/nightly-t5x-test-mgmn.yaml @@ -55,74 +55,39 @@ jobs: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: T5X_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} - secrets: inherit - - publish: - needs: [metadata, run-jobs] - uses: ./.github/workflows/_publish_t5x_results.yaml - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - EXPERIMENT_SUBDIR: T5X - secrets: inherit - - publish-completion: - needs: [metadata, run-jobs] - uses: ./.github/workflows/_publish_badge.yaml - if: success() || failure() - secrets: inherit - with: - ENDPOINT_FILENAME: 't5x-test-overall-status.json' PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - STATUS=failure - if [[ ${{ needs.run-jobs.result }} == "success" ]]; then - EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) + secrets: inherit - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES + #publish: + # needs: [metadata, run-jobs] + # uses: ./.github/workflows/_publish_t5x_results.yaml + # if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + # with: + # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + # EXPERIMENT_SUBDIR: T5X + # secrets: inherit - if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then - BADGE_COLOR=brightgreen - STATUS=success - elif [[ $PASSED_TESTS -eq 0 ]]; then - BADGE_COLOR=red - else - BADGE_COLOR=yellow - fi - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - else - echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT - echo "COLOR='red'" >> $GITHUB_OUTPUT - fi - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT + #publish-verified: + # if: needs.run-jobs.outputs.TEST_STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) + # needs: [metadata, run-jobs] + # uses: ./.github/workflows/_publish_container.yaml + # secrets: inherit + # with: + # SOURCE_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} + # TARGET_IMAGE: upstream-t5x + # TARGET_TAGS: | + # type=raw,value=latest-verified,priority=1000 - publish-verified: - if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) - needs: [metadata, publish-completion] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} - TARGET_IMAGE: upstream-t5x - TARGET_TAGS: | - type=raw,value=latest-verified,priority=1000 - - triage: - needs: [metadata, publish-completion] - uses: ./.github/workflows/_triage.yaml - if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') - secrets: inherit - with: - BROKEN_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} - BASE_IMAGE: ghcr.io/nvidia/upstream-t5x:latest-verified - REPO_DIRS: "/opt/t5x /opt/flax" - FILE_ISSUE: true + #triage: + # needs: [metadata, run-jobs] + # uses: ./.github/workflows/_triage.yaml + # if: needs.run-jobs.outputs.TEST_STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') + # secrets: inherit + # with: + # BROKEN_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} + # BASE_IMAGE: ghcr.io/nvidia/upstream-t5x:latest-verified + # REPO_DIRS: "/opt/t5x /opt/flax" + # FILE_ISSUE: true if-upstream-failed: runs-on: ubuntu-latest diff --git a/README.md b/README.md index 26adbcda5..80fa14d68 100644 --- a/README.md +++ b/README.md @@ -116,8 +116,8 @@ [test-badge-jax-V100]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-jax-unit-test-V100.json&logo=nvidia [test-badge-jax-A100]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-jax-unit-test-A100.json&logo=nvidia -[test-badge-t5x]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Ft5x-test-overall-status.json&logo=nvidia -[test-badge-pax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fpax-test-completion-status.json&logo=nvidia +[test-badge-t5x]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fupstream-t5x-test-overall-status.deleteme.json&logo=nvidia +[test-badge-pax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fupstream-pax-test-overall-status.deleteme.json&logo=nvidia [unit-test-badge-te]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fte-unit-test-status.json&logo=nvidia [integration-test-badge-te]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fte-integration-test-status.json&logo=nvidia [test-badge-rosetta-t5x]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Frosetta-t5x-overall-test-status.json&logo=nvidia