From b886cfa22316e1047f5c725f0f6b96b08c607ab2 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Fri, 31 Jan 2025 14:23:01 +0300 Subject: [PATCH 1/5] Upgrade upload-artifacts action to 4.6.0 --- .github/actions/save_logs_and_results/action.yml | 2 +- .github/actions/upload_coverage/action.yml | 2 +- .github/workflows/build_and_test.yml | 2 +- .github/workflows/flaky_test_debugging.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/actions/save_logs_and_results/action.yml b/.github/actions/save_logs_and_results/action.yml index 0f238835d19..b344c68f2ef 100644 --- a/.github/actions/save_logs_and_results/action.yml +++ b/.github/actions/save_logs_and_results/action.yml @@ -6,7 +6,7 @@ inputs: runs: using: composite steps: - - uses: actions/upload-artifact@v3.1.1 + - uses: actions/upload-artifact@v4.6.0 name: Upload logs with: name: ${{ inputs.folder }} diff --git a/.github/actions/upload_coverage/action.yml b/.github/actions/upload_coverage/action.yml index 0b5f581a6a4..abffa784a03 100644 --- a/.github/actions/upload_coverage/action.yml +++ b/.github/actions/upload_coverage/action.yml @@ -21,7 +21,7 @@ runs: mkdir -p /tmp/codeclimate cc-test-reporter format-coverage -t lcov -o /tmp/codeclimate/${{ inputs.flags }}.json lcov.info shell: bash - - uses: actions/upload-artifact@v3.1.1 + - uses: actions/upload-artifact@v4.6.0 with: path: "/tmp/codeclimate/*.json" name: codeclimate diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3d9b57c370a..f7683109935 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -122,7 +122,7 @@ jobs: - name: Build run: "./ci/build-citus.sh" shell: bash - - uses: actions/upload-artifact@v3.1.1 + - uses: actions/upload-artifact@v4.6.0 with: name: build-${{ env.PG_MAJOR }} path: |- diff --git a/.github/workflows/flaky_test_debugging.yml b/.github/workflows/flaky_test_debugging.yml index a666c1cd5df..574cb7813ff 100644 --- a/.github/workflows/flaky_test_debugging.yml +++ b/.github/workflows/flaky_test_debugging.yml @@ -34,7 +34,7 @@ jobs: echo "PG_MAJOR=${PG_MAJOR}" >> $GITHUB_ENV ./ci/build-citus.sh shell: bash - - uses: actions/upload-artifact@v3.1.1 + - uses: actions/upload-artifact@v4.6.0 with: name: build-${{ env.PG_MAJOR }} path: |- From cbe0de33a67772b21a9984b57f6133ca7316d2dc Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Fri, 31 Jan 2025 14:39:55 +0300 Subject: [PATCH 2/5] Upgrade download-artifacts action to 4.1.8 --- .github/actions/setup_extension/action.yml | 2 +- .github/workflows/build_and_test.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/setup_extension/action.yml b/.github/actions/setup_extension/action.yml index 96b408e7e43..33129f17de6 100644 --- a/.github/actions/setup_extension/action.yml +++ b/.github/actions/setup_extension/action.yml @@ -17,7 +17,7 @@ runs: echo "PG_MAJOR=${{ inputs.pg_major }}" >> $GITHUB_ENV fi shell: bash - - uses: actions/download-artifact@v3.0.1 + - uses: actions/download-artifact@v4.1.8 with: name: build-${{ env.PG_MAJOR }} - name: Install Extension diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f7683109935..0703d1a7e42 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -424,7 +424,7 @@ jobs: - test-citus-upgrade - test-pg-upgrade steps: - - uses: actions/download-artifact@v3.0.1 + - uses: actions/download-artifact@v4.1.8 with: name: "codeclimate" path: "codeclimate" @@ -525,7 +525,7 @@ jobs: matrix: ${{ fromJson(needs.prepare_parallelization_matrix_32.outputs.json) }} steps: - uses: actions/checkout@v3.5.0 - - uses: actions/download-artifact@v3.0.1 + - uses: actions/download-artifact@v4.1.8 - uses: "./.github/actions/setup_extension" - name: Run minimal tests run: |- From 684b4c6b9666954bdf13c9eb597390eb71a66d48 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Fri, 31 Jan 2025 14:56:33 +0300 Subject: [PATCH 3/5] Release RowExclusiveLock on pg_dist_transaction as soon as remote xacts are recovered As of this commit, after recovering the remote transactions, now we release the lock on pg_dist_transaction while closing it to avoid deadlocks that might occur because of trying to acquire a lock on pg_dist_authinfo while holding a lock on pg_dist_transaction. Such a scenario can only cause a deadlock if another transaction is trying to acquire a strong lock on pg_dist_transaction while holding a lock on pg_dist_authinfo. As of today, we (implicitly) acquire a strong lock on pg_dist_transaction only when upgrading Citus to 11.3-1 and this happens when creating a REPLICA IDENTITY on pg_dist_transaction. And regardless of the code-path we are in, it should be okay to release the lock there because all we do after that point is to abort the prepared transactions that are not part of an in-progress distributed transaction and releasing the lock before doing so should be just fine. This also changes the blocking behavior between citus_create_restore_point and the transaction recovery code-path in the sense that now citus_create_restore_point doesn't until transaction recovery completes aborting the prepared transactions that are not part of an in-progress distributed transaction. However, this should be fine because even before this was possible, e.g., if transaction recovery fails to open a remote connection to a node. --- .../transaction/transaction_recovery.c | 18 +++++++++++++++++- .../isolation_create_restore_point.out | 7 +++---- .../spec/isolation_create_restore_point.spec | 5 ++++- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/backend/distributed/transaction/transaction_recovery.c b/src/backend/distributed/transaction/transaction_recovery.c index f25823b3064..c114a2ed39f 100644 --- a/src/backend/distributed/transaction/transaction_recovery.c +++ b/src/backend/distributed/transaction/transaction_recovery.c @@ -347,7 +347,23 @@ RecoverWorkerTransactions(WorkerNode *workerNode) } systable_endscan(scanDescriptor); - table_close(pgDistTransaction, NoLock); + + /* + * Here we release the lock on pg_dist_transaction while closing it to avoid + * deadlocks that might occur because of trying to acquire a lock on + * pg_dist_authinfo while holding a lock on pg_dist_transaction. Such a scenario + * can only cause a deadlock if another transaction is trying to acquire a strong + * lock on pg_dist_transaction while holding a lock on pg_dist_authinfo. As of + * today, we (implicitly) acquire a strong lock on pg_dist_transaction only when + * upgrading Citus to 11.3-1 and this happens when creating a REPLICA IDENTITY on + * pg_dist_transaction. + * + * And reglardless of the code-path we are in, it should be okay to release the + * lock now because all we do after this point is to abort the prepared + * transactions that are not part of an in-progress distributed transaction and + * releasing the lock before doing so should be just fine. + */ + table_close(pgDistTransaction, RowExclusiveLock); if (!recoveryFailed) { diff --git a/src/test/regress/expected/isolation_create_restore_point.out b/src/test/regress/expected/isolation_create_restore_point.out index 3b1bdf9eb52..dce15a35d5c 100644 --- a/src/test/regress/expected/isolation_create_restore_point.out +++ b/src/test/regress/expected/isolation_create_restore_point.out @@ -147,16 +147,15 @@ recover_prepared_transactions step s2-create-restore: SELECT 1 FROM citus_create_restore_point('citus-test'); - -step s1-commit: - COMMIT; -step s2-create-restore: <... completed> ?column? --------------------------------------------------------------------- 1 (1 row) +step s1-commit: + COMMIT; + starting permutation: s1-begin s1-drop s2-create-restore s1-commit create_reference_table diff --git a/src/test/regress/spec/isolation_create_restore_point.spec b/src/test/regress/spec/isolation_create_restore_point.spec index 2cdc66f850a..c62a64a449c 100644 --- a/src/test/regress/spec/isolation_create_restore_point.spec +++ b/src/test/regress/spec/isolation_create_restore_point.spec @@ -154,7 +154,10 @@ permutation "s1-begin" "s1-ddl" "s2-create-restore" "s1-commit" // verify that citus_create_restore_point is not blocked by concurrent COPY (only commit) permutation "s1-begin" "s1-copy" "s2-create-restore" "s1-commit" -// verify that citus_create_restore_point is blocked by concurrent recover_prepared_transactions +// verify that citus_create_restore_point is partially blocked by concurrent recover_prepared_transactions. +// In the test output, we won't be able to explicitly observe this since +// recover_prepared_transactions unblocks citus_create_restore_point after in-progress prepared transactions +// are recovered. permutation "s1-begin" "s1-recover" "s2-create-restore" "s1-commit" // verify that citus_create_restore_point is blocked by concurrent DROP TABLE From 24758c39a1dad2ed7f549d3013137deb94de3bf6 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Fri, 31 Jan 2025 14:59:23 +0300 Subject: [PATCH 4/5] Fix flaky citus upgrade test --- .../upgrade_pg_dist_cleanup_after_0.out | 9 +++++++++ .../upgrade_pg_dist_cleanup_before_0.out | 17 +++++++++++++++++ .../sql/upgrade_pg_dist_cleanup_after.sql | 5 +++++ .../sql/upgrade_pg_dist_cleanup_before.sql | 10 ++++++++++ 4 files changed, 41 insertions(+) diff --git a/src/test/regress/expected/upgrade_pg_dist_cleanup_after_0.out b/src/test/regress/expected/upgrade_pg_dist_cleanup_after_0.out index d71fad887c9..168c64ccaa3 100644 --- a/src/test/regress/expected/upgrade_pg_dist_cleanup_after_0.out +++ b/src/test/regress/expected/upgrade_pg_dist_cleanup_after_0.out @@ -28,3 +28,12 @@ SELECT * FROM pg_dist_cleanup; CALL citus_cleanup_orphaned_resources(); NOTICE: cleaned up 1 orphaned resources DROP TABLE table_with_orphaned_shards; +-- Re-enable automatic shard cleanup by maintenance daemon as +-- we have disabled it in upgrade_pg_dist_cleanup_before.sql +ALTER SYSTEM RESET citus.defer_shard_delete_interval; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + diff --git a/src/test/regress/expected/upgrade_pg_dist_cleanup_before_0.out b/src/test/regress/expected/upgrade_pg_dist_cleanup_before_0.out index a0cf9ceb1ea..dd6c8868e32 100644 --- a/src/test/regress/expected/upgrade_pg_dist_cleanup_before_0.out +++ b/src/test/regress/expected/upgrade_pg_dist_cleanup_before_0.out @@ -30,6 +30,23 @@ SELECT COUNT(*) FROM pg_dist_placement WHERE shardstate = 1 AND shardid IN (SELE (1 row) -- create an orphaned placement based on an existing one +-- +-- But before doing that, first disable automatic shard cleanup +-- by maintenance daemon so that we can reliably test the cleanup +-- in upgrade_pg_dist_cleanup_after.sql. +ALTER SYSTEM SET citus.defer_shard_delete_interval TO -1; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +SELECT pg_sleep(0.1); + pg_sleep +--------------------------------------------------------------------- + +(1 row) + INSERT INTO pg_dist_placement(placementid, shardid, shardstate, shardlength, groupid) SELECT nextval('pg_dist_placement_placementid_seq'::regclass), shardid, 4, shardlength, 3-groupid FROM pg_dist_placement diff --git a/src/test/regress/sql/upgrade_pg_dist_cleanup_after.sql b/src/test/regress/sql/upgrade_pg_dist_cleanup_after.sql index e84c35b608c..333ac60ca93 100644 --- a/src/test/regress/sql/upgrade_pg_dist_cleanup_after.sql +++ b/src/test/regress/sql/upgrade_pg_dist_cleanup_after.sql @@ -13,3 +13,8 @@ SELECT COUNT(*) FROM pg_dist_placement WHERE shardid IN (SELECT shardid FROM pg_ SELECT * FROM pg_dist_cleanup; CALL citus_cleanup_orphaned_resources(); DROP TABLE table_with_orphaned_shards; + +-- Re-enable automatic shard cleanup by maintenance daemon as +-- we have disabled it in upgrade_pg_dist_cleanup_before.sql +ALTER SYSTEM RESET citus.defer_shard_delete_interval; +SELECT pg_reload_conf(); diff --git a/src/test/regress/sql/upgrade_pg_dist_cleanup_before.sql b/src/test/regress/sql/upgrade_pg_dist_cleanup_before.sql index 62ec8a1fb46..ec0eef353dc 100644 --- a/src/test/regress/sql/upgrade_pg_dist_cleanup_before.sql +++ b/src/test/regress/sql/upgrade_pg_dist_cleanup_before.sql @@ -16,6 +16,16 @@ SELECT create_distributed_table('table_with_orphaned_shards', 'a'); -- show all 32 placements are active SELECT COUNT(*) FROM pg_dist_placement WHERE shardstate = 1 AND shardid IN (SELECT shardid FROM pg_dist_shard WHERE logicalrelid='table_with_orphaned_shards'::regclass); -- create an orphaned placement based on an existing one +-- +-- But before doing that, first disable automatic shard cleanup +-- by maintenance daemon so that we can reliably test the cleanup +-- in upgrade_pg_dist_cleanup_after.sql. + +ALTER SYSTEM SET citus.defer_shard_delete_interval TO -1; +SELECT pg_reload_conf(); + +SELECT pg_sleep(0.1); + INSERT INTO pg_dist_placement(placementid, shardid, shardstate, shardlength, groupid) SELECT nextval('pg_dist_placement_placementid_seq'::regclass), shardid, 4, shardlength, 3-groupid FROM pg_dist_placement From 26f16a76542eca2caca7e25848663286c0c8c6d5 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Fri, 31 Jan 2025 15:46:42 +0300 Subject: [PATCH 5/5] Avoid publishing artifacts with conflicting names .. as documented in actions/upload-artifact#480. --- .github/actions/upload_coverage/action.yml | 2 +- .github/workflows/build_and_test.yml | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/actions/upload_coverage/action.yml b/.github/actions/upload_coverage/action.yml index abffa784a03..ba80ba63afa 100644 --- a/.github/actions/upload_coverage/action.yml +++ b/.github/actions/upload_coverage/action.yml @@ -24,4 +24,4 @@ runs: - uses: actions/upload-artifact@v4.6.0 with: path: "/tmp/codeclimate/*.json" - name: codeclimate + name: codeclimate-${{ inputs.flags }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0703d1a7e42..fefcc5bcbcd 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -303,10 +303,12 @@ jobs: check-arbitrary-configs parallel=4 CONFIGS=$TESTS - uses: "./.github/actions/save_logs_and_results" if: always() + with: + folder: ${{ env.PG_MAJOR }}_arbitrary_configs_${{ matrix.parallel }} - uses: "./.github/actions/upload_coverage" if: always() with: - flags: ${{ env.pg_major }}_upgrade + flags: ${{ env.PG_MAJOR }}_arbitrary_configs_${{ matrix.parallel }} codecov_token: ${{ secrets.CODECOV_TOKEN }} test-pg-upgrade: name: PG${{ matrix.old_pg_major }}-PG${{ matrix.new_pg_major }} - check-pg-upgrade @@ -360,6 +362,8 @@ jobs: if: failure() - uses: "./.github/actions/save_logs_and_results" if: always() + with: + folder: ${{ env.old_pg_major }}_${{ env.new_pg_major }}_upgrade - uses: "./.github/actions/upload_coverage" if: always() with: @@ -405,10 +409,12 @@ jobs: done; - uses: "./.github/actions/save_logs_and_results" if: always() + with: + folder: ${{ env.PG_MAJOR }}_citus_upgrade - uses: "./.github/actions/upload_coverage" if: always() with: - flags: ${{ env.pg_major }}_upgrade + flags: ${{ env.PG_MAJOR }}_citus_upgrade codecov_token: ${{ secrets.CODECOV_TOKEN }} upload-coverage: if: always() @@ -426,8 +432,9 @@ jobs: steps: - uses: actions/download-artifact@v4.1.8 with: - name: "codeclimate" - path: "codeclimate" + pattern: codeclimate* + path: codeclimate + merge-multiple: true - name: Upload coverage results to Code Climate run: |- cc-test-reporter sum-coverage codeclimate/*.json -o total.json