diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index d004b19894d..21ae6b4b5b0 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -2853,14 +2853,27 @@ ApplicationNameAssignHook(const char *newval, void *extra) DetermineCitusBackendType(newval); /* - * AssignGlobalPID might read from catalog tables to get the the local - * nodeid. But ApplicationNameAssignHook might be called before catalog - * access is available to the backend (such as in early stages of - * authentication). We use StartupCitusBackend to initialize the global pid - * after catalogs are available. After that happens this hook becomes - * responsible to update the global pid on later application_name changes. - * So we set the FinishedStartupCitusBackend flag in StartupCitusBackend to - * indicate when this responsibility handoff has happened. + * We use StartupCitusBackend to initialize the global pid after catalogs + * are available. After that happens this hook becomes responsible to update + * the global pid on later application_name changes. So we set the + * FinishedStartupCitusBackend flag in StartupCitusBackend to indicate when + * this responsibility handoff has happened. + * + * Also note that when application_name changes, we don't actually need to + * try re-assigning the global pid for external client backends and + * background workers because application_name doesn't affect the global + * pid for such backends - note that !IsExternalClientBackend() check covers + * both types of backends. Plus, + * trying to re-assign the global pid for such backends would unnecessarily + * cause performing a catalog access when the cached local node id is + * invalidated. However, accessing to the catalog tables is dangerous in + * certain situations like when we're not in a transaction block. And for + * the other types of backends, i.e., the Citus internal backends, we need + * to re-assign the global pid when the application_name changes because for + * such backends we simply extract the global pid inherited from the + * originating backend from the application_name -that's specified by + * originating backend when openning that connection- and this doesn't require + * catalog access. * * Another solution to the catalog table acccess problem would be to update * global pid lazily, like we do for HideShards. But that's not possible @@ -2870,7 +2883,7 @@ ApplicationNameAssignHook(const char *newval, void *extra) * as reasonably possible, which is also why we extract global pids in the * AuthHook already (extracting doesn't require catalog access). */ - if (FinishedStartupCitusBackend) + if (FinishedStartupCitusBackend && !IsExternalClientBackend()) { AssignGlobalPID(newval); } diff --git a/src/backend/distributed/test/run_from_same_connection.c b/src/backend/distributed/test/run_from_same_connection.c index 5663a42fa73..9e664b03d54 100644 --- a/src/backend/distributed/test/run_from_same_connection.c +++ b/src/backend/distributed/test/run_from_same_connection.c @@ -190,6 +190,9 @@ run_commands_on_session_level_connection_to_node(PG_FUNCTION_ARGS) /* * override_backend_data_gpid is a wrapper around SetBackendDataGpid(). + * Also sets distributedCommandOriginator to true since the only caller of + * this method calls this function actually wants this backend to + * be treated as a distributed command originator with the given global pid. */ Datum override_backend_data_gpid(PG_FUNCTION_ARGS) @@ -199,6 +202,7 @@ override_backend_data_gpid(PG_FUNCTION_ARGS) uint64 gpid = PG_GETARG_INT64(0); SetBackendDataGlobalPID(gpid); + SetBackendDataDistributedCommandOriginator(true); PG_RETURN_VOID(); } diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c index e2afd18f74a..384954b81f4 100644 --- a/src/backend/distributed/transaction/backend_data.c +++ b/src/backend/distributed/transaction/backend_data.c @@ -964,6 +964,23 @@ SetBackendDataGlobalPID(uint64 gpid) } +/* + * SetBackendDataDistributedCommandOriginator sets the distributedCommandOriginator + * field on MyBackendData. + */ +void +SetBackendDataDistributedCommandOriginator(bool distributedCommandOriginator) +{ + if (!MyBackendData) + { + return; + } + SpinLockAcquire(&MyBackendData->mutex); + MyBackendData->distributedCommandOriginator = distributedCommandOriginator; + SpinLockRelease(&MyBackendData->mutex); +} + + /* * GetGlobalPID returns the global process id of the current backend. */ diff --git a/src/include/distributed/backend_data.h b/src/include/distributed/backend_data.h index 8014fe5a6a9..5b3fcf2ac22 100644 --- a/src/include/distributed/backend_data.h +++ b/src/include/distributed/backend_data.h @@ -61,6 +61,7 @@ extern void AssignGlobalPID(const char *applicationName); extern uint64 GetGlobalPID(void); extern void SetBackendDataDatabaseId(void); extern void SetBackendDataGlobalPID(uint64 gpid); +extern void SetBackendDataDistributedCommandOriginator(bool distributedCommandOriginator); extern uint64 ExtractGlobalPID(const char *applicationName); extern int ExtractNodeIdFromGlobalPID(uint64 globalPID, bool missingOk); extern int ExtractProcessIdFromGlobalPID(uint64 globalPID); diff --git a/src/test/regress/expected/remove_coordinator.out b/src/test/regress/expected/remove_coordinator.out index 0226a7cd073..e2fd5df027b 100644 --- a/src/test/regress/expected/remove_coordinator.out +++ b/src/test/regress/expected/remove_coordinator.out @@ -5,6 +5,37 @@ SELECT master_remove_node('localhost', :master_port); (1 row) +-- to silence -potentially flaky- "could not establish connection after" warnings in below test +SET client_min_messages TO ERROR; +-- to fail fast when the hostname is not resolvable, as it will be the case below +SET citus.node_connection_timeout to '1s'; +BEGIN; + SET application_name TO 'new_app_name'; + -- that should fail because of bad hostname & port + SELECT citus_add_node('200.200.200.200', 1, 200); +ERROR: connection to the remote node postgres@200.200.200.200:1 failed + -- Since above command failed, now Postgres will need to revert the + -- application_name change made in this transaction and this will + -- happen within abort-transaction callback, so we won't be in a + -- transaction block while Postgres does that. + -- + -- And when the application_name changes, Citus tries to re-assign + -- the global pid but it does so only for Citus internal backends, + -- and doing so for Citus internal backends doesn't require being + -- in a transaction block and is safe. + -- + -- However, for the client external backends (like us here), Citus + -- doesn't re-assign the global pid because it's not needed and it's + -- not safe to do so outside of a transaction block. This is because, + -- it would require performing a catalog access to retrive the local + -- node id when the cached local node is invalidated like what just + -- happened here because of the failed citus_add_node() call made + -- above. + -- + -- So by failing here (rather than crashing), we ensure this behavior. +ROLLBACK; +RESET client_min_messages; +RESET citus.node_connection_timeout; -- restore coordinator for the rest of the tests SELECT citus_set_coordinator_host('localhost', :master_port); citus_set_coordinator_host diff --git a/src/test/regress/sql/remove_coordinator.sql b/src/test/regress/sql/remove_coordinator.sql index b0df327d15f..35a8a57189b 100644 --- a/src/test/regress/sql/remove_coordinator.sql +++ b/src/test/regress/sql/remove_coordinator.sql @@ -1,5 +1,41 @@ -- removing coordinator from pg_dist_node should update pg_dist_colocation SELECT master_remove_node('localhost', :master_port); +-- to silence -potentially flaky- "could not establish connection after" warnings in below test +SET client_min_messages TO ERROR; + +-- to fail fast when the hostname is not resolvable, as it will be the case below +SET citus.node_connection_timeout to '1s'; + +BEGIN; + SET application_name TO 'new_app_name'; + + -- that should fail because of bad hostname & port + SELECT citus_add_node('200.200.200.200', 1, 200); + + -- Since above command failed, now Postgres will need to revert the + -- application_name change made in this transaction and this will + -- happen within abort-transaction callback, so we won't be in a + -- transaction block while Postgres does that. + -- + -- And when the application_name changes, Citus tries to re-assign + -- the global pid but it does so only for Citus internal backends, + -- and doing so for Citus internal backends doesn't require being + -- in a transaction block and is safe. + -- + -- However, for the client external backends (like us here), Citus + -- doesn't re-assign the global pid because it's not needed and it's + -- not safe to do so outside of a transaction block. This is because, + -- it would require performing a catalog access to retrive the local + -- node id when the cached local node is invalidated like what just + -- happened here because of the failed citus_add_node() call made + -- above. + -- + -- So by failing here (rather than crashing), we ensure this behavior. +ROLLBACK; + +RESET client_min_messages; +RESET citus.node_connection_timeout; + -- restore coordinator for the rest of the tests SELECT citus_set_coordinator_host('localhost', :master_port);