citusdata · onurctirtir · Dec 23, 2024 · Dec 18, 2024 · Dec 17, 2024 · Dec 18, 2024
diff --git a/src/backend/distributed/metadata/metadata_cache.c b/src/backend/distributed/metadata/metadata_cache.c
@@ -4545,6 +4545,16 @@ GetLocalNodeId(void)
 }
 
 
+/*
+ * CachedLocalNodeIdIsValid return true if the cached local node id is valid.
+ */
+bool
+CachedLocalNodeIdIsValid(void)
+{
+	return LocalNodeId != -1;
+}
+
+
 /*
  * RegisterLocalGroupIdCacheCallbacks registers the callbacks required to
  * maintain LocalGroupId at a consistent value. It's separate from

diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c
@@ -2899,6 +2899,13 @@ ApplicationNameAssignHook(const char *newval, void *extra)
 	 * So we set the FinishedStartupCitusBackend flag in StartupCitusBackend to
 	 * indicate when this responsibility handoff has happened.
 	 *
+	 * On the other hand, even if now it's this hook's responsibility to update
+	 * the global pid, we cannot do so if the cached local node id is invalidated
+	 * and we're not allowed to access the catalog tables. Within a transaction
+	 * block, we can access the catalog tables. For this reason, in addition to
+	 * checking FinishedStartupCitusBackend, we also require either being in a
+	 * transaction block or the cached local node id to be valid.
+	 *
 	 * Another solution to the catalog table acccess problem would be to update
 	 * global pid lazily, like we do for HideShards. But that's not possible
 	 * for the global pid, since it is stored in shared memory instead of in a
@@ -2907,7 +2914,8 @@ ApplicationNameAssignHook(const char *newval, void *extra)
 	 * as reasonably possible, which is also why we extract global pids in the
 	 * AuthHook already (extracting doesn't require catalog access).
 	 */
-	if (FinishedStartupCitusBackend)
+	if (FinishedStartupCitusBackend &&
+		(IsTransactionState() || CachedLocalNodeIdIsValid()))
 	{
 		AssignGlobalPID(newval);
 	}

diff --git a/src/include/distributed/metadata_cache.h b/src/include/distributed/metadata_cache.h
@@ -181,6 +181,7 @@ extern CitusTableCacheEntry * LookupCitusTableCacheEntry(Oid relationId);
 extern DistObjectCacheEntry * LookupDistObjectCacheEntry(Oid classid, Oid objid, int32
 														 objsubid);
 extern int32 GetLocalGroupId(void);
+extern bool CachedLocalNodeIdIsValid(void);
 extern int32 GetLocalNodeId(void);
 extern void CitusTableCacheFlushInvalidatedEntries(void);
 extern Oid LookupShardRelationFromCatalog(int64 shardId, bool missing_ok);

diff --git a/src/test/regress/expected/remove_coordinator.out b/src/test/regress/expected/remove_coordinator.out
@@ -5,10 +5,14 @@ SELECT master_remove_node('localhost', :master_port);
 
 (1 row)
 
--- restore coordinator for the rest of the tests
-SELECT citus_set_coordinator_host('localhost', :master_port);
- citus_set_coordinator_host
----------------------------------------------------------------------
-
-(1 row)
-
+-- to silence -potentially flaky- "could not establish connection after" warnings in below test
+SET client_min_messages TO ERROR;
+-- to fail fast if the hostname is not resolvable
+SET citus.node_connection_timeout to '1s';
+BEGIN;
+  SET application_name TO 'new_app_name';
+  -- that should fail because of bad hostname & port
+  SELECT citus_add_node('200.200.200.200', 1, 200);
+ERROR:  connection to the remote node [email protected]:1 failed
+SSL SYSCALL error: EOF detected
+connection to server was lost
diff --git a/src/test/regress/sql/remove_coordinator.sql b/src/test/regress/sql/remove_coordinator.sql
@@ -1,5 +1,40 @@
 -- removing coordinator from pg_dist_node should update pg_dist_colocation
 SELECT master_remove_node('localhost', :master_port);
 
+-- to silence -potentially flaky- "could not establish connection after" warnings in below test
+SET client_min_messages TO ERROR;
+
+-- to fail fast if the hostname is not resolvable
+SET citus.node_connection_timeout to '1s';
+
+BEGIN;
+  SET application_name TO 'new_app_name';
+
+  -- that should fail because of bad hostname & port
+  SELECT citus_add_node('200.200.200.200', 1, 200);
+
+  -- Since above command failed, now Postgres will need to revert the
+  -- application_name change made in this transaction and this will
+  -- happen within abort-transaction callback, so we won't be in a
+  -- transaction block while Postgres does that.
+  --
+  -- And when the application_name changes, Citus tries to re-assign
+  -- the global pid but it does so only for Citus internal backends,
+  -- and doing so for Citus internal backends doesn't require being
+  -- in a transaction block and is safe.
+  --
+  -- However, for the client external backends (like us here), Citus
+  -- doesn't re-assign the global pid because it's not safe to do so
+  -- outside of a transaction block. This is because, it would require
+  -- performing a catalog access to retrive the local node id when the
+  -- cached local node is invalidated like what just happened here
+  -- because of the failed citus_add_node() call made above.
+  --
+  -- So by failing here (rather than crashing), we ensure this behavior.
+ROLLBACK;
+
+RESET client_min_messages;
+RESET citus.node_connection_timeout;
+
 -- restore coordinator for the rest of the tests
 SELECT citus_set_coordinator_host('localhost', :master_port);