elastic · schase-es · Apr 30, 2025 · May 24, 2025 · May 29, 2025 · May 31, 2025
diff --git a/...4/src/internalClusterTest/java/org/elasticsearch/transport/netty4/ESLoggingHandlerIT.java b/...4/src/internalClusterTest/java/org/elasticsearch/transport/netty4/ESLoggingHandlerIT.java
@@ -15,6 +15,7 @@
 import org.elasticsearch.test.ESIntegTestCase;
 import org.elasticsearch.test.MockLog;
 import org.elasticsearch.test.junit.annotations.TestLogging;
+import org.elasticsearch.transport.ClusterConnectionManager;
 import org.elasticsearch.transport.TcpTransport;
 import org.elasticsearch.transport.TransportLogger;
 
@@ -27,7 +28,7 @@ public class ESLoggingHandlerIT extends ESNetty4IntegTestCase {
 
     public void setUp() throws Exception {
         super.setUp();
-        mockLog = MockLog.capture(ESLoggingHandler.class, TransportLogger.class, TcpTransport.class);
+        mockLog = MockLog.capture(ESLoggingHandler.class, TransportLogger.class, TcpTransport.class, ClusterConnectionManager.class);
     }
 
     public void tearDown() throws Exception {

diff --git a/server/src/main/java/org/elasticsearch/cluster/NodeConnectionsService.java b/server/src/main/java/org/elasticsearch/cluster/NodeConnectionsService.java
@@ -17,15 +17,20 @@
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterApplier;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.component.AbstractLifecycleComponent;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.AbstractRunnable;
+import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
+import org.elasticsearch.core.Nullable;
 import org.elasticsearch.core.Releasable;
 import org.elasticsearch.core.Releasables;
 import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.injection.guice.Inject;
 import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.Transport;
+import org.elasticsearch.transport.TransportConnectionListener;
 import org.elasticsearch.transport.TransportService;
 
 import java.util.ArrayList;
@@ -35,6 +40,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 
@@ -79,12 +85,14 @@ public class NodeConnectionsService extends AbstractLifecycleComponent {
 
     private final TimeValue reconnectInterval;
     private volatile ConnectionChecker connectionChecker;
+    private final ConnectionHistory connectionHistory;
 
     @Inject
     public NodeConnectionsService(Settings settings, ThreadPool threadPool, TransportService transportService) {
         this.threadPool = threadPool;
         this.transportService = transportService;
         this.reconnectInterval = NodeConnectionsService.CLUSTER_NODE_RECONNECT_INTERVAL_SETTING.get(settings);
+        this.connectionHistory = new ConnectionHistory();
     }
 
     /**
@@ -101,6 +109,7 @@ public void connectToNodes(DiscoveryNodes discoveryNodes, Runnable onCompletion)
         final List<Runnable> runnables = new ArrayList<>(discoveryNodes.getSize());
         try (var refs = new RefCountingRunnable(onCompletion)) {
             synchronized (mutex) {
+                connectionHistory.reserveConnectionHistoryForNodes(DiscoveryNodes);
                 // Ugly hack: when https://github.com/elastic/elasticsearch/issues/94946 is fixed, just iterate over discoveryNodes here
                 for (final Iterator<DiscoveryNode> iterator = discoveryNodes.mastersFirstStream().iterator(); iterator.hasNext();) {
                     final DiscoveryNode discoveryNode = iterator.next();
@@ -137,6 +146,7 @@ public void disconnectFromNodesExcept(DiscoveryNodes discoveryNodes) {
                 nodesToDisconnect.remove(discoveryNode);
             }
 
+            connectionHistory.removeConnectionHistoryForNodes(nodesToDisconnect);
             for (final DiscoveryNode discoveryNode : nodesToDisconnect) {
                 runnables.add(targetsByNode.remove(discoveryNode)::disconnect);
             }
@@ -347,4 +357,113 @@ public String toString() {
             }
         }
     }
+
+    private class ConnectionHistory {
+        record NodeConnectionHistory(String ephemeralId, long disconnectTime, Exception disconnectCause) {}
+
+        /**
+         * Holds the DiscoveryNode nodeId to connection history record.
+         *
+         * Entries for each node are reserved during NodeConnectionsService.connectToNodes, by placing a (nodeId, dummy) entry
+         * for each node in the cluster. On node disconnect, this entry is updated with its NodeConnectionHistory. On node
+         * connect, this entry is reset to the dummy value. On NodeConnectionsService.disconnectFromNodesExcept, node entries
+         * are removed.
+         *
+         * Each node in the cluster always has a nodeHistory entry that is either the dummy value or a connection history record. This
+         * allows node disconnect callbacks to discard their entry if the disconnect occurred because of a change in cluster state.
+         */
+        private final NodeConnectionHistory dummy = new NodeConnectionHistory("", 0, null);
+        private final ConcurrentMap<String, NodeConnectionHistory> nodeHistory = ConcurrentCollections.newConcurrentMap();
+
+        ConnectionHistory() {
+            NodeConnectionsService.this.transportService.addConnectionListener(new TransportConnectionListener() {
+                @Override
+                public void onNodeConnected(DiscoveryNode node, Transport.Connection connection) {
+                    // log case where the remote node has same ephemeralId as its previous connection
+                    // (the network was disrupted, but not the remote process)
+                    NodeConnectionHistory nodeConnectionHistory = nodeHistory.get(node.getId());
+                    if (nodeConnectionHistory != null) {
+                        nodeHistory.replace(node.getId(), nodeConnectionHistory, dummy);
+                    }
+
+                    if (nodeConnectionHistory != null
+                        && nodeConnectionHistory != dummy
+                        && nodeConnectionHistory.ephemeralId.equals(node.getEphemeralId())) {
+                        if (nodeConnectionHistory.disconnectCause != null) {
+                            logger.warn(
+                                () -> format(
+                                    "reopened transport connection to node [%s] "
+                                        + "which disconnected exceptionally [%dms] ago but did not "
+                                        + "restart, so the disconnection is unexpected; "
+                                        + "if unexpected, see [{}] for troubleshooting guidance",
+                                    node.descriptionWithoutAttributes(),
+                                    nodeConnectionHistory.disconnectTime,
+                                    ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
+                                ),
+                                nodeConnectionHistory.disconnectCause
+                            );
+                        } else {
+                            logger.warn(
+                                """
+                                    reopened transport connection to node [{}] \
+                                    which disconnected gracefully [{}ms] ago but did not \
+                                    restart, so the disconnection is unexpected; \
+                                    if unexpected, see [{}] for troubleshooting guidance""",
+                                node.descriptionWithoutAttributes(),
+                                nodeConnectionHistory.disconnectTime,
+                                ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
+                            );
+                        }
+                    }
+                }
+
+                @Override
+                public void onNodeDisconnected(DiscoveryNode node, Transport.Connection connection) {
+                    connection.addCloseListener(new ActionListener<Void>() {
+                        @Override
+                        public void onResponse(Void ignored) {
+                            insertNodeConnectionHistory(null);
+                        }
+
+                        @Override
+                        public void onFailure(Exception e) {
+                            insertNodeConnectionHistory(e);
+                        }
+
+                        private void insertNodeConnectionHistory(@Nullable Exception e) {
+                            final long disconnectTime = threadPool.absoluteTimeInMillis();
+                            final NodeConnectionHistory nodeConnectionHistory = new NodeConnectionHistory(
+                                node.getEphemeralId(),
+                                disconnectTime,
+                                e
+                            );
+                            final String nodeId = node.getId();
+                            NodeConnectionHistory previousConnectionHistory = nodeHistory.get(nodeId);
+                            if (previousConnectionHistory != null) {
+                                nodeHistory.replace(nodeId, previousConnectionHistory, nodeConnectionHistory);
+                            }
+                        }
+                    });
+                }
+            });
+        }
+
+        void reserveConnectionHistoryForNodes(DiscoveryNodes nodes) {
+            for (DiscoveryNode node : nodes) {
+                nodeHistory.put(node.getId(), dummy);
+            }
+        }
+
+        void removeConnectionHistoryForNodes(Set<DiscoveryNode> nodes) {
+            final int startSize = nodeHistory.size();
+            for (DiscoveryNode node : nodes) {
+                nodeHistory.remove(node.getId());
+            }
+            logger.trace("Connection history garbage-collected from {} to {} entries", startSize, nodeHistory.size());
+        }
+
+        int connectionHistorySize() {
+            return nodeHistory.size();
+        }
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java b/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java
@@ -235,25 +235,48 @@ private void connectToNodeOrRetry(
                                     managerRefs.decRef();
                                 }));
 
-                                conn.addCloseListener(ActionListener.running(() -> {
-                                    if (connectingRefCounter.hasReferences() == false) {
-                                        logger.trace("connection manager shut down, closing transport connection to [{}]", node);
-                                    } else if (conn.hasReferences()) {
-                                        logger.info(
-                                            """
-                                                transport connection to [{}] closed by remote; \
-                                                if unexpected, see [{}] for troubleshooting guidance""",
-                                            node.descriptionWithoutAttributes(),
-                                            ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
-                                        );
-                                        // In production code we only close connections via ref-counting, so this message confirms that a
-                                        // 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a
-                                        // node leaves the cluster with "reason: disconnected" but without this message being logged then
-                                        // that's a bug.
-                                    } else {
-                                        logger.debug("closing unused transport connection to [{}]", node);
+                                conn.addCloseListener(new ActionListener<Void>() {
+                                    @Override
+                                    public void onResponse(Void ignored) {
+                                        if (connectingRefCounter.hasReferences() == false) {
+                                            logger.trace("connection manager shut down, closing transport connection to [{}]", node);
+                                        } else if (conn.hasReferences()) {
+                                            logger.info(
+                                                """
+                                                    transport connection to [{}] closed by remote; \
+                                                    if unexpected, see [{}] for troubleshooting guidance""",
+                                                node.descriptionWithoutAttributes(),
+                                                ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
+                                            );
+                                            // In production code we only close connections via ref-counting, so this message confirms that
+                                            // a 'node-left ... reason: disconnected' event was caused by external factors. Put
+                                            // differently, if a node leaves the cluster with "reason: disconnected" but without this
+                                            // message being logged then that's a bug.
+                                        } else {
+                                            logger.debug("closing unused transport connection to [{}]", node);
+                                        }
                                     }
-                                }));
+
+                                    @Override
+                                    public void onFailure(Exception e) {
+                                        if (conn.hasReferences()) {
+                                            logger.warn(
+                                                """
+                                                    transport connection to [{}] closed by remote with exception [{}]; \
+                                                    if unexpected, see [{}] for troubleshooting guidance""",
+                                                node.descriptionWithoutAttributes(),
+                                                e,
+                                                ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
+                                            );
+                                        } else {
+                                            logger.debug(
+                                                "closing unused transport connection to [{}], exception [{}]",
+                                                node.descriptionWithoutAttributes(),
+                                                e
+                                            );
+                                        }
+                                    }
+                                });
                             }
                         }
                     } finally {

diff --git a/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerIntegTests.java b/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerIntegTests.java
@@ -0,0 +1,50 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.transport;
+
+import org.apache.logging.log4j.Level;
+import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.test.MockLog;
+import org.elasticsearch.test.junit.annotations.TestLogging;
+
+@ESIntegTestCase.ClusterScope(numDataNodes = 2, scope = ESIntegTestCase.Scope.TEST)
+public class ClusterConnectionManagerIntegTests extends ESIntegTestCase {
+    private MockLog mockLog;
+
+    public void setUp() throws Exception {
+        super.setUp();
+        mockLog = MockLog.capture(ClusterConnectionManager.class);
+    }
+
+    public void tearDown() throws Exception {
+        mockLog.close();
+        super.tearDown();
+    }
+
+    @TestLogging(
+        value = "org.elasticsearch.transport.ClusterConnectionManager:WARN",
+        reason = "to ensure we log cluster manager disconnect events on WARN level"
+    )
+    public void testExceptionalDisconnectLoggingInClusterConnectionManager() throws Exception {
+        mockLog.addExpectation(
+            new MockLog.PatternSeenEventExpectation(
+                "cluster connection manager exceptional disconnect log",
+                ClusterConnectionManager.class.getCanonicalName(),
+                Level.WARN,
+                "transport connection to \\[.*\\] closed (by remote )?with exception .*"
+            )
+        );
+
+        final String nodeName = internalCluster().startNode();
+        internalCluster().restartNode(nodeName);
+
+        mockLog.assertAllExpectationsMatched();
+    }
+}