Skip to content

Commit 09fa1cd

Browse files
committed
added documentation to leader election
1 parent b43bd65 commit 09fa1cd

File tree

5 files changed

+330
-179
lines changed

5 files changed

+330
-179
lines changed

coordination/src/main/java/tech/ydb/coordination/recipes/election/LeaderElection.java

Lines changed: 135 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,14 @@
1111
import java.util.concurrent.Executors;
1212
import java.util.concurrent.Future;
1313
import java.util.concurrent.ScheduledExecutorService;
14+
import java.util.concurrent.ThreadFactory;
1415
import java.util.concurrent.atomic.AtomicReference;
1516
import java.util.function.Supplier;
1617
import java.util.stream.Collectors;
1718
import java.util.stream.Stream;
1819

1920
import com.google.common.base.Preconditions;
21+
import com.google.common.util.concurrent.ThreadFactoryBuilder;
2022
import org.slf4j.Logger;
2123
import org.slf4j.LoggerFactory;
2224

@@ -35,9 +37,27 @@
3537
import tech.ydb.core.Status;
3638
import tech.ydb.core.StatusCode;
3739

38-
// TODO: документцаия / логгирование / рекомендации по коду
40+
/**
41+
* A distributed leader election implementation using coordination services.
42+
* This class provides a mechanism for multiple instances to compete for leadership
43+
* of a named resource, with exactly one instance becoming the leader at any time.
44+
*
45+
* <p>The election process uses a semaphore-based approach where:
46+
* <ul>
47+
* <li>The leader holds the semaphore lock</li>
48+
* <li>Other participants wait in a queue</li>
49+
* <li>Leadership can be voluntarily released or lost due to session issues</li>
50+
* </ul>
51+
*
52+
* <p>Thread safety: This class is thread-safe. All public methods can be called
53+
* from multiple threads concurrently.
54+
*/
3955
public class LeaderElection implements Closeable, SessionListenableProvider {
4056
private static final Logger logger = LoggerFactory.getLogger(LeaderElection.class);
57+
private static final ThreadFactory threadFactory = new ThreadFactoryBuilder()
58+
.setNameFormat("ydb-leader-election-%d")
59+
.setDaemon(true)
60+
.build();
4161
private static final long MAX_LEASE = 1L;
4262

4363
private final LeaderElectionListener leaderElectionListener;
@@ -68,6 +88,15 @@ private enum State {
6888
CLOSED
6989
}
7090

91+
/**
92+
* Creates a new LeaderElection instance with default settings.
93+
*
94+
* @param client the coordination client to use
95+
* @param coordinationNodePath path to the coordination node
96+
* @param electionName name of the election (must be unique per coordination node)
97+
* @param data optional data to associate with the leader (visible to all participants)
98+
* @param leaderElectionListener callback for leadership events
99+
*/
71100
public LeaderElection(
72101
CoordinationClient client,
73102
String coordinationNodePath,
@@ -86,6 +115,17 @@ public LeaderElection(
86115
);
87116
}
88117

118+
/**
119+
* Creates a new LeaderElection instance with custom settings.
120+
*
121+
* @param client the coordination client to use
122+
* @param coordinationNodePath path to the coordination node
123+
* @param electionName name of the election (must be unique per coordination node)
124+
* @param data optional data to associate with the leader (visible to all participants)
125+
* @param leaderElectionListener callback for leadership events
126+
* @param settings configuration settings for the election process
127+
* @throws NullPointerException if any required parameter is null
128+
*/
89129
public LeaderElection(
90130
CoordinationClient client,
91131
String coordinationNodePath,
@@ -94,21 +134,28 @@ public LeaderElection(
94134
LeaderElectionListener leaderElectionListener,
95135
LeaderElectionSettings settings
96136
) {
137+
Preconditions.checkNotNull(client, "CoordinationClient cannot be null");
138+
Preconditions.checkNotNull(coordinationNodePath, "Coordination node path cannot be null");
139+
Preconditions.checkNotNull(electionName, "Election name cannot be null");
140+
Preconditions.checkNotNull(leaderElectionListener, "LeaderElectionListener cannot be null");
141+
Preconditions.checkNotNull(settings, "LeaderElectionSettings cannot be null");
142+
97143
this.coordinationNodePath = coordinationNodePath;
98144
this.electionName = electionName;
99145
this.data = data;
100146
this.leaderElectionListener = leaderElectionListener;
101147
this.scheduledExecutor = settings.getScheduledExecutor();
102-
this.blockingExecutor = Executors.newSingleThreadExecutor(); // TODO: thread factory
148+
this.blockingExecutor = Executors.newSingleThreadExecutor(threadFactory);
103149
this.retryPolicy = settings.getRetryPolicy();
104150

105151
this.coordinationSession = client.createSession(coordinationNodePath);
106152
this.sessionListenable = new ListenableContainer<>();
107153
coordinationSession.addStateListener(sessionState -> {
108-
if (sessionState == CoordinationSession.State.LOST || sessionState == CoordinationSession.State.CLOSED) {
154+
if (!state.get().equals(State.CLOSED) && (sessionState == CoordinationSession.State.LOST ||
155+
sessionState == CoordinationSession.State.CLOSED)) {
109156
logger.error("Coordination session unexpectedly changed to {} state, marking election as FAILED",
110157
sessionState);
111-
state.set(State.FAILED);
158+
stopInternal(State.FAILED);
112159
}
113160
sessionListenable.notifyListeners(sessionState);
114161
});
@@ -127,6 +174,11 @@ public LeaderElection(
127174
);
128175
}
129176

177+
/**
178+
* Starts the leader election process.
179+
*
180+
* @throws IllegalStateException if the election is already started or closed
181+
*/
130182
public void start() {
131183
Preconditions.checkState(
132184
state.compareAndSet(State.INITIAL, State.STARTING),
@@ -159,9 +211,7 @@ public void start() {
159211
return semaphoreStatus;
160212
}).exceptionally(ex -> {
161213
logger.error("Leader election initializing task failed", ex);
162-
state.set(State.FAILED);
163-
semaphoreObserver.close();
164-
startingLatch.countDown();
214+
stopInternal(State.FAILED);
165215
return Status.of(StatusCode.CLIENT_INTERNAL_ERROR);
166216
});
167217

@@ -176,20 +226,30 @@ private CompletableFuture<Status> executeWithRetry(Supplier<CompletableFuture<St
176226
return new RetryableTask("leaderElectionInitialize", taskSupplier, scheduledExecutor, retryPolicy).execute();
177227
}
178228

229+
/**
230+
* Enables automatic requeueing when leadership is lost.
231+
* If called before start election will be started immediately.
232+
*/
179233
public void autoRequeue() {
180234
autoRequeue = true;
181235
}
182236

237+
/**
238+
* Checks if this instance is currently the leader.
239+
*
240+
* @return true if this instance is the leader, false otherwise
241+
*/
183242
public boolean isLeader() {
184243
return isLeader;
185244
}
186245

187246
/**
188247
* Re-queue an attempt for leadership. If this instance is already queued, nothing
189248
* happens and false is returned. If the instance was not queued, it is re-queued and true
190-
* is returned
249+
* is returned.
191250
*
192-
* @return true if re-enqueue was successful
251+
* @return true if reenqueue was successful
252+
* @throws IllegalStateException if the election is not in STARTED or STARTING state
193253
*/
194254
public boolean requeue() {
195255
State localState = state.get();
@@ -201,6 +261,11 @@ public boolean requeue() {
201261
return enqueueElection();
202262
}
203263

264+
/**
265+
* Interrupts the current leadership attempt if one is in progress.
266+
*
267+
* @return true if leadership was interrupted, false if no attempt was in progress
268+
*/
204269
public synchronized boolean interruptLeadership() {
205270
Future<?> localTask = electionTask;
206271
if (localTask != null) {
@@ -231,11 +296,16 @@ public Void call() throws Exception {
231296
return false;
232297
}
233298

299+
/**
300+
* Main work loop for leadership acquisition and maintenance.
301+
*
302+
* @throws Exception if the leadership attempt fails
303+
*/
234304
private void doWork() throws Exception {
235305
isLeader = false;
236306

237307
try {
238-
waitStartedState();
308+
waitStartedStateOrFail();
239309
lock.tryAcquire(
240310
null,
241311
true,
@@ -248,7 +318,7 @@ private void doWork() throws Exception {
248318
Thread.currentThread().interrupt();
249319
throw e;
250320
} catch (Throwable e) {
251-
logger.debug("takeLeadership exception", e);
321+
logger.error("Unexpected error in takeLeadership", e);
252322
}
253323
} catch (InterruptedException e) {
254324
Thread.currentThread().interrupt();
@@ -270,7 +340,7 @@ private void doWork() throws Exception {
270340
}
271341
}
272342

273-
private void waitStartedState() throws InterruptedException {
343+
private void waitStartedStateOrFail() throws InterruptedException {
274344
State localState = state.get();
275345
if (localState == State.STARTING) {
276346
startingLatch.await();
@@ -295,9 +365,10 @@ private boolean isQueued() {
295365
}
296366

297367
/**
298-
* Не гарантированы все, кроме лидера
368+
* Gets all participants in the election.
369+
* Note: Due to observer limitations, waiters may be visible only eventually (after lease changes).
299370
*
300-
* @return
371+
* @return list of election participants (owners and visible waiters)
301372
*/
302373
public List<ElectionParticipant> getParticipants() {
303374
SemaphoreDescription semaphoreDescription = semaphoreObserver.getCachedData();
@@ -313,6 +384,11 @@ public List<ElectionParticipant> getParticipants() {
313384
).collect(Collectors.toList());
314385
}
315386

387+
/**
388+
* Gets the current leader if one exists.
389+
*
390+
* @return Optional containing the current leader, or empty if no leader exists
391+
*/
316392
public Optional<ElectionParticipant> getCurrentLeader() {
317393
SemaphoreDescription semaphoreDescription = semaphoreObserver.getCachedData();
318394
if (semaphoreDescription == null) {
@@ -336,18 +412,59 @@ public Listenable<CoordinationSession.State> getSessionListenable() {
336412
return sessionListenable;
337413
}
338414

415+
/**
416+
* Closes the leader election and releases all resources.
417+
* After closing, the instance cannot be reused.
418+
*/
339419
@Override
340420
public synchronized void close() {
341-
// TODO: Учесть все стейты
342-
Preconditions.checkState(state.compareAndSet(State.STARTED, State.CLOSED), "Already closed");
421+
stopInternal(State.CLOSED);
422+
}
423+
424+
/**
425+
* Internal method to stop the election with the specified termination state.
426+
*
427+
* @param terminationState the state to transition to (FAILED or CLOSED)
428+
* @return true if the state was changed, false if already terminated
429+
*/
430+
private synchronized boolean stopInternal(State terminationState) {
431+
State localState = state.get();
432+
if (localState == State.FAILED || localState == State.CLOSED) {
433+
logger.warn("Already stopped leader election {} with status: {}", electionName, localState);
434+
return false;
435+
}
436+
logger.debug("Transitioning leader election {} from {} to {}", electionName, localState, terminationState);
437+
438+
// change state
439+
state.set(terminationState);
343440

441+
// unblock starting latch if not yet
442+
startingLatch.countDown();
443+
444+
// stop tasks
445+
Future<Status> localInitializingTask = initializingTask.get();
446+
if (localInitializingTask != null) {
447+
localInitializingTask.cancel(true);
448+
initializingTask.set(null);
449+
}
344450
Future<Void> localTask = electionTask;
345451
if (localTask != null) {
346452
localTask.cancel(true);
347453
electionTask = null;
348454
}
349455

350-
blockingExecutor.shutdown();
351-
semaphoreObserver.close();
456+
// Clean up resources
457+
try {
458+
semaphoreObserver.close();
459+
} catch (Exception e) {
460+
logger.warn("Error closing semaphore observer for {}: {}", electionName, e.getMessage());
461+
}
462+
463+
try {
464+
blockingExecutor.shutdown();
465+
} catch (Exception e) {
466+
logger.warn("Error shutting down executor for {}: {}", electionName, e.getMessage());
467+
}
468+
return true;
352469
}
353470
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,66 @@
11
package tech.ydb.coordination.recipes.election;
22

3+
/**
4+
* A listener interface for receiving leadership election events in a distributed system.
5+
*
6+
* <p>Implementations of this interface are notified when the current process becomes
7+
* the leader in a leader election scenario.</p>
8+
*
9+
* <h3>Leadership Lifecycle:</h3>
10+
* <ol>
11+
* <li><b>Election:</b> The distributed system selects a leader</li>
12+
* <li><b>Takeover:</b> {@code takeLeadership()} is invoked on the elected leader</li>
13+
* <li><b>Execution:</b> The leader performs its duties while maintaining leadership</li>
14+
* <li><b>Termination:</b> When {@code takeLeadership()} completes (either normally or exceptionally),
15+
* the leadership is automatically relinquished and new elections begin</li>
16+
* </ol>
17+
*
18+
* <h3>Usage Example:</h3>
19+
* <pre>{@code
20+
* LeaderElectionListener listener = new LeaderElectionListener() {
21+
* public void takeLeadership() throws Exception {
22+
* startServices();
23+
*
24+
* // Main leadership work
25+
* while (shouldContinueLeadership()) {
26+
* performLeaderDuties();
27+
* }
28+
*
29+
* // Cleanup will trigger automatically when method exits
30+
* }
31+
* };
32+
* }</pre>
33+
*
34+
* <p><b>Important Implementation Notes:</b></p>
35+
* <ul>
36+
* <li>The leadership is maintained only while {@code takeLeadership()} is executing</li>
37+
* <li>When the method completes (either normally or by throwing an exception), the leadership
38+
* is automatically released and new elections begin immediately</li>
39+
* <li>For long-running leadership, the method should not return until leadership should end</li>
40+
* <li>To voluntarily relinquish leadership before completing, throw an exception</li>
41+
* </ul>
42+
*
43+
* <p><b>Error Handling:</b> If the implementation throws an exception, the leadership will be
44+
* released and new elections will be triggered, just as with normal completion.</p>
45+
*/
346
public interface LeaderElectionListener {
47+
/**
48+
* Called when leadership is acquired by the current process.
49+
*
50+
* <p>The leadership period lasts exactly as long as this method's execution. When the method
51+
* returns (either normally or exceptionally), the leadership is automatically relinquished
52+
* and new elections begin immediately.
53+
*
54+
* <p>For continuous leadership, implementations should:
55+
* <ul>
56+
* <li>Perform all initialization at start</li>
57+
* <li>Enter the main leadership loop</li>
58+
* <li>Only return when leadership should end</li>
59+
* </ul>
60+
*
61+
* @throws Exception if leadership cannot be maintained or should be terminated early.
62+
* The leadership will be released and new elections will begin when any
63+
* exception is thrown.
64+
*/
465
void takeLeadership() throws Exception;
566
}

0 commit comments

Comments
 (0)