Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.cancel
import kotlinx.coroutines.launch
import kotlinx.coroutines.runBlocking
import kotlinx.coroutines.withTimeoutOrNull
import org.lightningdevkit.ldknode.Event
import to.bitkit.App
import to.bitkit.R
Expand Down Expand Up @@ -144,21 +146,34 @@ class LightningNodeService : Service() {
}

override fun onDestroy() {
Logger.debug("onDestroy", context = TAG)
serviceScope.launch {
lightningRepo.stop()
serviceScope.cancel()
Logger.debug("onDestroy started", context = TAG)
runBlocking {
withTimeoutOrNull(NODE_STOP_TIMEOUT_MS) {
lightningRepo.stop()
}.let { result ->
if (result == null || result.isFailure) {
Logger.warn("Node stop timed out or failed during onDestroy", context = TAG)
}
}
}
serviceScope.cancel()
Logger.debug("onDestroy completed", context = TAG)
super.onDestroy()
}

@RequiresApi(Build.VERSION_CODES.VANILLA_ICE_CREAM)
override fun onTimeout(startId: Int, fgsType: Int) {
Logger.warn("Foreground service timeout reached", context = TAG)
serviceScope.launch {
lightningRepo.stop()
stopSelf()
runBlocking {
withTimeoutOrNull(FORCE_STOP_TIMEOUT_MS) {
lightningRepo.stop()
}.let { result ->
if (result == null || result.isFailure) {
Logger.warn("Node stop timed out or failed during onTimeout", context = TAG)
}
}
}
stopSelf()
super.onTimeout(startId, fgsType)
}

Expand All @@ -168,5 +183,7 @@ class LightningNodeService : Service() {
const val CHANNEL_ID_NODE = "bitkit_notification_channel_node"
const val TAG = "LightningNodeService"
const val ACTION_STOP_SERVICE_AND_APP = "to.bitkit.androidServices.action.STOP_SERVICE_AND_APP"
private const val NODE_STOP_TIMEOUT_MS = 5_000L
private const val FORCE_STOP_TIMEOUT_MS = 2_000L
}
}
14 changes: 14 additions & 0 deletions app/src/main/java/to/bitkit/repositories/LightningRepo.kt
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,14 @@ class LightningRepo @Inject constructor(

val result = lifecycleMutex.withLock {
initialLifecycleState = _lightningState.value.nodeLifecycleState

// Recovery: if stuck at Stopping (e.g., previous shutdown interrupted), reset to Stopped
if (initialLifecycleState == NodeLifecycleState.Stopping) {
Logger.warn("Found stuck Stopping state, resetting to Stopped", context = TAG)
_lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Stopped) }
initialLifecycleState = NodeLifecycleState.Stopped
}

if (initialLifecycleState.isRunningOrStarting()) {
Logger.info("LDK node start skipped, lifecycle state: $initialLifecycleState", context = TAG)
return@withLock Result.success(Unit)
Expand Down Expand Up @@ -396,6 +404,12 @@ class LightningRepo @Inject constructor(
lightningService.stop()
_lightningState.update { LightningState(nodeLifecycleState = NodeLifecycleState.Stopped) }
}.onFailure {
// On cancellation (e.g., timeout), ensure state is recoverable
if (it is CancellationException) {
Logger.warn("Node stop cancelled, forcing Stopped state for recovery", context = TAG)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: Setting state to Stopped when JNI operations may still be running

When a timeout-induced CancellationException occurs, this code forces the state to NodeLifecycleState.Stopped. However, the native node.syncWallets() and node.stop() JNI calls may still be executing on the ServiceQueue.LDK single-threaded dispatcher. JNI calls are not interruptible by coroutine cancellation - they will continue running until completion.

Consequences:

  1. State machine corruption: The lifecycle state says Stopped but the node is still actively performing I/O, persisting state, etc.

  2. Mutex release allows concurrent operations: When return@withLock executes, the lifecycleMutex is released. If start() is called next (e.g., on app relaunch), it will:

    • Acquire the mutex
    • See state Stopped
    • Proceed with startup
    • Find lightningService.node is non-null (since line 259 in LightningService.kt hasn't executed yet)
    • Skip setup() and dispatch node.start() to the LDK queue
    • The still-running stop() block eventually completes and sets [email protected] = null
    • Result: Repo thinks node is Running, but service's node reference is null
  3. Recovery mechanism doesn't help: The stuck-Stopping recovery at lines 282-286 is never triggered in the timeout scenario because state was already forced to Stopped.

Possible solutions:

  1. Leave state as Stopping instead of forcing to Stopped - this allows the stuck-state recovery mechanism to handle it on next start()
  2. Add a flag in LightningService to track whether native operations are truly complete, independent of coroutine cancellation
  3. Rethink the timeout approach - consider whether the timeout should apply to the entire operation or just serve as a best-effort mechanism with acknowledgment that native cleanup continues in background

_lightningState.update { LightningState(nodeLifecycleState = NodeLifecycleState.Stopped) }
return@withLock Result.failure(it)
Comment on lines +407 to +411
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: Catching CancellationException without rethrowing breaks coroutine cancellation contract

This code catches CancellationException via runCatching and wraps it in Result.failure() without rethrowing. This violates Kotlin's coroutine cancellation contract, which requires CancellationException to always be rethrown for proper structured concurrency.

The same file demonstrates the correct pattern in multiple places:

  • Line 223: // Cancellation is expected during pull-to-refresh, rethrow per Kotlin best practices followed by if (it is CancellationException) throw it
  • Line 861: Same pattern

Impact: While this happens to work for the primary onDestroy call site (since withContext(bgDispatcher) may detect parent cancellation independently), stop() is also called from:

  • wipeStorage() (line 543)
  • restartWithElectrumServer() (line 562)
  • restartWithRgsServer() (line 588)
  • restartWithPreviousConfig() (line 609)
  • restartNode() (line 1076)

If any of these call sites' coroutines are cancelled while stop() is running, the CancellationException will be silently swallowed, and subsequent code will execute when it should not.

Suggested fix: Rethrow the CancellationException after performing the state recovery:

Suggested change
// On cancellation (e.g., timeout), ensure state is recoverable
if (it is CancellationException) {
Logger.warn("Node stop cancelled, forcing Stopped state for recovery", context = TAG)
_lightningState.update { LightningState(nodeLifecycleState = NodeLifecycleState.Stopped) }
return@withLock Result.failure(it)
// On cancellation (e.g., timeout), ensure state is recoverable
if (it is CancellationException) {
Logger.warn("Node stop cancelled, forcing Stopped state for recovery", context = TAG)
_lightningState.update { LightningState(nodeLifecycleState = NodeLifecycleState.Stopped) }
throw it // Rethrow to properly propagate cancellation
}

}
Logger.error("Node stop error", it, context = TAG)
// On failure, check actual node state and update accordingly
// If node is still running, revert to Running state to allow retry
Expand Down
6 changes: 6 additions & 0 deletions app/src/main/java/to/bitkit/services/LightningService.kt
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,12 @@ class LightningService @Inject constructor(
return
}

runCatching {
Logger.debug("Performing final sync before shutdown…", context = TAG)
ServiceQueue.LDK.background { node.syncWallets() }
Logger.debug("Final sync completed", context = TAG)
}.onFailure { Logger.warn("Final sync failed, proceeding with shutdown", it, context = TAG) }

Logger.debug("Stopping node…", context = TAG)
ServiceQueue.LDK.background {
runCatching { node.stop() }
Expand Down
Loading