fix: improve connection stability and add connection check API
Some checks failed
ci / macos (push) Has been cancelled
ci / ios (push) Has been cancelled
ci / check-linter (push) Has been cancelled

- Fix race condition in resource cleanup during disconnect/cancel
- Trigger reconnect on ping send failure
- Trigger reconnect on write operation failure
- Add isConnected and connectionState properties
- Add checkConnection() method for active connection testing
- Add ensureConnected() method for proactive connection recovery
This commit is contained in:
wenzuhuai
2026-01-21 11:43:23 +08:00
parent 153e600bbc
commit 5ad1174292
2 changed files with 162 additions and 12 deletions

View File

@@ -407,12 +407,25 @@ class ConnectionHandler: ChannelInboundHandler {
}
} onCancel: {
logger.debug("Connection task cancelled")
// Clean up resources
if let channel = self.channel {
channel.close(mode: .all, promise: nil)
self.channel = nil
}
// Clean up resources safely to avoid race conditions
// Capture references first to avoid concurrent access issues
let channelToClose = self.channel
let bufferToRelease = self.batchBuffer
// Clear references first to prevent other threads from using them
self.channel = nil
self.batchBuffer = nil
// Close channel asynchronously after clearing references
// This ensures BatchBuffer's deinit won't conflict with channel close
if let channel = channelToClose {
channel.eventLoop.execute {
channel.close(mode: .all, promise: nil)
}
}
// bufferToRelease will be released here after channel close is scheduled
_ = bufferToRelease
let continuationToResume: CheckedContinuation<ServerInfo, Error>? = self
.serverInfoContinuation.withLockedValue { cont in
@@ -551,12 +564,25 @@ class ConnectionHandler: ChannelInboundHandler {
}
} onCancel: {
logger.debug("Client connect initialization cancelled")
// Clean up resources
if let channel = self.channel {
channel.close(mode: .all, promise: nil)
self.channel = nil
}
// Clean up resources safely to avoid race conditions
// Capture references first to avoid concurrent access issues
let channelToClose = self.channel
let bufferToRelease = self.batchBuffer
// Clear references first to prevent other threads from using them
self.channel = nil
self.batchBuffer = nil
// Close channel asynchronously after clearing references
// This ensures BatchBuffer's deinit won't conflict with channel close
if let channel = channelToClose {
channel.eventLoop.execute {
channel.close(mode: .all, promise: nil)
}
}
// bufferToRelease will be released here after channel close is scheduled
_ = bufferToRelease
let continuationToResume: CheckedContinuation<Void, Error>? = self
.connectionEstablishedContinuation.withLockedValue { cont in
@@ -738,6 +764,13 @@ class ConnectionHandler: ChannelInboundHandler {
private func disconnect() async throws {
self.pingTask?.cancel()
clearPendingPings() // Clear pending pings to avoid promise leaks
// Safely clear batchBuffer before closing channel
// This prevents race conditions during deallocation
let bufferToRelease = self.batchBuffer
self.batchBuffer = nil
_ = bufferToRelease // Release after clearing reference
try await self.channel?.close().get()
}
@@ -819,6 +852,11 @@ class ConnectionHandler: ChannelInboundHandler {
logger.debug("sent ping: \(pingsOut)")
} catch {
logger.error("Unable to send ping: \(error)")
// Trigger reconnect on ping failure - connection may be broken
let currentState = state.withLockedValue { $0 }
if currentState == .connected {
handleDisconnect()
}
}
}
@@ -895,6 +933,12 @@ class ConnectionHandler: ChannelInboundHandler {
func handleDisconnect() {
state.withLockedValue { $0 = .disconnected }
// Safely clear batchBuffer first to avoid race conditions
let bufferToRelease = self.batchBuffer
self.batchBuffer = nil
_ = bufferToRelease // Release after clearing reference
if let channel = self.channel {
let promise = channel.eventLoop.makePromise(of: Void.self)
Task {
@@ -915,10 +959,13 @@ class ConnectionHandler: ChannelInboundHandler {
} catch {
logger.error("Error closing connection: \(error)")
}
// Only start reconnect after disconnect is complete
self.handleReconnect()
}
} else {
// No channel, start reconnect immediately
handleReconnect()
}
handleReconnect()
}
func handleReconnect() {
@@ -979,6 +1026,12 @@ class ConnectionHandler: ChannelInboundHandler {
do {
try await buffer.writeMessage(operation)
} catch {
// Trigger reconnect on write failure - connection may be broken
let currentState = state.withLockedValue { $0 }
if currentState == .connected {
logger.error("Write operation failed, triggering reconnect: \(error)")
handleDisconnect()
}
throw NatsError.ClientError.io(error)
}
}