Skip to content

Commit 0e58789

Browse files
committed
fix(websocket): Fix race conditions and memory leak
- Add state check in abort_connection to prevent double-close - Fix memory leak: free errormsg_buffer on disconnect - Reset connection state on reconnect to prevent stale data - Implement lock ordering for separate TX lock mode - Added sdkconfig.ci.tx_lock config
1 parent 732cd29 commit 0e58789

File tree

2 files changed

+94
-7
lines changed

2 files changed

+94
-7
lines changed

components/esp_websocket_client/esp_websocket_client.c

Lines changed: 79 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,16 @@ static esp_err_t esp_websocket_client_dispatch_event(esp_websocket_client_handle
244244
static esp_err_t esp_websocket_client_abort_connection(esp_websocket_client_handle_t client, esp_websocket_error_type_t error_type)
245245
{
246246
ESP_WS_CLIENT_STATE_CHECK(TAG, client, return ESP_FAIL);
247+
248+
// Note: This function must be called with client->lock already held
249+
// The caller is responsible for acquiring the lock before calling
250+
251+
// CRITICAL: Check if already closing/closed to prevent double-close
252+
if (client->state == WEBSOCKET_STATE_CLOSING || client->state == WEBSOCKET_STATE_UNKNOW) {
253+
ESP_LOGW(TAG, "Connection already closing/closed, skipping abort");
254+
return ESP_OK;
255+
}
256+
247257
esp_transport_close(client->transport);
248258

249259
if (!client->config->auto_reconnect) {
@@ -256,6 +266,17 @@ static esp_err_t esp_websocket_client_abort_connection(esp_websocket_client_hand
256266
}
257267
client->error_handle.error_type = error_type;
258268
esp_websocket_client_dispatch_event(client, WEBSOCKET_EVENT_DISCONNECTED, NULL, 0);
269+
270+
if (client->errormsg_buffer) {
271+
ESP_LOGI(TAG, "Freeing error buffer (%d bytes) - Free heap: %" PRIu32 " bytes",
272+
client->errormsg_size, esp_get_free_heap_size());
273+
free(client->errormsg_buffer);
274+
client->errormsg_buffer = NULL;
275+
client->errormsg_size = 0;
276+
} else {
277+
ESP_LOGI(TAG, "Disconnect - Free heap: %" PRIu32 " bytes", esp_get_free_heap_size());
278+
}
279+
259280
return ESP_OK;
260281
}
261282

@@ -453,6 +474,8 @@ static void destroy_and_free_resources(esp_websocket_client_handle_t client)
453474
esp_websocket_client_destroy_config(client);
454475
if (client->transport_list) {
455476
esp_transport_list_destroy(client->transport_list);
477+
client->transport_list = NULL;
478+
client->transport = NULL;
456479
}
457480
vSemaphoreDelete(client->lock);
458481
#ifdef CONFIG_ESP_WS_CLIENT_SEPARATE_TX_LOCK
@@ -679,8 +702,18 @@ static int esp_websocket_client_send_with_exact_opcode(esp_websocket_client_hand
679702
} else {
680703
esp_websocket_client_error(client, "esp_transport_write() returned %d, errno=%d", ret, errno);
681704
}
705+
ESP_LOGD(TAG, "Calling abort_connection due to send error");
706+
#ifdef CONFIG_ESP_WS_CLIENT_SEPARATE_TX_LOCK
707+
xSemaphoreGiveRecursive(client->tx_lock);
708+
xSemaphoreTakeRecursive(client->lock, portMAX_DELAY);
709+
esp_websocket_client_abort_connection(client, WEBSOCKET_ERROR_TYPE_TCP_TRANSPORT);
710+
xSemaphoreGiveRecursive(client->lock);
711+
return ret;
712+
#else
713+
// Already holding client->lock, safe to call
682714
esp_websocket_client_abort_connection(client, WEBSOCKET_ERROR_TYPE_TCP_TRANSPORT);
683715
goto unlock_and_return;
716+
#endif
684717
}
685718
opcode = 0;
686719
widx += wlen;
@@ -1019,7 +1052,6 @@ static esp_err_t esp_websocket_client_recv(esp_websocket_client_handle_t client)
10191052
esp_websocket_free_buf(client, false);
10201053
return ESP_OK;
10211054
}
1022-
10231055
esp_websocket_client_dispatch_event(client, WEBSOCKET_EVENT_DATA, client->rx_buffer, rlen);
10241056

10251057
client->payload_offset += rlen;
@@ -1030,15 +1062,47 @@ static esp_err_t esp_websocket_client_recv(esp_websocket_client_handle_t client)
10301062
const char *data = (client->payload_len == 0) ? NULL : client->rx_buffer;
10311063
ESP_LOGD(TAG, "Sending PONG with payload len=%d", client->payload_len);
10321064
#ifdef CONFIG_ESP_WS_CLIENT_SEPARATE_TX_LOCK
1065+
// CRITICAL: To avoid deadlock, we must follow lock ordering: tx_lock BEFORE client->lock
1066+
// Current state: We hold client->lock (acquired by caller at line 1228)
1067+
// We need: tx_lock
1068+
// Deadlock risk: Send error path holds tx_lock and waits for client->lock = circular wait!
1069+
//
1070+
// Solution: Temporarily release client->lock, acquire both locks in correct order
1071+
// 1. Release client->lock
1072+
// 2. Acquire tx_lock (now safe - no deadlock)
1073+
// 3. Re-acquire client->lock (for consistency with caller expectations)
1074+
// 4. Send PONG
1075+
// 5. Release both locks in reverse order
1076+
1077+
xSemaphoreGiveRecursive(client->lock); // Release client->lock
1078+
1079+
// Now acquire tx_lock with timeout (consistent with PING/CLOSE handling)
10331080
if (xSemaphoreTakeRecursive(client->tx_lock, WEBSOCKET_TX_LOCK_TIMEOUT_MS) != pdPASS) {
1034-
ESP_LOGE(TAG, "Could not lock ws-client within %d timeout", WEBSOCKET_TX_LOCK_TIMEOUT_MS);
1035-
return ESP_FAIL;
1081+
ESP_LOGE(TAG, "Could not lock ws-client within %d timeout for PONG", WEBSOCKET_TX_LOCK_TIMEOUT_MS);
1082+
xSemaphoreTakeRecursive(client->lock, portMAX_DELAY); // Re-acquire client->lock before returning
1083+
return ESP_OK; // Return gracefully, caller expects client->lock to be held
10361084
}
1037-
#endif
1085+
1086+
// Re-acquire client->lock to maintain consistency
1087+
xSemaphoreTakeRecursive(client->lock, portMAX_DELAY);
1088+
1089+
// CRITICAL: Check if transport is still valid after re-acquiring lock
1090+
// Another thread may have closed it while we didn't hold client->lock
1091+
if (client->state == WEBSOCKET_STATE_CLOSING || client->state == WEBSOCKET_STATE_UNKNOW ||
1092+
client->state == WEBSOCKET_STATE_WAIT_TIMEOUT || client->transport == NULL) {
1093+
ESP_LOGW(TAG, "Transport closed while preparing PONG, skipping send");
1094+
xSemaphoreGiveRecursive(client->tx_lock);
1095+
return ESP_OK; // Caller expects client->lock to be held, which it is
1096+
}
1097+
10381098
esp_transport_ws_send_raw(client->transport, WS_TRANSPORT_OPCODES_PONG | WS_TRANSPORT_OPCODES_FIN, data, client->payload_len,
10391099
client->config->network_timeout_ms);
1040-
#ifdef CONFIG_ESP_WS_CLIENT_SEPARATE_TX_LOCK
10411100
xSemaphoreGiveRecursive(client->tx_lock);
1101+
#else
1102+
// When separate TX lock is not configured, we already hold client->lock
1103+
// which protects the transport, so we can send PONG directly
1104+
esp_transport_ws_send_raw(client->transport, WS_TRANSPORT_OPCODES_PONG | WS_TRANSPORT_OPCODES_FIN, data, client->payload_len,
1105+
client->config->network_timeout_ms);
10421106
#endif
10431107
} else if (client->last_opcode == WS_TRANSPORT_OPCODES_PONG) {
10441108
client->wait_for_pong_resp = false;
@@ -1136,6 +1200,11 @@ static void esp_websocket_client_task(void *pv)
11361200
client->state = WEBSOCKET_STATE_CONNECTED;
11371201
client->wait_for_pong_resp = false;
11381202
client->error_handle.error_type = WEBSOCKET_ERROR_TYPE_NONE;
1203+
client->payload_len = 0;
1204+
client->payload_offset = 0;
1205+
client->last_fin = false;
1206+
client->last_opcode = WS_TRANSPORT_OPCODES_NONE;
1207+
11391208
esp_websocket_client_dispatch_event(client, WEBSOCKET_EVENT_CONNECTED, NULL, 0);
11401209
break;
11411210
case WEBSOCKET_STATE_CONNECTED:
@@ -1221,12 +1290,15 @@ static void esp_websocket_client_task(void *pv)
12211290
esp_websocket_client_abort_connection(client, WEBSOCKET_ERROR_TYPE_TCP_TRANSPORT);
12221291
xSemaphoreGiveRecursive(client->lock);
12231292
} else if (read_select > 0) {
1293+
// CRITICAL: Protect entire recv operation with client->lock
1294+
// This prevents transport from being closed while recv is in progress
1295+
xSemaphoreTakeRecursive(client->lock, lock_timeout);
12241296
if (esp_websocket_client_recv(client) == ESP_FAIL) {
12251297
ESP_LOGE(TAG, "Error receive data");
1226-
xSemaphoreTakeRecursive(client->lock, lock_timeout);
1298+
// Note: Already holding client->lock from line above
12271299
esp_websocket_client_abort_connection(client, WEBSOCKET_ERROR_TYPE_TCP_TRANSPORT);
1228-
xSemaphoreGiveRecursive(client->lock);
12291300
}
1301+
xSemaphoreGiveRecursive(client->lock);
12301302
}
12311303
} else if (WEBSOCKET_STATE_WAIT_TIMEOUT == client->state) {
12321304
// waiting for reconnection or a request to stop the client...
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
CONFIG_IDF_TARGET="esp32"
2+
CONFIG_IDF_TARGET_LINUX=n
3+
CONFIG_WEBSOCKET_URI_FROM_STDIN=n
4+
CONFIG_WEBSOCKET_URI_FROM_STRING=y
5+
CONFIG_EXAMPLE_CONNECT_ETHERNET=y
6+
CONFIG_EXAMPLE_CONNECT_WIFI=n
7+
CONFIG_EXAMPLE_USE_INTERNAL_ETHERNET=y
8+
CONFIG_EXAMPLE_ETH_PHY_IP101=y
9+
CONFIG_EXAMPLE_ETH_MDC_GPIO=23
10+
CONFIG_EXAMPLE_ETH_MDIO_GPIO=18
11+
CONFIG_EXAMPLE_ETH_PHY_RST_GPIO=5
12+
CONFIG_EXAMPLE_ETH_PHY_ADDR=1
13+
CONFIG_EXAMPLE_CONNECT_IPV6=y
14+
CONFIG_ESP_WS_CLIENT_SEPARATE_TX_LOCK=y
15+
CONFIG_ESP_WS_CLIENT_TX_LOCK_TIMEOUT_MS=2000

0 commit comments

Comments
 (0)