Skip to content

Commit a859117

Browse files
authored
Merge branch 'main' into design/heartbeat-keep-alive
2 parents 1910332 + 6b708f0 commit a859117

9 files changed

Lines changed: 399 additions & 7 deletions

File tree

NEXT_CHANGELOG.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,28 @@
1919

2020
### Fixed
2121

22+
- Reclassify transient/mis-categorized server errors so callers can identify
23+
retryable failures. The remap is applied at all Thrift error sites
24+
(`checkResponseForErrors`, `executeAsync`, `verifySuccessStatus`, and the
25+
polling status handler) so the same server failure surfaces with the same
26+
SQL state regardless of which response carries it.
27+
- Unity Catalog unavailability (`[UC_CLIENT_EXCEPTION]`, previously `XXUCC`)
28+
and parquet read / connection-acquisition deadlines
29+
(`[PARQUET_FAILED_READ_FOOTER]`, `DEADLINE_EXCEEDED: acquiring connection`)
30+
are now reported with SQL state `08S01` (communication link failure).
31+
- Server-side `java.util.ConcurrentModificationException` is now reported
32+
with SQL state `40001` (serialization failure) instead of the misleading
33+
`42000`. The remap only applies when the original SQL state is `42000` so
34+
unrelated `42xxx` states (e.g. `42501` insufficient privilege) are
35+
preserved.
36+
Notes for callers and operators:
37+
- Callers branching on the legacy `XXUCC`/`42000` states for these failures
38+
must update to `08S01`/`40001`. The driver logs the original→remapped
39+
state at `INFO` level for traceability.
40+
- The driver's failure telemetry uses `sqlState` as the error-name field,
41+
so dashboards/alerts keyed on `XXUCC` or `42000` for these specific
42+
failure modes will need to be updated to the new states.
43+
2244
---
2345
*Note: When making changes, please add your change under the appropriate section
2446
with a brief description.*

src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,11 @@ public final class DatabricksJdbcConstants {
116116
/** Standard SQL state for communication link failure (SQLSTATE 08S01). */
117117
public static final String COMMUNICATION_LINK_FAILURE_SQLSTATE = "08S01";
118118

119+
/**
120+
* Standard SQL state for transaction rollback - serialization failure (SQLSTATE 40001). Used for
121+
* concurrent-modification errors where the operation is potentially retryable.
122+
*/
123+
public static final String SERIALIZATION_FAILURE_SQLSTATE = "40001";
119124
/** Standard SQL state for data exception (SQLSTATE 22000). */
120125
public static final String DATA_EXCEPTION_SQLSTATE = "22000";
121126

src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,16 @@ public static void verifySuccessStatus(TStatus status, String errorContext, Stri
110110
errorMessage, sqlState, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR);
111111
}
112112

113-
throw new DatabricksHttpException(errorMessage, sqlState);
113+
String remappedSqlState =
114+
SqlStateClassifier.classifyTransientSqlState(status.getErrorMessage(), sqlState);
115+
if (!Objects.equals(remappedSqlState, sqlState)) {
116+
LOGGER.info(
117+
"Remapped SQL state [{}] -> [{}] for transient error pattern in thrift status (context: {})",
118+
sqlState,
119+
remappedSqlState,
120+
errorContext);
121+
}
122+
throw new DatabricksHttpException(errorMessage, remappedSqlState);
114123
}
115124
}
116125

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package com.databricks.jdbc.common.util;
2+
3+
import static com.databricks.jdbc.common.DatabricksJdbcConstants.COMMUNICATION_LINK_FAILURE_SQLSTATE;
4+
import static com.databricks.jdbc.common.DatabricksJdbcConstants.SERIALIZATION_FAILURE_SQLSTATE;
5+
6+
/**
7+
* Reclassifies SQL states for known transient or mis-categorized server errors so callers can
8+
* programmatically identify retryable failures.
9+
*
10+
* <p>Patterns handled:
11+
*
12+
* <ul>
13+
* <li>Unity Catalog unavailability ({@code UC_CLIENT_EXCEPTION}) → {@code 08S01} (communication
14+
* link failure, retryable).
15+
* <li>Connection-acquisition / parquet read deadlines ({@code PARQUET_FAILED_READ_FOOTER}, {@code
16+
* DEADLINE_EXCEEDED: acquiring connection}) → {@code 08S01}.
17+
* <li>Server-side {@code java.util.ConcurrentModificationException} mis-mapped to {@code 42000}
18+
* (syntax/access violation) → {@code 40001} (serialization failure, retryable). Only applied
19+
* when the original SQL state is {@code 42000} so unrelated {@code 42xxx} states are
20+
* preserved.
21+
* </ul>
22+
*
23+
* <p>Patterns are anchored on stable server-emitted tokens (Spark error classes, fully-qualified
24+
* Java exception names) rather than English prose, so server message rewording does not silently
25+
* regress the classifier.
26+
*/
27+
public final class SqlStateClassifier {
28+
29+
private static final String SYNTAX_OR_ACCESS_VIOLATION_SQLSTATE = "42000";
30+
31+
private SqlStateClassifier() {}
32+
33+
/**
34+
* Returns a remapped SQL state if {@code errorMessage} matches a known transient pattern, or
35+
* {@code originalSqlState} otherwise. Pure function — callers should log the remap separately
36+
* with their own context (statement ID, response).
37+
*/
38+
public static String classifyTransientSqlState(String errorMessage, String originalSqlState) {
39+
if (errorMessage == null) {
40+
return originalSqlState;
41+
}
42+
// Spark error classes are the outer cause; check before nested-Java-exception patterns like
43+
// ConcurrentModificationException, which may appear inside a UC/Parquet wrapping.
44+
if (errorMessage.contains("UC_CLIENT_EXCEPTION")
45+
|| errorMessage.contains("PARQUET_FAILED_READ_FOOTER")
46+
|| errorMessage.contains("DEADLINE_EXCEEDED: acquiring connection")) {
47+
return COMMUNICATION_LINK_FAILURE_SQLSTATE;
48+
}
49+
if (SYNTAX_OR_ACCESS_VIOLATION_SQLSTATE.equals(originalSqlState)
50+
&& errorMessage.contains("java.util.ConcurrentModificationException")) {
51+
return SERIALIZATION_FAILURE_SQLSTATE;
52+
}
53+
return originalSqlState;
54+
}
55+
}

src/main/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClient.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import static com.databricks.jdbc.common.EnvironmentVariables.DEFAULT_RESULT_ROW_LIMIT;
88
import static com.databricks.jdbc.common.util.DatabricksTypeUtil.DECIMAL;
99
import static com.databricks.jdbc.common.util.DatabricksTypeUtil.getDecimalTypeString;
10+
import static com.databricks.jdbc.common.util.SqlStateClassifier.classifyTransientSqlState;
1011
import static com.databricks.jdbc.dbclient.impl.sqlexec.PathConstants.*;
1112
import static com.databricks.jdbc.model.telemetry.enums.DatabricksDriverErrorCode.TEMPORARY_REDIRECT_EXCEPTION;
1213

@@ -790,8 +791,16 @@ void handleFailedExecution(
790791
throw new DatabricksTimeoutException(
791792
errorMessage, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR);
792793
}
794+
String remappedSqlState = classifyTransientSqlState(errorMessage, sqlState);
795+
if (!Objects.equals(remappedSqlState, sqlState)) {
796+
LOGGER.info(
797+
"Remapped SQL state [{}] -> [{}] for transient error pattern in SEA statement [{}]",
798+
sqlState,
799+
remappedSqlState,
800+
statementId);
801+
}
793802
throw new DatabricksSQLException(
794-
errorMessage, sqlState, DatabricksDriverErrorCode.EXECUTE_STATEMENT_FAILED);
803+
errorMessage, remappedSqlState, DatabricksDriverErrorCode.EXECUTE_STATEMENT_FAILED);
795804
}
796805

797806
private ExecuteStatementResponse wrapGetStatementResponse(

src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import static com.databricks.jdbc.common.DatabricksJdbcConstants.QUERY_EXECUTION_TIMEOUT_SQLSTATE;
66
import static com.databricks.jdbc.common.EnvironmentVariables.*;
77
import static com.databricks.jdbc.common.util.DatabricksThriftUtil.*;
8+
import static com.databricks.jdbc.common.util.SqlStateClassifier.classifyTransientSqlState;
89

910
import com.databricks.jdbc.api.impl.*;
1011
import com.databricks.jdbc.api.internal.IDatabricksConnectionContext;
@@ -29,6 +30,7 @@
2930
import com.databricks.sdk.service.sql.StatementState;
3031
import java.sql.SQLException;
3132
import java.util.Arrays;
33+
import java.util.Objects;
3234
import java.util.concurrent.TimeUnit;
3335
import org.apache.http.HttpException;
3436
import org.apache.thrift.TBase;
@@ -379,7 +381,16 @@ DatabricksResultSet executeAsync(
379381
"Received error response {} from Thrift Server for request {}",
380382
response,
381383
request.toString());
382-
throw new DatabricksSQLException(response.status.errorMessage, response.status.sqlState);
384+
String originalSqlState = response.status.sqlState;
385+
String remappedSqlState =
386+
classifyTransientSqlState(response.status.errorMessage, originalSqlState);
387+
if (!Objects.equals(remappedSqlState, originalSqlState)) {
388+
LOGGER.info(
389+
"Remapped SQL state [{}] -> [{}] for transient error pattern in async execute response",
390+
originalSqlState,
391+
remappedSqlState);
392+
}
393+
throw new DatabricksSQLException(response.status.errorMessage, remappedSqlState);
383394
}
384395
} catch (DatabricksSQLException | TException e) {
385396

@@ -819,7 +830,16 @@ private <T extends TBase<T, F>, F extends TFieldIdEnum> void checkResponseForErr
819830
if (!response.isSet(operationHandleField) || isErrorStatusCode(status)) {
820831
// if the operationHandle has not been set, it is an error from the server.
821832
LOGGER.error("Error thrift response {}", response);
822-
throw new DatabricksSQLException(status.getErrorMessage(), status.getSqlState());
833+
String originalSqlState = status.getSqlState();
834+
String remappedSqlState =
835+
classifyTransientSqlState(status.getErrorMessage(), originalSqlState);
836+
if (!Objects.equals(remappedSqlState, originalSqlState)) {
837+
LOGGER.info(
838+
"Remapped SQL state [{}] -> [{}] for transient error pattern in thrift response",
839+
originalSqlState,
840+
remappedSqlState);
841+
}
842+
throw new DatabricksSQLException(status.getErrorMessage(), remappedSqlState);
823843
}
824844
}
825845

@@ -840,8 +860,16 @@ private void checkOperationStatusForErrors(TGetOperationStatusResp statusResp, S
840860
+ "error: [%s]",
841861
statusResp.getStatus().getStatusCode(), statementId, serverError);
842862
LOGGER.error(errorMsg);
843-
throw new DatabricksSQLException(
844-
errorMsg, statusResp.isSetSqlState() ? statusResp.getSqlState() : null);
863+
String originalSqlState = statusResp.isSetSqlState() ? statusResp.getSqlState() : null;
864+
String remappedSqlState = classifyTransientSqlState(serverError, originalSqlState);
865+
if (!Objects.equals(remappedSqlState, originalSqlState)) {
866+
LOGGER.info(
867+
"Remapped SQL state [{}] -> [{}] for transient error pattern in statement [{}]",
868+
originalSqlState,
869+
remappedSqlState,
870+
statementId);
871+
}
872+
throw new DatabricksSQLException(errorMsg, remappedSqlState);
845873
}
846874

847875
if (statusResp.isSetOperationState()
@@ -864,7 +892,15 @@ private void checkOperationStatusForErrors(TGetOperationStatusResp statusResp, S
864892
errorMsg, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR);
865893
}
866894

867-
throw new DatabricksSQLException(errorMsg, sqlState);
895+
String remappedSqlState = classifyTransientSqlState(serverError, sqlState);
896+
if (!Objects.equals(remappedSqlState, sqlState)) {
897+
LOGGER.info(
898+
"Remapped SQL state [{}] -> [{}] for transient error pattern in statement [{}]",
899+
sqlState,
900+
remappedSqlState,
901+
statementId);
902+
}
903+
throw new DatabricksSQLException(errorMsg, remappedSqlState);
868904
}
869905
}
870906

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package com.databricks.jdbc.common.util;
2+
3+
import static com.databricks.jdbc.common.util.SqlStateClassifier.classifyTransientSqlState;
4+
import static org.junit.jupiter.api.Assertions.assertEquals;
5+
import static org.junit.jupiter.api.Assertions.assertNull;
6+
7+
import org.junit.jupiter.api.Test;
8+
9+
class SqlStateClassifierTest {
10+
11+
@Test
12+
void unityCatalogTokenRemapsTo08S01() {
13+
String ucMessage =
14+
"Error running query: [UC_CLIENT_EXCEPTION] "
15+
+ "com.databricks.sql.managedcatalog.UnityCatalogClientException: "
16+
+ "[UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. "
17+
+ "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED";
18+
assertEquals("08S01", classifyTransientSqlState(ucMessage, "XXUCC"));
19+
}
20+
21+
@Test
22+
void parquetReadFooterTokenRemapsTo08S01() {
23+
String message =
24+
"Error running query: [PARQUET_FAILED_READ_FOOTER] "
25+
+ "com.databricks.sql.io.parquet.ParquetFailedReadFooterException: "
26+
+ "DEADLINE_EXCEEDED: acquiring connection";
27+
assertEquals("08S01", classifyTransientSqlState(message, null));
28+
}
29+
30+
@Test
31+
void deadlineExceededAcquiringConnectionRemapsTo08S01() {
32+
assertEquals(
33+
"08S01",
34+
classifyTransientSqlState("DEADLINE_EXCEEDED: acquiring connection from pool", null));
35+
}
36+
37+
@Test
38+
void concurrentModificationGatedOn42000() {
39+
String message =
40+
"Error running query: java.util.ConcurrentModificationException: "
41+
+ "mutation occurred during iteration";
42+
assertEquals("40001", classifyTransientSqlState(message, "42000"));
43+
}
44+
45+
@Test
46+
void concurrentModificationDoesNotRemapWhenOriginalStateIsNot42000() {
47+
String message =
48+
"Error running query: java.util.ConcurrentModificationException: "
49+
+ "mutation occurred during iteration";
50+
assertEquals("42501", classifyTransientSqlState(message, "42501"));
51+
assertEquals("XXUCC", classifyTransientSqlState(message, "XXUCC"));
52+
assertNull(classifyTransientSqlState(message, null));
53+
}
54+
55+
@Test
56+
void bareConcurrentModificationExceptionWithoutFqnDoesNotRemap() {
57+
assertEquals(
58+
"42000",
59+
classifyTransientSqlState(
60+
"SELECT 'ConcurrentModificationException' FROM nonexistent_tbl", "42000"));
61+
}
62+
63+
@Test
64+
void ucProseWithoutTokenDoesNotTriggerRemap() {
65+
assertEquals(
66+
"42S02",
67+
classifyTransientSqlState(
68+
"User-supplied literal: Failed to contact the Unity Catalog server", "42S02"));
69+
}
70+
71+
@Test
72+
void unrelatedErrorPreservesOriginalState() {
73+
assertEquals(
74+
"42S02", classifyTransientSqlState("Table or view not found: foo.bar.baz", "42S02"));
75+
}
76+
77+
@Test
78+
void nullMessagePreservesOriginalState() {
79+
assertNull(classifyTransientSqlState(null, null));
80+
assertEquals("42S02", classifyTransientSqlState(null, "42S02"));
81+
}
82+
83+
@Test
84+
void emptyOriginalStateIsPreservedWhenUnrelated() {
85+
assertEquals("", classifyTransientSqlState("Some unrelated error", ""));
86+
}
87+
88+
@Test
89+
void caseSensitivityIsExplicit() {
90+
assertEquals(
91+
"42S02",
92+
classifyTransientSqlState("Lowercase [uc_client_exception] should not match", "42S02"));
93+
}
94+
95+
@Test
96+
void ucCheckRunsBeforeCmeWhenOriginalStateIs42000AndBothSubstringsPresent() {
97+
String message =
98+
"[UC_CLIENT_EXCEPTION] catalog server: caused by "
99+
+ "java.util.ConcurrentModificationException: nested";
100+
assertEquals("08S01", classifyTransientSqlState(message, "42000"));
101+
}
102+
}

src/test/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClientTest.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,38 @@ public void testHandleFailedExecution_FailedState_ThrowsWithoutHY008() throws Ex
339339
assertTrue(exception.getMessage().contains("execution failed"));
340340
}
341341

342+
@Test
343+
public void testHandleFailedExecution_unityCatalogError_remapsToCommunicationLinkFailure()
344+
throws Exception {
345+
IDatabricksConnectionContext connectionContext =
346+
DatabricksConnectionContext.parse(JDBC_URL, new Properties());
347+
DatabricksSdkClient databricksSdkClient =
348+
new DatabricksSdkClient(connectionContext, statementExecutionService, apiClient);
349+
350+
StatementStatus failedStatus =
351+
new StatementStatus()
352+
.setState(StatementState.FAILED)
353+
.setSqlState("XXUCC")
354+
.setError(
355+
new ServiceError()
356+
.setMessage(
357+
"[UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. "
358+
+ "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"));
359+
ExecuteStatementResponse response =
360+
new ExecuteStatementResponse()
361+
.setStatementId(STATEMENT_ID.toSQLExecStatementId())
362+
.setStatus(failedStatus);
363+
364+
DatabricksSQLException exception =
365+
assertThrows(
366+
DatabricksSQLException.class,
367+
() ->
368+
databricksSdkClient.handleFailedExecution(
369+
response, STATEMENT_ID.toSQLExecStatementId(), STATEMENT));
370+
371+
assertEquals("08S01", exception.getSQLState(), "Expected XXUCC to be remapped to 08S01");
372+
}
373+
342374
@Test
343375
public void testGetStatementResult_CancelledState_ThrowsWithHY008() throws Exception {
344376
IDatabricksConnectionContext connectionContext =

0 commit comments

Comments
 (0)