-
Notifications
You must be signed in to change notification settings - Fork 3.3k
HBASE-29265: Batch calls to overloaded cluster can cause meta hotspotting #6961
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: branch-2
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -219,13 +219,14 @@ public void run() { | |
} catch (IOException e) { | ||
// The service itself failed . It may be an error coming from the communication | ||
// layer, but, as well, a functional error raised by the server. | ||
receiveGlobalFailure(multiAction, server, numAttempt, e, true); | ||
|
||
receiveGlobalFailure(multiAction, server, numAttempt, e); | ||
return; | ||
} catch (Throwable t) { | ||
// This should not happen. Let's log & retry anyway. | ||
LOG.error("id=" + asyncProcess.id + ", caught throwable. Unexpected." | ||
+ " Retrying. Server=" + server + ", tableName=" + tableName, t); | ||
receiveGlobalFailure(multiAction, server, numAttempt, t, true); | ||
receiveGlobalFailure(multiAction, server, numAttempt, t); | ||
return; | ||
} | ||
if (res.type() == AbstractResponse.ResponseType.MULTI) { | ||
|
@@ -570,7 +571,6 @@ private RegionLocations findAllLocationsOrFail(Action action, boolean useCache) | |
*/ | ||
void sendMultiAction(Map<ServerName, MultiAction> actionsByServer, int numAttempt, | ||
List<Action> actionsForReplicaThread, boolean reuseThread) { | ||
boolean clearServerCache = true; | ||
// Run the last item on the same thread if we are already on a send thread. | ||
// We hope most of the time it will be the only item, so we can cut down on threads. | ||
int actionsRemaining = actionsByServer.size(); | ||
|
@@ -606,15 +606,14 @@ void sendMultiAction(Map<ServerName, MultiAction> actionsByServer, int numAttemp | |
LOG.warn("id=" + asyncProcess.id + ", task rejected by pool. Unexpected." + " Server=" | ||
+ server.getServerName(), t); | ||
// Do not update cache if exception is from failing to submit action to thread pool | ||
clearServerCache = false; | ||
} else { | ||
// see #HBASE-14359 for more details | ||
LOG.warn("Caught unexpected exception/error: ", t); | ||
} | ||
asyncProcess.decTaskCounters(multiAction.getRegions(), server); | ||
// We're likely to fail again, but this will increment the attempt counter, | ||
// so it will finish. | ||
receiveGlobalFailure(multiAction, server, numAttempt, t, clearServerCache); | ||
receiveGlobalFailure(multiAction, server, numAttempt, t); | ||
} | ||
} | ||
} | ||
|
@@ -764,13 +763,18 @@ private void failAll(MultiAction actions, ServerName server, int numAttempt, | |
* @param t the throwable (if any) that caused the resubmit | ||
*/ | ||
private void receiveGlobalFailure(MultiAction rsActions, ServerName server, int numAttempt, | ||
Throwable t, boolean clearServerCache) { | ||
Throwable t) { | ||
errorsByServer.reportServerError(server); | ||
Retry canRetry = errorsByServer.canTryMore(numAttempt) ? Retry.YES : Retry.NO_RETRIES_EXHAUSTED; | ||
boolean clearServerCache = ClientExceptionsUtil.isMetaClearingException(t); | ||
|
||
// Do not update cache if exception is from failing to submit action to thread pool | ||
if (clearServerCache) { | ||
cleanServerCache(server, t); | ||
|
||
if (LOG.isTraceEnabled()) { | ||
LOG.trace("Cleared meta cache for server {} due to global failure {}", server, t); | ||
} | ||
} | ||
|
||
int failed = 0; | ||
|
@@ -779,12 +783,8 @@ private void receiveGlobalFailure(MultiAction rsActions, ServerName server, int | |
for (Map.Entry<byte[], List<Action>> e : rsActions.actions.entrySet()) { | ||
byte[] regionName = e.getKey(); | ||
byte[] row = e.getValue().get(0).getAction().getRow(); | ||
// Do not use the exception for updating cache because it might be coming from | ||
// any of the regions in the MultiAction and do not update cache if exception is | ||
// from failing to submit action to thread pool | ||
if (clearServerCache) { | ||
updateCachedLocations(server, regionName, row, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hey @hgromer . From what I understand , the I may have misunderstand or have missed something, could you possibly add a test to show that on batch operation a non meta cache clearing exception is causing a meta cache clear through There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah you're right. This code path is really tricky to reason about. Between the UnknownExceptions in the meta cache clear metrics, and the lack of logging, it's really difficult to identify what is causing these meta cache clears. I think I'm going to re-purpose this PR to simply add some logging + avoid passing in @droudnitsky does that make sense to you? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah |
||
ClientExceptionsUtil.isMetaClearingException(t) ? null : t); | ||
updateCachedLocations(server, regionName, row, t); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This also solves the frustration of seeing "UnknownException" when inspecting meta cache clear exception metrics. This has made it quite difficult to track down what triggered the meta cache clear. I think it's always better to provide more context than less. Even if an exception is meta cache clearing (though it will be now), I'd still prefer to know the exact exception type that cleared the meta cache. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 would be good to preserve the exception for Since we currently pass null to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What we'll do is basically "mask" the cache clearing exception by report an |
||
} | ||
for (Action action : e.getValue()) { | ||
Retry retry = | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we tick a client-side metric here too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hernan looks to have that covered through
updateCachedLocations
- https://github.com/apache/hbase/pull/6961/files#r2096239861