Skip to content

Commit 5d557f7

Browse files
authored
[spicedb] Fix intermittent "Error: 4 DEADLINE_EXCEEDED...Waiting for LB pick" (#20637)
* [dev] Bump grpc/grpc-js 1.10.8 -> 1.12.6 and authzed/authzed-node 0.15.0 -> 1.2.2 Tool: gitpod/catfood.gitpod.cloud * [server] Streamline spicedb gRPC client usage and creation options - instead of doing retries on two levels, rely on the gRPC-level retries - to mitigate the loss of insights, introduce createDebugLogInterceptor - client options: use sane defaults derived from the documentation instead of the excessive ones we had in place before - use "waitForReady" option: it should a) make our calls for responsive on re-connects, while b) - because we keep re-trying on DEADLINE_EXCEEDED - should be as reliable as before Tool: gitpod/catfood.gitpod.cloud * [protocol] Centralize grpc.isConnectionAlive Tool: gitpod/catfood.gitpod.cloud * [server] SpiceDB client: retry with new client on "Waiting for LB pick" error Tool: gitpod/catfood.gitpod.cloud
1 parent ab11ec3 commit 5d557f7

File tree

15 files changed

+241
-119
lines changed

15 files changed

+241
-119
lines changed

components/content-service-api/typescript/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"lib"
1212
],
1313
"dependencies": {
14-
"@grpc/grpc-js": "1.10.8",
14+
"@grpc/grpc-js": "1.12.6",
1515
"google-protobuf": "^3.19.1",
1616
"inversify": "^6.0.1",
1717
"opentracing": "^0.14.4"

components/gitpod-protocol/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"src"
1111
],
1212
"devDependencies": {
13-
"@grpc/grpc-js": "1.10.8",
13+
"@grpc/grpc-js": "1.12.6",
1414
"@testdeck/mocha": "^0.3.3",
1515
"@types/analytics-node": "^3.1.9",
1616
"@types/chai-subset": "^1.3.3",

components/gitpod-protocol/src/util/grpc.ts

+87
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import * as grpc from "@grpc/grpc-js";
88
import { Status } from "@grpc/grpc-js/build/src/constants";
9+
import { log } from "./logging";
10+
import { TrustedValue } from "./scrubbing";
911

1012
export const defaultGRPCOptions = {
1113
"grpc.keepalive_timeout_ms": 10000,
@@ -108,6 +110,91 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics):
108110
};
109111
}
110112

113+
export function createDebugLogInterceptor(additionalContextF: (() => object) | undefined): grpc.Interceptor {
114+
const FAILURE_STATUS_CODES = new Map([
115+
[Status.ABORTED, true],
116+
[Status.CANCELLED, true],
117+
[Status.DATA_LOSS, true],
118+
[Status.DEADLINE_EXCEEDED, true],
119+
[Status.FAILED_PRECONDITION, true],
120+
[Status.INTERNAL, true],
121+
[Status.PERMISSION_DENIED, true],
122+
[Status.RESOURCE_EXHAUSTED, true],
123+
[Status.UNAUTHENTICATED, true],
124+
[Status.UNAVAILABLE, true],
125+
[Status.UNIMPLEMENTED, true],
126+
[Status.UNKNOWN, true],
127+
]);
128+
129+
return (options, nextCall): grpc.InterceptingCall => {
130+
const methodDef = options.method_definition;
131+
const method = methodDef.path.substring(methodDef.path.lastIndexOf("/") + 1);
132+
const service = methodDef.path.substring(1, methodDef.path.length - method.length - 1);
133+
const labels = {
134+
service,
135+
method,
136+
type: getGrpcMethodType(options.method_definition.requestStream, options.method_definition.responseStream),
137+
};
138+
const requester = new grpc.RequesterBuilder()
139+
.withStart((metadata, listener, next) => {
140+
const newListener = new grpc.ListenerBuilder()
141+
.withOnReceiveStatus((status, next) => {
142+
// If given, call the additionalContext function and log the result
143+
let additionalContext = {};
144+
try {
145+
if (additionalContextF) {
146+
additionalContext = additionalContextF();
147+
}
148+
} catch (e) {}
149+
150+
try {
151+
const info = {
152+
labels: new TrustedValue(labels),
153+
metadata: new TrustedValue(metadata.toJSON()),
154+
code: Status[status.code],
155+
details: status.details,
156+
additionalContext: new TrustedValue(additionalContext),
157+
};
158+
if (FAILURE_STATUS_CODES.has(status.code)) {
159+
log.warn(`grpc call failed`, info);
160+
} else {
161+
log.debug(`grpc call status`, info);
162+
}
163+
} finally {
164+
next(status);
165+
}
166+
})
167+
.build();
168+
try {
169+
log.debug(`grpc call started`, {
170+
labels: new TrustedValue(labels),
171+
metadata: new TrustedValue(metadata.toJSON()),
172+
});
173+
} finally {
174+
next(metadata, newListener);
175+
}
176+
})
177+
.withCancel((next) => {
178+
try {
179+
log.debug(`grpc call cancelled`, { labels: new TrustedValue(labels) });
180+
} finally {
181+
next();
182+
}
183+
})
184+
.build();
185+
return new grpc.InterceptingCall(nextCall(options), requester);
186+
};
187+
}
188+
111189
export function isGrpcError(err: any): err is grpc.StatusObject {
112190
return err.code && err.details;
113191
}
192+
193+
export function isConnectionAlive(client: grpc.Client) {
194+
const cs = client.getChannel().getConnectivityState(false);
195+
return (
196+
cs == grpc.connectivityState.CONNECTING ||
197+
cs == grpc.connectivityState.IDLE ||
198+
cs == grpc.connectivityState.READY
199+
);
200+
}

components/ide-metrics-api/typescript-grpc/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"lib"
1010
],
1111
"dependencies": {
12-
"@grpc/grpc-js": "1.10.8",
12+
"@grpc/grpc-js": "1.12.6",
1313
"google-protobuf": "^3.19.1"
1414
},
1515
"devDependencies": {

components/image-builder-api/typescript/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
"dependencies": {
1414
"@gitpod/content-service": "0.1.5",
1515
"@gitpod/gitpod-protocol": "0.1.5",
16-
"@grpc/grpc-js": "1.10.8",
16+
"@grpc/grpc-js": "1.12.6",
1717
"google-protobuf": "^3.19.1",
1818
"inversify": "^6.0.1",
1919
"opentracing": "^0.14.4"

components/image-builder-api/typescript/src/sugar.ts

+2-7
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { ImageBuilderClient } from "./imgbuilder_grpc_pb";
88
import { TraceContext } from "@gitpod/gitpod-protocol/lib/util/tracing";
99
import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred";
1010
import { log } from "@gitpod/gitpod-protocol/lib/util/logging";
11-
import { createClientCallMetricsInterceptor, IClientCallMetrics } from "@gitpod/gitpod-protocol/lib/util/grpc";
11+
import { createClientCallMetricsInterceptor, IClientCallMetrics, isConnectionAlive } from "@gitpod/gitpod-protocol/lib/util/grpc";
1212
import * as opentracing from "opentracing";
1313
import { Metadata } from "@grpc/grpc-js";
1414
import {
@@ -130,12 +130,7 @@ export class PromisifiedImageBuilderClient {
130130
) {}
131131

132132
public isConnectionAlive() {
133-
const cs = this.client.getChannel().getConnectivityState(false);
134-
return (
135-
cs == grpc.connectivityState.CONNECTING ||
136-
cs == grpc.connectivityState.IDLE ||
137-
cs == grpc.connectivityState.READY
138-
);
133+
return isConnectionAlive(this.client);
139134
}
140135

141136
public resolveBaseImage(ctx: TraceContext, request: ResolveBaseImageRequest): Promise<ResolveBaseImageResponse> {

components/server/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
"/src"
4646
],
4747
"dependencies": {
48-
"@authzed/authzed-node": "^0.15.0",
48+
"@authzed/authzed-node": "^1.2.2",
4949
"@connectrpc/connect": "1.1.2",
5050
"@connectrpc/connect-express": "1.1.2",
5151
"@gitbeaker/rest": "^39.12.0",

components/server/src/authorization/spicedb-authorizer.ts

+62-56
Original file line numberDiff line numberDiff line change
@@ -17,46 +17,6 @@ import { ctxTryGetCache, ctxTrySetCache } from "../util/request-context";
1717
import { ApplicationError, ErrorCodes } from "@gitpod/gitpod-protocol/lib/messaging/error";
1818
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc";
1919

20-
async function tryThree<T>(errMessage: string, code: (attempt: number) => Promise<T>): Promise<T> {
21-
let attempt = 0;
22-
// we do sometimes see INTERNAL errors from SpiceDB, or grpc-js reports DEADLINE_EXCEEDED, so we retry a few times
23-
// last time we checked it was 15 times per day (check logs)
24-
while (attempt++ < 3) {
25-
try {
26-
return await code(attempt);
27-
} catch (err) {
28-
if (
29-
(err.code === grpc.status.INTERNAL ||
30-
err.code === grpc.status.DEADLINE_EXCEEDED ||
31-
err.code === grpc.status.UNAVAILABLE) &&
32-
attempt < 3
33-
) {
34-
let delay = 500 * attempt;
35-
if (err.code === grpc.status.DEADLINE_EXCEEDED) {
36-
// we already waited for timeout, so let's try again immediately
37-
delay = 0;
38-
}
39-
40-
log.warn(errMessage, err, {
41-
attempt,
42-
delay,
43-
code: err.code,
44-
});
45-
await new Promise((resolve) => setTimeout(resolve, delay));
46-
continue;
47-
}
48-
49-
log.error(errMessage, err, {
50-
attempt,
51-
code: err.code,
52-
});
53-
// we don't try again on other errors
54-
throw err;
55-
}
56-
}
57-
throw new Error("unreachable");
58-
}
59-
6020
export function createSpiceDBAuthorizer(clientProvider: SpiceDBClientProvider): SpiceDBAuthorizer {
6121
return new SpiceDBAuthorizer(clientProvider, new RequestLocalZedTokenCache());
6222
}
@@ -71,13 +31,11 @@ interface DeletionResult {
7131
deletedAt?: string;
7232
}
7333

34+
const GRPC_DEADLINE = 10_000;
35+
7436
export class SpiceDBAuthorizer {
7537
constructor(private readonly clientProvider: SpiceDBClientProvider, private readonly tokenCache: ZedTokenCache) {}
7638

77-
private get client(): v1.ZedPromiseClientInterface {
78-
return this.clientProvider.getClient();
79-
}
80-
8139
public async check(req: v1.CheckPermissionRequest, experimentsFields: { userId: string }): Promise<boolean> {
8240
req.consistency = await this.tokenCache.consistency(req.resource);
8341
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
@@ -99,8 +57,8 @@ export class SpiceDBAuthorizer {
9957
const timer = spicedbClientLatency.startTimer();
10058
let error: Error | undefined;
10159
try {
102-
const response = await tryThree("[spicedb] Failed to perform authorization check.", () =>
103-
this.client.checkPermission(req, this.callOptions),
60+
const response = await this.call("[spicedb] Failed to perform authorization check.", (client) =>
61+
client.checkPermission(req, this.callOptions),
10462
);
10563
const permitted = response.permissionship === v1.CheckPermissionResponse_Permissionship.HAS_PERMISSION;
10664
return { permitted, checkedAt: response.checkedAt?.token };
@@ -139,8 +97,8 @@ export class SpiceDBAuthorizer {
13997
const timer = spicedbClientLatency.startTimer();
14098
let error: Error | undefined;
14199
try {
142-
const response = await tryThree("[spicedb] Failed to write relationships.", () =>
143-
this.client.writeRelationships(
100+
const response = await this.call("[spicedb] Failed to write relationships.", (client) =>
101+
client.writeRelationships(
144102
v1.WriteRelationshipsRequest.create({
145103
updates,
146104
}),
@@ -175,16 +133,16 @@ export class SpiceDBAuthorizer {
175133
let error: Error | undefined;
176134
try {
177135
let deletedAt: string | undefined = undefined;
178-
const existing = await tryThree("readRelationships before deleteRelationships failed.", () =>
179-
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
136+
const existing = await this.call("readRelationships before deleteRelationships failed.", (client) =>
137+
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
180138
);
181139
if (existing.length > 0) {
182-
const response = await tryThree("deleteRelationships failed.", () =>
183-
this.client.deleteRelationships(req, this.callOptions),
140+
const response = await this.call("deleteRelationships failed.", (client) =>
141+
client.deleteRelationships(req, this.callOptions),
184142
);
185143
deletedAt = response.deletedAt?.token;
186-
const after = await tryThree("readRelationships failed.", () =>
187-
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
144+
const after = await this.call("readRelationships failed.", (client) =>
145+
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
188146
);
189147
if (after.length > 0) {
190148
log.error("[spicedb] Failed to delete relationships.", { existing, after, request: req });
@@ -213,7 +171,55 @@ export class SpiceDBAuthorizer {
213171
async readRelationships(req: v1.ReadRelationshipsRequest): Promise<v1.ReadRelationshipsResponse[]> {
214172
req.consistency = await this.tokenCache.consistency(undefined);
215173
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
216-
return tryThree("readRelationships failed.", () => this.client.readRelationships(req, this.callOptions));
174+
return this.call("readRelationships failed.", (client) => client.readRelationships(req, this.callOptions));
175+
}
176+
177+
/**
178+
* call retrieves a Spicedb client and executes the given code block.
179+
* In addition to the gRPC-level retry mechanisms, it retries on "Waiting for LB pick" errors.
180+
* This is required, because we seem to be running into a grpc/grpc-js bug where a subchannel takes 120s+ to reconnect.
181+
* @param description
182+
* @param code
183+
* @returns
184+
*/
185+
private async call<T>(description: string, code: (client: v1.ZedPromiseClientInterface) => Promise<T>): Promise<T> {
186+
const MAX_ATTEMPTS = 3;
187+
let attempt = 0;
188+
while (attempt++ < 3) {
189+
try {
190+
const checkClient = attempt > 1; // the last client error'd out, so check if we should get a new one
191+
const client = this.clientProvider.getClient(checkClient);
192+
return code(client);
193+
} catch (err) {
194+
// Check: Is this a "no connection to upstream" error? If yes, retry here, to work around grpc/grpc-js bugs introducing high latency for re-tries
195+
if (
196+
(err.code === grpc.status.DEADLINE_EXCEEDED || err.code === grpc.status.UNAVAILABLE) &&
197+
attempt < MAX_ATTEMPTS
198+
) {
199+
let delay = 500 * attempt;
200+
if (err.code === grpc.status.DEADLINE_EXCEEDED) {
201+
// we already waited for timeout, so let's try again immediately
202+
delay = 0;
203+
}
204+
205+
log.warn(description, err, {
206+
attempt,
207+
delay,
208+
code: err.code,
209+
});
210+
await new Promise((resolve) => setTimeout(resolve, delay));
211+
continue;
212+
}
213+
214+
// Some other error: log and rethrow
215+
log.error(description, err, {
216+
attempt,
217+
code: err.code,
218+
});
219+
throw err;
220+
}
221+
}
222+
throw new Error("unreachable");
217223
}
218224

219225
/**
@@ -223,7 +229,7 @@ export class SpiceDBAuthorizer {
223229
*/
224230
private get callOptions(): grpc.Metadata {
225231
return (<grpc.CallOptions>{
226-
deadline: Date.now() + 8000,
232+
deadline: Date.now() + GRPC_DEADLINE,
227233
}) as any as grpc.Metadata;
228234
}
229235
}

0 commit comments

Comments
 (0)