@@ -17,46 +17,6 @@ import { ctxTryGetCache, ctxTrySetCache } from "../util/request-context";
17
17
import { ApplicationError , ErrorCodes } from "@gitpod/gitpod-protocol/lib/messaging/error" ;
18
18
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc" ;
19
19
20
- async function tryThree < T > ( errMessage : string , code : ( attempt : number ) => Promise < T > ) : Promise < T > {
21
- let attempt = 0 ;
22
- // we do sometimes see INTERNAL errors from SpiceDB, or grpc-js reports DEADLINE_EXCEEDED, so we retry a few times
23
- // last time we checked it was 15 times per day (check logs)
24
- while ( attempt ++ < 3 ) {
25
- try {
26
- return await code ( attempt ) ;
27
- } catch ( err ) {
28
- if (
29
- ( err . code === grpc . status . INTERNAL ||
30
- err . code === grpc . status . DEADLINE_EXCEEDED ||
31
- err . code === grpc . status . UNAVAILABLE ) &&
32
- attempt < 3
33
- ) {
34
- let delay = 500 * attempt ;
35
- if ( err . code === grpc . status . DEADLINE_EXCEEDED ) {
36
- // we already waited for timeout, so let's try again immediately
37
- delay = 0 ;
38
- }
39
-
40
- log . warn ( errMessage , err , {
41
- attempt,
42
- delay,
43
- code : err . code ,
44
- } ) ;
45
- await new Promise ( ( resolve ) => setTimeout ( resolve , delay ) ) ;
46
- continue ;
47
- }
48
-
49
- log . error ( errMessage , err , {
50
- attempt,
51
- code : err . code ,
52
- } ) ;
53
- // we don't try again on other errors
54
- throw err ;
55
- }
56
- }
57
- throw new Error ( "unreachable" ) ;
58
- }
59
-
60
20
export function createSpiceDBAuthorizer ( clientProvider : SpiceDBClientProvider ) : SpiceDBAuthorizer {
61
21
return new SpiceDBAuthorizer ( clientProvider , new RequestLocalZedTokenCache ( ) ) ;
62
22
}
@@ -71,13 +31,11 @@ interface DeletionResult {
71
31
deletedAt ?: string ;
72
32
}
73
33
34
+ const GRPC_DEADLINE = 10_000 ;
35
+
74
36
export class SpiceDBAuthorizer {
75
37
constructor ( private readonly clientProvider : SpiceDBClientProvider , private readonly tokenCache : ZedTokenCache ) { }
76
38
77
- private get client ( ) : v1 . ZedPromiseClientInterface {
78
- return this . clientProvider . getClient ( ) ;
79
- }
80
-
81
39
public async check ( req : v1 . CheckPermissionRequest , experimentsFields : { userId : string } ) : Promise < boolean > {
82
40
req . consistency = await this . tokenCache . consistency ( req . resource ) ;
83
41
incSpiceDBRequestsCheckTotal ( req . consistency ?. requirement ?. oneofKind || "undefined" ) ;
@@ -99,8 +57,8 @@ export class SpiceDBAuthorizer {
99
57
const timer = spicedbClientLatency . startTimer ( ) ;
100
58
let error : Error | undefined ;
101
59
try {
102
- const response = await tryThree ( "[spicedb] Failed to perform authorization check." , ( ) =>
103
- this . client . checkPermission ( req , this . callOptions ) ,
60
+ const response = await this . call ( "[spicedb] Failed to perform authorization check." , ( client ) =>
61
+ client . checkPermission ( req , this . callOptions ) ,
104
62
) ;
105
63
const permitted = response . permissionship === v1 . CheckPermissionResponse_Permissionship . HAS_PERMISSION ;
106
64
return { permitted, checkedAt : response . checkedAt ?. token } ;
@@ -139,8 +97,8 @@ export class SpiceDBAuthorizer {
139
97
const timer = spicedbClientLatency . startTimer ( ) ;
140
98
let error : Error | undefined ;
141
99
try {
142
- const response = await tryThree ( "[spicedb] Failed to write relationships." , ( ) =>
143
- this . client . writeRelationships (
100
+ const response = await this . call ( "[spicedb] Failed to write relationships." , ( client ) =>
101
+ client . writeRelationships (
144
102
v1 . WriteRelationshipsRequest . create ( {
145
103
updates,
146
104
} ) ,
@@ -175,16 +133,16 @@ export class SpiceDBAuthorizer {
175
133
let error : Error | undefined ;
176
134
try {
177
135
let deletedAt : string | undefined = undefined ;
178
- const existing = await tryThree ( "readRelationships before deleteRelationships failed." , ( ) =>
179
- this . client . readRelationships ( v1 . ReadRelationshipsRequest . create ( req ) , this . callOptions ) ,
136
+ const existing = await this . call ( "readRelationships before deleteRelationships failed." , ( client ) =>
137
+ client . readRelationships ( v1 . ReadRelationshipsRequest . create ( req ) , this . callOptions ) ,
180
138
) ;
181
139
if ( existing . length > 0 ) {
182
- const response = await tryThree ( "deleteRelationships failed." , ( ) =>
183
- this . client . deleteRelationships ( req , this . callOptions ) ,
140
+ const response = await this . call ( "deleteRelationships failed." , ( client ) =>
141
+ client . deleteRelationships ( req , this . callOptions ) ,
184
142
) ;
185
143
deletedAt = response . deletedAt ?. token ;
186
- const after = await tryThree ( "readRelationships failed." , ( ) =>
187
- this . client . readRelationships ( v1 . ReadRelationshipsRequest . create ( req ) , this . callOptions ) ,
144
+ const after = await this . call ( "readRelationships failed." , ( client ) =>
145
+ client . readRelationships ( v1 . ReadRelationshipsRequest . create ( req ) , this . callOptions ) ,
188
146
) ;
189
147
if ( after . length > 0 ) {
190
148
log . error ( "[spicedb] Failed to delete relationships." , { existing, after, request : req } ) ;
@@ -213,7 +171,55 @@ export class SpiceDBAuthorizer {
213
171
async readRelationships ( req : v1 . ReadRelationshipsRequest ) : Promise < v1 . ReadRelationshipsResponse [ ] > {
214
172
req . consistency = await this . tokenCache . consistency ( undefined ) ;
215
173
incSpiceDBRequestsCheckTotal ( req . consistency ?. requirement ?. oneofKind || "undefined" ) ;
216
- return tryThree ( "readRelationships failed." , ( ) => this . client . readRelationships ( req , this . callOptions ) ) ;
174
+ return this . call ( "readRelationships failed." , ( client ) => client . readRelationships ( req , this . callOptions ) ) ;
175
+ }
176
+
177
+ /**
178
+ * call retrieves a Spicedb client and executes the given code block.
179
+ * In addition to the gRPC-level retry mechanisms, it retries on "Waiting for LB pick" errors.
180
+ * This is required, because we seem to be running into a grpc/grpc-js bug where a subchannel takes 120s+ to reconnect.
181
+ * @param description
182
+ * @param code
183
+ * @returns
184
+ */
185
+ private async call < T > ( description : string , code : ( client : v1 . ZedPromiseClientInterface ) => Promise < T > ) : Promise < T > {
186
+ const MAX_ATTEMPTS = 3 ;
187
+ let attempt = 0 ;
188
+ while ( attempt ++ < 3 ) {
189
+ try {
190
+ const checkClient = attempt > 1 ; // the last client error'd out, so check if we should get a new one
191
+ const client = this . clientProvider . getClient ( checkClient ) ;
192
+ return code ( client ) ;
193
+ } catch ( err ) {
194
+ // Check: Is this a "no connection to upstream" error? If yes, retry here, to work around grpc/grpc-js bugs introducing high latency for re-tries
195
+ if (
196
+ ( err . code === grpc . status . DEADLINE_EXCEEDED || err . code === grpc . status . UNAVAILABLE ) &&
197
+ attempt < MAX_ATTEMPTS
198
+ ) {
199
+ let delay = 500 * attempt ;
200
+ if ( err . code === grpc . status . DEADLINE_EXCEEDED ) {
201
+ // we already waited for timeout, so let's try again immediately
202
+ delay = 0 ;
203
+ }
204
+
205
+ log . warn ( description , err , {
206
+ attempt,
207
+ delay,
208
+ code : err . code ,
209
+ } ) ;
210
+ await new Promise ( ( resolve ) => setTimeout ( resolve , delay ) ) ;
211
+ continue ;
212
+ }
213
+
214
+ // Some other error: log and rethrow
215
+ log . error ( description , err , {
216
+ attempt,
217
+ code : err . code ,
218
+ } ) ;
219
+ throw err ;
220
+ }
221
+ }
222
+ throw new Error ( "unreachable" ) ;
217
223
}
218
224
219
225
/**
@@ -223,7 +229,7 @@ export class SpiceDBAuthorizer {
223
229
*/
224
230
private get callOptions ( ) : grpc . Metadata {
225
231
return ( < grpc . CallOptions > {
226
- deadline : Date . now ( ) + 8000 ,
232
+ deadline : Date . now ( ) + GRPC_DEADLINE ,
227
233
} ) as any as grpc . Metadata ;
228
234
}
229
235
}
0 commit comments