@@ -35,6 +35,11 @@ const TASK_RUN_COMPLETED_WITH_ACK_TIMEOUT_MS =
35
35
const TASK_RUN_COMPLETED_WITH_ACK_MAX_RETRIES =
36
36
parseInt ( process . env . TASK_RUN_COMPLETED_WITH_ACK_MAX_RETRIES || "" ) || 7 ;
37
37
38
+ const WAIT_FOR_TASK_CHECKPOINT_DELAY_MS =
39
+ parseInt ( process . env . WAIT_FOR_TASK_CHECKPOINT_DELAY_MS || "" ) || 0 ;
40
+ const WAIT_FOR_BATCH_CHECKPOINT_DELAY_MS =
41
+ parseInt ( process . env . WAIT_FOR_BATCH_CHECKPOINT_DELAY_MS || "" ) || 0 ;
42
+
38
43
const logger = new SimpleStructuredLogger ( "coordinator" , undefined , { nodeName : NODE_NAME } ) ;
39
44
const chaosMonkey = new ChaosMonkey (
40
45
! ! process . env . CHAOS_MONKEY_ENABLED ,
@@ -143,6 +148,7 @@ class TaskCoordinator {
143
148
authToken : PLATFORM_SECRET ,
144
149
logHandlerPayloads : false ,
145
150
handlers : {
151
+ // This is used by resumeAttempt
146
152
RESUME_AFTER_DEPENDENCY : async ( message ) => {
147
153
const log = platformLogger . child ( {
148
154
eventName : "RESUME_AFTER_DEPENDENCY" ,
@@ -168,11 +174,12 @@ class TaskCoordinator {
168
174
169
175
await chaosMonkey . call ( ) ;
170
176
171
- // In case the task resumed faster than we could checkpoint
177
+ // In case the task resumes before the checkpoint is created
172
178
this . #cancelCheckpoint( message . runId ) ;
173
179
174
180
taskSocket . emit ( "RESUME_AFTER_DEPENDENCY" , message ) ;
175
181
} ,
182
+ // This is used by sharedQueueConsumer
176
183
RESUME_AFTER_DEPENDENCY_WITH_ACK : async ( message ) => {
177
184
const log = platformLogger . child ( {
178
185
eventName : "RESUME_AFTER_DEPENDENCY_WITH_ACK" ,
@@ -218,7 +225,7 @@ class TaskCoordinator {
218
225
219
226
await chaosMonkey . call ( ) ;
220
227
221
- // In case the task resumed faster than we could checkpoint
228
+ // In case the task resumes before the checkpoint is created
222
229
this . #cancelCheckpoint( message . runId ) ;
223
230
224
231
taskSocket . emit ( "RESUME_AFTER_DEPENDENCY" , message ) ;
@@ -1096,12 +1103,15 @@ class TaskCoordinator {
1096
1103
}
1097
1104
}
1098
1105
1099
- const checkpoint = await this . #checkpointer. checkpointAndPush ( {
1100
- runId : socket . data . runId ,
1101
- projectRef : socket . data . projectRef ,
1102
- deploymentVersion : socket . data . deploymentVersion ,
1103
- attemptNumber : getAttemptNumber ( ) ,
1104
- } ) ;
1106
+ const checkpoint = await this . #checkpointer. checkpointAndPush (
1107
+ {
1108
+ runId : socket . data . runId ,
1109
+ projectRef : socket . data . projectRef ,
1110
+ deploymentVersion : socket . data . deploymentVersion ,
1111
+ attemptNumber : getAttemptNumber ( ) ,
1112
+ } ,
1113
+ WAIT_FOR_TASK_CHECKPOINT_DELAY_MS
1114
+ ) ;
1105
1115
1106
1116
if ( ! checkpoint ) {
1107
1117
log . error ( "Failed to checkpoint" ) ;
@@ -1189,12 +1199,15 @@ class TaskCoordinator {
1189
1199
}
1190
1200
}
1191
1201
1192
- const checkpoint = await this . #checkpointer. checkpointAndPush ( {
1193
- runId : socket . data . runId ,
1194
- projectRef : socket . data . projectRef ,
1195
- deploymentVersion : socket . data . deploymentVersion ,
1196
- attemptNumber : getAttemptNumber ( ) ,
1197
- } ) ;
1202
+ const checkpoint = await this . #checkpointer. checkpointAndPush (
1203
+ {
1204
+ runId : socket . data . runId ,
1205
+ projectRef : socket . data . projectRef ,
1206
+ deploymentVersion : socket . data . deploymentVersion ,
1207
+ attemptNumber : getAttemptNumber ( ) ,
1208
+ } ,
1209
+ WAIT_FOR_BATCH_CHECKPOINT_DELAY_MS
1210
+ ) ;
1198
1211
1199
1212
if ( ! checkpoint ) {
1200
1213
log . error ( "Failed to checkpoint" ) ;
0 commit comments