@@ -11,7 +11,7 @@ import {
11
11
} from "@trigger.dev/core/v3" ;
12
12
import { ZodNamespace } from "@trigger.dev/core/v3/zodNamespace" ;
13
13
import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket" ;
14
- import { HttpReply , getTextBody } from "@trigger.dev/core/v3/apps" ;
14
+ import { ExponentialBackoff , HttpReply , getTextBody } from "@trigger.dev/core/v3/apps" ;
15
15
import { ChaosMonkey } from "./chaosMonkey" ;
16
16
import { Checkpointer } from "./checkpointer" ;
17
17
import { boolFromEnv , numFromEnv , safeJsonParse } from "./util" ;
@@ -30,6 +30,11 @@ const PLATFORM_WS_PORT = process.env.PLATFORM_WS_PORT || 3030;
30
30
const PLATFORM_SECRET = process . env . PLATFORM_SECRET || "coordinator-secret" ;
31
31
const SECURE_CONNECTION = [ "1" , "true" ] . includes ( process . env . SECURE_CONNECTION ?? "false" ) ;
32
32
33
+ const TASK_RUN_COMPLETED_WITH_ACK_TIMEOUT_MS =
34
+ parseInt ( process . env . TASK_RUN_COMPLETED_WITH_ACK_TIMEOUT_MS || "" ) || 30_000 ;
35
+ const TASK_RUN_COMPLETED_WITH_ACK_MAX_RETRIES =
36
+ parseInt ( process . env . TASK_RUN_COMPLETED_WITH_ACK_MAX_RETRIES || "" ) || 7 ;
37
+
33
38
const logger = new SimpleStructuredLogger ( "coordinator" , undefined , { nodeName : NODE_NAME } ) ;
34
39
const chaosMonkey = new ChaosMonkey (
35
40
! ! process . env . CHAOS_MONKEY_ENABLED ,
@@ -720,37 +725,102 @@ class TaskCoordinator {
720
725
721
726
await chaosMonkey . call ( { throwErrors : false } ) ;
722
727
723
- const completeWithoutCheckpoint = ( shouldExit : boolean ) => {
728
+ const sendCompletionWithAck = async ( ) : Promise < boolean > => {
729
+ try {
730
+ const response = await this . #platformSocket?. sendWithAck (
731
+ "TASK_RUN_COMPLETED_WITH_ACK" ,
732
+ {
733
+ version : "v2" ,
734
+ execution,
735
+ completion,
736
+ } ,
737
+ TASK_RUN_COMPLETED_WITH_ACK_TIMEOUT_MS
738
+ ) ;
739
+
740
+ if ( ! response ) {
741
+ log . error ( "TASK_RUN_COMPLETED_WITH_ACK: no response" ) ;
742
+ return false ;
743
+ }
744
+
745
+ if ( ! response . success ) {
746
+ log . error ( "TASK_RUN_COMPLETED_WITH_ACK: error response" , {
747
+ error : response . error ,
748
+ } ) ;
749
+ return false ;
750
+ }
751
+
752
+ log . log ( "TASK_RUN_COMPLETED_WITH_ACK: successful response" ) ;
753
+ return true ;
754
+ } catch ( error ) {
755
+ log . error ( "TASK_RUN_COMPLETED_WITH_ACK: threw error" , { error } ) ;
756
+ return false ;
757
+ }
758
+ } ;
759
+
760
+ const completeWithoutCheckpoint = async ( shouldExit : boolean ) => {
724
761
const supportsRetryCheckpoints = message . version === "v1" ;
725
762
726
- this . #platformSocket?. send ( "TASK_RUN_COMPLETED" , {
727
- version : supportsRetryCheckpoints ? "v1" : "v2" ,
728
- execution,
729
- completion,
730
- } ) ;
731
763
callback ( { willCheckpointAndRestore : false , shouldExit } ) ;
764
+
765
+ if ( supportsRetryCheckpoints ) {
766
+ // This is only here for backwards compat
767
+ this . #platformSocket?. send ( "TASK_RUN_COMPLETED" , {
768
+ version : "v1" ,
769
+ execution,
770
+ completion,
771
+ } ) ;
772
+ } else {
773
+ // 99.99% of runs should end up here
774
+
775
+ const completedWithAckBackoff = new ExponentialBackoff ( "FullJitter" ) . maxRetries (
776
+ TASK_RUN_COMPLETED_WITH_ACK_MAX_RETRIES
777
+ ) ;
778
+
779
+ const result = await completedWithAckBackoff . execute (
780
+ async ( { retry, delay, elapsedMs } ) => {
781
+ logger . log ( "TASK_RUN_COMPLETED_WITH_ACK: sending with backoff" , {
782
+ retry,
783
+ delay,
784
+ elapsedMs,
785
+ } ) ;
786
+
787
+ const success = await sendCompletionWithAck ( ) ;
788
+
789
+ if ( ! success ) {
790
+ throw new Error ( "Failed to send completion with ack" ) ;
791
+ }
792
+ }
793
+ ) ;
794
+
795
+ if ( ! result . success ) {
796
+ logger . error ( "TASK_RUN_COMPLETED_WITH_ACK: failed to send with backoff" , result ) ;
797
+ return ;
798
+ }
799
+
800
+ logger . log ( "TASK_RUN_COMPLETED_WITH_ACK: sent with backoff" , result ) ;
801
+ }
732
802
} ;
733
803
734
804
if ( completion . ok ) {
735
- completeWithoutCheckpoint ( true ) ;
805
+ await completeWithoutCheckpoint ( true ) ;
736
806
return ;
737
807
}
738
808
739
809
if (
740
810
completion . error . type === "INTERNAL_ERROR" &&
741
811
completion . error . code === "TASK_RUN_CANCELLED"
742
812
) {
743
- completeWithoutCheckpoint ( true ) ;
813
+ await completeWithoutCheckpoint ( true ) ;
744
814
return ;
745
815
}
746
816
747
817
if ( completion . retry === undefined ) {
748
- completeWithoutCheckpoint ( true ) ;
818
+ await completeWithoutCheckpoint ( true ) ;
749
819
return ;
750
820
}
751
821
752
822
if ( completion . retry . delay < this . #delayThresholdInMs) {
753
- completeWithoutCheckpoint ( false ) ;
823
+ await completeWithoutCheckpoint ( false ) ;
754
824
755
825
// Prevents runs that fail fast from never sending a heartbeat
756
826
this . #sendRunHeartbeat( socket . data . runId ) ;
@@ -759,7 +829,7 @@ class TaskCoordinator {
759
829
}
760
830
761
831
if ( message . version === "v2" ) {
762
- completeWithoutCheckpoint ( true ) ;
832
+ await completeWithoutCheckpoint ( true ) ;
763
833
return ;
764
834
}
765
835
@@ -768,7 +838,7 @@ class TaskCoordinator {
768
838
const willCheckpointAndRestore = canCheckpoint || willSimulate ;
769
839
770
840
if ( ! willCheckpointAndRestore ) {
771
- completeWithoutCheckpoint ( false ) ;
841
+ await completeWithoutCheckpoint ( false ) ;
772
842
return ;
773
843
}
774
844
@@ -792,7 +862,7 @@ class TaskCoordinator {
792
862
793
863
if ( ! checkpoint ) {
794
864
log . error ( "Failed to checkpoint" ) ;
795
- completeWithoutCheckpoint ( false ) ;
865
+ await completeWithoutCheckpoint ( false ) ;
796
866
return ;
797
867
}
798
868
0 commit comments