@@ -21,7 +21,7 @@ import {
21
21
TaskRunStatus ,
22
22
} from "@trigger.dev/database" ;
23
23
import { z } from "zod" ;
24
- import { prisma } from "~/db.server" ;
24
+ import { $replica , prisma } from "~/db.server" ;
25
25
import { findEnvironmentById } from "~/models/runtimeEnvironment.server" ;
26
26
import { logger } from "~/services/logger.server" ;
27
27
import { singleton } from "~/utils/singleton" ;
@@ -43,7 +43,12 @@ import { generateJWTTokenForEnvironment } from "~/services/apiAuth.server";
43
43
import { EnvironmentVariable } from "../environmentVariables/repository" ;
44
44
import { machinePresetFromConfig } from "../machinePresets.server" ;
45
45
import { env } from "~/env.server" ;
46
- import { isFinalAttemptStatus , isFinalRunStatus } from "../taskStatus" ;
46
+ import {
47
+ FINAL_ATTEMPT_STATUSES ,
48
+ FINAL_RUN_STATUSES ,
49
+ isFinalAttemptStatus ,
50
+ isFinalRunStatus ,
51
+ } from "../taskStatus" ;
47
52
import { getMaxDuration } from "../utils/maxDuration" ;
48
53
49
54
const WithTraceContext = z . object ( {
@@ -620,6 +625,9 @@ export class SharedQueueConsumer {
620
625
const resumableRun = await prisma . taskRun . findUnique ( {
621
626
where : {
622
627
id : message . messageId ,
628
+ status : {
629
+ notIn : FINAL_RUN_STATUSES ,
630
+ } ,
623
631
} ,
624
632
} ) ;
625
633
@@ -633,6 +641,14 @@ export class SharedQueueConsumer {
633
641
return ;
634
642
}
635
643
644
+ if ( resumableRun . status !== "EXECUTING" ) {
645
+ logger . warn ( "Run is not executing, will try to resume anyway" , {
646
+ queueMessage : message . data ,
647
+ messageId : message . messageId ,
648
+ runStatus : resumableRun . status ,
649
+ } ) ;
650
+ }
651
+
636
652
const resumableAttempt = await prisma . taskRunAttempt . findUnique ( {
637
653
where : {
638
654
id : messageBody . data . resumableAttemptId ,
@@ -740,7 +756,11 @@ export class SharedQueueConsumer {
740
756
executions,
741
757
} ;
742
758
743
- logger . debug ( "Broadcasting RESUME_AFTER_DEPENDENCY_WITH_ACK" , { resumeMessage, message } ) ;
759
+ logger . debug ( "Broadcasting RESUME_AFTER_DEPENDENCY_WITH_ACK" , {
760
+ resumeMessage,
761
+ message,
762
+ resumableRun,
763
+ } ) ;
744
764
745
765
// The attempt should still be running so we can broadcast to all coordinators to resume immediately
746
766
const responses = await socketIo . coordinatorNamespace
@@ -763,15 +783,91 @@ export class SharedQueueConsumer {
763
783
}
764
784
765
785
const hasSuccess = responses . some ( ( response ) => response . success ) ;
766
- if ( ! hasSuccess ) {
767
- logger . warn ( "RESUME_AFTER_DEPENDENCY_WITH_ACK failed" , {
768
- resumeMessage,
769
- responses,
770
- message,
771
- } ) ;
772
- await this . #nackAndDoMoreWork( message . messageId , this . _options . nextTickInterval , 5_000 ) ;
786
+
787
+ if ( hasSuccess ) {
788
+ this . #doMoreWork( ) ;
773
789
return ;
774
790
}
791
+
792
+ // No coordinator was able to resume the run
793
+ logger . warn ( "RESUME_AFTER_DEPENDENCY_WITH_ACK failed" , {
794
+ resumeMessage,
795
+ responses,
796
+ message,
797
+ } ) ;
798
+
799
+ // Let's check if the run is frozen
800
+ if ( resumableRun . status === "WAITING_TO_RESUME" ) {
801
+ logger . debug ( "RESUME_AFTER_DEPENDENCY_WITH_ACK run is waiting to be restored" , {
802
+ queueMessage : message . data ,
803
+ messageId : message . messageId ,
804
+ } ) ;
805
+
806
+ try {
807
+ const restoreService = new RestoreCheckpointService ( ) ;
808
+
809
+ const checkpointEvent = await restoreService . getLastCheckpointEventIfUnrestored (
810
+ resumableRun . id
811
+ ) ;
812
+
813
+ if ( checkpointEvent ) {
814
+ // The last checkpoint hasn't been restored yet, so restore it
815
+ const checkpoint = await restoreService . call ( {
816
+ eventId : checkpointEvent . id ,
817
+ } ) ;
818
+
819
+ if ( ! checkpoint ) {
820
+ logger . debug ( "RESUME_AFTER_DEPENDENCY_WITH_ACK failed to restore checkpoint" , {
821
+ queueMessage : message . data ,
822
+ messageId : message . messageId ,
823
+ } ) ;
824
+
825
+ await this . #ackAndDoMoreWork( message . messageId ) ;
826
+ return ;
827
+ }
828
+
829
+ logger . debug ( "RESUME_AFTER_DEPENDENCY_WITH_ACK restored checkpoint" , {
830
+ queueMessage : message . data ,
831
+ messageId : message . messageId ,
832
+ checkpoint,
833
+ } ) ;
834
+
835
+ this . #doMoreWork( ) ;
836
+ return ;
837
+ } else {
838
+ logger . debug (
839
+ "RESUME_AFTER_DEPENDENCY_WITH_ACK run is frozen without last checkpoint event" ,
840
+ {
841
+ queueMessage : message . data ,
842
+ messageId : message . messageId ,
843
+ }
844
+ ) ;
845
+ }
846
+ } catch ( e ) {
847
+ if ( e instanceof Error ) {
848
+ this . _currentSpan ?. recordException ( e ) ;
849
+ } else {
850
+ this . _currentSpan ?. recordException ( new Error ( String ( e ) ) ) ;
851
+ }
852
+
853
+ this . _endSpanInNextIteration = true ;
854
+
855
+ await this . #nackAndDoMoreWork(
856
+ message . messageId ,
857
+ this . _options . nextTickInterval ,
858
+ 5_000
859
+ ) ;
860
+ return ;
861
+ }
862
+ }
863
+
864
+ logger . debug ( "RESUME_AFTER_DEPENDENCY_WITH_ACK retrying" , {
865
+ queueMessage : message . data ,
866
+ messageId : message . messageId ,
867
+ } ) ;
868
+
869
+ await this . #nackAndDoMoreWork( message . messageId , this . _options . nextTickInterval , 5_000 ) ;
870
+ return ;
775
871
} catch ( e ) {
776
872
if ( e instanceof Error ) {
777
873
this . _currentSpan ?. recordException ( e ) ;
@@ -896,7 +992,7 @@ class SharedQueueTasks {
896
992
where : {
897
993
id,
898
994
status : {
899
- in : [ "COMPLETED" , "FAILED" ] ,
995
+ in : FINAL_ATTEMPT_STATUSES ,
900
996
} ,
901
997
} ,
902
998
include : {
@@ -1237,13 +1333,13 @@ class SharedQueueTasks {
1237
1333
return ;
1238
1334
}
1239
1335
1240
- await marqs ?. heartbeatMessage ( taskRunAttempt . taskRunId ) ;
1336
+ await this . #heartbeat ( taskRunAttempt . taskRunId ) ;
1241
1337
}
1242
1338
1243
1339
async taskRunHeartbeat ( runId : string ) {
1244
1340
logger . debug ( "[SharedQueueConsumer] taskRunHeartbeat()" , { runId } ) ;
1245
1341
1246
- await marqs ?. heartbeatMessage ( runId ) ;
1342
+ await this . #heartbeat ( runId ) ;
1247
1343
}
1248
1344
1249
1345
public async taskRunFailed ( completion : TaskRunFailedExecutionResult ) {
@@ -1254,6 +1350,66 @@ class SharedQueueTasks {
1254
1350
await service . call ( completion . id , completion ) ;
1255
1351
}
1256
1352
1353
+ async #heartbeat( runId : string ) {
1354
+ await marqs ?. heartbeatMessage ( runId ) ;
1355
+
1356
+ try {
1357
+ // There can be a lot of calls per minute and the data doesn't have to be accurate, so use the read replica
1358
+ const taskRun = await $replica . taskRun . findFirst ( {
1359
+ where : {
1360
+ id : runId ,
1361
+ } ,
1362
+ select : {
1363
+ id : true ,
1364
+ status : true ,
1365
+ runtimeEnvironment : {
1366
+ select : {
1367
+ type : true ,
1368
+ } ,
1369
+ } ,
1370
+ lockedToVersion : {
1371
+ select : {
1372
+ supportsLazyAttempts : true ,
1373
+ } ,
1374
+ } ,
1375
+ } ,
1376
+ } ) ;
1377
+
1378
+ if ( ! taskRun ) {
1379
+ logger . error ( "SharedQueueTasks.#heartbeat: Task run not found" , {
1380
+ runId,
1381
+ } ) ;
1382
+
1383
+ return ;
1384
+ }
1385
+
1386
+ if ( taskRun . runtimeEnvironment . type === "DEVELOPMENT" ) {
1387
+ return ;
1388
+ }
1389
+
1390
+ if ( isFinalRunStatus ( taskRun . status ) ) {
1391
+ logger . debug ( "SharedQueueTasks.#heartbeat: Task run is in final status" , {
1392
+ runId,
1393
+ status : taskRun . status ,
1394
+ } ) ;
1395
+
1396
+ // Signal to exit any leftover containers
1397
+ socketIo . coordinatorNamespace . emit ( "REQUEST_RUN_CANCELLATION" , {
1398
+ version : "v1" ,
1399
+ runId : taskRun . id ,
1400
+ // Give the run a few seconds to exit to complete any flushing etc
1401
+ delayInMs : taskRun . lockedToVersion ?. supportsLazyAttempts ? 5_000 : undefined ,
1402
+ } ) ;
1403
+ return ;
1404
+ }
1405
+ } catch ( error ) {
1406
+ logger . error ( "SharedQueueTasks.#heartbeat: Error signaling run cancellation" , {
1407
+ runId,
1408
+ error : error instanceof Error ? error . message : error ,
1409
+ } ) ;
1410
+ }
1411
+ }
1412
+
1257
1413
async #buildEnvironmentVariables(
1258
1414
environment : RuntimeEnvironment ,
1259
1415
runId : string ,
0 commit comments