@@ -97,6 +97,12 @@ class ProdWorker {
97
97
idempotencyKey : string ;
98
98
}
99
99
| undefined ;
100
+ private readyForResumeReplay :
101
+ | {
102
+ idempotencyKey : string ;
103
+ type : WaitReason ;
104
+ }
105
+ | undefined ;
100
106
101
107
#httpPort: number ;
102
108
#httpServer: ReturnType < typeof createServer > ;
@@ -365,10 +371,18 @@ class ProdWorker {
365
371
async #prepareForRetry( ) {
366
372
// Clear state for retrying
367
373
this . paused = false ;
374
+ this . nextResumeAfter = undefined ;
368
375
this . waitForPostStart = false ;
369
376
this . executing = false ;
370
377
this . attemptFriendlyId = undefined ;
371
378
this . attemptNumber = undefined ;
379
+
380
+ // Clear replay state
381
+ this . waitForTaskReplay = undefined ;
382
+ this . waitForBatchReplay = undefined ;
383
+ this . readyForLazyAttemptReplay = undefined ;
384
+ this . durationResumeFallback = undefined ;
385
+ this . readyForResumeReplay = undefined ;
372
386
}
373
387
374
388
// MARK: CHECKPOINT PREP
@@ -405,13 +419,16 @@ class ProdWorker {
405
419
this . waitForPostStart = false ;
406
420
407
421
this . durationResumeFallback = undefined ;
422
+ this . readyForResumeReplay = undefined ;
408
423
409
424
this . _taskRunProcess ?. waitCompletedNotification ( ) ;
410
425
}
411
426
412
427
async #readyForLazyAttempt( ) {
413
428
const idempotencyKey = randomUUID ( ) ;
414
429
430
+ logger . log ( "ready for lazy attempt" , { idempotencyKey } ) ;
431
+
415
432
this . readyForLazyAttemptReplay = {
416
433
idempotencyKey,
417
434
} ;
@@ -420,7 +437,7 @@ class ProdWorker {
420
437
// ..but we also have to be fast to avoid failing the task due to missing heartbeat
421
438
for await ( const { delay, retry } of defaultBackoff . min ( 10 ) . maxRetries ( 7 ) ) {
422
439
if ( retry > 0 ) {
423
- logger . log ( "retrying ready for lazy attempt" , { retry } ) ;
440
+ logger . log ( "retrying ready for lazy attempt" , { retry, idempotencyKey } ) ;
424
441
}
425
442
426
443
this . #coordinatorSocket. socket . emit ( "READY_FOR_LAZY_ATTEMPT" , {
@@ -453,6 +470,93 @@ class ProdWorker {
453
470
this . #failRun( this . runId , "Failed to receive execute request in a reasonable time" ) ;
454
471
}
455
472
473
+ async #readyForResume( ) {
474
+ const idempotencyKey = randomUUID ( ) ;
475
+
476
+ logger . log ( "readyForResume()" , {
477
+ nextResumeAfter : this . nextResumeAfter ,
478
+ attemptFriendlyId : this . attemptFriendlyId ,
479
+ attemptNumber : this . attemptNumber ,
480
+ idempotencyKey,
481
+ } ) ;
482
+
483
+ if ( ! this . nextResumeAfter ) {
484
+ logger . error ( "Missing next resume reason" , { status : this . #status } ) ;
485
+
486
+ this . #emitUnrecoverableError(
487
+ "NoNextResume" ,
488
+ "Next resume reason not set while resuming from paused state"
489
+ ) ;
490
+
491
+ return ;
492
+ }
493
+
494
+ if ( ! this . attemptFriendlyId ) {
495
+ logger . error ( "Missing attempt friendly ID" , { status : this . #status } ) ;
496
+
497
+ this . #emitUnrecoverableError(
498
+ "NoAttemptId" ,
499
+ "Attempt ID not set while resuming from paused state"
500
+ ) ;
501
+
502
+ return ;
503
+ }
504
+
505
+ if ( ! this . attemptNumber ) {
506
+ logger . error ( "Missing attempt number" , { status : this . #status } ) ;
507
+
508
+ this . #emitUnrecoverableError(
509
+ "NoAttemptNumber" ,
510
+ "Attempt number not set while resuming from paused state"
511
+ ) ;
512
+
513
+ return ;
514
+ }
515
+
516
+ this . readyForResumeReplay = {
517
+ idempotencyKey,
518
+ type : this . nextResumeAfter ,
519
+ } ;
520
+
521
+ const lockedMetadata = {
522
+ attemptFriendlyId : this . attemptFriendlyId ,
523
+ attemptNumber : this . attemptNumber ,
524
+ type : this . nextResumeAfter ,
525
+ } ;
526
+
527
+ // Retry if we don't receive RESUME_AFTER_DEPENDENCY or RESUME_AFTER_DURATION in a reasonable time
528
+ // ..but we also have to be fast to avoid failing the task due to missing heartbeat
529
+ for await ( const { delay, retry } of defaultBackoff . min ( 10 ) . maxRetries ( 7 ) ) {
530
+ if ( retry > 0 ) {
531
+ logger . log ( "retrying ready for resume" , { retry, idempotencyKey } ) ;
532
+ }
533
+
534
+ this . #coordinatorSocket. socket . emit ( "READY_FOR_RESUME" , {
535
+ version : "v2" ,
536
+ ...lockedMetadata ,
537
+ } ) ;
538
+
539
+ await timeout ( delay . milliseconds ) ;
540
+
541
+ if ( ! this . readyForResumeReplay ) {
542
+ logger . log ( "replay ready for resume cancelled, discarding" , {
543
+ idempotencyKey,
544
+ } ) ;
545
+
546
+ return ;
547
+ }
548
+
549
+ if ( idempotencyKey !== this . readyForResumeReplay . idempotencyKey ) {
550
+ logger . log ( "replay ready for resume idempotency key mismatch, discarding" , {
551
+ idempotencyKey,
552
+ newIdempotencyKey : this . readyForResumeReplay . idempotencyKey ,
553
+ } ) ;
554
+
555
+ return ;
556
+ }
557
+ }
558
+ }
559
+
456
560
#readyForCheckpoint( ) {
457
561
this . #coordinatorSocket. socket . emit ( "READY_FOR_CHECKPOINT" , { version : "v1" } ) ;
458
562
}
@@ -630,6 +734,7 @@ class ProdWorker {
630
734
this . paused = false ;
631
735
this . nextResumeAfter = undefined ;
632
736
this . waitForPostStart = false ;
737
+ this . readyForResumeReplay = undefined ;
633
738
634
739
for ( let i = 0 ; i < completions . length ; i ++ ) {
635
740
const completion = completions [ i ] ;
@@ -845,46 +950,7 @@ class ProdWorker {
845
950
}
846
951
847
952
if ( this . paused ) {
848
- if ( ! this . nextResumeAfter ) {
849
- logger . error ( "Missing next resume reason" , { status : this . #status } ) ;
850
-
851
- this . #emitUnrecoverableError(
852
- "NoNextResume" ,
853
- "Next resume reason not set while resuming from paused state"
854
- ) ;
855
-
856
- return ;
857
- }
858
-
859
- if ( ! this . attemptFriendlyId ) {
860
- logger . error ( "Missing attempt friendly ID" , { status : this . #status } ) ;
861
-
862
- this . #emitUnrecoverableError(
863
- "NoAttemptId" ,
864
- "Attempt ID not set while resuming from paused state"
865
- ) ;
866
-
867
- return ;
868
- }
869
-
870
- if ( ! this . attemptNumber ) {
871
- logger . error ( "Missing attempt number" , { status : this . #status } ) ;
872
-
873
- this . #emitUnrecoverableError(
874
- "NoAttemptNumber" ,
875
- "Attempt number not set while resuming from paused state"
876
- ) ;
877
-
878
- return ;
879
- }
880
-
881
- socket . emit ( "READY_FOR_RESUME" , {
882
- version : "v2" ,
883
- attemptFriendlyId : this . attemptFriendlyId ,
884
- attemptNumber : this . attemptNumber ,
885
- type : this . nextResumeAfter ,
886
- } ) ;
887
-
953
+ await this . #readyForResume( ) ;
888
954
return ;
889
955
}
890
956
@@ -1293,6 +1359,9 @@ class ProdWorker {
1293
1359
attemptNumber : this . attemptNumber ,
1294
1360
waitForTaskReplay : this . waitForTaskReplay ,
1295
1361
waitForBatchReplay : this . waitForBatchReplay ,
1362
+ readyForLazyAttemptReplay : this . readyForLazyAttemptReplay ,
1363
+ durationResumeFallback : this . durationResumeFallback ,
1364
+ readyForResumeReplay : this . readyForResumeReplay ,
1296
1365
} ;
1297
1366
}
1298
1367
0 commit comments