Skip to content

Commit d2c779e

Browse files
authored
Fixes for internal error reattempts (#1436)
* magic links on span event errors * prevent task monitor from processing errors handled elsewhere * exclusively use internal error code enum for completion data * add complete attempt service opts * reattempts need to go via the queue for task controllers that may have exited * only infer retry config if completed via crash or system failure * enhance error before deciding if retriable * retry on SIGTERM * enable retry config helper for latest sdk * don't retry heartbeat timeouts for now * enable task monitor to update fatal errors * add missing service * update retry config since package version * don't alter completion time when updating existing error
1 parent cbe5170 commit d2c779e

File tree

14 files changed

+212
-95
lines changed

14 files changed

+212
-95
lines changed

apps/kubernetes-provider/src/taskMonitor.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,10 @@ export class TaskMonitor {
160160

161161
let reason = rawReason || "Unknown error";
162162
let logs = rawLogs || "";
163-
let overrideCompletion = false;
163+
164+
/** This will only override existing task errors. It will not crash the run. */
165+
let onlyOverrideExistingError = exitCode === EXIT_CODE_CHILD_NONZERO;
166+
164167
let errorCode: TaskRunInternalError["code"] = TaskRunErrorCodes.POD_UNKNOWN_ERROR;
165168

166169
switch (rawReason) {
@@ -185,10 +188,8 @@ export class TaskMonitor {
185188
}
186189
break;
187190
case "OOMKilled":
188-
overrideCompletion = true;
189-
reason = `${
190-
exitCode === EXIT_CODE_CHILD_NONZERO ? "Child process" : "Parent process"
191-
} ran out of memory! Try choosing a machine preset with more memory for this task.`;
191+
reason =
192+
"[TaskMonitor] Your task ran out of memory. Try increasing the machine specs. If this doesn't fix it there might be a memory leak.";
192193
errorCode = TaskRunErrorCodes.TASK_PROCESS_OOM_KILLED;
193194
break;
194195
default:
@@ -199,7 +200,7 @@ export class TaskMonitor {
199200
exitCode,
200201
reason,
201202
logs,
202-
overrideCompletion,
203+
overrideCompletion: onlyOverrideExistingError,
203204
errorCode,
204205
} satisfies FailureDetails;
205206

apps/webapp/app/components/runs/v3/SpanEvents.tsx

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
import { EnvelopeIcon } from "@heroicons/react/20/solid";
12
import {
23
exceptionEventEnhancer,
34
isExceptionSpanEvent,
45
type ExceptionEventProperties,
56
type SpanEvent as OtelSpanEvent,
67
} from "@trigger.dev/core/v3";
78
import { CodeBlock } from "~/components/code/CodeBlock";
9+
import { Feedback } from "~/components/Feedback";
10+
import { Button } from "~/components/primitives/Buttons";
811
import { Callout } from "~/components/primitives/Callout";
912
import { DateTimeAccurate } from "~/components/primitives/DateTime";
1013
import { Header2, Header3 } from "~/components/primitives/Headers";
@@ -75,11 +78,26 @@ export function SpanEventError({
7578
titleClassName="text-rose-500"
7679
/>
7780
{enhancedException.message && <Callout variant="error">{enhancedException.message}</Callout>}
78-
{enhancedException.link && (
79-
<Callout variant="docs" to={enhancedException.link.href}>
80-
{enhancedException.link.name}
81-
</Callout>
82-
)}
81+
{enhancedException.link &&
82+
(enhancedException.link.magic === "CONTACT_FORM" ? (
83+
<Feedback
84+
button={
85+
<Button
86+
variant="tertiary/medium"
87+
LeadingIcon={EnvelopeIcon}
88+
leadingIconClassName="text-blue-400"
89+
fullWidth
90+
textAlignLeft
91+
>
92+
{enhancedException.link.name}
93+
</Button>
94+
}
95+
/>
96+
) : (
97+
<Callout variant="docs" to={enhancedException.link.href}>
98+
{enhancedException.link.name}
99+
</Callout>
100+
))}
83101
{enhancedException.stacktrace && (
84102
<CodeBlock
85103
showCopyButton={false}

apps/webapp/app/models/taskRun.server.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import type {
33
TaskRunFailedExecutionResult,
44
TaskRunSuccessfulExecutionResult,
55
} from "@trigger.dev/core/v3";
6-
import { TaskRunError } from "@trigger.dev/core/v3";
6+
import { TaskRunError, TaskRunErrorCodes } from "@trigger.dev/core/v3";
77

88
import type {
99
TaskRun,
@@ -62,7 +62,7 @@ export function executionResultForTaskRun(
6262
id: taskRun.friendlyId,
6363
error: {
6464
type: "INTERNAL_ERROR",
65-
code: "TASK_RUN_CANCELLED",
65+
code: TaskRunErrorCodes.TASK_RUN_CANCELLED,
6666
},
6767
} satisfies TaskRunFailedExecutionResult;
6868
}
@@ -94,7 +94,7 @@ export function executionResultForTaskRun(
9494
id: taskRun.friendlyId,
9595
error: {
9696
type: "INTERNAL_ERROR",
97-
code: "CONFIGURED_INCORRECTLY",
97+
code: TaskRunErrorCodes.CONFIGURED_INCORRECTLY,
9898
},
9999
} satisfies TaskRunFailedExecutionResult;
100100
}

apps/webapp/app/v3/failedTaskRun.server.ts

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -112,23 +112,15 @@ export class FailedTaskRunRetryHelper extends BaseService {
112112

113113
logger.debug("[FailedTaskRunRetryHelper] Completing attempt", { taskRun, completion });
114114

115-
const executionRetry =
116-
completion.retry ??
117-
(await FailedTaskRunRetryHelper.getExecutionRetry({
118-
run: taskRun,
119-
execution: retriableExecution,
120-
}));
121-
122-
const completeAttempt = new CompleteAttemptService(this._prisma);
123-
const completeResult = await completeAttempt.call({
124-
completion: {
125-
...completion,
126-
retry: executionRetry,
127-
},
128-
execution: retriableExecution,
115+
const completeAttempt = new CompleteAttemptService({
116+
prisma: this._prisma,
129117
isSystemFailure: !isCrash,
130118
isCrash,
131119
});
120+
const completeResult = await completeAttempt.call({
121+
completion,
122+
execution: retriableExecution,
123+
});
132124

133125
return completeResult;
134126
}
@@ -280,6 +272,5 @@ export class FailedTaskRunRetryHelper extends BaseService {
280272
}
281273
}
282274

283-
// TODO: update this to the correct version
284-
static DEFAULT_RETRY_CONFIG_SINCE_VERSION = "3.0.14";
275+
static DEFAULT_RETRY_CONFIG_SINCE_VERSION = "3.1.0";
285276
}

apps/webapp/app/v3/handleSocketIo.server.ts

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import { Redis } from "ioredis";
2424
import { createAdapter } from "@socket.io/redis-adapter";
2525
import { CrashTaskRunService } from "./services/crashTaskRun.server";
2626
import { CreateTaskRunAttemptService } from "./services/createTaskRunAttempt.server";
27+
import { UpdateFatalRunErrorService } from "./services/updateFatalRunError.server";
2728

2829
export const socketIo = singleton("socketIo", initalizeIoServer);
2930

@@ -123,12 +124,13 @@ function createCoordinatorNamespace(io: Server) {
123124
await resumeAttempt.call(message);
124125
},
125126
TASK_RUN_COMPLETED: async (message) => {
126-
const completeAttempt = new CompleteAttemptService();
127+
const completeAttempt = new CompleteAttemptService({
128+
supportsRetryCheckpoints: message.version === "v1",
129+
});
127130
await completeAttempt.call({
128131
completion: message.completion,
129132
execution: message.execution,
130133
checkpoint: message.checkpoint,
131-
supportsRetryCheckpoints: message.version === "v1",
132134
});
133135
},
134136
TASK_RUN_FAILED_TO_RUN: async (message) => {
@@ -301,11 +303,13 @@ function createProviderNamespace(io: Server) {
301303
handlers: {
302304
WORKER_CRASHED: async (message) => {
303305
try {
304-
const service = new CrashTaskRunService();
305-
306-
await service.call(message.runId, {
307-
...message,
308-
});
306+
if (message.overrideCompletion) {
307+
const updateErrorService = new UpdateFatalRunErrorService();
308+
await updateErrorService.call(message.runId, { ...message });
309+
} else {
310+
const crashRunService = new CrashTaskRunService();
311+
await crashRunService.call(message.runId, { ...message });
312+
}
309313
} catch (error) {
310314
logger.error("Error while handling crashed worker", { error });
311315
}

apps/webapp/app/v3/requeueTaskRun.server.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { BaseService } from "./services/baseService.server";
77
import { PrismaClientOrTransaction } from "~/db.server";
88
import { workerQueue } from "~/services/worker.server";
99
import { socketIo } from "./handleSocketIo.server";
10+
import { TaskRunErrorCodes } from "@trigger.dev/core/v3";
1011

1112
export class RequeueTaskRunService extends BaseService {
1213
public async call(runId: string) {
@@ -59,7 +60,7 @@ export class RequeueTaskRunService extends BaseService {
5960
retry: undefined,
6061
error: {
6162
type: "INTERNAL_ERROR",
62-
code: "TASK_RUN_HEARTBEAT_TIMEOUT",
63+
code: TaskRunErrorCodes.TASK_RUN_HEARTBEAT_TIMEOUT,
6364
message: "Did not receive a heartbeat from the worker in time",
6465
},
6566
});

0 commit comments

Comments
 (0)