diff --git a/.changeset/many-plants-destroy.md b/.changeset/many-plants-destroy.md new file mode 100644 index 0000000000..a59a5f385c --- /dev/null +++ b/.changeset/many-plants-destroy.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +SIGTERM detection and prettier errors diff --git a/apps/coordinator/src/checkpointer.ts b/apps/coordinator/src/checkpointer.ts index f1b55fa066..bf82a6702c 100644 --- a/apps/coordinator/src/checkpointer.ts +++ b/apps/coordinator/src/checkpointer.ts @@ -436,7 +436,10 @@ export class Checkpointer { this.#logger.error("Error during cleanup", { ...metadata, error }); } - this.#abortControllers.delete(runId); + // Ensure only the current controller is removed + if (this.#abortControllers.get(runId) === controller) { + this.#abortControllers.delete(runId); + } controller.signal.removeEventListener("abort", onAbort); }; diff --git a/apps/coordinator/src/exec.ts b/apps/coordinator/src/exec.ts index d0c7745b0b..b905723c0f 100644 --- a/apps/coordinator/src/exec.ts +++ b/apps/coordinator/src/exec.ts @@ -64,7 +64,18 @@ export class Exec { command, argsRaw: args, argsTrimmed, - ...output, + globalOpts: { + trimArgs: this.trimArgs, + neverThrow: this.neverThrow, + hasAbortSignal: !!this.abortSignal, + }, + localOpts: opts, + stdout: output.stdout, + stderr: output.stderr, + pid: result.pid, + exitCode: result.exitCode, + aborted: result.aborted, + killed: result.killed, }; if (this.logOutput) { diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.v3.$projectParam.runs.$runParam.spans.$spanParam/route.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.v3.$projectParam.runs.$runParam.spans.$spanParam/route.tsx index 9db878ea6c..b250a88017 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.v3.$projectParam.runs.$runParam.spans.$spanParam/route.tsx +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.v3.$projectParam.runs.$runParam.spans.$spanParam/route.tsx @@ -1,4 +1,10 @@ -import { CheckIcon, ClockIcon, CloudArrowDownIcon, QueueListIcon } from "@heroicons/react/20/solid"; +import { + CheckIcon, + ClockIcon, + CloudArrowDownIcon, + EnvelopeIcon, + QueueListIcon, +} from "@heroicons/react/20/solid"; import { Link } from "@remix-run/react"; import { LoaderFunctionArgs } from "@remix-run/server-runtime"; import { @@ -13,6 +19,7 @@ import { typedjson, useTypedFetcher } from "remix-typedjson"; import { ExitIcon } from "~/assets/icons/ExitIcon"; import { CodeBlock } from "~/components/code/CodeBlock"; import { EnvironmentLabel } from "~/components/environments/EnvironmentLabel"; +import { Feedback } from "~/components/Feedback"; import { Button, LinkButton } from "~/components/primitives/Buttons"; import { Callout } from "~/components/primitives/Callout"; import { DateTime, DateTimeAccurate } from "~/components/primitives/DateTime"; @@ -963,11 +970,26 @@ function RunError({ error }: { error: TaskRunError }) {
{name} {enhancedError.message && {enhancedError.message}} - {enhancedError.link && ( - - {enhancedError.link.name} - - )} + {enhancedError.link && + (enhancedError.link.magic === "CONTACT_FORM" ? ( + + {enhancedError.link.name} + + } + /> + ) : ( + + {enhancedError.link.name} + + ))} {enhancedError.stackTrace && ( { + const result = await $transaction(this.prismaClient, async (tx) => { const environmentVariable = await tx.environmentVariable.upsert({ where: { projectId_key: { diff --git a/apps/webapp/app/v3/handleSocketIo.server.ts b/apps/webapp/app/v3/handleSocketIo.server.ts index 065d2bc168..dd12d69bb4 100644 --- a/apps/webapp/app/v3/handleSocketIo.server.ts +++ b/apps/webapp/app/v3/handleSocketIo.server.ts @@ -195,7 +195,10 @@ function createCoordinatorNamespace(io: Server) { const service = new CreateTaskRunAttemptService(); const { attempt } = await service.call(message.runId, environment, false); - const payload = await sharedQueueTasks.getExecutionPayloadFromAttempt(attempt.id, true); + const payload = await sharedQueueTasks.getExecutionPayloadFromAttempt({ + id: attempt.id, + setToExecuting: true, + }); if (!payload) { logger.error("Failed to retrieve payload after attempt creation", message); diff --git a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts index a663011e4a..19628db616 100644 --- a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts @@ -21,7 +21,7 @@ import { TaskRunStatus, } from "@trigger.dev/database"; import { z } from "zod"; -import { prisma } from "~/db.server"; +import { $replica, prisma } from "~/db.server"; import { findEnvironmentById } from "~/models/runtimeEnvironment.server"; import { logger } from "~/services/logger.server"; import { singleton } from "~/utils/singleton"; @@ -43,7 +43,12 @@ import { generateJWTTokenForEnvironment } from "~/services/apiAuth.server"; import { EnvironmentVariable } from "../environmentVariables/repository"; import { machinePresetFromConfig } from "../machinePresets.server"; import { env } from "~/env.server"; -import { isFinalAttemptStatus, isFinalRunStatus } from "../taskStatus"; +import { + FINAL_ATTEMPT_STATUSES, + FINAL_RUN_STATUSES, + isFinalAttemptStatus, + isFinalRunStatus, +} from "../taskStatus"; import { getMaxDuration } from "../utils/maxDuration"; const WithTraceContext = z.object({ @@ -620,6 +625,9 @@ export class SharedQueueConsumer { const resumableRun = await prisma.taskRun.findUnique({ where: { id: message.messageId, + status: { + notIn: FINAL_RUN_STATUSES, + }, }, }); @@ -633,6 +641,14 @@ export class SharedQueueConsumer { return; } + if (resumableRun.status !== "EXECUTING") { + logger.warn("Run is not executing, will try to resume anyway", { + queueMessage: message.data, + messageId: message.messageId, + runStatus: resumableRun.status, + }); + } + const resumableAttempt = await prisma.taskRunAttempt.findUnique({ where: { id: messageBody.data.resumableAttemptId, @@ -718,9 +734,9 @@ export class SharedQueueConsumer { completions.push(completion); - const executionPayload = await this._tasks.getExecutionPayloadFromAttempt( - completedAttempt.id - ); + const executionPayload = await this._tasks.getExecutionPayloadFromAttempt({ + id: completedAttempt.id, + }); if (!executionPayload) { await this.#ackAndDoMoreWork(message.messageId); @@ -740,7 +756,11 @@ export class SharedQueueConsumer { executions, }; - logger.debug("Broadcasting RESUME_AFTER_DEPENDENCY_WITH_ACK", { resumeMessage, message }); + logger.debug("Broadcasting RESUME_AFTER_DEPENDENCY_WITH_ACK", { + resumeMessage, + message, + resumableRun, + }); // The attempt should still be running so we can broadcast to all coordinators to resume immediately const responses = await socketIo.coordinatorNamespace @@ -763,15 +783,91 @@ export class SharedQueueConsumer { } const hasSuccess = responses.some((response) => response.success); - if (!hasSuccess) { - logger.warn("RESUME_AFTER_DEPENDENCY_WITH_ACK failed", { - resumeMessage, - responses, - message, - }); - await this.#nackAndDoMoreWork(message.messageId, this._options.nextTickInterval, 5_000); + + if (hasSuccess) { + this.#doMoreWork(); return; } + + // No coordinator was able to resume the run + logger.warn("RESUME_AFTER_DEPENDENCY_WITH_ACK failed", { + resumeMessage, + responses, + message, + }); + + // Let's check if the run is frozen + if (resumableRun.status === "WAITING_TO_RESUME") { + logger.debug("RESUME_AFTER_DEPENDENCY_WITH_ACK run is waiting to be restored", { + queueMessage: message.data, + messageId: message.messageId, + }); + + try { + const restoreService = new RestoreCheckpointService(); + + const checkpointEvent = await restoreService.getLastCheckpointEventIfUnrestored( + resumableRun.id + ); + + if (checkpointEvent) { + // The last checkpoint hasn't been restored yet, so restore it + const checkpoint = await restoreService.call({ + eventId: checkpointEvent.id, + }); + + if (!checkpoint) { + logger.debug("RESUME_AFTER_DEPENDENCY_WITH_ACK failed to restore checkpoint", { + queueMessage: message.data, + messageId: message.messageId, + }); + + await this.#ackAndDoMoreWork(message.messageId); + return; + } + + logger.debug("RESUME_AFTER_DEPENDENCY_WITH_ACK restored checkpoint", { + queueMessage: message.data, + messageId: message.messageId, + checkpoint, + }); + + this.#doMoreWork(); + return; + } else { + logger.debug( + "RESUME_AFTER_DEPENDENCY_WITH_ACK run is frozen without last checkpoint event", + { + queueMessage: message.data, + messageId: message.messageId, + } + ); + } + } catch (e) { + if (e instanceof Error) { + this._currentSpan?.recordException(e); + } else { + this._currentSpan?.recordException(new Error(String(e))); + } + + this._endSpanInNextIteration = true; + + await this.#nackAndDoMoreWork( + message.messageId, + this._options.nextTickInterval, + 5_000 + ); + return; + } + } + + logger.debug("RESUME_AFTER_DEPENDENCY_WITH_ACK retrying", { + queueMessage: message.data, + messageId: message.messageId, + }); + + await this.#nackAndDoMoreWork(message.messageId, this._options.nextTickInterval, 5_000); + return; } catch (e) { if (e instanceof Error) { this._currentSpan?.recordException(e); @@ -896,7 +992,7 @@ class SharedQueueTasks { where: { id, status: { - in: ["COMPLETED", "FAILED"], + in: FINAL_ATTEMPT_STATUSES, }, }, include: { @@ -942,11 +1038,17 @@ class SharedQueueTasks { } } - async getExecutionPayloadFromAttempt( - id: string, - setToExecuting?: boolean, - isRetrying?: boolean - ): Promise { + async getExecutionPayloadFromAttempt({ + id, + setToExecuting, + isRetrying, + skipStatusChecks, + }: { + id: string; + setToExecuting?: boolean; + isRetrying?: boolean; + skipStatusChecks?: boolean; + }): Promise { const attempt = await prisma.taskRunAttempt.findUnique({ where: { id, @@ -979,27 +1081,29 @@ class SharedQueueTasks { return; } - switch (attempt.status) { - case "CANCELED": - case "EXECUTING": { - logger.error("Invalid attempt status for execution payload retrieval", { - attemptId: id, - status: attempt.status, - }); - return; + if (!skipStatusChecks) { + switch (attempt.status) { + case "CANCELED": + case "EXECUTING": { + logger.error("Invalid attempt status for execution payload retrieval", { + attemptId: id, + status: attempt.status, + }); + return; + } } - } - switch (attempt.taskRun.status) { - case "CANCELED": - case "EXECUTING": - case "INTERRUPTED": { - logger.error("Invalid run status for execution payload retrieval", { - attemptId: id, - runId: attempt.taskRunId, - status: attempt.taskRun.status, - }); - return; + switch (attempt.taskRun.status) { + case "CANCELED": + case "EXECUTING": + case "INTERRUPTED": { + logger.error("Invalid run status for execution payload retrieval", { + attemptId: id, + runId: attempt.taskRunId, + status: attempt.taskRun.status, + }); + return; + } } } @@ -1150,7 +1254,11 @@ class SharedQueueTasks { return; } - return this.getExecutionPayloadFromAttempt(latestAttempt.id, setToExecuting, isRetrying); + return this.getExecutionPayloadFromAttempt({ + id: latestAttempt.id, + setToExecuting, + isRetrying, + }); } async getLazyAttemptPayload( @@ -1225,13 +1333,13 @@ class SharedQueueTasks { return; } - await marqs?.heartbeatMessage(taskRunAttempt.taskRunId); + await this.#heartbeat(taskRunAttempt.taskRunId); } async taskRunHeartbeat(runId: string) { logger.debug("[SharedQueueConsumer] taskRunHeartbeat()", { runId }); - await marqs?.heartbeatMessage(runId); + await this.#heartbeat(runId); } public async taskRunFailed(completion: TaskRunFailedExecutionResult) { @@ -1242,6 +1350,66 @@ class SharedQueueTasks { await service.call(completion.id, completion); } + async #heartbeat(runId: string) { + await marqs?.heartbeatMessage(runId); + + try { + // There can be a lot of calls per minute and the data doesn't have to be accurate, so use the read replica + const taskRun = await $replica.taskRun.findFirst({ + where: { + id: runId, + }, + select: { + id: true, + status: true, + runtimeEnvironment: { + select: { + type: true, + }, + }, + lockedToVersion: { + select: { + supportsLazyAttempts: true, + }, + }, + }, + }); + + if (!taskRun) { + logger.error("SharedQueueTasks.#heartbeat: Task run not found", { + runId, + }); + + return; + } + + if (taskRun.runtimeEnvironment.type === "DEVELOPMENT") { + return; + } + + if (isFinalRunStatus(taskRun.status)) { + logger.debug("SharedQueueTasks.#heartbeat: Task run is in final status", { + runId, + status: taskRun.status, + }); + + // Signal to exit any leftover containers + socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", { + version: "v1", + runId: taskRun.id, + // Give the run a few seconds to exit to complete any flushing etc + delayInMs: taskRun.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined, + }); + return; + } + } catch (error) { + logger.error("SharedQueueTasks.#heartbeat: Error signaling run cancellation", { + runId, + error: error instanceof Error ? error.message : error, + }); + } + } + async #buildEnvironmentVariables( environment: RuntimeEnvironment, runId: string, diff --git a/apps/webapp/app/v3/requeueTaskRun.server.ts b/apps/webapp/app/v3/requeueTaskRun.server.ts index 8228829e87..005eadbf6a 100644 --- a/apps/webapp/app/v3/requeueTaskRun.server.ts +++ b/apps/webapp/app/v3/requeueTaskRun.server.ts @@ -6,11 +6,29 @@ import { FailedTaskRunService } from "./failedTaskRun.server"; import { BaseService } from "./services/baseService.server"; import { PrismaClientOrTransaction } from "~/db.server"; import { workerQueue } from "~/services/worker.server"; +import { socketIo } from "./handleSocketIo.server"; export class RequeueTaskRunService extends BaseService { public async call(runId: string) { const taskRun = await this._prisma.taskRun.findUnique({ - where: { id: runId }, + where: { + id: runId, + }, + select: { + id: true, + friendlyId: true, + status: true, + runtimeEnvironment: { + select: { + type: true, + }, + }, + lockedToVersion: { + select: { + supportsLazyAttempts: true, + }, + }, + }, }); if (!taskRun) { @@ -76,6 +94,25 @@ export class RequeueTaskRunService extends BaseService { await marqs?.acknowledgeMessage(taskRun.id); + try { + if (taskRun.runtimeEnvironment.type === "DEVELOPMENT") { + return; + } + + // Signal to exit any leftover containers + socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", { + version: "v1", + runId: taskRun.id, + // Give the run a few seconds to exit to complete any flushing etc + delayInMs: taskRun.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined, + }); + } catch (error) { + logger.error("[RequeueTaskRunService] Error signaling run cancellation", { + runId: taskRun.id, + error: error instanceof Error ? error.message : error, + }); + } + break; } default: { diff --git a/apps/webapp/app/v3/services/restoreCheckpoint.server.ts b/apps/webapp/app/v3/services/restoreCheckpoint.server.ts index 3ca0efbc33..9a9b82210a 100644 --- a/apps/webapp/app/v3/services/restoreCheckpoint.server.ts +++ b/apps/webapp/app/v3/services/restoreCheckpoint.server.ts @@ -112,4 +112,24 @@ export class RestoreCheckpointService extends BaseService { return checkpoint; } + + async getLastCheckpointEventIfUnrestored(runId: string) { + const event = await this._prisma.checkpointRestoreEvent.findFirst({ + where: { + runId, + }, + take: 1, + orderBy: { + createdAt: "desc", + }, + }); + + if (!event) { + return; + } + + if (event.type === "CHECKPOINT") { + return event; + } + } } diff --git a/apps/webapp/app/v3/services/resumeAttempt.server.ts b/apps/webapp/app/v3/services/resumeAttempt.server.ts index f02d9e4ccc..0cba99e472 100644 --- a/apps/webapp/app/v3/services/resumeAttempt.server.ts +++ b/apps/webapp/app/v3/services/resumeAttempt.server.ts @@ -2,23 +2,24 @@ import { CoordinatorToPlatformMessages, TaskRunExecution, TaskRunExecutionResult, - WaitReason, } from "@trigger.dev/core/v3"; import type { InferSocketMessageSchema } from "@trigger.dev/core/v3/zodSocket"; import { $transaction, PrismaClientOrTransaction } from "~/db.server"; import { logger } from "~/services/logger.server"; import { marqs } from "~/v3/marqs/index.server"; import { socketIo } from "../handleSocketIo.server"; -import { SharedQueueMessageBody, sharedQueueTasks } from "../marqs/sharedQueueConsumer.server"; +import { sharedQueueTasks } from "../marqs/sharedQueueConsumer.server"; import { BaseService } from "./baseService.server"; import { TaskRunAttempt } from "@trigger.dev/database"; import { isFinalRunStatus } from "../taskStatus"; export class ResumeAttemptService extends BaseService { + private _logger = logger; + public async call( params: InferSocketMessageSchema ): Promise { - logger.debug(`ResumeAttemptService.call()`, params); + this._logger.debug(`ResumeAttemptService.call()`, params); await $transaction(this._prisma, async (tx) => { const attempt = await tx.taskRunAttempt.findUnique({ @@ -77,16 +78,18 @@ export class ResumeAttemptService extends BaseService { }); if (!attempt) { - logger.error("Could not find attempt", { attemptFriendlyId: params.attemptFriendlyId }); + this._logger.error("Could not find attempt", params); return; } + this._logger = logger.child({ + attemptId: attempt.id, + attemptFriendlyId: attempt.friendlyId, + taskRun: attempt.taskRun, + }); + if (isFinalRunStatus(attempt.taskRun.status)) { - logger.error("Run is not resumable", { - attemptId: attempt.id, - runId: attempt.taskRunId, - status: attempt.taskRun.status, - }); + this._logger.error("Run is not resumable"); return; } @@ -94,10 +97,7 @@ export class ResumeAttemptService extends BaseService { switch (params.type) { case "WAIT_FOR_DURATION": { - logger.debug("Sending duration wait resume message", { - attemptId: attempt.id, - attemptFriendlyId: params.attemptFriendlyId, - }); + this._logger.debug("Sending duration wait resume message"); await this.#setPostResumeStatuses(attempt, tx); @@ -114,13 +114,13 @@ export class ResumeAttemptService extends BaseService { const dependentAttempt = attempt.dependencies[0].taskRun.attempts[0]; if (!dependentAttempt) { - logger.error("No dependent attempt", { attemptId: attempt.id }); + this._logger.error("No dependent attempt"); return; } completedAttemptIds = [dependentAttempt.id]; } else { - logger.error("No task dependency", { attemptId: attempt.id }); + this._logger.error("No task dependency"); return; } @@ -134,13 +134,13 @@ export class ResumeAttemptService extends BaseService { const dependentBatchItems = attempt.batchDependencies[0].items; if (!dependentBatchItems) { - logger.error("No dependent batch items", { attemptId: attempt.id }); + this._logger.error("No dependent batch items"); return; } completedAttemptIds = dependentBatchItems.map((item) => item.taskRun.attempts[0]?.id); } else { - logger.error("No batch dependency", { attemptId: attempt.id }); + this._logger.error("No batch dependency"); return; } @@ -161,7 +161,7 @@ export class ResumeAttemptService extends BaseService { tx: PrismaClientOrTransaction ) { if (completedAttemptIds.length === 0) { - logger.error("No completed attempt IDs", { attemptId: attempt.id }); + this._logger.error("No completed attempt IDs"); return; } @@ -184,38 +184,36 @@ export class ResumeAttemptService extends BaseService { }); if (!completedAttempt) { - logger.error("Completed attempt not found", { - attemptId: attempt.id, - completedAttemptId, - }); + this._logger.error("Completed attempt not found", { completedAttemptId }); await marqs?.acknowledgeMessage(attempt.taskRunId); return; } + const logger = this._logger.child({ + completedAttemptId: completedAttempt.id, + completedAttemptFriendlyId: completedAttempt.friendlyId, + completedRunId: completedAttempt.taskRunId, + }); + const completion = await sharedQueueTasks.getCompletionPayloadFromAttempt( completedAttempt.id ); if (!completion) { - logger.error("Failed to get completion payload", { - attemptId: attempt.id, - completedAttemptId, - }); + logger.error("Failed to get completion payload"); await marqs?.acknowledgeMessage(attempt.taskRunId); return; } completions.push(completion); - const executionPayload = await sharedQueueTasks.getExecutionPayloadFromAttempt( - completedAttempt.id - ); + const executionPayload = await sharedQueueTasks.getExecutionPayloadFromAttempt({ + id: completedAttempt.id, + skipStatusChecks: true, // already checked when getting the completion + }); if (!executionPayload) { - logger.error("Failed to get execution payload", { - attemptId: attempt.id, - completedAttemptId, - }); + logger.error("Failed to get execution payload"); await marqs?.acknowledgeMessage(attempt.taskRunId); return; } diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index f63f48b99f..bc6fc20e47 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -312,8 +312,23 @@ export class GracefulExitTimeoutError extends Error { } } +type ErrorLink = { + name: string; + href: string; + // This allows us to easily add more complex logic on the frontend, e.g. display a button to open a contact form modal + magic?: "CONTACT_FORM"; +}; + +type EnhanceError = T & { link?: ErrorLink }; + const prettyInternalErrors: Partial< - Record + Record< + TaskRunInternalError["code"], + { + message: string; + link?: ErrorLink; + } + > > = { TASK_PROCESS_OOM_KILLED: { message: @@ -331,10 +346,15 @@ const prettyInternalErrors: Partial< href: links.docs.machines.home, }, }, -}; - -type EnhanceError = T & { - link?: { name: string; href: string }; + TASK_PROCESS_SIGTERM: { + message: + "Your task exited after receiving SIGTERM but we don't know why. If this keeps happening, please get in touch so we can investigate.", + link: { + name: "Contact us", + href: links.site.contact, + magic: "CONTACT_FORM", + }, + }, }; export function taskRunErrorEnhancer(error: TaskRunError): EnhanceError { @@ -342,6 +362,14 @@ export function taskRunErrorEnhancer(error: TaskRunError): EnhanceError; -export const TaskRunErrorCodes = { - COULD_NOT_FIND_EXECUTOR: "COULD_NOT_FIND_EXECUTOR", - COULD_NOT_FIND_TASK: "COULD_NOT_FIND_TASK", - COULD_NOT_IMPORT_TASK: "COULD_NOT_IMPORT_TASK", - CONFIGURED_INCORRECTLY: "CONFIGURED_INCORRECTLY", - TASK_ALREADY_RUNNING: "TASK_ALREADY_RUNNING", - TASK_EXECUTION_FAILED: "TASK_EXECUTION_FAILED", - TASK_EXECUTION_ABORTED: "TASK_EXECUTION_ABORTED", - TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", - TASK_PROCESS_SIGKILL_TIMEOUT: "TASK_PROCESS_SIGKILL_TIMEOUT", - TASK_PROCESS_OOM_KILLED: "TASK_PROCESS_OOM_KILLED", - TASK_PROCESS_MAYBE_OOM_KILLED: "TASK_PROCESS_MAYBE_OOM_KILLED", - TASK_RUN_CANCELLED: "TASK_RUN_CANCELLED", - TASK_OUTPUT_ERROR: "TASK_OUTPUT_ERROR", - HANDLE_ERROR_ERROR: "HANDLE_ERROR_ERROR", - GRACEFUL_EXIT_TIMEOUT: "GRACEFUL_EXIT_TIMEOUT", - TASK_RUN_CRASHED: "TASK_RUN_CRASHED", - MAX_DURATION_EXCEEDED: "MAX_DURATION_EXCEEDED", - DISK_SPACE_EXCEEDED: "DISK_SPACE_EXCEEDED", - POD_EVICTED: "POD_EVICTED", - POD_UNKNOWN_ERROR: "POD_UNKNOWN_ERROR", -} as const; - export const TaskRunInternalError = z.object({ type: z.literal("INTERNAL_ERROR"), code: z.enum([ @@ -112,6 +89,7 @@ export const TaskRunInternalError = z.object({ "TASK_EXECUTION_ABORTED", "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", "TASK_PROCESS_SIGKILL_TIMEOUT", + "TASK_PROCESS_SIGTERM", "TASK_PROCESS_OOM_KILLED", "TASK_PROCESS_MAYBE_OOM_KILLED", "TASK_RUN_CANCELLED", @@ -131,6 +109,8 @@ export const TaskRunInternalError = z.object({ export type TaskRunInternalError = z.infer; +export const TaskRunErrorCodes = TaskRunInternalError.shape.code.enum; + export const TaskRunError = z.discriminatedUnion("type", [ TaskRunBuiltInError, TaskRunCustomErrorObject,