Skip to content

Fix several restore and resume bugs #1418

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
623f158
try to correct resume messages with missing checkpoint
nicktrn Oct 17, 2024
11066b4
prevent creating checkpoints for outdated task waits
nicktrn Oct 17, 2024
f2b5c2a
prevent creating checkpoints for outdated batch waits
nicktrn Oct 17, 2024
d756a16
use heartbeats to check for and clean up any leftover containers
nicktrn Oct 17, 2024
5364558
Merge remote-tracking branch 'origin/main' into fix/invalid-resume-me…
nicktrn Oct 17, 2024
df15d6a
lint
nicktrn Oct 17, 2024
e003d25
improve exec logging
nicktrn Oct 17, 2024
9af6018
improve resume attempt logs
nicktrn Oct 17, 2024
4c8618d
fix for resuming parents of canceled child runs
nicktrn Oct 17, 2024
12ad920
separate SIGTERM from maybe OOM errors
nicktrn Oct 17, 2024
13faa69
Merge remote-tracking branch 'origin/main' into fix/resume-restore-bugs
nicktrn Oct 17, 2024
a9928be
pretty errors can have magic dashboard links
nicktrn Oct 17, 2024
2d84b7c
prevent uncancellable checkpoints
nicktrn Oct 17, 2024
34d9759
simplify task run error code enum export
nicktrn Oct 18, 2024
89ec5c8
grab the last, not the first child run
nicktrn Oct 18, 2024
5c262fd
Revert "prevent creating checkpoints for outdated batch waits"
nicktrn Oct 18, 2024
e6afbb4
Revert "grab the last, not the first child run"
nicktrn Oct 18, 2024
40d80f9
Revert "prevent creating checkpoints for outdated task waits"
nicktrn Oct 18, 2024
59d375b
more logs for resume message handling
nicktrn Oct 18, 2024
3604d83
add magic error link comment
nicktrn Oct 18, 2024
cdbf5c6
add changeset
nicktrn Oct 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/many-plants-destroy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@trigger.dev/core": patch
---

SIGTERM detection and prettier errors
5 changes: 4 additions & 1 deletion apps/coordinator/src/checkpointer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,10 @@ export class Checkpointer {
this.#logger.error("Error during cleanup", { ...metadata, error });
}

this.#abortControllers.delete(runId);
// Ensure only the current controller is removed
if (this.#abortControllers.get(runId) === controller) {
this.#abortControllers.delete(runId);
}
controller.signal.removeEventListener("abort", onAbort);
};

Expand Down
13 changes: 12 additions & 1 deletion apps/coordinator/src/exec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,18 @@ export class Exec {
command,
argsRaw: args,
argsTrimmed,
...output,
globalOpts: {
trimArgs: this.trimArgs,
neverThrow: this.neverThrow,
hasAbortSignal: !!this.abortSignal,
},
localOpts: opts,
stdout: output.stdout,
stderr: output.stderr,
pid: result.pid,
exitCode: result.exitCode,
aborted: result.aborted,
killed: result.killed,
};

if (this.logOutput) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import { CheckIcon, ClockIcon, CloudArrowDownIcon, QueueListIcon } from "@heroicons/react/20/solid";
import {
CheckIcon,
ClockIcon,
CloudArrowDownIcon,
EnvelopeIcon,
QueueListIcon,
} from "@heroicons/react/20/solid";
import { Link } from "@remix-run/react";
import { LoaderFunctionArgs } from "@remix-run/server-runtime";
import {
Expand All @@ -13,6 +19,7 @@ import { typedjson, useTypedFetcher } from "remix-typedjson";
import { ExitIcon } from "~/assets/icons/ExitIcon";
import { CodeBlock } from "~/components/code/CodeBlock";
import { EnvironmentLabel } from "~/components/environments/EnvironmentLabel";
import { Feedback } from "~/components/Feedback";
import { Button, LinkButton } from "~/components/primitives/Buttons";
import { Callout } from "~/components/primitives/Callout";
import { DateTime, DateTimeAccurate } from "~/components/primitives/DateTime";
Expand Down Expand Up @@ -963,11 +970,26 @@ function RunError({ error }: { error: TaskRunError }) {
<div className="flex flex-col gap-2 rounded-sm border border-rose-500/50 px-3 pb-3 pt-2">
<Header3 className="text-rose-500">{name}</Header3>
{enhancedError.message && <Callout variant="error">{enhancedError.message}</Callout>}
{enhancedError.link && (
<Callout variant="docs" to={enhancedError.link.href}>
{enhancedError.link.name}
</Callout>
)}
{enhancedError.link &&
(enhancedError.link.magic === "CONTACT_FORM" ? (
<Feedback
button={
<Button
variant="tertiary/medium"
LeadingIcon={EnvelopeIcon}
leadingIconClassName="text-blue-400"
fullWidth
textAlignLeft
>
{enhancedError.link.name}
</Button>
}
/>
) : (
<Callout variant="docs" to={enhancedError.link.href}>
{enhancedError.link.name}
</Callout>
))}
{enhancedError.stackTrace && (
<CodeBlock
showCopyButton={false}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ export class EnvironmentVariablesRepository implements Repository {

try {
for (const variable of values) {
const result = await $transaction(this.prismaClient, async (tx) => {
const result = await $transaction(this.prismaClient, async (tx) => {
const environmentVariable = await tx.environmentVariable.upsert({
where: {
projectId_key: {
Expand Down
5 changes: 4 additions & 1 deletion apps/webapp/app/v3/handleSocketIo.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,10 @@ function createCoordinatorNamespace(io: Server) {
const service = new CreateTaskRunAttemptService();
const { attempt } = await service.call(message.runId, environment, false);

const payload = await sharedQueueTasks.getExecutionPayloadFromAttempt(attempt.id, true);
const payload = await sharedQueueTasks.getExecutionPayloadFromAttempt({
id: attempt.id,
setToExecuting: true,
});

if (!payload) {
logger.error("Failed to retrieve payload after attempt creation", message);
Expand Down
Loading
Loading