fix(webapp): mollifier read-fallback version field + trip-threshold guard

d-cs · claude · d-cs · commit e56b937b3b26 · 2026-05-27T16:06:39.000+01:00
- readFallback: read snapshot.taskVersion (the key buildEngineTriggerInput writes) instead of the nonexistent snapshot.lockToVersion, so buffered version-locked runs report their locked version; test now uses the real key as a regression guard.

- env: TRIGGER_MOLLIFIER_TRIP_THRESHOLD back to positive() (matching sibling mollifier numerics) to forbid threshold=0 silently mollifying every trigger.

- idempotencyKeys: document why the resolved-but-unfindable fall-through is safe (PG-unique + accept SETNX dedup + ~30s claim TTL self-heal); add regression test pinning the fall-through and the resolved-and-findable cached-hit path.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts
@@ -1092,7 +1092,7 @@ const EnvironmentSchema = z
       .transform((v) => v ?? process.env.REDIS_PASSWORD),
     TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"),
     TRIGGER_MOLLIFIER_TRIP_WINDOW_MS: z.coerce.number().int().positive().default(200),
-    TRIGGER_MOLLIFIER_TRIP_THRESHOLD: z.coerce.number().int().nonnegative().default(100),
+    TRIGGER_MOLLIFIER_TRIP_THRESHOLD: z.coerce.number().int().positive().default(100),
     TRIGGER_MOLLIFIER_HOLD_MS: z.coerce.number().int().positive().default(500),
     TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY: z.coerce.number().int().positive().default(50),
     TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS: z.coerce.number().int().positive().default(3),
diff --git a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts
@@ -266,9 +266,27 @@ export class IdempotencyKeyConcern {
         if (buffered) {
           return { isCached: true, run: buffered };
         }
-        // Claim resolved to a runId nothing can find — likely the
-        // claimant errored after publish, or the row TTL'd out. Log
-        // and fall through to a fresh trigger.
+        // Claim resolved to a runId nothing can find — the run was
+        // genuinely lost (claimant errored after publish, drain failed,
+        // or both the PG row and buffer entry TTL'd out). This is
+        // terminal, not transient: `lookupIdempotency` self-heals a
+        // dangling pointer, and `ack` keeps the entry hash as a
+        // read-fallback past the PG write, so re-polling cannot conjure
+        // a run that is gone. Falling through to a fresh trigger is the
+        // correct recovery.
+        //
+        // Why falling through claimless is safe (no duplicate runs):
+        // concurrent triggers that also fall through here converge on a
+        // single run via the same dedup backstops the claim layer relies
+        // on — the PG unique constraint on the idempotency key
+        // (RunDuplicateIdempotencyKeyError → retry resolves to the
+        // winner) for the pass-through path, and `accept`'s idempotency
+        // SETNX (`duplicate_idempotency`) for the mollify path. Once the
+        // first fall-through commits a run, later callers find it via the
+        // writer-PG / buffer lookups above despite the stale `resolved:`
+        // slot, which the slot's TTL clears within ~30s. The residual
+        // cost is a few redundant (deduped) trigger attempts in that
+        // window, not duplicate runs.
         logger.warn("idempotency claim resolved but runId not findable", {
           envId: request.environment.id,
           taskIdentifier: request.taskId,
diff --git a/apps/webapp/app/v3/mollifier/readFallback.server.ts b/apps/webapp/app/v3/mollifier/readFallback.server.ts
@@ -175,7 +175,7 @@ export async function findRunByIdWithMollifierFallback(
       ttl: asString(snapshot.ttl),
       tags,
       runTags: tags,
-      lockedToVersion: asString(snapshot.lockToVersion),
+      lockedToVersion: asString(snapshot.taskVersion),
       resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true,
       parentTaskRunId: asString(snapshot.parentTaskRunId),
 
diff --git a/apps/webapp/test/mollifierClaimResolution.test.ts b/apps/webapp/test/mollifierClaimResolution.test.ts
@@ -0,0 +1,93 @@
+import { describe, expect, it, vi } from "vitest";
+
+// Stub `~/db.server` before importing the concern — the real module
+// eagerly calls `prisma.$connect()` at singleton construction, which
+// would fail without a database. The concern under test receives its
+// prisma via the constructor, so the stub is never used by the code path.
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+// The IdempotencyKeyConcern resolves the pre-gate claim through the
+// global mollifier buffer (`getMollifierBuffer`), shared by both
+// `claimOrAwait` and `findBufferedRunWithIdempotency`. Control it via a
+// hoisted handle so each test can script the claim/lookup responses.
+const h = vi.hoisted(() => ({ buffer: null as unknown }));
+vi.mock("~/v3/mollifier/mollifierBuffer.server", () => ({
+  getMollifierBuffer: () => h.buffer,
+}));
+
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+import { IdempotencyKeyConcern } from "~/runEngine/concerns/idempotencyKeys.server";
+import type { TriggerTaskRequest } from "~/runEngine/types";
+
+function makeConcern(prisma: { findFirst: () => Promise<unknown> }) {
+  return new IdempotencyKeyConcern(
+    { taskRun: { findFirst: prisma.findFirst } } as never,
+    {} as never, // engine — unused on this path
+    {} as never, // traceEventConcern — unused on this path
+  );
+}
+
+function makeRequest(): TriggerTaskRequest {
+  return {
+    taskId: "my-task",
+    environment: { id: "env_a", organizationId: "org_1" },
+    options: {},
+    body: { options: { idempotencyKey: "k-1" } },
+  } as unknown as TriggerTaskRequest;
+}
+
+describe("IdempotencyKeyConcern · claim resolution", () => {
+  it("resolved-but-unfindable falls through to a fresh trigger (no cached run, no claim held)", async () => {
+    // The claim slot holds a runId that is gone from both stores: the PG
+    // findFirst misses and the buffer lookup misses. Regression guard for
+    // the resolved-but-unfindable terminal case — the concern must fall
+    // through to a fresh trigger rather than throw, hand back a bogus
+    // cached run, or claim ownership it doesn't hold.
+    const lookupIdempotency = vi.fn(async () => null);
+    h.buffer = {
+      claimIdempotency: vi.fn(async () => ({ kind: "resolved", runId: "run_gone" })),
+      lookupIdempotency,
+    } as unknown as MollifierBuffer;
+
+    const findFirst = vi.fn(async () => null); // PG misses on every call
+    const concern = makeConcern({ findFirst });
+
+    const result = await concern.handleTriggerRequest(makeRequest(), undefined);
+
+    expect(result.isCached).toBe(false);
+    if (result.isCached === false) {
+      // No claim held — we resolved someone else's (stale) claim, we did
+      // not win one. The caller must NOT publish/release on our behalf.
+      expect(result.claim).toBeUndefined();
+      expect(result.idempotencyKey).toBe("k-1");
+    }
+    // We attempted the buffer fallback before giving up.
+    expect(lookupIdempotency).toHaveBeenCalled();
+  });
+
+  it("resolved-and-findable returns the existing run as a cached hit", async () => {
+    // Guard the happy resolved path: when the claimed runId IS findable
+    // (writer-side PG), the fall-through change must not swallow it.
+    h.buffer = {
+      claimIdempotency: vi.fn(async () => ({ kind: "resolved", runId: "run_winner" })),
+      lookupIdempotency: vi.fn(async () => null),
+    } as unknown as MollifierBuffer;
+
+    const winner = { id: "run_winner", friendlyId: "run_winner" };
+    // First findFirst (initial existingRun check) misses so we enter the
+    // claim path; the second (writer-side re-resolve) finds the winner.
+    let calls = 0;
+    const findFirst = vi.fn(async () => {
+      calls += 1;
+      return calls >= 2 ? winner : null;
+    });
+    const concern = makeConcern({ findFirst });
+
+    const result = await concern.handleTriggerRequest(makeRequest(), undefined);
+
+    expect(result.isCached).toBe(true);
+    if (result.isCached === true) {
+      expect(result.run).toBe(winner);
+    }
+  });
+});
diff --git a/apps/webapp/test/mollifierReadFallback.test.ts b/apps/webapp/test/mollifierReadFallback.test.ts
@@ -141,7 +141,9 @@ describe("findRunByIdWithMollifierFallback", () => {
         depth: 2,
         ttl: "1h",
         tags: ["tag-a", "tag-b"],
-        lockToVersion: "20260511.1",
+        // The engine.trigger snapshot stores the locked version string under
+        // `taskVersion` (see triggerTask.server.ts#buildEngineTriggerInput).
+        taskVersion: "20260511.1",
         resumeParentOnCompletion: false,
         parentTaskRunId: "run_parent",
       }),