Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .changeset/classify-run-error-codes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
"@workflow/errors": patch
"@workflow/core": patch
"@workflow/web": patch
"@workflow/world-local": patch
"@workflow/world-vercel": patch
---

Add error code classification (`USER_ERROR`, `RUNTIME_ERROR`) to `run_failed` events, improve queue and schema validation error logging
7 changes: 6 additions & 1 deletion packages/core/e2e/e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,7 @@ describe('e2e', () => {
expect(WorkflowRunFailedError.is(error)).toBe(true);
assert(WorkflowRunFailedError.is(error));
expect(error.cause.message).toContain('Nested workflow error');
expect(error.cause.code).toBe('USER_ERROR');
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add one e2e test that actually causes + asserts a RUNTIME_ERROR

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Triggering RUNTIME_ERROR in e2e requires corrupting the event log (e.g., injecting an unexpected event type for a step). This is fragile and environment-dependent. We have unit test coverage for WorkflowRuntimeError classification in classify-error.test.ts and for the error itself in step.test.ts. Could add an e2e in a follow-up with a dedicated fault injection mechanism.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

human: I tried this but not possible without chaos testing or a bigger change out of scope of this PR. tl;dr we can't easily inject failures into runs (we do on steps by injecting 500s into the world) but for workflow/run code - we would either need to expose things into the VM to allow it to inject that (i.e. changing runtime code just for a fault injection test) - or another ideas is we need to use a proxy in front of workflow-server and queue to inject these failures

at that point I'm thinking we just do this in the chaos testing @TooTallNate is setting up and we can have validation that it's working once that's up


// Workflow source maps are not properly supported everywhere. Check the definition
// of hasWorkflowSourceMaps() to see where they are supported
Expand All @@ -747,8 +748,11 @@ describe('e2e', () => {
expect(error.cause.stack).not.toContain('evalmachine');
}

const { json: runData } = await cliInspectJson(`runs ${run.runId}`);
const { json: runData } = await cliInspectJson(
`runs ${run.runId} --withData`
);
expect(runData.status).toBe('failed');
expect(runData.error.code).toBe('USER_ERROR');
}
);

Expand Down Expand Up @@ -921,6 +925,7 @@ describe('e2e', () => {
expect(WorkflowRunFailedError.is(error)).toBe(true);
assert(WorkflowRunFailedError.is(error));
expect(error.cause.message).toContain('Fatal step error');
expect(error.cause.code).toBe('USER_ERROR');

const { json: steps } = await cliInspectJson(
`steps --runId ${run.runId}`
Expand Down
47 changes: 47 additions & 0 deletions packages/core/src/classify-error.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import {
RUN_ERROR_CODES,
WorkflowAPIError,
WorkflowRuntimeError,
} from '@workflow/errors';
import { describe, expect, it } from 'vitest';
import { classifyRunError } from './classify-error.js';

describe('classifyRunError', () => {
it('classifies WorkflowRuntimeError as RUNTIME_ERROR', () => {
expect(
classifyRunError(new WorkflowRuntimeError('corrupted event log'))
).toBe(RUN_ERROR_CODES.RUNTIME_ERROR);
});

it('classifies plain Error as USER_ERROR', () => {
expect(classifyRunError(new Error('user code broke'))).toBe(
RUN_ERROR_CODES.USER_ERROR
);
});

it('classifies TypeError as USER_ERROR', () => {
expect(classifyRunError(new TypeError('cannot read property'))).toBe(
RUN_ERROR_CODES.USER_ERROR
);
});

it('classifies WorkflowAPIError as USER_ERROR (from user code fetch)', () => {
expect(
classifyRunError(
new WorkflowAPIError('Internal Server Error', { status: 500 })
)
).toBe(RUN_ERROR_CODES.USER_ERROR);
});

it('classifies string throw as USER_ERROR', () => {
expect(classifyRunError('string error')).toBe(RUN_ERROR_CODES.USER_ERROR);
});

it('classifies null throw as USER_ERROR', () => {
expect(classifyRunError(null)).toBe(RUN_ERROR_CODES.USER_ERROR);
});

it('classifies undefined throw as USER_ERROR', () => {
expect(classifyRunError(undefined)).toBe(RUN_ERROR_CODES.USER_ERROR);
});
});
20 changes: 20 additions & 0 deletions packages/core/src/classify-error.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import {
type RunErrorCode,
RUN_ERROR_CODES,
WorkflowRuntimeError,
} from '@workflow/errors';

/**
* Classify an error that caused a workflow run to fail.
*
* After the structural separation of infrastructure vs user code error
* handling, the only errors that reach the `run_failed` try/catch are:
* - User code errors (throws from workflow functions, propagated step failures)
* - WorkflowRuntimeError (corrupted event log, missing timestamps, etc.)
*/
export function classifyRunError(err: unknown): RunErrorCode {
if (WorkflowRuntimeError.is(err)) {
return RUN_ERROR_CODES.RUNTIME_ERROR;
}
return RUN_ERROR_CODES.USER_ERROR;
}
18 changes: 16 additions & 2 deletions packages/core/src/runtime.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import { WorkflowAPIError, WorkflowRuntimeError } from '@workflow/errors';
import {
RUN_ERROR_CODES,
WorkflowAPIError,
WorkflowRuntimeError,
} from '@workflow/errors';
import { classifyRunError } from './classify-error.js';
import { parseWorkflowName } from '@workflow/utils/parse-name';
import {
type Event,
Expand Down Expand Up @@ -195,6 +200,7 @@ export function workflowEntrypoint(
message: err.message,
stack: err.stack,
},
errorCode: RUN_ERROR_CODES.RUNTIME_ERROR,
},
},
{ requestId }
Expand Down Expand Up @@ -368,8 +374,14 @@ export function workflowEntrypoint(
);
}

// Classify the error: WorkflowRuntimeError indicates an
// internal issue (corrupted event log, missing data);
// everything else is a user code error.
const errorCode = classifyRunError(err);

runtimeLogger.error('Error while running workflow', {
workflowRunId: runId,
errorCode,
errorName,
errorStack,
});
Expand All @@ -386,7 +398,7 @@ export function workflowEntrypoint(
message: errorMessage,
stack: errorStack,
},
// TODO: include error codes when we define them
errorCode,
},
},
Comment on lines 377 to 403
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed — the setup failure path now includes errorCode: RUN_ERROR_CODES.RUNTIME_ERROR in the run_failed event data. Both run_failed emission sites are now consistent.

{ requestId }
Expand All @@ -404,6 +416,7 @@ export function workflowEntrypoint(
}
);
span?.setAttributes({
...Attribute.WorkflowErrorCode(errorCode),
...Attribute.WorkflowErrorName(errorName),
...Attribute.WorkflowErrorMessage(errorMessage),
...Attribute.ErrorType(errorName),
Expand All @@ -416,6 +429,7 @@ export function workflowEntrypoint(

span?.setAttributes({
...Attribute.WorkflowRunStatus('failed'),
...Attribute.WorkflowErrorCode(errorCode),
...Attribute.WorkflowErrorName(errorName),
...Attribute.WorkflowErrorMessage(errorMessage),
...Attribute.ErrorType(errorName),
Expand Down
5 changes: 5 additions & 0 deletions packages/core/src/telemetry/semantic-conventions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,11 @@ export const WorkflowErrorMessage = SemanticConvention<string>(
'workflow.error.message'
);

/** Error classification code (USER_ERROR, RUNTIME_ERROR, etc.) */
export const WorkflowErrorCode = SemanticConvention<string>(
'workflow.error.code'
);

/** Number of steps created during workflow execution */
export const WorkflowStepsCreated = SemanticConvention<number>(
'workflow.steps.created'
Expand Down
14 changes: 14 additions & 0 deletions packages/errors/src/error-codes.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/**
* Error codes for classifying run failures.
* These are populated in the `errorCode` field of `run_failed` events
* and flow through to `StructuredError.code` on the run entity.
*/
export const RUN_ERROR_CODES = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If these are also user-facing, should add to docs

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call — will add docs in a follow-up. These are user-facing via WorkflowRunFailedError.cause.code.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added docs in #1445 — documents error codes in the errors & retries guide.

/** Error thrown in user workflow or step code */
USER_ERROR: 'USER_ERROR',
/** Internal runtime error (corrupted event log, missing timestamps) */
RUNTIME_ERROR: 'RUNTIME_ERROR',
} as const;

export type RunErrorCode =
(typeof RUN_ERROR_CODES)[keyof typeof RUN_ERROR_CODES];
2 changes: 2 additions & 0 deletions packages/errors/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -337,3 +337,5 @@ export class RetryableError extends Error {

export const VERCEL_403_ERROR_MESSAGE =
'Your current vercel account does not have access to this resource. Use `vercel login` or `vercel switch` to ensure you are linked to the right account.';

export { RUN_ERROR_CODES, type RunErrorCode } from './error-codes.js';
57 changes: 32 additions & 25 deletions packages/web/app/components/display-utils/status-badge.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@ import {
} from '~/components/ui/tooltip';
import { cn, formatDuration } from '~/lib/utils';

/** Extract the error code from an unknown error value (StructuredError shape). */
function getErrorCode(error: unknown): string | undefined {
if (
error !== null &&
typeof error === 'object' &&
'code' in error &&
typeof (error as { code: unknown }).code === 'string'
) {
return (error as { code: string }).code;
}
return undefined;
}

interface StatusBadgeProps {
status: WorkflowRun['status'] | Step['status'];
context?: { error?: unknown };
Expand Down Expand Up @@ -58,49 +71,48 @@ export function StatusBadge({
</span>
);

// Show error tooltip if status is failed and error exists
if (status === 'failed' && context?.error) {
return <ErrorStatusBadge content={content} error={context.error} />;
// Show error code tooltip if status is failed and error has a code
const errorCode =
status === 'failed' ? getErrorCode(context?.error) : undefined;
if (errorCode) {
return <ErrorCodeBadge content={content} errorCode={errorCode} />;
}

return content;
}

function ErrorStatusBadge({
function ErrorCodeBadge({
content,
error,
errorCode,
}: {
content: React.ReactNode;
error: unknown;
errorCode: string;
}) {
const [copied, setCopied] = useState(false);

const errorMessage =
typeof error === 'string'
? error
: error instanceof Error
? error.message
: JSON.stringify(error);

const handleCopy = async (e: React.MouseEvent) => {
e.stopPropagation();
await navigator.clipboard.writeText(errorMessage);
setCopied(true);
setTimeout(() => setCopied(false), 2000);
try {
await navigator.clipboard.writeText(errorCode);
setCopied(true);
setTimeout(() => setCopied(false), 2000);
} catch {
// Clipboard API can fail on unfocused pages or non-HTTPS contexts
}
};

return (
<Tooltip>
<TooltipTrigger asChild>
<span className="cursor-help">{content}</span>
</TooltipTrigger>
<TooltipContent className="max-w-md p-0">
<div className="flex items-start justify-between gap-2 p-1 border-b">
<span className="text-xs font-medium pl-1 pt-1">Error Details</span>
<TooltipContent className="p-0">
<div className="flex items-center gap-2 p-1.5">
<span className="text-xs font-mono">{errorCode}</span>
<Button
variant="ghost"
size="icon"
className="h-6 w-6 shrink-0"
className="h-5 w-5 shrink-0"
onClick={handleCopy}
>
{copied ? (
Expand All @@ -110,11 +122,6 @@ function ErrorStatusBadge({
)}
</Button>
</div>
<div className="max-h-48 overflow-auto p-2">
<p className="text-xs whitespace-pre-wrap break-words font-mono">
{errorMessage}
</p>
</div>
</TooltipContent>
</Tooltip>
);
Expand Down
19 changes: 9 additions & 10 deletions packages/world-local/src/queue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ export function createQueue(config: Partial<Config>): LocalQueue {
}
try {
let defaultRetriesLeft = 3;
const maxAttempts = defaultRetriesLeft;
for (let attempt = 0; defaultRetriesLeft > 0; attempt++) {
defaultRetriesLeft--;

Expand Down Expand Up @@ -169,18 +170,16 @@ export function createQueue(config: Partial<Config>): LocalQueue {
return;
}

console.error(`[local world] Failed to queue message`, {
queueName,
text,
status: response.status,
headers: Object.fromEntries(response.headers.entries()),
body: body.toString(),
});
console.error(
`[world-local] Queue message failed (attempt ${attempt + 1}/${maxAttempts}, status ${response.status}): ${text}`,
{ queueName, messageId }
);
}

console.error(
`[local world] Reached max retries of local world queue implementation`
);
console.error(`[world-local] Queue message exhausted all retries`, {
queueName,
messageId,
});
} finally {
semaphore.release();
}
Expand Down
3 changes: 3 additions & 0 deletions packages/world-vercel/src/runs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ export const WorkflowRunWireBaseSchema = WorkflowRunBaseSchema.omit({
}).extend({
// Backend returns error as either a JSON string or structured object
error: z.union([z.string(), StructuredErrorSchema]).optional(),
// errorCode is stored inline on the run entity (not inside errorRef).
// It's merged into StructuredError.code by deserializeError().
errorCode: z.string().optional(),
// Not part of the World interface, but passed through for direct consumers and debugging
blobStorageBytes: z.number().optional(),
streamStorageBytes: z.number().optional(),
Expand Down
Loading
Loading