diff --git a/.changeset/classify-run-error-codes.md b/.changeset/classify-run-error-codes.md new file mode 100644 index 0000000000..fb004f4eb0 --- /dev/null +++ b/.changeset/classify-run-error-codes.md @@ -0,0 +1,9 @@ +--- +"@workflow/errors": patch +"@workflow/core": patch +"@workflow/web": patch +"@workflow/world-local": patch +"@workflow/world-vercel": patch +--- + +Add error code classification (`USER_ERROR`, `RUNTIME_ERROR`) to `run_failed` events, improve queue and schema validation error logging diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 48b209dc4b..1c9eeb8451 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -734,6 +734,7 @@ describe('e2e', () => { expect(WorkflowRunFailedError.is(error)).toBe(true); assert(WorkflowRunFailedError.is(error)); expect(error.cause.message).toContain('Nested workflow error'); + expect(error.cause.code).toBe('USER_ERROR'); // Workflow source maps are not properly supported everywhere. Check the definition // of hasWorkflowSourceMaps() to see where they are supported @@ -747,8 +748,11 @@ describe('e2e', () => { expect(error.cause.stack).not.toContain('evalmachine'); } - const { json: runData } = await cliInspectJson(`runs ${run.runId}`); + const { json: runData } = await cliInspectJson( + `runs ${run.runId} --withData` + ); expect(runData.status).toBe('failed'); + expect(runData.error.code).toBe('USER_ERROR'); } ); @@ -921,6 +925,7 @@ describe('e2e', () => { expect(WorkflowRunFailedError.is(error)).toBe(true); assert(WorkflowRunFailedError.is(error)); expect(error.cause.message).toContain('Fatal step error'); + expect(error.cause.code).toBe('USER_ERROR'); const { json: steps } = await cliInspectJson( `steps --runId ${run.runId}` diff --git a/packages/core/src/classify-error.test.ts b/packages/core/src/classify-error.test.ts new file mode 100644 index 0000000000..120020d4e5 --- /dev/null +++ b/packages/core/src/classify-error.test.ts @@ -0,0 +1,47 @@ +import { + RUN_ERROR_CODES, + WorkflowAPIError, + WorkflowRuntimeError, +} from '@workflow/errors'; +import { describe, expect, it } from 'vitest'; +import { classifyRunError } from './classify-error.js'; + +describe('classifyRunError', () => { + it('classifies WorkflowRuntimeError as RUNTIME_ERROR', () => { + expect( + classifyRunError(new WorkflowRuntimeError('corrupted event log')) + ).toBe(RUN_ERROR_CODES.RUNTIME_ERROR); + }); + + it('classifies plain Error as USER_ERROR', () => { + expect(classifyRunError(new Error('user code broke'))).toBe( + RUN_ERROR_CODES.USER_ERROR + ); + }); + + it('classifies TypeError as USER_ERROR', () => { + expect(classifyRunError(new TypeError('cannot read property'))).toBe( + RUN_ERROR_CODES.USER_ERROR + ); + }); + + it('classifies WorkflowAPIError as USER_ERROR (from user code fetch)', () => { + expect( + classifyRunError( + new WorkflowAPIError('Internal Server Error', { status: 500 }) + ) + ).toBe(RUN_ERROR_CODES.USER_ERROR); + }); + + it('classifies string throw as USER_ERROR', () => { + expect(classifyRunError('string error')).toBe(RUN_ERROR_CODES.USER_ERROR); + }); + + it('classifies null throw as USER_ERROR', () => { + expect(classifyRunError(null)).toBe(RUN_ERROR_CODES.USER_ERROR); + }); + + it('classifies undefined throw as USER_ERROR', () => { + expect(classifyRunError(undefined)).toBe(RUN_ERROR_CODES.USER_ERROR); + }); +}); diff --git a/packages/core/src/classify-error.ts b/packages/core/src/classify-error.ts new file mode 100644 index 0000000000..fe4a960b7e --- /dev/null +++ b/packages/core/src/classify-error.ts @@ -0,0 +1,20 @@ +import { + type RunErrorCode, + RUN_ERROR_CODES, + WorkflowRuntimeError, +} from '@workflow/errors'; + +/** + * Classify an error that caused a workflow run to fail. + * + * After the structural separation of infrastructure vs user code error + * handling, the only errors that reach the `run_failed` try/catch are: + * - User code errors (throws from workflow functions, propagated step failures) + * - WorkflowRuntimeError (corrupted event log, missing timestamps, etc.) + */ +export function classifyRunError(err: unknown): RunErrorCode { + if (WorkflowRuntimeError.is(err)) { + return RUN_ERROR_CODES.RUNTIME_ERROR; + } + return RUN_ERROR_CODES.USER_ERROR; +} diff --git a/packages/core/src/runtime.ts b/packages/core/src/runtime.ts index 582aab3daa..203c469d24 100644 --- a/packages/core/src/runtime.ts +++ b/packages/core/src/runtime.ts @@ -1,4 +1,9 @@ -import { WorkflowAPIError, WorkflowRuntimeError } from '@workflow/errors'; +import { + RUN_ERROR_CODES, + WorkflowAPIError, + WorkflowRuntimeError, +} from '@workflow/errors'; +import { classifyRunError } from './classify-error.js'; import { parseWorkflowName } from '@workflow/utils/parse-name'; import { type Event, @@ -195,6 +200,7 @@ export function workflowEntrypoint( message: err.message, stack: err.stack, }, + errorCode: RUN_ERROR_CODES.RUNTIME_ERROR, }, }, { requestId } @@ -368,8 +374,14 @@ export function workflowEntrypoint( ); } + // Classify the error: WorkflowRuntimeError indicates an + // internal issue (corrupted event log, missing data); + // everything else is a user code error. + const errorCode = classifyRunError(err); + runtimeLogger.error('Error while running workflow', { workflowRunId: runId, + errorCode, errorName, errorStack, }); @@ -386,7 +398,7 @@ export function workflowEntrypoint( message: errorMessage, stack: errorStack, }, - // TODO: include error codes when we define them + errorCode, }, }, { requestId } @@ -404,6 +416,7 @@ export function workflowEntrypoint( } ); span?.setAttributes({ + ...Attribute.WorkflowErrorCode(errorCode), ...Attribute.WorkflowErrorName(errorName), ...Attribute.WorkflowErrorMessage(errorMessage), ...Attribute.ErrorType(errorName), @@ -416,6 +429,7 @@ export function workflowEntrypoint( span?.setAttributes({ ...Attribute.WorkflowRunStatus('failed'), + ...Attribute.WorkflowErrorCode(errorCode), ...Attribute.WorkflowErrorName(errorName), ...Attribute.WorkflowErrorMessage(errorMessage), ...Attribute.ErrorType(errorName), diff --git a/packages/core/src/telemetry/semantic-conventions.ts b/packages/core/src/telemetry/semantic-conventions.ts index afe86b1ecb..fcc5d0694f 100644 --- a/packages/core/src/telemetry/semantic-conventions.ts +++ b/packages/core/src/telemetry/semantic-conventions.ts @@ -102,6 +102,11 @@ export const WorkflowErrorMessage = SemanticConvention( 'workflow.error.message' ); +/** Error classification code (USER_ERROR, RUNTIME_ERROR, etc.) */ +export const WorkflowErrorCode = SemanticConvention( + 'workflow.error.code' +); + /** Number of steps created during workflow execution */ export const WorkflowStepsCreated = SemanticConvention( 'workflow.steps.created' diff --git a/packages/errors/src/error-codes.ts b/packages/errors/src/error-codes.ts new file mode 100644 index 0000000000..b25a80ef77 --- /dev/null +++ b/packages/errors/src/error-codes.ts @@ -0,0 +1,14 @@ +/** + * Error codes for classifying run failures. + * These are populated in the `errorCode` field of `run_failed` events + * and flow through to `StructuredError.code` on the run entity. + */ +export const RUN_ERROR_CODES = { + /** Error thrown in user workflow or step code */ + USER_ERROR: 'USER_ERROR', + /** Internal runtime error (corrupted event log, missing timestamps) */ + RUNTIME_ERROR: 'RUNTIME_ERROR', +} as const; + +export type RunErrorCode = + (typeof RUN_ERROR_CODES)[keyof typeof RUN_ERROR_CODES]; diff --git a/packages/errors/src/index.ts b/packages/errors/src/index.ts index c8be6c406d..7c8bc21696 100644 --- a/packages/errors/src/index.ts +++ b/packages/errors/src/index.ts @@ -337,3 +337,5 @@ export class RetryableError extends Error { export const VERCEL_403_ERROR_MESSAGE = 'Your current vercel account does not have access to this resource. Use `vercel login` or `vercel switch` to ensure you are linked to the right account.'; + +export { RUN_ERROR_CODES, type RunErrorCode } from './error-codes.js'; diff --git a/packages/web/app/components/display-utils/status-badge.tsx b/packages/web/app/components/display-utils/status-badge.tsx index 85bd053dd5..3ed2e2a6b6 100644 --- a/packages/web/app/components/display-utils/status-badge.tsx +++ b/packages/web/app/components/display-utils/status-badge.tsx @@ -9,6 +9,19 @@ import { } from '~/components/ui/tooltip'; import { cn, formatDuration } from '~/lib/utils'; +/** Extract the error code from an unknown error value (StructuredError shape). */ +function getErrorCode(error: unknown): string | undefined { + if ( + error !== null && + typeof error === 'object' && + 'code' in error && + typeof (error as { code: unknown }).code === 'string' + ) { + return (error as { code: string }).code; + } + return undefined; +} + interface StatusBadgeProps { status: WorkflowRun['status'] | Step['status']; context?: { error?: unknown }; @@ -58,35 +71,34 @@ export function StatusBadge({ ); - // Show error tooltip if status is failed and error exists - if (status === 'failed' && context?.error) { - return ; + // Show error code tooltip if status is failed and error has a code + const errorCode = + status === 'failed' ? getErrorCode(context?.error) : undefined; + if (errorCode) { + return ; } return content; } -function ErrorStatusBadge({ +function ErrorCodeBadge({ content, - error, + errorCode, }: { content: React.ReactNode; - error: unknown; + errorCode: string; }) { const [copied, setCopied] = useState(false); - const errorMessage = - typeof error === 'string' - ? error - : error instanceof Error - ? error.message - : JSON.stringify(error); - const handleCopy = async (e: React.MouseEvent) => { e.stopPropagation(); - await navigator.clipboard.writeText(errorMessage); - setCopied(true); - setTimeout(() => setCopied(false), 2000); + try { + await navigator.clipboard.writeText(errorCode); + setCopied(true); + setTimeout(() => setCopied(false), 2000); + } catch { + // Clipboard API can fail on unfocused pages or non-HTTPS contexts + } }; return ( @@ -94,13 +106,13 @@ function ErrorStatusBadge({ {content} - -
- Error Details + +
+ {errorCode}
-
-

- {errorMessage} -

-
); diff --git a/packages/world-local/src/queue.ts b/packages/world-local/src/queue.ts index 5d3bcc2185..50cccf18a0 100644 --- a/packages/world-local/src/queue.ts +++ b/packages/world-local/src/queue.ts @@ -108,6 +108,7 @@ export function createQueue(config: Partial): LocalQueue { } try { let defaultRetriesLeft = 3; + const maxAttempts = defaultRetriesLeft; for (let attempt = 0; defaultRetriesLeft > 0; attempt++) { defaultRetriesLeft--; @@ -169,18 +170,16 @@ export function createQueue(config: Partial): LocalQueue { return; } - console.error(`[local world] Failed to queue message`, { - queueName, - text, - status: response.status, - headers: Object.fromEntries(response.headers.entries()), - body: body.toString(), - }); + console.error( + `[world-local] Queue message failed (attempt ${attempt + 1}/${maxAttempts}, status ${response.status}): ${text}`, + { queueName, messageId } + ); } - console.error( - `[local world] Reached max retries of local world queue implementation` - ); + console.error(`[world-local] Queue message exhausted all retries`, { + queueName, + messageId, + }); } finally { semaphore.release(); } diff --git a/packages/world-vercel/src/runs.ts b/packages/world-vercel/src/runs.ts index f008f6f3ab..a102912a6e 100644 --- a/packages/world-vercel/src/runs.ts +++ b/packages/world-vercel/src/runs.ts @@ -33,6 +33,9 @@ export const WorkflowRunWireBaseSchema = WorkflowRunBaseSchema.omit({ }).extend({ // Backend returns error as either a JSON string or structured object error: z.union([z.string(), StructuredErrorSchema]).optional(), + // errorCode is stored inline on the run entity (not inside errorRef). + // It's merged into StructuredError.code by deserializeError(). + errorCode: z.string().optional(), // Not part of the World interface, but passed through for direct consumers and debugging blobStorageBytes: z.number().optional(), streamStorageBytes: z.number().optional(), diff --git a/packages/world-vercel/src/utils.ts b/packages/world-vercel/src/utils.ts index ebcb5811b7..e2ea236685 100644 --- a/packages/world-vercel/src/utils.ts +++ b/packages/world-vercel/src/utils.ts @@ -90,12 +90,16 @@ export function serializeError( * status), but the transformation preserves all other fields correctly. */ export function deserializeError>(obj: any): T { - const { error, ...rest } = obj; + const { error, errorCode, ...rest } = obj; if (!error) { return obj as T; } + // errorCode is stored as a separate inline field on the run entity (not + // inside errorRef). Merge it into StructuredError.code so consumers see it. + // If the error already has a code from the ref, errorCode takes precedence. + // If error is already an object (new format), validate and use directly if (typeof error === 'object' && error !== null) { const result = StructuredErrorSchema.safeParse(error); @@ -105,7 +109,7 @@ export function deserializeError>(obj: any): T { error: { message: result.data.message, stack: result.data.stack, - code: result.data.code, + code: errorCode ?? result.data.code, }, } as T; } @@ -121,7 +125,7 @@ export function deserializeError>(obj: any): T { error: { message: parsed.message, stack: parsed.stack, - code: parsed.code, + code: errorCode ?? parsed.code, }, } as T; } catch { @@ -130,6 +134,7 @@ export function deserializeError>(obj: any): T { ...rest, error: { message: error, + code: errorCode, }, } as T; } @@ -294,8 +299,9 @@ export async function makeRequest({ await parseResponseBody(response) .then((r) => r.data as { message?: string; code?: string }) .catch(() => ({})); - if (process.env.DEBUG === '1') { + if (process.env.DEBUG) { const stringifiedHeaders = Array.from(headers.entries()) + .filter(([key]) => key.toLowerCase() !== 'authorization') .map(([key, value]: [string, string]) => `-H "${key}: ${value}"`) .join(' '); console.error( @@ -358,8 +364,17 @@ export async function makeRequest({ const result = await trace('world.validate', async () => { const validationResult = schema.safeParse(parseResult.data); if (!validationResult.success) { + const issues = validationResult.error.issues + .map( + (i) => + ` ${i.path.length > 0 ? i.path.join('.') : ''}: ${i.message}` + ) + .join('\n'); + const debugContext = process.env.DEBUG + ? `\n\nResponse context: ${parseResult.getDebugContext()}` + : ''; throw new WorkflowAPIError( - `Schema validation failed for ${method} ${endpoint}:\n\n${validationResult.error}\n\nResponse context: ${parseResult.getDebugContext()}`, + `Schema validation failed for ${method} ${endpoint}:\n${issues}${debugContext}`, { url, cause: validationResult.error } ); } diff --git a/packages/world/src/shared.ts b/packages/world/src/shared.ts index 2586e9df22..d4a3337c2a 100644 --- a/packages/world/src/shared.ts +++ b/packages/world/src/shared.ts @@ -53,7 +53,7 @@ export type ResolveData = 'none' | 'all'; export const StructuredErrorSchema = z.object({ message: z.string(), stack: z.string().optional(), - code: z.string().optional(), // TODO: currently unused. make this an enum maybe + code: z.string().optional(), // Populated with RunErrorCode values (USER_ERROR, RUNTIME_ERROR) for run_failed events }); export type StructuredError = z.infer;