Skip to content

Commit f349656

Browse files
authored
Merge pull request #921 from conveyal/job-error-limiting
Prevent runaway error reporting
2 parents ef9638b + 9df41b6 commit f349656

File tree

5 files changed

+64
-32
lines changed

5 files changed

+64
-32
lines changed

src/main/java/com/conveyal/analysis/components/broker/Broker.java

+25-13
Original file line numberDiff line numberDiff line change
@@ -177,19 +177,23 @@ public synchronized void enqueueTasksForRegionalJob (RegionalAnalysis regionalAn
177177
LOG.error("Someone tried to enqueue job {} but it already exists.", templateTask.jobId);
178178
throw new RuntimeException("Enqueued duplicate job " + templateTask.jobId);
179179
}
180+
// Create the Job object to share with the MultiOriginAssembler, but defer adding this job to the Multimap of
181+
// active jobs until we're sure the result assembler was constructed without any errors. Always add and remove
182+
// the Job and corresponding MultiOriginAssembler as a unit in the same synchronized block of code (see #887).
180183
WorkerTags workerTags = WorkerTags.fromRegionalAnalysis(regionalAnalysis);
181184
Job job = new Job(templateTask, workerTags);
182-
jobs.put(job.workerCategory, job);
183185

184186
// Register the regional job so results received from multiple workers can be assembled into one file.
187+
// If any parameters fail checks here, an exception may cause this method to exit early.
185188
// TODO encapsulate MultiOriginAssemblers in a new Component
186-
// Note: if this fails with an exception we'll have a job enqueued, possibly being processed, with no assembler.
187-
// That is not catastrophic, but the user may need to recognize and delete the stalled regional job.
188189
MultiOriginAssembler assembler = new MultiOriginAssembler(regionalAnalysis, job, fileStorage);
189190
resultAssemblers.put(templateTask.jobId, assembler);
190191

192+
// A MultiOriginAssembler was successfully put in place. It's now safe to register and start the Job.
193+
jobs.put(job.workerCategory, job);
194+
195+
// If this is a fake job for testing, don't confuse the worker startup code below with its null graph ID.
191196
if (config.testTaskRedelivery()) {
192-
// This is a fake job for testing, don't confuse the worker startup code below with null graph ID.
193197
return;
194198
}
195199

@@ -385,14 +389,20 @@ public synchronized void markTaskCompleted (Job job, int taskId) {
385389
}
386390

387391
/**
388-
* When job.errors is non-empty, job.isErrored() becomes true and job.isActive() becomes false.
392+
* Record an error that happened while a worker was processing a task on the given job. This method is tolerant
393+
* of job being null, because it's called on a code path where any number of things could be wrong or missing.
394+
* This method also ensures synchronization of writes to Jobs from any non-synchronized sections of an HTTP handler.
395+
* Once job.errors is non-empty, job.isErrored() becomes true and job.isActive() becomes false.
389396
* The Job will stop delivering tasks, allowing workers to shut down, but will continue to exist allowing the user
390397
* to see the error message. User will then need to manually delete it, which will remove the result assembler.
391-
* This method ensures synchronization of writes to Jobs from the unsynchronized worker poll HTTP handler.
392398
*/
393399
private synchronized void recordJobError (Job job, String error) {
394400
if (job != null) {
395-
job.errors.add(error);
401+
// Limit the number of errors recorded to one.
402+
// Still using a Set<String> instead of just String since the set of errors is exposed in a UI-facing API.
403+
if (job.errors.isEmpty()) {
404+
job.errors.add(error);
405+
}
396406
}
397407
}
398408

@@ -488,21 +498,23 @@ public void handleRegionalWorkResult(RegionalWorkResult workResult) {
488498
// Once the job is retrieved, it can be used below to requestExtraWorkersIfAppropriate without synchronization,
489499
// because that method only uses final fields of the job.
490500
Job job = null;
491-
MultiOriginAssembler assembler;
492501
try {
502+
MultiOriginAssembler assembler;
493503
synchronized (this) {
494504
job = findJob(workResult.jobId);
505+
// Record any error reported by the worker and don't pass bad results on to regional result assembly.
506+
// This will mark the job as errored and not-active, stopping distribution of tasks to workers.
507+
// To ensure that happens, record errors before any other conditional that could exit this method.
508+
if (workResult.error != null) {
509+
recordJobError(job, workResult.error);
510+
return;
511+
}
495512
assembler = resultAssemblers.get(workResult.jobId);
496513
if (job == null || assembler == null || !job.isActive()) {
497514
// This will happen naturally for all delivered tasks after a job is deleted or it errors out.
498515
LOG.debug("Ignoring result for unrecognized, deleted, or inactive job ID {}.", workResult.jobId);
499516
return;
500517
}
501-
if (workResult.error != null) {
502-
// Record any error reported by the worker and don't pass bad results on to regional result assembly.
503-
recordJobError(job, workResult.error);
504-
return;
505-
}
506518
// Mark tasks completed first before passing results to the assembler. On the final result received,
507519
// this will minimize the risk of race conditions by quickly making the job invisible to incoming stray
508520
// results from spurious redeliveries, before the assembler is busy finalizing and uploading results.

src/main/java/com/conveyal/analysis/components/broker/Job.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,11 @@ private RegionalTask makeOneTask (int taskNumber) {
123123
public int deliveryPass = 0;
124124

125125
/**
126-
* If any error compromises the usabilty or quality of results from any origin, it is recorded here.
126+
* If any error compromises the usability or quality of results from any origin, it is recorded here.
127127
* This is a Set because identical errors are likely to be reported from many workers or individual tasks.
128+
* The presence of an error here causes the job to be considered "errored" and "inactive" and stop delivering tasks.
129+
* There is some risk here of accumulating unbounded amounts of large error messages (see #919).
130+
* The field type could be changed to a single String instead of Set, but it's exposed on a UI-facing API as a Set.
128131
*/
129132
public final Set<String> errors = new HashSet();
130133

src/main/java/com/conveyal/analysis/components/eventbus/ErrorEvent.java

+2-16
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import com.conveyal.r5.util.ExceptionUtils;
44

5+
import static com.conveyal.r5.util.ExceptionUtils.filterStackTrace;
6+
57
/**
68
* This Event is fired each time a Throwable (usually an Exception or Error) occurs on the backend. It can then be
79
* recorded or tracked in various places - the console logs, Slack, etc. This could eventually be used for errors on
@@ -59,20 +61,4 @@ public String traceWithContext (boolean verbose) {
5961
return builder.toString();
6062
}
6163

62-
private static String filterStackTrace (String stackTrace) {
63-
if (stackTrace == null) return null;
64-
final String unknownFrame = "Unknown stack frame, probably optimized out by JVM.";
65-
String error = stackTrace.lines().findFirst().get();
66-
String frame = stackTrace.lines()
67-
.map(String::strip)
68-
.filter(s -> s.startsWith("at "))
69-
.findFirst().orElse(unknownFrame);
70-
String conveyalFrame = stackTrace.lines()
71-
.map(String::strip)
72-
.filter(s -> s.startsWith("at com.conveyal."))
73-
.filter(s -> !frame.equals(s))
74-
.findFirst().orElse("");
75-
return String.join("\n", error, frame, conveyalFrame);
76-
}
77-
7864
}

src/main/java/com/conveyal/r5/analyst/cluster/RegionalWorkResult.java

+8-2
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,17 @@ public RegionalWorkResult(OneOriginResult result, RegionalTask task) {
6969
// TODO checkTravelTimeInvariants, checkAccessibilityInvariants to verify that values are monotonically increasing
7070
}
7171

72-
/** Constructor used when results for this origin are considered unusable due to an unhandled error. */
72+
/**
73+
* Constructor used when results for this origin are considered unusable due to an unhandled error. Besides the
74+
* short-form exception, most result fields are left null. There is no information to communicate, and because
75+
* errors are often produced faster than valid results, we don't want to flood the backend with unnecessarily
76+
* voluminous error reports. The short-form exception message is used for a similar reason, to limit the total size
77+
* of error messages.
78+
*/
7379
public RegionalWorkResult(Throwable t, RegionalTask task) {
7480
this.jobId = task.jobId;
7581
this.taskId = task.taskId;
76-
this.error = ExceptionUtils.shortAndLongString(t);
82+
this.error = ExceptionUtils.filterStackTrace(t);
7783
}
7884

7985
}

src/main/java/com/conveyal/r5/util/ExceptionUtils.java

+25
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,29 @@ public static String shortAndLongString (Throwable throwable) {
5050
return shortCauseString(throwable) + "\n[detail follows]\n" + stackTraceString(throwable);
5151
}
5252

53+
/**
54+
* Given a full stack trace string with one frame per line, keep only the exception name, the first stack frame,
55+
* and all additional frames that come from Conveyal packages. This yields a much shorter stack trace that still
56+
* shows where the exception was thrown and where the problem originates in our own code.
57+
*/
58+
public static String filterStackTrace (String stackTrace) {
59+
if (stackTrace == null) return null;
60+
final String unknownFrame = "Unknown stack frame, probably optimized out by JVM.";
61+
String error = stackTrace.lines().findFirst().get();
62+
String frame = stackTrace.lines()
63+
.map(String::strip)
64+
.filter(s -> s.startsWith("at "))
65+
.findFirst().orElse(unknownFrame);
66+
String conveyalFrame = stackTrace.lines()
67+
.map(String::strip)
68+
.filter(s -> s.startsWith("at com.conveyal."))
69+
.filter(s -> !frame.equals(s))
70+
.findFirst().orElse("");
71+
return String.join("\n", error, frame, conveyalFrame);
72+
}
73+
74+
public static String filterStackTrace (Throwable throwable) {
75+
return filterStackTrace(stackTraceString(throwable));
76+
}
77+
5378
}

0 commit comments

Comments
 (0)