Skip to content

Commit 2a632a9

Browse files
i#6471 sched idle: Add idle time (#6472)
Adds a new STATUS_IDLE return code, and a corresponding TRACE_MARKER_TYPE_CORE_IDLE record. Changes the scheduler behavior to no longer return STATUS_EOF for an output when the ready queue is empty: instead STATUS_IDLE is returned until every single input is at EOF. This results in a more realistic schedule where other cores can pick up work later rather than disappearing from the system. Augments the schedule_stats tool to count idle replies and compute a % cpu usage metric. Adds a unit test for counting idles. Augments the scheduler_launcher to also compute %cpu usage. Updates all the scheduler tests for the new change. Adding idle time due to blocking syscalls will be done separately. Issue: #6471
1 parent 34fbc25 commit 2a632a9

16 files changed

+348
-97
lines changed

clients/drcachesim/analyzer.cpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,17 @@ analyzer_t::create_wait_marker()
133133
return record;
134134
}
135135

136+
template <>
137+
memref_t
138+
analyzer_t::create_idle_marker()
139+
{
140+
memref_t record = {}; // Zero the other fields.
141+
record.marker.type = TRACE_TYPE_MARKER;
142+
record.marker.marker_type = TRACE_MARKER_TYPE_CORE_IDLE;
143+
record.marker.tid = INVALID_THREAD_ID;
144+
return record;
145+
}
146+
136147
/******************************************************************************
137148
* Specializations for analyzer_tmpl_t<record_reader_t>, aka record_analyzer_t.
138149
*/
@@ -182,6 +193,17 @@ record_analyzer_t::create_wait_marker()
182193
return record;
183194
}
184195

196+
template <>
197+
trace_entry_t
198+
record_analyzer_t::create_idle_marker()
199+
{
200+
trace_entry_t record;
201+
record.type = TRACE_TYPE_MARKER;
202+
record.size = TRACE_MARKER_TYPE_CORE_IDLE;
203+
record.addr = 0; // Marker value has no meaning so we zero it.
204+
return record;
205+
}
206+
185207
/********************************************************************
186208
* Other analyzer_tmpl_t routines that do not need to be specialized.
187209
*/
@@ -537,6 +559,12 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_tasks(analyzer_worker_data_t *w
537559
// We synthesize a record here. If we wanted this to count toward output
538560
// stream ordinals we would need to add a scheduler API to inject it.
539561
record = create_wait_marker();
562+
} else if (status == sched_type_t::STATUS_IDLE) {
563+
assert(shard_type_ == SHARD_BY_CORE);
564+
// We let tools know about idle time so they can analyze cpu usage.
565+
// We synthesize a record here. If we wanted this to count toward output
566+
// stream ordinals we would need to add a scheduler API to inject it.
567+
record = create_idle_marker();
540568
} else if (status != sched_type_t::STATUS_OK) {
541569
if (status == sched_type_t::STATUS_REGION_INVALID) {
542570
worker->error =
@@ -596,8 +624,10 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_tasks(analyzer_worker_data_t *w
596624
}
597625
}
598626
if (shard_type_ == SHARD_BY_CORE) {
599-
if (!process_shard_exit(worker, worker->index))
600-
return;
627+
if (worker->shard_data.find(worker->index) != worker->shard_data.end()) {
628+
if (!process_shard_exit(worker, worker->index))
629+
return;
630+
}
601631
}
602632
for (const auto &keyval : worker->shard_data) {
603633
if (!keyval.second.exited) {

clients/drcachesim/analyzer.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,9 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
252252
RecordType
253253
create_wait_marker();
254254

255+
RecordType
256+
create_idle_marker();
257+
255258
// Invoked when the given interval finishes during serial or parallel
256259
// analysis of the trace. For parallel analysis, the shard_id
257260
// parameter should be set to the shard_id for which the interval

clients/drcachesim/common/options.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -809,15 +809,15 @@ droption_t<bool> op_core_sharded(
809809
"software threads. This option instead schedules those threads onto virtual cores "
810810
"and analyzes each core in parallel. Thus, each shard consists of pieces from "
811811
"many software threads. How the scheduling is performed is controlled by a set "
812-
"of options with the prefix \"sched_\" along with -num_cores.");
812+
"of options with the prefix \"sched_\" along with -cores.");
813813

814814
droption_t<bool> op_core_serial(
815815
DROPTION_SCOPE_ALL, "core_serial", false, "Analyze per-core in serial.",
816816
"In this mode, scheduling is performed just like for -core_sharded. "
817817
"However, the resulting schedule is acted upon by a single analysis thread"
818818
"which walks the N cores in lockstep in round robin fashion. "
819819
"How the scheduling is performed is controlled by a set "
820-
"of options with the prefix \"sched_\" along with -num_cores.");
820+
"of options with the prefix \"sched_\" along with -cores.");
821821

822822
droption_t<int64_t>
823823
op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 1 * 1000 * 1000,

clients/drcachesim/common/trace_entry.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,19 @@ typedef enum {
583583
*/
584584
TRACE_MARKER_TYPE_CORE_WAIT,
585585

586+
/**
587+
* This marker is used for core-sharded analyses to indicate that the current
588+
* core has no available inputs to run (all inputs are on other cores or are
589+
* blocked waiting for kernel resources). A new marker is emitted each
590+
* time the tool analysis framework requests a new record from the scheduler and
591+
* is given an idle status. There are no units of time here but each repetition
592+
* is roughly the time where a regular record could have been read and passed
593+
* along. This idle marker indicates that a core actually had no work to do,
594+
* as opposed to #TRACE_MARKER_TYPE_CORE_WAIT which is an artifact of an
595+
* imposed re-created schedule.
596+
*/
597+
TRACE_MARKER_TYPE_CORE_IDLE,
598+
586599
// ...
587600
// These values are reserved for future built-in marker types.
588601
// ...

clients/drcachesim/reader/reader.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,8 @@ class reader_t : public std::iterator<std::input_iterator_tag, memref_t>,
193193
is_record_synthetic() const override
194194
{
195195
if (cur_ref_.marker.type == TRACE_TYPE_MARKER &&
196-
cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) {
196+
(cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT ||
197+
cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE)) {
197198
// These are synthetic records not part of the input and not
198199
// counting toward ordinals.
199200
return true;

clients/drcachesim/scheduler/scheduler.cpp

Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::init(
631631
}
632632
}
633633
}
634+
VPRINT(this, 1, "%zu inputs\n", inputs_.size());
635+
live_input_count_.store(static_cast<int>(inputs_.size()), std::memory_order_release);
634636
return set_initial_schedule(workload2inputs);
635637
}
636638

@@ -1313,7 +1315,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
13131315
input.cur_region);
13141316
if (input.cur_region >= static_cast<int>(input.regions_of_interest.size())) {
13151317
if (input.at_eof)
1316-
return sched_type_t::STATUS_EOF;
1318+
return eof_or_idle(output);
13171319
else {
13181320
// We let the user know we're done.
13191321
if (options_.schedule_record_ostream != nullptr) {
@@ -1329,7 +1331,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
13291331
return status;
13301332
}
13311333
input.queue.push_back(create_thread_exit(input.tid));
1332-
input.at_eof = true;
1334+
mark_input_eof(input);
13331335
return sched_type_t::STATUS_SKIPPED;
13341336
}
13351337
}
@@ -1408,7 +1410,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::skip_instructions(output_ordinal_t out
14081410
if (*input.reader == *input.reader_end) {
14091411
// Raise error because the input region is out of bounds.
14101412
VPRINT(this, 2, "skip_instructions: input=%d skip out of bounds\n", input.index);
1411-
input.at_eof = true;
1413+
mark_input_eof(input);
14121414
return sched_type_t::STATUS_REGION_INVALID;
14131415
}
14141416
input.in_cur_region = true;
@@ -1645,7 +1647,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
16451647
{
16461648
if (outputs_[output].record_index + 1 >=
16471649
static_cast<int>(outputs_[output].record.size()))
1648-
return sched_type_t::STATUS_EOF;
1650+
return eof_or_idle(output);
16491651
const schedule_record_t &segment =
16501652
outputs_[output].record[outputs_[output].record_index + 1];
16511653
index = segment.key.input;
@@ -1681,6 +1683,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
16811683
// XXX i#5843: We may want to provide a kernel-mediated wait
16821684
// feature so a multi-threaded simulator doesn't have to do a
16831685
// spinning poll loop.
1686+
// XXX i#5843: For replaying a schedule as it was traced with
1687+
// MAP_TO_RECORDED_OUTPUT there may have been true idle periods during
1688+
// tracing where some other process than the traced workload was
1689+
// scheduled on a core. If we could identify those, we should return
1690+
// STATUS_IDLE rather than STATUS_WAIT.
16841691
VPRINT(this, 3, "next_record[%d]: waiting for input %d instr #%" PRId64 "\n",
16851692
output, index, segment.start_instruction);
16861693
// Give up this input and go into a wait state.
@@ -1719,7 +1726,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
17191726
// queued candidate record, if any.
17201727
clear_input_queue(inputs_[index]);
17211728
inputs_[index].queue.push_back(create_thread_exit(inputs_[index].tid));
1722-
inputs_[index].at_eof = true;
1729+
mark_input_eof(inputs_[index]);
17231730
VPRINT(this, 2, "early end for input %d\n", index);
17241731
// We're done with this entry but we need the queued record to be read,
17251732
// so we do not move past the entry.
@@ -1773,7 +1780,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
17731780
const schedule_record_t &segment =
17741781
outputs_[output].record[outputs_[output].record_index];
17751782
int input = segment.key.input;
1776-
VPRINT(this, res == sched_type_t::STATUS_WAIT ? 3 : 2,
1783+
VPRINT(this,
1784+
(res == sched_type_t::STATUS_IDLE ||
1785+
res == sched_type_t::STATUS_WAIT)
1786+
? 3
1787+
: 2,
17771788
"next_record[%d]: replay segment in=%d (@%" PRId64
17781789
") type=%d start=%" PRId64 " end=%" PRId64 "\n",
17791790
output, input,
@@ -1819,10 +1830,10 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
18191830
// We found a direct switch target above.
18201831
} else if (ready_queue_empty()) {
18211832
if (prev_index == INVALID_INPUT_ORDINAL)
1822-
return sched_type_t::STATUS_EOF;
1833+
return eof_or_idle(output);
18231834
std::lock_guard<std::mutex> lock(*inputs_[prev_index].lock);
18241835
if (inputs_[prev_index].at_eof)
1825-
return sched_type_t::STATUS_EOF;
1836+
return eof_or_idle(output);
18261837
else
18271838
index = prev_index; // Go back to prior.
18281839
} else {
@@ -1836,7 +1847,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
18361847
}
18371848
input_info_t *queue_next = pop_from_ready_queue(output);
18381849
if (queue_next == nullptr)
1839-
return sched_type_t::STATUS_EOF;
1850+
return eof_or_idle(output);
18401851
index = queue_next->index;
18411852
}
18421853
} else if (options_.deps == DEPENDENCY_TIMESTAMPS) {
@@ -1850,7 +1861,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
18501861
}
18511862
}
18521863
if (index < 0)
1853-
return sched_type_t::STATUS_EOF;
1864+
return eof_or_idle(output);
18541865
VPRINT(this, 2,
18551866
"next_record[%d]: advancing to timestamp %" PRIu64
18561867
" == input #%d\n",
@@ -1883,14 +1894,15 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
18831894
std::lock_guard<std::mutex> lock(*inputs_[index].lock);
18841895
if (inputs_[index].at_eof ||
18851896
*inputs_[index].reader == *inputs_[index].reader_end) {
1886-
VPRINT(this, 2, "next_record[%d]: local index %d == input #%d at eof\n",
1887-
output, outputs_[output].input_indices_index, index);
1897+
VPRINT(this, 2, "next_record[%d]: input #%d at eof\n", output, index);
18881898
if (options_.schedule_record_ostream != nullptr &&
18891899
prev_index != INVALID_INPUT_ORDINAL)
18901900
close_schedule_segment(output, inputs_[prev_index]);
1891-
inputs_[index].at_eof = true;
1901+
if (!inputs_[index].at_eof)
1902+
mark_input_eof(inputs_[index]);
18921903
index = INVALID_INPUT_ORDINAL;
18931904
// Loop and pick next thread.
1905+
prev_index = INVALID_INPUT_ORDINAL;
18941906
continue;
18951907
}
18961908
break;
@@ -1911,7 +1923,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
19111923
// check for quantum end.
19121924
outputs_[output].cur_time = cur_time; // Invalid values are checked below.
19131925
if (!outputs_[output].active)
1914-
return sched_type_t::STATUS_WAIT;
1926+
return sched_type_t::STATUS_IDLE;
19151927
if (outputs_[output].waiting) {
19161928
VPRINT(this, 5, "next_record[%d]: need new input (cur=waiting)\n", output);
19171929
sched_type_t::stream_status_t res = pick_next_input(output, true);
@@ -1922,7 +1934,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
19221934
if (outputs_[output].cur_input < 0) {
19231935
// This happens with more outputs than inputs. For non-empty outputs we
19241936
// require cur_input to be set to >=0 during init().
1925-
return sched_type_t::STATUS_EOF;
1937+
return eof_or_idle(output);
19261938
}
19271939
input = &inputs_[outputs_[output].cur_input];
19281940
auto lock = std::unique_lock<std::mutex>(*input->lock);
@@ -1970,6 +1982,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
19701982
input->needs_advance = true;
19711983
}
19721984
if (input->at_eof || *input->reader == *input->reader_end) {
1985+
if (!input->at_eof)
1986+
mark_input_eof(*input);
19731987
lock.unlock();
19741988
VPRINT(this, 5, "next_record[%d]: need new input (cur=%d eof)\n", output,
19751989
input->index);
@@ -1998,6 +2012,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
19982012
if (outputs_[output].record_index >=
19992013
static_cast<int>(outputs_[output].record.size())) {
20002014
// We're on the last record.
2015+
VPRINT(this, 4, "next_record[%d]: on last record\n", output);
20012016
} else if (outputs_[output].record[outputs_[output].record_index].type ==
20022017
schedule_record_t::SKIP) {
20032018
VPRINT(this, 5, "next_record[%d]: need new input after skip\n", output);
@@ -2257,6 +2272,28 @@ scheduler_tmpl_t<RecordType, ReaderType>::stop_speculation(output_ordinal_t outp
22572272
return sched_type_t::STATUS_OK;
22582273
}
22592274

2275+
template <typename RecordType, typename ReaderType>
2276+
void
2277+
scheduler_tmpl_t<RecordType, ReaderType>::mark_input_eof(input_info_t &input)
2278+
{
2279+
input.at_eof = true;
2280+
assert(live_input_count_.load(std::memory_order_acquire) > 0);
2281+
live_input_count_.fetch_add(-1, std::memory_order_release);
2282+
}
2283+
2284+
template <typename RecordType, typename ReaderType>
2285+
typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
2286+
scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output)
2287+
{
2288+
if (options_.mapping == MAP_TO_CONSISTENT_OUTPUT ||
2289+
live_input_count_.load(std::memory_order_acquire) == 0) {
2290+
return sched_type_t::STATUS_EOF;
2291+
} else {
2292+
outputs_[output].waiting = true;
2293+
return sched_type_t::STATUS_IDLE;
2294+
}
2295+
}
2296+
22602297
template <typename RecordType, typename ReaderType>
22612298
typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
22622299
scheduler_tmpl_t<RecordType, ReaderType>::set_output_active(output_ordinal_t output,

clients/drcachesim/scheduler/scheduler.h

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include <stddef.h>
4646
#include <stdint.h>
4747

48+
#include <atomic>
4849
#include <deque>
4950
#include <limits>
5051
#include <memory>
@@ -109,17 +110,28 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
109110
* For dynamic scheduling with cross-stream dependencies, the scheduler may pause
110111
* a stream if it gets ahead of another stream it should have a dependence on.
111112
* This value is also used for schedules following the recorded timestamps
112-
* (#DEPENDENCY_TIMESTAMPS) to avoid one stream getting ahead of another. For
113-
* replaying a schedule as it was traced with #MAP_TO_RECORDED_OUTPUT this can
114-
* indicate an idle period on a core where the traced workload was not currently
115-
* scheduled.
113+
* (#DEPENDENCY_TIMESTAMPS) to avoid one stream getting ahead of another.
114+
* #STATUS_WAIT should be treated as artificial, an artifact of enforcing a
115+
* recorded schedule on concurrent differently-timed output streams.
116+
* Simulators are suggested to not advance simulated time for #STATUS_WAIT while
117+
* they should advance time for #STATUS_IDLE as the latter indicates a true
118+
* lack of work.
116119
*/
117120
STATUS_WAIT,
118121
STATUS_INVALID, /**< Error condition. */
119122
STATUS_REGION_INVALID, /**< Input region is out of bounds. */
120123
STATUS_NOT_IMPLEMENTED, /**< Feature not implemented. */
121124
STATUS_SKIPPED, /**< Used for internal scheduler purposes. */
122125
STATUS_RECORD_FAILED, /**< Failed to record schedule for future replay. */
126+
/**
127+
* This code indicates that all inputs are blocked waiting for kernel resources
128+
* (such as i/o). This is similar to #STATUS_WAIT, but #STATUS_WAIT indicates an
129+
* artificial pause due to imposing the original ordering while #STATUS_IDLE
130+
* indicates actual idle time in the application. Simulators are suggested
131+
* to not advance simulated time for #STATUS_WAIT while they should advance
132+
* time for #STATUS_IDLE.
133+
*/
134+
STATUS_IDLE,
123135
};
124136

125137
/** Identifies an input stream by its index. */
@@ -629,7 +641,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
629641
/**
630642
* Disables or re-enables this output stream. If "active" is false, this
631643
* stream becomes inactive and its currently assigned input is moved to the
632-
* ready queue to be scheduled on other outputs. The #STATUS_WAIT code is
644+
* ready queue to be scheduled on other outputs. The #STATUS_IDLE code is
633645
* returned to next_record() for inactive streams. If "active" is true,
634646
* this stream becomes active again.
635647
* This is only supported for #MAP_TO_ANY_OUTPUT.
@@ -1076,7 +1088,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
10761088
// sched_lock_.
10771089
std::vector<schedule_record_t> record;
10781090
int record_index = 0;
1079-
bool waiting = false;
1091+
bool waiting = false; // Waiting or idling.
10801092
bool active = true;
10811093
// Used for time-based quanta.
10821094
uint64_t cur_time = 0;
@@ -1259,6 +1271,13 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
12591271
stream_status_t
12601272
set_output_active(output_ordinal_t output, bool active);
12611273

1274+
// Caller must hold the input's lock.
1275+
void
1276+
mark_input_eof(input_info_t &input);
1277+
1278+
stream_status_t
1279+
eof_or_idle(output_ordinal_t output);
1280+
12621281
///////////////////////////////////////////////////////////////////////////
12631282
// Support for ready queues for who to schedule next:
12641283

@@ -1325,6 +1344,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
13251344
flexible_queue_t<input_info_t *, InputTimestampComparator> ready_priority_;
13261345
// Global ready queue counter used to provide FIFO for same-priority inputs.
13271346
uint64_t ready_counter_ = 0;
1347+
// Count of inputs not yet at eof.
1348+
std::atomic<int> live_input_count_;
13281349
// Map from workload,tid pair to input.
13291350
struct workload_tid_t {
13301351
workload_tid_t(int wl, memref_tid_t tid)

0 commit comments

Comments
 (0)