diff --git a/HOWTO.rst b/HOWTO.rst index 2c6c6dbe5c..3cc1dcb49e 100644 --- a/HOWTO.rst +++ b/HOWTO.rst @@ -2890,6 +2890,19 @@ I/O depth if none of I/O has been completed yet, we will NOT wait and immediately exit the system call. In this example we simply do polling. +.. option:: iodepth_batch_complete_omit=int + + The number of in-flight IOs that need not be retrieved on quiescing. It + defaults to 0 which means all in-flight IOs will be retrieved until completion + as not to skew the latency. After that, if in the rate-limiting context, the + next IO will be set after a fixed delay. But in some cases, the in-flight IOs + may need be hanging for a while to merge more subsequent IOs in order to get + a better bandwidth or reach the EC stripe length, and so on. And the hanging + time may delay the next IO for long if all in-flight IOs must be retrieved. + Therefore, this option will be used to skip quiescing partially(IOs in merge + waiting) for a smooth data flow, and as such, it is not suitable for the + latency-sensitive scenarios. + .. option:: iodepth_low=int The low water mark indicating when to start filling the queue diff --git a/cconv.c b/cconv.c index 6c36afb72d..365e33ee23 100644 --- a/cconv.c +++ b/cconv.c @@ -98,6 +98,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->iodepth_batch = le32_to_cpu(top->iodepth_batch); o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min); o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max); + o->iodepth_batch_complete_omit = le32_to_cpu(top->iodepth_batch_complete_omit); o->serialize_overlap = le32_to_cpu(top->serialize_overlap); o->size = le64_to_cpu(top->size); o->io_size = le64_to_cpu(top->io_size); @@ -379,6 +380,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->iodepth_batch = cpu_to_le32(o->iodepth_batch); top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min); top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max); + top->iodepth_batch_complete_omit = cpu_to_le32(o->iodepth_batch_complete_omit); top->serialize_overlap = cpu_to_le32(o->serialize_overlap); top->size_percent = cpu_to_le32(o->size_percent); top->io_size_percent = cpu_to_le32(o->io_size_percent); diff --git a/fio.1 b/fio.1 index 67d7c710fa..b5d22170d6 100644 --- a/fio.1 +++ b/fio.1 @@ -2650,6 +2650,18 @@ if none of I/O has been completed yet, we will NOT wait and immediately exit the system call. In this example we simply do polling. .RE .TP +.BI iodepth_batch_complete_omit \fR=\fPint +The number of in-flight IOs that need not be retrieved on quiescing. It +defaults to 0 which means all in-flight IOs will be retrieved until completion +as not to skew the latency. After that, if in the rate-limiting context, the +next IO will be set after a fixed delay. But in some cases, the in-flight IOs +may need be hanging for a while to merge more subsequent IOs in order to get +a better bandwidth or reach the EC stripe length, and so on. And the hanging +time may delay the next IO for long if all in-flight IOs must be retrieved. +Therefore, this option will be used to skip quiescing partially(IOs in merge +waiting) for a smooth data flow, and as such, it is not suitable for the +latency-sensitive scenarios. +.TP .BI iodepth_low \fR=\fPint The low water mark indicating when to start filling the queue again. Defaults to the same as \fBiodepth\fR, meaning that fio will diff --git a/io_u.c b/io_u.c index eec378ddc0..622626860a 100644 --- a/io_u.c +++ b/io_u.c @@ -622,7 +622,7 @@ int io_u_quiesce(struct thread_data *td) if (td->io_u_queued || td->cur_depth) td_io_commit(td); - while (td->io_u_in_flight) { + while (td->io_u_in_flight > td->o.iodepth_batch_complete_omit) { ret = io_u_queued_complete(td, 1); if (ret > 0) completed += ret; diff --git a/options.c b/options.c index 5d3daedf34..592266e078 100644 --- a/options.c +++ b/options.c @@ -2206,6 +2206,20 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_BASIC, }, + { + .name = "iodepth_batch_complete_omit", + .lname = "Omit IO depth batch complete", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, iodepth_batch_complete_omit), + .help = "The number of in-flight IOs that need not be retrieved", + .parent = "iodepth", + .hide = 1, + .minval = 0, + .interval = 1, + .def = "0", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_IO_BASIC, + }, { .name = "iodepth_low", .lname = "IO Depth batch low", diff --git a/thread_options.h b/thread_options.h index 634070af00..8687d209d4 100644 --- a/thread_options.h +++ b/thread_options.h @@ -96,6 +96,7 @@ struct thread_options { unsigned int iodepth_batch; unsigned int iodepth_batch_complete_min; unsigned int iodepth_batch_complete_max; + unsigned int iodepth_batch_complete_omit; unsigned int serialize_overlap; unsigned int unique_filename; @@ -414,8 +415,8 @@ struct thread_options_pack { uint32_t iodepth_batch; uint32_t iodepth_batch_complete_min; uint32_t iodepth_batch_complete_max; + uint32_t iodepth_batch_complete_omit; uint32_t serialize_overlap; - uint32_t pad; uint64_t size; uint64_t io_size;