diff --git a/HOWTO.rst b/HOWTO.rst
index 2c6c6dbe5c..3cc1dcb49e 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -2890,6 +2890,19 @@ I/O depth
 	if none of I/O has been completed yet, we will NOT wait and immediately exit
 	the system call. In this example we simply do polling.
 
+.. option:: iodepth_batch_complete_omit=int
+
+	The number of in-flight IOs that need not be retrieved on quiescing. It
+	defaults to 0 which means all in-flight IOs will be retrieved until completion
+	as not to skew the latency. After that, if in the rate-limiting context, the
+	next IO will be set after a fixed delay. But in some cases, the in-flight IOs
+	may need be hanging for a while to merge more subsequent IOs in order to get
+	a better bandwidth or reach the EC stripe length, and so on. And the hanging
+	time may delay the next IO for long if all in-flight IOs must be retrieved.
+	Therefore, this option will be used to skip quiescing partially(IOs in merge
+	waiting) for a smooth data flow, and as such, it is not suitable for the
+	latency-sensitive scenarios.
+
 .. option:: iodepth_low=int
 
 	The low water mark indicating when to start filling the queue
diff --git a/cconv.c b/cconv.c
index 6c36afb72d..365e33ee23 100644
--- a/cconv.c
+++ b/cconv.c
@@ -98,6 +98,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->iodepth_batch = le32_to_cpu(top->iodepth_batch);
 	o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min);
 	o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max);
+	o->iodepth_batch_complete_omit = le32_to_cpu(top->iodepth_batch_complete_omit);
 	o->serialize_overlap = le32_to_cpu(top->serialize_overlap);
 	o->size = le64_to_cpu(top->size);
 	o->io_size = le64_to_cpu(top->io_size);
@@ -379,6 +380,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->iodepth_batch = cpu_to_le32(o->iodepth_batch);
 	top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min);
 	top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max);
+	top->iodepth_batch_complete_omit = cpu_to_le32(o->iodepth_batch_complete_omit);
 	top->serialize_overlap = cpu_to_le32(o->serialize_overlap);
 	top->size_percent = cpu_to_le32(o->size_percent);
 	top->io_size_percent = cpu_to_le32(o->io_size_percent);
diff --git a/fio.1 b/fio.1
index 67d7c710fa..b5d22170d6 100644
--- a/fio.1
+++ b/fio.1
@@ -2650,6 +2650,18 @@ if none of I/O has been completed yet, we will NOT wait and immediately exit
 the system call. In this example we simply do polling.
 .RE
 .TP
+.BI iodepth_batch_complete_omit \fR=\fPint
+The number of in-flight IOs that need not be retrieved on quiescing. It
+defaults to 0 which means all in-flight IOs will be retrieved until completion
+as not to skew the latency. After that, if in the rate-limiting context, the
+next IO will be set after a fixed delay. But in some cases, the in-flight IOs
+may need be hanging for a while to merge more subsequent IOs in order to get
+a better bandwidth or reach the EC stripe length, and so on. And the hanging
+time may delay the next IO for long if all in-flight IOs must be retrieved.
+Therefore, this option will be used to skip quiescing partially(IOs in merge
+waiting) for a smooth data flow, and as such, it is not suitable for the
+latency-sensitive scenarios.
+.TP
 .BI iodepth_low \fR=\fPint
 The low water mark indicating when to start filling the queue
 again. Defaults to the same as \fBiodepth\fR, meaning that fio will
diff --git a/io_u.c b/io_u.c
index eec378ddc0..622626860a 100644
--- a/io_u.c
+++ b/io_u.c
@@ -622,7 +622,7 @@ int io_u_quiesce(struct thread_data *td)
 	if (td->io_u_queued || td->cur_depth)
 		td_io_commit(td);
 
-	while (td->io_u_in_flight) {
+	while (td->io_u_in_flight > td->o.iodepth_batch_complete_omit) {
 		ret = io_u_queued_complete(td, 1);
 		if (ret > 0)
 			completed += ret;
diff --git a/options.c b/options.c
index 5d3daedf34..592266e078 100644
--- a/options.c
+++ b/options.c
@@ -2206,6 +2206,20 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BASIC,
 	},
+	{
+		.name	= "iodepth_batch_complete_omit",
+		.lname	= "Omit IO depth batch complete",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, iodepth_batch_complete_omit),
+		.help	= "The number of in-flight IOs that need not be retrieved",
+		.parent	= "iodepth",
+		.hide	= 1,
+		.minval	= 0,
+		.interval = 1,
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
 	{
 		.name	= "iodepth_low",
 		.lname	= "IO Depth batch low",
diff --git a/thread_options.h b/thread_options.h
index 634070af00..8687d209d4 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -96,6 +96,7 @@ struct thread_options {
 	unsigned int iodepth_batch;
 	unsigned int iodepth_batch_complete_min;
 	unsigned int iodepth_batch_complete_max;
+	unsigned int iodepth_batch_complete_omit;
 	unsigned int serialize_overlap;
 
 	unsigned int unique_filename;
@@ -414,8 +415,8 @@ struct thread_options_pack {
 	uint32_t iodepth_batch;
 	uint32_t iodepth_batch_complete_min;
 	uint32_t iodepth_batch_complete_max;
+	uint32_t iodepth_batch_complete_omit;
 	uint32_t serialize_overlap;
-	uint32_t pad;
 
 	uint64_t size;
 	uint64_t io_size;