[PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion

Tal Zussman posted 3 patches 1 week, 1 day ago
[PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion
Posted by Tal Zussman 1 week, 1 day ago
Some bio completion handlers need to run in task context but bio_endio()
can be called from IRQ context (e.g. buffer_head writeback). Add a
BIO_COMPLETE_IN_TASK flag that bio submitters can set to request
task-context completion of their bi_end_io callback.

When bio_endio() sees this flag and is running in non-task context, it
queues the bio to a per-cpu list and schedules a work item to call
bi_end_io() from task context. A CPU hotplug dead callback drains any
remaining bios from the departing CPU's batch.

This will be used to enable RWF_DONTCACHE for block devices, and could
be used for other subsystems like fscrypt that need task-context bio
completion.

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
---
 block/bio.c               | 84 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/blk_types.h |  1 +
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 8203bb7455a9..69ee0d93041f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/blk-crypto.h>
 #include <linux/xarray.h>
+#include <linux/local_lock.h>
 
 #include <trace/events/block.h>
 #include "blk.h"
@@ -1714,6 +1715,60 @@ void bio_check_pages_dirty(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
+struct bio_complete_batch {
+	local_lock_t lock;
+	struct bio_list list;
+	struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch) = {
+	.lock = INIT_LOCAL_LOCK(lock),
+};
+
+static void bio_complete_work_fn(struct work_struct *w)
+{
+	struct bio_complete_batch *batch;
+	struct bio_list list;
+
+again:
+	local_lock_irq(&bio_complete_batch.lock);
+	batch = this_cpu_ptr(&bio_complete_batch);
+	list = batch->list;
+	bio_list_init(&batch->list);
+	local_unlock_irq(&bio_complete_batch.lock);
+
+	while (!bio_list_empty(&list)) {
+		struct bio *bio = bio_list_pop(&list);
+		bio->bi_end_io(bio);
+	}
+
+	local_lock_irq(&bio_complete_batch.lock);
+	batch = this_cpu_ptr(&bio_complete_batch);
+	if (!bio_list_empty(&batch->list)) {
+		local_unlock_irq(&bio_complete_batch.lock);
+
+		if (!need_resched())
+			goto again;
+
+		schedule_work_on(smp_processor_id(), &batch->work);
+		return;
+	}
+	local_unlock_irq(&bio_complete_batch.lock);
+}
+
+static void bio_queue_completion(struct bio *bio)
+{
+	struct bio_complete_batch *batch;
+	unsigned long flags;
+
+	local_lock_irqsave(&bio_complete_batch.lock, flags);
+	batch = this_cpu_ptr(&bio_complete_batch);
+	bio_list_add(&batch->list, bio);
+	local_unlock_irqrestore(&bio_complete_batch.lock, flags);
+
+	schedule_work_on(smp_processor_id(), &batch->work);
+}
+
 static inline bool bio_remaining_done(struct bio *bio)
 {
 	/*
@@ -1788,7 +1843,9 @@ void bio_endio(struct bio *bio)
 	}
 #endif
 
-	if (bio->bi_end_io)
+	if (!in_task() && bio_flagged(bio, BIO_COMPLETE_IN_TASK))
+		bio_queue_completion(bio);
+	else if (bio->bi_end_io)
 		bio->bi_end_io(bio);
 }
 EXPORT_SYMBOL(bio_endio);
@@ -1974,6 +2031,21 @@ int bioset_init(struct bio_set *bs,
 }
 EXPORT_SYMBOL(bioset_init);
 
+/*
+ * Drain a dead CPU's deferred bio completions. The CPU is dead so no locking
+ * is needed -- no new bios will be queued to it.
+ */
+static int bio_complete_batch_cpu_dead(unsigned int cpu)
+{
+	struct bio_complete_batch *batch = per_cpu_ptr(&bio_complete_batch, cpu);
+	struct bio *bio;
+
+	while ((bio = bio_list_pop(&batch->list)))
+		bio->bi_end_io(bio);
+
+	return 0;
+}
+
 static int __init init_bio(void)
 {
 	int i;
@@ -1988,6 +2060,16 @@ static int __init init_bio(void)
 				SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 	}
 
+	for_each_possible_cpu(i) {
+		struct bio_complete_batch *batch =
+			per_cpu_ptr(&bio_complete_batch, i);
+
+		bio_list_init(&batch->list);
+		INIT_WORK(&batch->work, bio_complete_work_fn);
+	}
+
+	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
+				NULL, bio_complete_batch_cpu_dead);
 	cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
 					bio_cpu_dead);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8808ee76e73c..d49d97a050d0 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -322,6 +322,7 @@ enum {
 	BIO_REMAPPED,
 	BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
 	BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
+	BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */
 	BIO_FLAG_LAST
 };
 

-- 
2.39.5
Re: [PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion
Posted by Christoph Hellwig 6 days, 17 hours ago
On Wed, Mar 25, 2026 at 02:43:00PM -0400, Tal Zussman wrote:
> Some bio completion handlers need to run in task context but bio_endio()
> can be called from IRQ context (e.g. buffer_head writeback). Add a
> BIO_COMPLETE_IN_TASK flag that bio submitters can set to request
> task-context completion of their bi_end_io callback.
> 
> When bio_endio() sees this flag and is running in non-task context, it
> queues the bio to a per-cpu list and schedules a work item to call
> bi_end_io() from task context. A CPU hotplug dead callback drains any
> remaining bios from the departing CPU's batch.
> 
> This will be used to enable RWF_DONTCACHE for block devices, and could
> be used for other subsystems like fscrypt that need task-context bio
> completion.
> 
> Suggested-by: Matthew Wilcox <willy@infradead.org>
> Signed-off-by: Tal Zussman <tz2294@columbia.edu>
> ---
>  block/bio.c               | 84 ++++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/blk_types.h |  1 +
>  2 files changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 8203bb7455a9..69ee0d93041f 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -18,6 +18,7 @@
>  #include <linux/highmem.h>
>  #include <linux/blk-crypto.h>
>  #include <linux/xarray.h>
> +#include <linux/local_lock.h>
>  
>  #include <trace/events/block.h>
>  #include "blk.h"
> @@ -1714,6 +1715,60 @@ void bio_check_pages_dirty(struct bio *bio)
>  }
>  EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
>  
> +struct bio_complete_batch {
> +	local_lock_t lock;
> +	struct bio_list list;
> +	struct work_struct work;
> +};
> +
> +static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch) = {
> +	.lock = INIT_LOCAL_LOCK(lock),
> +};
> +
> +static void bio_complete_work_fn(struct work_struct *w)
> +{
> +	struct bio_complete_batch *batch;
> +	struct bio_list list;
> +
> +again:
> +	local_lock_irq(&bio_complete_batch.lock);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	list = batch->list;
> +	bio_list_init(&batch->list);
> +	local_unlock_irq(&bio_complete_batch.lock);
> +
> +	while (!bio_list_empty(&list)) {
> +		struct bio *bio = bio_list_pop(&list);
> +		bio->bi_end_io(bio);
> +	}

bio_list_pop already does a NULL check, so this could be:

	while ((bio = bio_list_pop(&batch->list)))
		bio->bi_end_io(bio);

In fact that same pattern is repeated later, so maybe just add a helper
for it?  But I think Dave's idea of just using a llist (and adding a
new llist member to the bio for this) seems sensible.  Just don't forget
the llist_reverse_order call to avoid reordering.

> +
> +	local_lock_irq(&bio_complete_batch.lock);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	if (!bio_list_empty(&batch->list)) {
> +		local_unlock_irq(&bio_complete_batch.lock);
> +
> +		if (!need_resched())
> +			goto again;
> +
> +		schedule_work_on(smp_processor_id(), &batch->work);
> +		return;
> +	}
> +	local_unlock_irq(&bio_complete_batch.lock);

I don't really understand this requeue logic.  Can you explain it?

> +	schedule_work_on(smp_processor_id(), &batch->work);

We'll probably want a dedicated workqueue here to avoid deadlocks
vs other system wq uses.

> +static int bio_complete_batch_cpu_dead(unsigned int cpu)
> +{
> +	struct bio_complete_batch *batch = per_cpu_ptr(&bio_complete_batch, cpu);

Overly long line.
Re: [PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion
Posted by Bart Van Assche 1 week, 1 day ago
On 3/25/26 11:43 AM, Tal Zussman wrote:
> +	schedule_work_on(smp_processor_id(), &batch->work);

Since schedule_work_on() queues work on system_percpu_wq the above call
has the same effect as schedule_work(&batch->work), isn't it? From the
workqueue implementation:

	system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0);

[ ... ]

	if (req_cpu == WORK_CPU_UNBOUND) {
		if (wq->flags & WQ_UNBOUND)
			cpu = wq_select_unbound_cpu(raw_smp_processor_id());
		else
			cpu = raw_smp_processor_id();

Thanks,

Bart.
Re: [PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion
Posted by Dave Chinner 1 week ago
On Wed, Mar 25, 2026 at 02:03:40PM -0700, Bart Van Assche wrote:
> On 3/25/26 11:43 AM, Tal Zussman wrote:
> > +	schedule_work_on(smp_processor_id(), &batch->work);
> 
> Since schedule_work_on() queues work on system_percpu_wq the above call
> has the same effect as schedule_work(&batch->work), isn't it?

No. Two words: Task preemption.

And in saying this, I realise the originally proposed code is dodgy.
It might work look ok because the common cases is that interrupt
context processing can't be preempted. However, I don't think that
is true for PREEMPT_RT kernels (IIRC interrupt processing runs as a
task that can be preempted). Also, bio completion can naturally run
from task context because the submitter can hold the last reference
to the bio.

Hence the queueing function can be preempted and scheduled to a
different CPU like so:

lock_lock_irq()
queue on CPU 0
local_lock_irq()
<preempt>
<run on CPU 1>
schedule_work_on(smp_processor_id())

That results in bio completion being queued on CPU 0, but the
processing work is scheduled for CPU 1. Oops.

> From the
> workqueue implementation:
> 
> 	system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0);
> 
> [ ... ]
> 	if (req_cpu == WORK_CPU_UNBOUND) {
> 		if (wq->flags & WQ_UNBOUND)
> 			cpu = wq_select_unbound_cpu(raw_smp_processor_id());
> 		else
> 			cpu = raw_smp_processor_id();

Same preemption problem as above.


-Dave.
-- 
Dave Chinner
dgc@kernel.org
Re: [PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion
Posted by Dave Chinner 1 week, 1 day ago
On Wed, Mar 25, 2026 at 02:43:00PM -0400, Tal Zussman wrote:
> Some bio completion handlers need to run in task context but bio_endio()
> can be called from IRQ context (e.g. buffer_head writeback). Add a
> BIO_COMPLETE_IN_TASK flag that bio submitters can set to request
> task-context completion of their bi_end_io callback.
> 
> When bio_endio() sees this flag and is running in non-task context, it
> queues the bio to a per-cpu list and schedules a work item to call
> bi_end_io() from task context. A CPU hotplug dead callback drains any
> remaining bios from the departing CPU's batch.
> 
> This will be used to enable RWF_DONTCACHE for block devices, and could
> be used for other subsystems like fscrypt that need task-context bio
> completion.
> 
> Suggested-by: Matthew Wilcox <willy@infradead.org>
> Signed-off-by: Tal Zussman <tz2294@columbia.edu>
> ---
>  block/bio.c               | 84 ++++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/blk_types.h |  1 +
>  2 files changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 8203bb7455a9..69ee0d93041f 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -18,6 +18,7 @@
>  #include <linux/highmem.h>
>  #include <linux/blk-crypto.h>
>  #include <linux/xarray.h>
> +#include <linux/local_lock.h>
>  
>  #include <trace/events/block.h>
>  #include "blk.h"
> @@ -1714,6 +1715,60 @@ void bio_check_pages_dirty(struct bio *bio)
>  }
>  EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
>  
> +struct bio_complete_batch {
> +	local_lock_t lock;
> +	struct bio_list list;
> +	struct work_struct work;
> +};
> +
> +static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch) = {
> +	.lock = INIT_LOCAL_LOCK(lock),
> +};
> +
> +static void bio_complete_work_fn(struct work_struct *w)
> +{
> +	struct bio_complete_batch *batch;
> +	struct bio_list list;
> +
> +again:
> +	local_lock_irq(&bio_complete_batch.lock);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	list = batch->list;
> +	bio_list_init(&batch->list);
> +	local_unlock_irq(&bio_complete_batch.lock);

This is just a FIFO processing queue, and it is so wanting to be a
struct llist for lockless queuing and dequeueing.

We do this lockless per-cpu queue + per-cpu workqueue in XFS for
background inode GC processing. See struct xfs_inodegc and all the
xfs_inodegc_*() functions - it may be useful to have a generic
lockless per-cpu queue processing so we don't keep open coding this
repeating pattern everywhere.

> +
> +	while (!bio_list_empty(&list)) {
> +		struct bio *bio = bio_list_pop(&list);
> +		bio->bi_end_io(bio);
> +	}
> +
> +	local_lock_irq(&bio_complete_batch.lock);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	if (!bio_list_empty(&batch->list)) {
> +		local_unlock_irq(&bio_complete_batch.lock);
> +
> +		if (!need_resched())
> +			goto again;
> +
> +		schedule_work_on(smp_processor_id(), &batch->work);

We've learnt that immediately scheduling per-cpu batch
processing work can cause context switch storms as the queue/dequeue
steps one work item at a time.

Hence we use a delayed work with a scheduling delay of a singel
jiffie to allow batches of queue work from a single context to
complete before (potentially) being pre-empted by the per-cpu
kworker task that will process the queue...

> +		return;
> +	}
> +	local_unlock_irq(&bio_complete_batch.lock);
> +}
> +
> +static void bio_queue_completion(struct bio *bio)
> +{
> +	struct bio_complete_batch *batch;
> +	unsigned long flags;
> +
> +	local_lock_irqsave(&bio_complete_batch.lock, flags);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	bio_list_add(&batch->list, bio);
> +	local_unlock_irqrestore(&bio_complete_batch.lock, flags);
> +
> +	schedule_work_on(smp_processor_id(), &batch->work);
> +}

Yeah, we definitely want to queue all the pending bio completions
the interrupt is delivering before we run the batch processing...

> +
>  static inline bool bio_remaining_done(struct bio *bio)
>  {
>  	/*
> @@ -1788,7 +1843,9 @@ void bio_endio(struct bio *bio)
>  	}
>  #endif
>  
> -	if (bio->bi_end_io)
> +	if (!in_task() && bio_flagged(bio, BIO_COMPLETE_IN_TASK))
> +		bio_queue_completion(bio);
> +	else if (bio->bi_end_io)
>  		bio->bi_end_io(bio);
>  }
>  EXPORT_SYMBOL(bio_endio);
> @@ -1974,6 +2031,21 @@ int bioset_init(struct bio_set *bs,
>  }
>  EXPORT_SYMBOL(bioset_init);
>  
> +/*
> + * Drain a dead CPU's deferred bio completions. The CPU is dead so no locking
> + * is needed -- no new bios will be queued to it.
> + */
> +static int bio_complete_batch_cpu_dead(unsigned int cpu)
> +{
> +	struct bio_complete_batch *batch = per_cpu_ptr(&bio_complete_batch, cpu);
> +	struct bio *bio;
> +
> +	while ((bio = bio_list_pop(&batch->list)))
> +		bio->bi_end_io(bio);
> +
> +	return 0;
> +}

If you use a llist for the queue, this code is no different to the
normal processing work.

> +
>  static int __init init_bio(void)
>  {
>  	int i;
> @@ -1988,6 +2060,16 @@ static int __init init_bio(void)
>  				SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
>  	}
>  
> +	for_each_possible_cpu(i) {
> +		struct bio_complete_batch *batch =
> +			per_cpu_ptr(&bio_complete_batch, i);
> +
> +		bio_list_init(&batch->list);
> +		INIT_WORK(&batch->work, bio_complete_work_fn);
> +	}
> +
> +	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
> +				NULL, bio_complete_batch_cpu_dead);

XFS inodegc tracks the CPUs with work queued via a cpumask and
iterates the CPU mask for "all CPU" iteration scans. This avoids the
need for CPU hotplug integration...

>  	cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
>  					bio_cpu_dead);
>  
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 8808ee76e73c..d49d97a050d0 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -322,6 +322,7 @@ enum {
>  	BIO_REMAPPED,
>  	BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
>  	BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
> +	BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */

Can anyone set this on a bio they submit? i.e. This needs a better
description. Who can use it, constraints, guarantees, etc.

I ask, because the higher filesystem layers often know at submission
time that we need task based IO completion. If we can tell the bio
we are submitting that it needs task completion and have the block
layer guarantee that the ->end_io completion only ever runs in task
context, then we can get rid of mulitple instances of IO completion
deferal to task context in filesystem code (e.g. iomap - for both
buffered and direct IO, xfs buffer cache write completions, etc).

-Dave.
-- 
Dave Chinner
dgc@kernel.org
Re: [PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion
Posted by Matthew Wilcox 1 week, 1 day ago
On Thu, Mar 26, 2026 at 07:26:26AM +1100, Dave Chinner wrote:
> > @@ -1988,6 +2060,16 @@ static int __init init_bio(void)
> >  				SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
> >  	}
> >  
> > +	for_each_possible_cpu(i) {
> > +		struct bio_complete_batch *batch =
> > +			per_cpu_ptr(&bio_complete_batch, i);
> > +
> > +		bio_list_init(&batch->list);
> > +		INIT_WORK(&batch->work, bio_complete_work_fn);
> > +	}
> > +
> > +	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
> > +				NULL, bio_complete_batch_cpu_dead);
> 
> XFS inodegc tracks the CPUs with work queued via a cpumask and
> iterates the CPU mask for "all CPU" iteration scans. This avoids the
> need for CPU hotplug integration...

Can you elaborate a bit on how this would work in this context?
I understand why inode garbage collection might do an "all CPU"
iteration, but I don't understand the circumstances under which
we'd iterate over all CPUs to complete deferred BIOs.

> > +++ b/include/linux/blk_types.h
> > @@ -322,6 +322,7 @@ enum {
> >  	BIO_REMAPPED,
> >  	BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
> >  	BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
> > +	BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */
> 
> Can anyone set this on a bio they submit? i.e. This needs a better
> description. Who can use it, constraints, guarantees, etc.
> 
> I ask, because the higher filesystem layers often know at submission
> time that we need task based IO completion. If we can tell the bio
> we are submitting that it needs task completion and have the block
> layer guarantee that the ->end_io completion only ever runs in task
> context, then we can get rid of mulitple instances of IO completion
> deferal to task context in filesystem code (e.g. iomap - for both
> buffered and direct IO, xfs buffer cache write completions, etc).

Right, that's the idea, this would be entirely general.  I want to do
it for all pagecache writeback so we can change i_pages.xa_lock from
being irq-safe to only taken in task context.
Re: [PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion
Posted by Dave Chinner 1 week ago
On Wed, Mar 25, 2026 at 08:39:21PM +0000, Matthew Wilcox wrote:
> On Thu, Mar 26, 2026 at 07:26:26AM +1100, Dave Chinner wrote:
> > > @@ -1988,6 +2060,16 @@ static int __init init_bio(void)
> > >  				SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
> > >  	}
> > >  
> > > +	for_each_possible_cpu(i) {
> > > +		struct bio_complete_batch *batch =
> > > +			per_cpu_ptr(&bio_complete_batch, i);
> > > +
> > > +		bio_list_init(&batch->list);
> > > +		INIT_WORK(&batch->work, bio_complete_work_fn);
> > > +	}
> > > +
> > > +	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
> > > +				NULL, bio_complete_batch_cpu_dead);
> > 
> > XFS inodegc tracks the CPUs with work queued via a cpumask and
> > iterates the CPU mask for "all CPU" iteration scans. This avoids the
> > need for CPU hotplug integration...
> 
> Can you elaborate a bit on how this would work in this context?

It may not even be relevant. I was just mentioning it because if
someone looks at the xfs_inodegc code (as I suggested) they might
wonder why there aren't hotplug hooks for a per-cpu queuing
algorithm and/or why it tracked CPUs with queued items via a CPU
mask...

-Dave.
-- 
Dave Chinner
dgc@kernel.org
Re: [PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion
Posted by Jens Axboe 1 week, 1 day ago
On 3/25/26 12:43 PM, Tal Zussman wrote:
> diff --git a/block/bio.c b/block/bio.c
> index 8203bb7455a9..69ee0d93041f 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -18,6 +18,7 @@
>  #include <linux/highmem.h>
>  #include <linux/blk-crypto.h>
>  #include <linux/xarray.h>
> +#include <linux/local_lock.h>
>  
>  #include <trace/events/block.h>
>  #include "blk.h"
> @@ -1714,6 +1715,60 @@ void bio_check_pages_dirty(struct bio *bio)
>  }
>  EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
>  
> +struct bio_complete_batch {
> +	local_lock_t lock;
> +	struct bio_list list;
> +	struct work_struct work;
> +};
> +
> +static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch) = {
> +	.lock = INIT_LOCAL_LOCK(lock),
> +};
> +
> +static void bio_complete_work_fn(struct work_struct *w)
> +{
> +	struct bio_complete_batch *batch;
> +	struct bio_list list;
> +
> +again:
> +	local_lock_irq(&bio_complete_batch.lock);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	list = batch->list;
> +	bio_list_init(&batch->list);
> +	local_unlock_irq(&bio_complete_batch.lock);
> +
> +	while (!bio_list_empty(&list)) {
> +		struct bio *bio = bio_list_pop(&list);
> +		bio->bi_end_io(bio);
> +	}
> +
> +	local_lock_irq(&bio_complete_batch.lock);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	if (!bio_list_empty(&batch->list)) {
> +		local_unlock_irq(&bio_complete_batch.lock);
> +
> +		if (!need_resched())
> +			goto again;
> +
> +		schedule_work_on(smp_processor_id(), &batch->work);
> +		return;
> +	}
> +	local_unlock_irq(&bio_complete_batch.lock);
> +}

bool looped = false;

do {
	if (looped && need_resched()) {
    		schedule_work_on(smp_processor_id(), &batch->work);
		break;
	}

	local_lock_irq(&bio_complete_batch.lock);
	batch = this_cpu_ptr(&bio_complete_batch);
	list = batch->list;
	bio_list_init(&batch->list);
	local_unlock_irq(&bio_complete_batch.lock);

	if (bio_list_empty(&list))
		break;

	do {
		struct bio *bio = bio_list_pop(&list);
		bio->bi_end_io(bio);
	} while (!bio_list_empty(&list));
	looped = true;
} while (1);

would be a lot easier to read, and avoid needing the list manipulation
included twice.

> +static void bio_queue_completion(struct bio *bio)
> +{
> +	struct bio_complete_batch *batch;
> +	unsigned long flags;
> +
> +	local_lock_irqsave(&bio_complete_batch.lock, flags);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	bio_list_add(&batch->list, bio);
> +	local_unlock_irqrestore(&bio_complete_batch.lock, flags);
> +
> +	schedule_work_on(smp_processor_id(), &batch->work);
> +}

Maybe do something ala:

static void bio_queue_completion(struct bio *bio)
{
	struct bio_complete_batch *batch;
	unsigned long flags;
	bool was_empty;

	local_lock_irqsave(&bio_complete_batch.lock, flags);
	batch = this_cpu_ptr(&bio_complete_batch);
	was_empty = bio_list_empty(&batch->list);
	bio_list_add(&batch->list, bio);
	local_unlock_irqrestore(&bio_complete_batch.lock, flags);

	if (was_empty)
		schedule_work_on(smp_processor_id(), &batch->work);
}

Outside of these mostly nits, I like this approach. It avoids my main
worry with this, which was contention on the list locks. And on the
io_uring side, we'll never hit the !in_task() path anyway, as the
completions are run from the task always. The bio flag makes sense for
this.

-- 
Jens Axboe
Re: [PATCH RFC v4 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion
Posted by Matthew Wilcox 1 week, 1 day ago
On Wed, Mar 25, 2026 at 02:43:00PM -0400, Tal Zussman wrote:
> +static void bio_complete_work_fn(struct work_struct *w)
> +{
> +	struct bio_complete_batch *batch;
> +	struct bio_list list;
> +
> +again:
> +	local_lock_irq(&bio_complete_batch.lock);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	list = batch->list;
> +	bio_list_init(&batch->list);
> +	local_unlock_irq(&bio_complete_batch.lock);
> +
> +	while (!bio_list_empty(&list)) {
> +		struct bio *bio = bio_list_pop(&list);
> +		bio->bi_end_io(bio);
> +	}
> +
> +	local_lock_irq(&bio_complete_batch.lock);
> +	batch = this_cpu_ptr(&bio_complete_batch);
> +	if (!bio_list_empty(&batch->list)) {
> +		local_unlock_irq(&bio_complete_batch.lock);
> +
> +		if (!need_resched())
> +			goto again;
> +
> +		schedule_work_on(smp_processor_id(), &batch->work);
> +		return;
> +	}

I don't know how often we see this actually trigger, but wouldn't this
be slightly more efficient?

+	local_lock_irq(&bio_complete_batch.lock);
+	batch = this_cpu_ptr(&bio_complete_batch);
+	list = batch->list;
+again:
+	bio_list_init(&batch->list);
+	local_unlock_irq(&bio_complete_batch.lock);
+
+	while (!bio_list_empty(&list)) {
+		struct bio *bio = bio_list_pop(&list);
+		bio->bi_end_io(bio);
+	}
+
+	local_lock_irq(&bio_complete_batch.lock);
+	batch = this_cpu_ptr(&bio_complete_batch);
+	list = batch->list;
+	if (!bio_list_empty(&list)) {
+		if (!need_resched())
+			goto again;
+
+		local_unlock_irq(&bio_complete_batch.lock);
+		schedule_work_on(smp_processor_id(), &batch->work);
+		return;
+	}


Overall I like this.  I think this is a better approach than the earlier
patches, and I'm looking forward to the simplifications that it's going
to enable.