Batch free work in io-wq

[RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work

Posted by Bui Quang Minh 11 months, 3 weeks ago

Currently, in case we don't use IORING_SETUP_DEFER_TASKRUN, when io
worker frees work, it needs to add a task work. This creates contention
on tctx->task_list. With this commit, io work queues free work on a
local list and batch multiple free work in one call when the number of
free work in local list exceeds IO_REQ_ALLOC_BATCH.

Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
---
 io_uring/io-wq.c    | 62 +++++++++++++++++++++++++++++++++++++++++++--
 io_uring/io-wq.h    |  4 ++-
 io_uring/io_uring.c | 23 ++++++++++++++---
 io_uring/io_uring.h |  6 ++++-
 4 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 5d0928f37471..096711707db9 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -544,6 +544,20 @@ static void io_assign_current_work(struct io_worker *worker,
 	raw_spin_unlock(&worker->lock);
 }
 
+static void flush_req_free_list(struct llist_head *free_list,
+				struct llist_node *tail)
+{
+	struct io_kiocb *first_req, *last_req;
+
+	first_req = container_of(free_list->first, struct io_kiocb,
+				 io_task_work.node);
+	last_req = container_of(tail, struct io_kiocb,
+				io_task_work.node);
+
+	io_req_normal_work_add(first_req, last_req);
+	init_llist_head(free_list);
+}
+
 /*
  * Called with acct->lock held, drops it before returning
  */
@@ -553,6 +567,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 {
 	struct io_wq *wq = worker->wq;
 	bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
+	LLIST_HEAD(free_list);
+	int free_req = 0;
+	struct llist_node *tail = NULL;
+	struct io_ring_ctx *last_added_ctx = NULL;
 
 	do {
 		struct io_wq_work *work;
@@ -592,6 +610,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 		do {
 			struct io_wq_work *next_hashed, *linked;
 			unsigned int hash = io_get_work_hash(work);
+			struct io_kiocb *req = container_of(work,
+						struct io_kiocb, work);
+			bool did_free = false;
 
 			next_hashed = wq_next_work(work);
 
@@ -601,7 +622,41 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 			wq->do_work(work);
 			io_assign_current_work(worker, NULL);
 
-			linked = wq->free_work(work);
+			/*
+			 * All requests in free list must have the same
+			 * io_ring_ctx.
+			 */
+			if (last_added_ctx && last_added_ctx != req->ctx) {
+				flush_req_free_list(&free_list, tail);
+				tail = NULL;
+				last_added_ctx = NULL;
+				free_req = 0;
+			}
+
+			/*
+			 * Try to batch free work when
+			 * !IORING_SETUP_DEFER_TASKRUN to reduce contention
+			 * on tctx->task_list.
+			 */
+			if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+				linked = wq->free_work(work, NULL, NULL);
+			else
+				linked = wq->free_work(work, &free_list, &did_free);
+
+			if (did_free) {
+				if (!tail)
+					tail = free_list.first;
+
+				last_added_ctx = req->ctx;
+				free_req++;
+				if (free_req == IO_REQ_ALLOC_BATCH) {
+					flush_req_free_list(&free_list, tail);
+					tail = NULL;
+					last_added_ctx = NULL;
+					free_req = 0;
+				}
+			}
+
 			work = next_hashed;
 			if (!work && linked && !io_wq_is_hashed(linked)) {
 				work = linked;
@@ -626,6 +681,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 			break;
 		raw_spin_lock(&acct->lock);
 	} while (1);
+
+	if (free_list.first)
+		flush_req_free_list(&free_list, tail);
 }
 
 static int io_wq_worker(void *data)
@@ -899,7 +957,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
 	do {
 		atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
 		wq->do_work(work);
-		work = wq->free_work(work);
+		work = wq->free_work(work, NULL, NULL);
 	} while (work);
 }
 
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index b3b004a7b625..4f1674d3d68e 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -21,7 +21,9 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
+typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *,
+					  struct llist_head *,
+					  bool *did_free);
 typedef void (io_wq_work_fn)(struct io_wq_work *);
 
 struct io_wq_hash {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0c111f7d7832..0343c9ec7271 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -120,7 +120,6 @@
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
 #define IO_COMPL_BATCH			32
-#define IO_REQ_ALLOC_BATCH		8
 #define IO_LOCAL_TW_DEFAULT_MAX		20
 
 struct io_defer_entry {
@@ -985,13 +984,18 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	return true;
 }
 
-__cold void io_free_req(struct io_kiocb *req)
+static __cold void io_set_req_free(struct io_kiocb *req)
 {
 	/* refs were already put, restore them for io_req_task_complete() */
 	req->flags &= ~REQ_F_REFCOUNT;
 	/* we only want to free it, don't post CQEs */
 	req->flags |= REQ_F_CQE_SKIP;
 	req->io_task_work.func = io_req_task_complete;
+}
+
+__cold void io_free_req(struct io_kiocb *req)
+{
+	io_set_req_free(req);
 	io_req_task_work_add(req);
 }
 
@@ -1772,16 +1776,27 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
 				 IO_URING_F_COMPLETE_DEFER);
 }
 
-struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
+struct io_wq_work *io_wq_free_work(struct io_wq_work *work,
+				   struct llist_head *free_list,
+				   bool *did_free)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_kiocb *nxt = NULL;
+	bool free_req = false;
 
 	if (req_ref_put_and_test(req)) {
 		if (req->flags & IO_REQ_LINK_FLAGS)
 			nxt = io_req_find_next(req);
-		io_free_req(req);
+		io_set_req_free(req);
+		if (free_list)
+			__llist_add(&req->io_task_work.node, free_list);
+		else
+			io_req_task_work_add(req);
+		free_req = true;
 	}
+	if (did_free)
+		*did_free = free_req;
+
 	return nxt ? &nxt->work : NULL;
 }
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index bdd6407c14d0..dc050bc44b65 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -54,6 +54,8 @@ struct io_wait_queue {
 #endif
 };
 
+#define IO_REQ_ALLOC_BATCH	8
+
 static inline bool io_should_wake(struct io_wait_queue *iowq)
 {
 	struct io_ring_ctx *ctx = iowq->ctx;
@@ -111,7 +113,9 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
 void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 
-struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
+struct io_wq_work *io_wq_free_work(struct io_wq_work *work,
+				   struct llist_head *free_req,
+				   bool *did_free);
 void io_wq_submit_work(struct io_wq_work *work);
 
 void io_free_req(struct io_kiocb *req);
-- 
2.43.0

Re: [RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work

Posted by Pavel Begunkov 11 months, 3 weeks ago

On 2/21/25 04:19, Bui Quang Minh wrote:
> Currently, in case we don't use IORING_SETUP_DEFER_TASKRUN, when io
> worker frees work, it needs to add a task work. This creates contention
> on tctx->task_list. With this commit, io work queues free work on a
> local list and batch multiple free work in one call when the number of
> free work in local list exceeds IO_REQ_ALLOC_BATCH.

I see no relation to IO_REQ_ALLOC_BATCH, that should be
a separate macro.

> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> ---
>   io_uring/io-wq.c    | 62 +++++++++++++++++++++++++++++++++++++++++++--
>   io_uring/io-wq.h    |  4 ++-
>   io_uring/io_uring.c | 23 ++++++++++++++---
>   io_uring/io_uring.h |  6 ++++-
>   4 files changed, 87 insertions(+), 8 deletions(-)
> 
> diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
> index 5d0928f37471..096711707db9 100644
> --- a/io_uring/io-wq.c
> +++ b/io_uring/io-wq.c
...
> @@ -601,7 +622,41 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
>   			wq->do_work(work);
>   			io_assign_current_work(worker, NULL);
>   
> -			linked = wq->free_work(work);
> +			/*
> +			 * All requests in free list must have the same
> +			 * io_ring_ctx.
> +			 */
> +			if (last_added_ctx && last_added_ctx != req->ctx) {
> +				flush_req_free_list(&free_list, tail);
> +				tail = NULL;
> +				last_added_ctx = NULL;
> +				free_req = 0;
> +			}
> +
> +			/*
> +			 * Try to batch free work when
> +			 * !IORING_SETUP_DEFER_TASKRUN to reduce contention
> +			 * on tctx->task_list.
> +			 */
> +			if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
> +				linked = wq->free_work(work, NULL, NULL);
> +			else
> +				linked = wq->free_work(work, &free_list, &did_free);

The problem here is that iowq is blocking and hence you lock up resources
of already completed request for who knows how long. In case of unbound
requests (see IO_WQ_ACCT_UNBOUND) it's indefinite, and it's absolutely
cannot be used without some kind of a timer. But even in case of bound
work, it can be pretty long.

Maybe, for bound requests it can target N like here, but read jiffies
in between each request and flush if it has been too long. So in worst
case the total delay is the last req execution time + DT. But even then
it feels wrong, especially with filesystems sometimes not even
honouring NOWAIT.

The question is, why do you force it into the worker pool with the
IOSQE_ASYNC flag? It's generally not recommended, and the name of the
flag is confusing as it should've been more like "WORKER_OFFLOAD".

> +
> +			if (did_free) {
> +				if (!tail)
> +					tail = free_list.first;
> +
> +				last_added_ctx = req->ctx;
> +				free_req++;
> +				if (free_req == IO_REQ_ALLOC_BATCH) {
> +					flush_req_free_list(&free_list, tail);
> +					tail = NULL;
> +					last_added_ctx = NULL;
> +					free_req = 0;
> +				}
> +			}
> +
>   			work = next_hashed;
>   			if (!work && linked && !io_wq_is_hashed(linked)) {
>   				work = linked;
> @@ -626,6 +681,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
>   			break;
>   		raw_spin_lock(&acct->lock);
>   	} while (1);
> +
> +	if (free_list.first)
> +		flush_req_free_list(&free_list, tail);
>   }
>   
...

-- 
Pavel Begunkov

Re: [RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work

Posted by Bui Quang Minh 11 months, 3 weeks ago

On 2/21/25 19:44, Pavel Begunkov wrote:
> On 2/21/25 04:19, Bui Quang Minh wrote:
>> Currently, in case we don't use IORING_SETUP_DEFER_TASKRUN, when io
>> worker frees work, it needs to add a task work. This creates contention
>> on tctx->task_list. With this commit, io work queues free work on a
>> local list and batch multiple free work in one call when the number of
>> free work in local list exceeds IO_REQ_ALLOC_BATCH.
>
> I see no relation to IO_REQ_ALLOC_BATCH, that should be
> a separate macro.
>
>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>> ---
>>   io_uring/io-wq.c    | 62 +++++++++++++++++++++++++++++++++++++++++++--
>>   io_uring/io-wq.h    |  4 ++-
>>   io_uring/io_uring.c | 23 ++++++++++++++---
>>   io_uring/io_uring.h |  6 ++++-
>>   4 files changed, 87 insertions(+), 8 deletions(-)
>>
>> diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
>> index 5d0928f37471..096711707db9 100644
>> --- a/io_uring/io-wq.c
>> +++ b/io_uring/io-wq.c
> ...
>> @@ -601,7 +622,41 @@ static void io_worker_handle_work(struct 
>> io_wq_acct *acct,
>>               wq->do_work(work);
>>               io_assign_current_work(worker, NULL);
>>   -            linked = wq->free_work(work);
>> +            /*
>> +             * All requests in free list must have the same
>> +             * io_ring_ctx.
>> +             */
>> +            if (last_added_ctx && last_added_ctx != req->ctx) {
>> +                flush_req_free_list(&free_list, tail);
>> +                tail = NULL;
>> +                last_added_ctx = NULL;
>> +                free_req = 0;
>> +            }
>> +
>> +            /*
>> +             * Try to batch free work when
>> +             * !IORING_SETUP_DEFER_TASKRUN to reduce contention
>> +             * on tctx->task_list.
>> +             */
>> +            if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>> +                linked = wq->free_work(work, NULL, NULL);
>> +            else
>> +                linked = wq->free_work(work, &free_list, &did_free);
>
> The problem here is that iowq is blocking and hence you lock up resources
> of already completed request for who knows how long. In case of unbound
> requests (see IO_WQ_ACCT_UNBOUND) it's indefinite, and it's absolutely
> cannot be used without some kind of a timer. But even in case of bound
> work, it can be pretty long.
That's a good point, I've overlooked the fact that work handler might 
block indefinitely.
> Maybe, for bound requests it can target N like here, but read jiffies
> in between each request and flush if it has been too long. So in worst
> case the total delay is the last req execution time + DT. But even then
> it feels wrong, especially with filesystems sometimes not even
> honouring NOWAIT.
>
> The question is, why do you force it into the worker pool with the
> IOSQE_ASYNC flag? It's generally not recommended, and the name of the
> flag is confusing as it should've been more like "WORKER_OFFLOAD".


I launched more workers to parallel the work handler, but as you said, 
it seems like an incorrect use case.

However, I think the request free seems heavy, we need to create a task 
work so that we can hold the uring_lock to queue the request to 
ctx->submit_state->compl_reqs. Let me play around more to see if I can 
find an optimization for this.


Sorry for messing up in the previous reply, I've resent the reply for 
better read.

Quang Minh.

Re: [RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work

Posted by Pavel Begunkov 11 months, 3 weeks ago

On 2/21/25 14:52, Bui Quang Minh wrote:
> On 2/21/25 19:44, Pavel Begunkov wrote:
>> On 2/21/25 04:19, Bui Quang Minh wrote:
>>> Currently, in case we don't use IORING_SETUP_DEFER_TASKRUN, when io
>>> worker frees work, it needs to add a task work. This creates contention
>>> on tctx->task_list. With this commit, io work queues free work on a
>>> local list and batch multiple free work in one call when the number of
>>> free work in local list exceeds IO_REQ_ALLOC_BATCH.
>>
>> I see no relation to IO_REQ_ALLOC_BATCH, that should be
>> a separate macro.
>>
>>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>>> ---
>>>   io_uring/io-wq.c    | 62 +++++++++++++++++++++++++++++++++++++++++++--
>>>   io_uring/io-wq.h    |  4 ++-
>>>   io_uring/io_uring.c | 23 ++++++++++++++---
>>>   io_uring/io_uring.h |  6 ++++-
>>>   4 files changed, 87 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
>>> index 5d0928f37471..096711707db9 100644
>>> --- a/io_uring/io-wq.c
>>> +++ b/io_uring/io-wq.c
>> ...
>>> @@ -601,7 +622,41 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
>>>               wq->do_work(work);
>>>               io_assign_current_work(worker, NULL);
>>>   -            linked = wq->free_work(work);
>>> +            /*
>>> +             * All requests in free list must have the same
>>> +             * io_ring_ctx.
>>> +             */
>>> +            if (last_added_ctx && last_added_ctx != req->ctx) {
>>> +                flush_req_free_list(&free_list, tail);
>>> +                tail = NULL;
>>> +                last_added_ctx = NULL;
>>> +                free_req = 0;
>>> +            }
>>> +
>>> +            /*
>>> +             * Try to batch free work when
>>> +             * !IORING_SETUP_DEFER_TASKRUN to reduce contention
>>> +             * on tctx->task_list.
>>> +             */
>>> +            if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>> +                linked = wq->free_work(work, NULL, NULL);
>>> +            else
>>> +                linked = wq->free_work(work, &free_list, &did_free);
>>
>> The problem here is that iowq is blocking and hence you lock up resources
>> of already completed request for who knows how long. In case of unbound
>> requests (see IO_WQ_ACCT_UNBOUND) it's indefinite, and it's absolutely
>> cannot be used without some kind of a timer. But even in case of bound
>> work, it can be pretty long.
> That's a good point, I've overlooked the fact that work handler might block indefinitely.
>> Maybe, for bound requests it can target N like here, but read jiffies
>> in between each request and flush if it has been too long. So in worst
>> case the total delay is the last req execution time + DT. But even then
>> it feels wrong, especially with filesystems sometimes not even
>> honouring NOWAIT.
>>
>> The question is, why do you force it into the worker pool with the
>> IOSQE_ASYNC flag? It's generally not recommended, and the name of the
>> flag is confusing as it should've been more like "WORKER_OFFLOAD".
> 
> 
> I launched more workers to parallel the work handler, but as you said, it seems like an incorrect use case.

Not as much incorrect as generally inefficient and not recommended,
especially not recommended before trying it without the flag. People
often fall into the trap of "Oh, it's _async_, surely I have to set it",
really unfortunate naming.

> However, I think the request free seems heavy, we need to create a task work so that we can hold the uring_lock to queue the request to ctx->submit_state->compl_reqs. Let me play around more to see if I can find an optimization for this.

That's because it's a slow fallback path for cases that can't do
async for one reason or another, and ideally we wouldn't have it
at all. In reality it's used more than I'd wish for, but it's
still a path we don't heavily optimise.

Btw, if you're really spamming iowq threads, I'm surprised that's
the only hotspot you have. There should be some contention for
CQE posting (->completion_lock), and probably in the iowq queue
locking, and so on.

-- 
Pavel Begunkov

Re: [RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work

Posted by Pavel Begunkov 11 months, 3 weeks ago

On 2/21/25 15:28, Pavel Begunkov wrote:
> On 2/21/25 14:52, Bui Quang Minh wrote:
>> On 2/21/25 19:44, Pavel Begunkov wrote:
>>> On 2/21/25 04:19, Bui Quang Minh wrote:
>>>> Currently, in case we don't use IORING_SETUP_DEFER_TASKRUN, when io
>>>> worker frees work, it needs to add a task work. This creates contention
>>>> on tctx->task_list. With this commit, io work queues free work on a
>>>> local list and batch multiple free work in one call when the number of
>>>> free work in local list exceeds IO_REQ_ALLOC_BATCH.
>>>
>>> I see no relation to IO_REQ_ALLOC_BATCH, that should be
>>> a separate macro.
>>>
>>>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>>>> ---
>>>>   io_uring/io-wq.c    | 62 +++++++++++++++++++++++++++++++++++++++++++--
>>>>   io_uring/io-wq.h    |  4 ++-
>>>>   io_uring/io_uring.c | 23 ++++++++++++++---
>>>>   io_uring/io_uring.h |  6 ++++-
>>>>   4 files changed, 87 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
>>>> index 5d0928f37471..096711707db9 100644
>>>> --- a/io_uring/io-wq.c
>>>> +++ b/io_uring/io-wq.c
>>> ...
>>>> @@ -601,7 +622,41 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
>>>>               wq->do_work(work);
>>>>               io_assign_current_work(worker, NULL);
>>>>   -            linked = wq->free_work(work);
>>>> +            /*
>>>> +             * All requests in free list must have the same
>>>> +             * io_ring_ctx.
>>>> +             */
>>>> +            if (last_added_ctx && last_added_ctx != req->ctx) {
>>>> +                flush_req_free_list(&free_list, tail);
>>>> +                tail = NULL;
>>>> +                last_added_ctx = NULL;
>>>> +                free_req = 0;
>>>> +            }
>>>> +
>>>> +            /*
>>>> +             * Try to batch free work when
>>>> +             * !IORING_SETUP_DEFER_TASKRUN to reduce contention
>>>> +             * on tctx->task_list.
>>>> +             */
>>>> +            if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>>> +                linked = wq->free_work(work, NULL, NULL);
>>>> +            else
>>>> +                linked = wq->free_work(work, &free_list, &did_free);
>>>
>>> The problem here is that iowq is blocking and hence you lock up resources
>>> of already completed request for who knows how long. In case of unbound
>>> requests (see IO_WQ_ACCT_UNBOUND) it's indefinite, and it's absolutely
>>> cannot be used without some kind of a timer. But even in case of bound
>>> work, it can be pretty long.
>> That's a good point, I've overlooked the fact that work handler might block indefinitely.
>>> Maybe, for bound requests it can target N like here, but read jiffies
>>> in between each request and flush if it has been too long. So in worst
>>> case the total delay is the last req execution time + DT. But even then
>>> it feels wrong, especially with filesystems sometimes not even
>>> honouring NOWAIT.
>>>
>>> The question is, why do you force it into the worker pool with the
>>> IOSQE_ASYNC flag? It's generally not recommended, and the name of the
>>> flag is confusing as it should've been more like "WORKER_OFFLOAD".
>>
>>
>> I launched more workers to parallel the work handler, but as you said, it seems like an incorrect use case.
> 
> Not as much incorrect as generally inefficient and not recommended,
> especially not recommended before trying it without the flag. People
> often fall into the trap of "Oh, it's _async_, surely I have to set it",
> really unfortunate naming.
> 
>> However, I think the request free seems heavy, we need to create a task work so that we can hold the uring_lock to queue the request to ctx->submit_state->compl_reqs. Let me play around more to see if I can find an optimization for this.
> 
> That's because it's a slow fallback path for cases that can't do
> async for one reason or another, and ideally we wouldn't have it
> at all. In reality it's used more than I'd wish for, but it's
> still a path we don't heavily optimise.

Regardless of that, we probably can optimise bound requests
like above with jiffies, but sockets would fall into
the unbound bucket. Otherwise, someone would need to be able
to forcibly flush the list, like on timeout every N ms or
something similar.

> Btw, if you're really spamming iowq threads, I'm surprised that's
> the only hotspot you have. There should be some contention for
> CQE posting (->completion_lock), and probably in the iowq queue
> locking, and so on.

-- 
Pavel Begunkov

Re: [RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work

Posted by Bui Quang Minh 11 months, 3 weeks ago

On 2/21/25 19:44, Pavel Begunkov wrote:
> On 2/21/25 04:19, Bui Quang Minh wrote: >> Currently, in case we don't use IORING_SETUP_DEFER_TASKRUN, when >> 
io worker frees work, it needs to add a task work. This creates >> 
contention on tctx->task_list. With this commit, io work queues >> free 
work on a local list and batch multiple free work in one call >> when 
the number of free work in local list exceeds >> IO_REQ_ALLOC_BATCH. > > 
I see no relation to IO_REQ_ALLOC_BATCH, that should be a separate > 
macro. > >> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com> --- 
 >> io_uring/io-wq.c | 62 ++++++++++++++++++++++++++++++++++++++++++ >> 
+-- io_uring/io-wq.h | 4 ++- io_uring/io_uring.c | 23 +++++++++ >> 
+++++--- io_uring/io_uring.h | 6 ++++- 4 files changed, 87 >> 
insertions(+), 8 deletions(-) >> >> diff --git a/io_uring/io-wq.c 
b/io_uring/io-wq.c index >> 5d0928f37471..096711707db9 100644 --- 
a/io_uring/io-wq.c +++ b/ >> io_uring/io-wq.c > ... >> @@ -601,7 +622,41 
@@ static void io_worker_handle_work(struct >> io_wq_acct *acct, 
wq->do_work(work); >> io_assign_current_work(worker, NULL); - linked = 
wq- >> >free_work(work); + /* + * All requests in >> free list must have 
the same + * io_ring_ctx. >> + */ + if (last_added_ctx && >> 
last_added_ctx != req->ctx) { + >> flush_req_free_list(&free_list, 
tail); + tail = >> NULL; + last_added_ctx = NULL; + >> free_req = 0; + } 
+ + /* + * Try >> to batch free work when + * ! >> 
IORING_SETUP_DEFER_TASKRUN to reduce contention + * on >> 
tctx->task_list. + */ + if (req->ctx->flags >> & 
IORING_SETUP_DEFER_TASKRUN) + linked = wq- >> >free_work(work, NULL, 
NULL); + else + >> linked = wq->free_work(work, &free_list, &did_free); 
 > > The problem here is that iowq is blocking and hence you lock up > 
resources of already completed request for who knows how long. In > case 
of unbound requests (see IO_WQ_ACCT_UNBOUND) it's indefinite, > and it's 
absolutely cannot be used without some kind of a timer. But > even in 
case of bound work, it can be pretty long.

That's a good point, I've overlooked the fact that work handler might 
block indefinitely.

> Maybe, for bound requests it can target N like here, but read > jiffies in between each request and flush if it has been too long. > 
So in worst case the total delay is the last req execution time + > DT. 
But even then it feels wrong, especially with filesystems > sometimes 
not even honouring NOWAIT. > > The question is, why do you force it into 
the worker pool with the > IOSQE_ASYNC flag? It's generally not 
recommended, and the name of > the flag is confusing as it should've 
been more like > "WORKER_OFFLOAD".

I launched more workers to parallel the work handler, but as you said, 
it seems like an incorrect use case.

However, I think the request free seems heavy, we need to create a task 
work so that we can hold the uring_lock to queue the request to 
ctx->submit_state->compl_reqs. Let me play around more to see if I can 
find an optimization for this.


Thank you,

Quang Minh.

[RFC PATCH 1/2] io_uring: make io_req_normal_work_add accept a list of requests
[RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work