[PATCH] iomap: add allocation cache for iomap_dio

guzebing posted 1 patch 2 months, 2 weeks ago
There is a newer version of this series
fs/iomap/direct-io.c | 92 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 87 insertions(+), 5 deletions(-)
[PATCH] iomap: add allocation cache for iomap_dio
Posted by guzebing 2 months, 2 weeks ago
From: guzebing <guzebing@bytedance.com>

As implemented by the bio structure, we do the same thing on the
iomap-dio structure. Add a per-cpu cache for iomap_dio allocations,
enabling us to quickly recycle them instead of going through the slab
allocator.

By making such changes, we can reduce memory allocation on the direct
IO path, so that direct IO will not block due to insufficient system
memory. In addition, for direct IO, the read performance of io_uring
is improved by about 2.6%.

Suggested-by: Fengnan Chang <changfengnan@bytedance.com>
Signed-off-by: guzebing <guzebing@bytedance.com>
---
 fs/iomap/direct-io.c | 92 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 5d5d63efbd57..7a5c610ded7b 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -54,8 +54,84 @@ struct iomap_dio {
 			struct work_struct	work;
 		} aio;
 	};
+	struct iomap_dio		*dio_next;	/* request queue link */
 };
 
+#define DIO_ALLOC_CACHE_THRESHOLD	16
+#define DIO_ALLOC_CACHE_MAX		256
+struct dio_alloc_cache {
+	struct iomap_dio		*free_list;
+	struct iomap_dio		*free_list_irq;
+	int		nr;
+	int		nr_irq;
+};
+
+static struct dio_alloc_cache __percpu *dio_cache;
+
+static void dio_alloc_irq_cache_splice(struct dio_alloc_cache *cache)
+{
+	unsigned long flags;
+
+	/* cache->free_list must be empty */
+	if (WARN_ON_ONCE(cache->free_list))
+		return;
+
+	local_irq_save(flags);
+	cache->free_list = cache->free_list_irq;
+	cache->free_list_irq = NULL;
+	cache->nr += cache->nr_irq;
+	cache->nr_irq = 0;
+	local_irq_restore(flags);
+}
+
+static struct iomap_dio *dio_alloc_percpu_cache(void)
+{
+	struct dio_alloc_cache *cache;
+	struct iomap_dio *dio;
+
+	cache = per_cpu_ptr(dio_cache, get_cpu());
+	if (!cache->free_list) {
+		if (READ_ONCE(cache->nr_irq) >= DIO_ALLOC_CACHE_THRESHOLD)
+			dio_alloc_irq_cache_splice(cache);
+		if (!cache->free_list) {
+			put_cpu();
+			return NULL;
+		}
+	}
+	dio = cache->free_list;
+	cache->free_list = dio->dio_next;
+	cache->nr--;
+	put_cpu();
+	return dio;
+}
+
+static void dio_put_percpu_cache(struct iomap_dio *dio)
+{
+	struct dio_alloc_cache *cache;
+
+	cache = per_cpu_ptr(dio_cache, get_cpu());
+	if (READ_ONCE(cache->nr_irq) + cache->nr > DIO_ALLOC_CACHE_MAX)
+		goto out_free;
+
+	if (in_task()) {
+		dio->dio_next = cache->free_list;
+		cache->free_list = dio;
+		cache->nr++;
+	} else if (in_hardirq()) {
+		lockdep_assert_irqs_disabled();
+		dio->dio_next = cache->free_list_irq;
+		cache->free_list_irq = dio;
+		cache->nr_irq++;
+	} else {
+		goto out_free;
+	}
+	put_cpu();
+	return;
+out_free:
+	put_cpu();
+	kfree(dio);
+}
+
 static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
 		struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf)
 {
@@ -135,7 +211,7 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
 			ret += dio->done_before;
 	}
 	trace_iomap_dio_complete(iocb, dio->error, ret);
-	kfree(dio);
+	dio_put_percpu_cache(dio);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_dio_complete);
@@ -620,9 +696,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (!iomi.len)
 		return NULL;
 
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
-	if (!dio)
-		return ERR_PTR(-ENOMEM);
+	dio = dio_alloc_percpu_cache();
+	if (!dio) {
+		dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+		if (!dio)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	dio->iocb = iocb;
 	atomic_set(&dio->ref, 1);
@@ -804,7 +883,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	return dio;
 
 out_free_dio:
-	kfree(dio);
+	dio_put_percpu_cache(dio);
 	if (ret)
 		return ERR_PTR(ret);
 	return NULL;
@@ -833,6 +912,9 @@ static int __init iomap_dio_init(void)
 
 	if (!zero_page)
 		return -ENOMEM;
+	dio_cache = alloc_percpu(struct dio_alloc_cache);
+	if (!dio_cache)
+		return -ENOMEM;
 
 	return 0;
 }
-- 
2.20.1
Re: [PATCH] iomap: add allocation cache for iomap_dio
Posted by Christoph Hellwig 2 months, 2 weeks ago
On Fri, Nov 21, 2025 at 05:00:52PM +0800, guzebing wrote:
> From: guzebing <guzebing@bytedance.com>
> 
> As implemented by the bio structure, we do the same thing on the
> iomap-dio structure. Add a per-cpu cache for iomap_dio allocations,
> enabling us to quickly recycle them instead of going through the slab
> allocator.
> 
> By making such changes, we can reduce memory allocation on the direct
> IO path, so that direct IO will not block due to insufficient system
> memory. In addition, for direct IO, the read performance of io_uring
> is improved by about 2.6%.

Have you checked how much of that you'd get by using a dedicated
slab cache that should also do per-cpu allocations?  Note that even
if we had a dedicated per-cpu cache we'd probably still want that.

Also any chance you could factor this into common code?
Re: [PATCH] iomap: add allocation cache for iomap_dio
Posted by guzebing 1 month ago

在 2025/11/21 18:22, Christoph Hellwig 写道:
> On Fri, Nov 21, 2025 at 05:00:52PM +0800, guzebing wrote:
>> From: guzebing <guzebing@bytedance.com>
>>
>> As implemented by the bio structure, we do the same thing on the
>> iomap-dio structure. Add a per-cpu cache for iomap_dio allocations,
>> enabling us to quickly recycle them instead of going through the slab
>> allocator.
>>
>> By making such changes, we can reduce memory allocation on the direct
>> IO path, so that direct IO will not block due to insufficient system
>> memory. In addition, for direct IO, the read performance of io_uring
>> is improved by about 2.6%.
> 
> Have you checked how much of that you'd get by using a dedicated
> slab cache that should also do per-cpu allocations?  Note that even
> if we had a dedicated per-cpu cache we'd probably still want that.
I’m sorry for the long delay in replying to your email due to some other 
matters. I hope you still remember this revision. First, thank you for 
your response.

Yes, I try to use a dedicated kmem cache to allocate cache for iomap-dio 
structure. However, when system memory is sufficient, kmalloc and kmem 
cache deliver identical performance.

For direct I/O reads on the ext4 file system, the test command is:

./t/io_uring -p0 -d128 -b4096 -s32 -c32 -F1 -B1 -R1 -X1 -n1 -P1 /mnt/004.txt

The measured performance is:

kmalloc: 750K IOPS
kmem cache: 750K IOPS
per-CPU cache: 770K IOPS
> 
> Also any chance you could factor this into common code?
> 
For a mempool, we first allocate with kmalloc or kmem cache and finally 
fall back to a reserved cache—this is for reliability. It’s not a great 
fit for our high‑performance scenario.

Additionally, the current need for frequent allocation/free (hundreds of 
thousands to millions of times per second) may be more suitable for the 
bio or dio structures; beyond those, I’m not sure whether similar 
scenarios exist.

If we were to extract a generic implementation solely for this, would it 
yield significant benefits? Do you have any good suggestions?

I’d appreciate your review.

Re: [PATCH] iomap: add allocation cache for iomap_dio
Posted by Christoph Hellwig 1 month ago
On Mon, Jan 05, 2026 at 05:27:54PM +0800, guzebing wrote:
> Yes, I try to use a dedicated kmem cache to allocate cache for iomap-dio
> structure. However, when system memory is sufficient, kmalloc and kmem cache
> deliver identical performance.

Thanks for benchmarking this.

> > Also any chance you could factor this into common code?
> > 
> For a mempool, we first allocate with kmalloc or kmem cache and finally fall
> back to a reserved cache—this is for reliability. It’s not a great fit for
> our high‑performance scenario.
> 
> Additionally, the current need for frequent allocation/free (hundreds of
> thousands to millions of times per second) may be more suitable for the bio
> or dio structures; beyond those, I’m not sure whether similar scenarios
> exist.
> 
> If we were to extract a generic implementation solely for this, would it
> yield significant benefits? Do you have any good suggestions?


Factoring mean the percpu cache.  But given that it's been so long that
I looked at the code all the defails of this have been paged out from
my brain.  I can take another look when you resend it.