From nobody Fri Dec 19 20:39:23 2025
Received: from mout-p-103.mailbox.org (mout-p-103.mailbox.org [80.241.56.161])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5BBA08F58;
Fri, 16 May 2025 10:11:19 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.161
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1747390283; cv=none;
b=QQ9Mj6egeR4SEXlt+ANw3YAWXIN3AoplVl1Ho7WPVrbfCUB1o09Xy7WaK5ADs/BvqEmur6WurDtixoK/m8oP8GrTFprdExNQ45PZsGtcZ/kl6vXmYZquT6Im0QoKvGx2ApbJACjFcks/3c11Dj2csgL6ITMDiLIMQDJLbT2VWGQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1747390283; c=relaxed/simple;
bh=8eKraBA6ag2WRjUmlRbTiPM4gQnifhxiQFHkkEMRhK4=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=HhqIEqJQnD/Qz38YChE3NqbIzT3xX3JEnMcstKp33+JXkYAYBF5TXQNKibatz/yBOX8D7u6lLby+VbCoGJ1v73WUNbMvZt4TPV0lIpizps/vQPFSyDST0+lnL/vrjdI4k4/HjfA0qn5rKyXUKMU+ukJESxUPxeGD76uAQMDKsKQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.161
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp1.mailbox.org (smtp1.mailbox.org [10.196.197.1])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-103.mailbox.org (Postfix) with ESMTPS id 4ZzNC70M28z9tBW;
Fri, 16 May 2025 12:11:15 +0200 (CEST)
From: Pankaj Raghav
To: "Darrick J . Wong" ,
hch@lst.de,
willy@infradead.org
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
David Hildenbrand ,
linux-fsdevel@vger.kernel.org,
mcgrof@kernel.org,
gost.dev@samsung.com,
Andrew Morton ,
kernel@pankajraghav.com,
Pankaj Raghav
Subject: [RFC 1/3] mm: add large zero page for efficient zeroing of larger
segments
Date: Fri, 16 May 2025 12:10:52 +0200
Message-ID: <20250516101054.676046-2-p.raghav@samsung.com>
In-Reply-To: <20250516101054.676046-1-p.raghav@samsung.com>
References: <20250516101054.676046-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"
Introduce LARGE_ZERO_PAGE of size 2M as an alternative to ZERO_PAGE of
size PAGE_SIZE.
There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time is limited by
PAGE_SIZE.
This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.
While there are other options such as huge_zero_page, they can fail
based on the system memory pressure requiring a fallback to ZERO_PAGE[3].
This idea (but not the implementation) was suggested during the review of
adding LBS support to XFS[1][2].
LARGE_ZERO_PAGE is added behind a config option so that systems that are
constrained by memory are not forced to use it.
[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
[3] https://lore.kernel.org/linux-xfs/3pqmgrlewo6ctcwakdvbvjqixac5en6irlipe=
5aiz6vkylfyni@2luhrs36ke5r/
Suggested-by: Christoph Hellwig
Signed-off-by: Pankaj Raghav
---
arch/Kconfig | 8 ++++++++
arch/x86/include/asm/pgtable.h | 20 +++++++++++++++++++-
arch/x86/kernel/head_64.S | 9 ++++++++-
3 files changed, 35 insertions(+), 2 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index b0adb665041f..aefa519cb211 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -218,6 +218,14 @@ config USER_RETURN_NOTIFIER
Provide a kernel-internal notification when a cpu is about to
switch to user mode.
=20
+config LARGE_ZERO_PAGE
+ bool "Large zero pages"
+ def_bool n
+ help
+ 2M sized zero pages for zeroing. This will reserve 2M sized
+ physical pages for zeroing. Not suitable for memory constrained
+ systems.
+
config HAVE_IOREMAP_PROT
bool
=20
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3f59d7a16010..78eb83f2da34 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -17,6 +17,7 @@
=20
#ifndef __ASSEMBLER__
#include
+#include
#include
#include
#include
@@ -47,14 +48,31 @@ void ptdump_walk_user_pgd_level_checkwx(void);
#define debug_checkwx_user() do { } while (0)
#endif
=20
+#ifdef CONFIG_LARGE_ZERO_PAGE
+/*
+ * LARGE_ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_large_zero_page[(SZ_2M) / sizeof(unsigned long)]
+ __visible;
+#define ZERO_LARGE_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_large_zer=
o_page))
+
+#define ZERO_PAGE(vaddr) ZERO_LARGE_PAGE(vaddr)
+#define ZERO_LARGE_PAGE_SIZE SZ_2M
+#else
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
*/
-extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
+extern unsigned long empty_zero_page[(PAGE_SIZE) / sizeof(unsigned long)]
__visible;
#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))
=20
+#define ZERO_LARGE_PAGE(vaddr) ZERO_PAGE(vaddr)
+
+#define ZERO_LARGE_PAGE_SIZE PAGE_SIZE
+#endif
+
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;
=20
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fefe2a25cf02..ebcd12f72966 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -14,6 +14,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -708,8 +709,14 @@ EXPORT_SYMBOL(phys_base)
#include "../xen/xen-head.S"
=20
__PAGE_ALIGNED_BSS
+#ifdef CONFIG_LARGE_ZERO_PAGE
+SYM_DATA_START_PAGE_ALIGNED(empty_large_zero_page)
+ .skip SZ_2M
+SYM_DATA_END(empty_large_zero_page)
+EXPORT_SYMBOL(empty_large_zero_page)
+#else
SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
.skip PAGE_SIZE
SYM_DATA_END(empty_zero_page)
EXPORT_SYMBOL(empty_zero_page)
-
+#endif
--=20
2.47.2
From nobody Fri Dec 19 20:39:23 2025
Received: from mout-p-103.mailbox.org (mout-p-103.mailbox.org [80.241.56.161])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9139E23370C;
Fri, 16 May 2025 10:11:22 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.161
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1747390284; cv=none;
b=D35qDZ61KXJor04dfxPqQTW2h7etzL0ZxSy7vLyp6G4t8l/AtGglU4iQQqWq29eHZDdHWORgNZKcqtjd1Ba0fZWjRKbUaQ1jnyE+1RP5Y3NnSTIpbOd1P0e1h9chMcugkEZfidLVHmeac2cDH9DOMW614mkXc/7NWM3dwP9ZUsA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1747390284; c=relaxed/simple;
bh=ajC8gbTODZgih3ebHEAqb5o/Ct5VekreC9TKQudISIQ=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=TVfyfk2zibJIZ1wE1p+t7fhb69vAQ8KaaaA2NVjjcYxwTOli72vVV21qcjGjBGPQ6IhqpbJvSByKXdfUgcSI3tPSo4SCBIB1Fftk91pCk0bKuOP2EoaGriLCRBaP99HZKUdwQ30+xSFT/w35WHymUMbM20X53dBZAEYkq1cMwLw=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.161
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp1.mailbox.org (smtp1.mailbox.org
[IPv6:2001:67c:2050:b231:465::1])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-103.mailbox.org (Postfix) with ESMTPS id 4ZzNCC150Mz9ssM;
Fri, 16 May 2025 12:11:19 +0200 (CEST)
From: Pankaj Raghav
To: "Darrick J . Wong" ,
hch@lst.de,
willy@infradead.org
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
David Hildenbrand ,
linux-fsdevel@vger.kernel.org,
mcgrof@kernel.org,
gost.dev@samsung.com,
Andrew Morton ,
kernel@pankajraghav.com,
Pankaj Raghav
Subject: [RFC 2/3] block: use LARGE_ZERO_PAGE in __blkdev_issue_zero_pages()
Date: Fri, 16 May 2025 12:10:53 +0200
Message-ID: <20250516101054.676046-3-p.raghav@samsung.com>
In-Reply-To: <20250516101054.676046-1-p.raghav@samsung.com>
References: <20250516101054.676046-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Rspamd-Queue-Id: 4ZzNCC150Mz9ssM
Content-Type: text/plain; charset="utf-8"
Use LARGE_ZERO_PAGE in __blkdev_issue_zero_pages() instead of ZERO_PAGE.
On systems that support LARGE_ZERO_PAGE, we will end up sending larger
bvecs instead of multiple small ones.
Noticed a 4% increase in performance on a commercial NVMe SSD which does
not support OP_WRITE_ZEROES. The performance gains might be bigger if
the device supports larger MDTS.
Signed-off-by: Pankaj Raghav
---
block/blk-lib.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 4c9f20a689f7..80dfc737d1f6 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -211,8 +211,8 @@ static void __blkdev_issue_zero_pages(struct block_devi=
ce *bdev,
unsigned int len, added;
=20
len =3D min_t(sector_t,
- PAGE_SIZE, nr_sects << SECTOR_SHIFT);
- added =3D bio_add_page(bio, ZERO_PAGE(0), len, 0);
+ ZERO_LARGE_PAGE_SIZE, nr_sects << SECTOR_SHIFT);
+ added =3D bio_add_page(bio, ZERO_LARGE_PAGE(0), len, 0);
if (added < len)
break;
nr_sects -=3D added >> SECTOR_SHIFT;
--=20
2.47.2
From nobody Fri Dec 19 20:39:23 2025
Received: from mout-p-101.mailbox.org (mout-p-101.mailbox.org [80.241.56.151])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8144723184F;
Fri, 16 May 2025 10:11:26 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.151
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1747390288; cv=none;
b=oapsMz0o/77Un2mqhu1yx8KCWnEp7zIilhsZ7a9JEJiQw3g26DEOFqE1JDsdWxFI0gILuI0OJTlGwkGAGnXESnHIm8gh1hMSgwCQx/wSDe0pYudls04TYlSFTimYqjx4okC7NM1CdxYF/iSjS204ButSVOfZ77cHipfoDsj4hLQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1747390288; c=relaxed/simple;
bh=YurZ2FzxP3Y4no0Ti7m/BM4bZkV94wobNRj9pR79ess=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=hW4KkDBJgajYRIq0Q7cspNoL5bT31oiDFnghuT9uTk/1f3Z7lkSvh+4kdpDItg9minQd2QBDj3RTLVlazZGj/tr3pSUowsaxOrY/bE5jN4f9Yy2xP8JILteU2GRcoEuk5Sc6ZkVklOKHr81WP35ZUbkmy3otIOalFf1NanNCiy8=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.151
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp2.mailbox.org (smtp2.mailbox.org
[IPv6:2001:67c:2050:b231:465::2])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-101.mailbox.org (Postfix) with ESMTPS id 4ZzNCG6SMtz9tCG;
Fri, 16 May 2025 12:11:22 +0200 (CEST)
From: Pankaj Raghav
To: "Darrick J . Wong" ,
hch@lst.de,
willy@infradead.org
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
David Hildenbrand ,
linux-fsdevel@vger.kernel.org,
mcgrof@kernel.org,
gost.dev@samsung.com,
Andrew Morton ,
kernel@pankajraghav.com,
Pankaj Raghav
Subject: [RFC 3/3] iomap: use LARGE_ZERO_PAGE in iomap_dio_zero()
Date: Fri, 16 May 2025 12:10:54 +0200
Message-ID: <20250516101054.676046-4-p.raghav@samsung.com>
In-Reply-To: <20250516101054.676046-1-p.raghav@samsung.com>
References: <20250516101054.676046-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Rspamd-Queue-Id: 4ZzNCG6SMtz9tCG
Content-Type: text/plain; charset="utf-8"
Use LARGE_ZERO_PAGE instead of custom allocated 64k zero pages. The
downside is we might end up using ZERO_PAGE on systems that do not
enable LARGE_ZERO_PAGE feature.
Signed-off-by: Pankaj Raghav
---
fs/iomap/direct-io.c | 31 +++++++++----------------------
1 file changed, 9 insertions(+), 22 deletions(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 844261a31156..6a2b6726a156 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -29,13 +29,6 @@
#define IOMAP_DIO_WRITE (1U << 30)
#define IOMAP_DIO_DIRTY (1U << 31)
=20
-/*
- * Used for sub block zeroing in iomap_dio_zero()
- */
-#define IOMAP_ZERO_PAGE_SIZE (SZ_64K)
-#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE))
-static struct page *zero_page;
-
struct iomap_dio {
struct kiocb *iocb;
const struct iomap_dio_ops *dops;
@@ -290,23 +283,29 @@ static int iomap_dio_zero(const struct iomap_iter *it=
er, struct iomap_dio *dio,
{
struct inode *inode =3D file_inode(dio->iocb->ki_filp);
struct bio *bio;
+ int nr_vecs =3D max(1, i_blocksize(inode) / ZERO_LARGE_PAGE_SIZE);
=20
if (!len)
return 0;
/*
* Max block size supported is 64k
*/
- if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE))
+ if (WARN_ON_ONCE(len > SZ_64K))
return -EINVAL;
=20
- bio =3D iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_I=
DLE);
+ bio =3D iomap_dio_alloc_bio(iter, dio, nr_vecs, REQ_OP_WRITE | REQ_SYNC |=
REQ_IDLE);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector =3D iomap_sector(&iter->iomap, pos);
bio->bi_private =3D dio;
bio->bi_end_io =3D iomap_dio_bio_end_io;
=20
- __bio_add_page(bio, zero_page, len, 0);
+ while (len) {
+ unsigned int io_len =3D min_t(unsigned int, len, ZERO_LARGE_PAGE_SIZE);
+
+ __bio_add_page(bio, ZERO_LARGE_PAGE(0), len, 0);
+ len -=3D io_len;
+ }
iomap_dio_submit_bio(iter, dio, bio, pos);
return 0;
}
@@ -827,15 +826,3 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
-
-static int __init iomap_dio_init(void)
-{
- zero_page =3D alloc_pages(GFP_KERNEL | __GFP_ZERO,
- IOMAP_ZERO_PAGE_ORDER);
-
- if (!zero_page)
- return -ENOMEM;
-
- return 0;
-}
-fs_initcall(iomap_dio_init);
--=20
2.47.2