fs/inode.c | 2 + include/linux/fs.h | 1 + mm/filemap.c | 150 ++++++++++++++++++++++++++++++++++++++------- 3 files changed, 130 insertions(+), 23 deletions(-)
From: Kiryl Shutsemau <kas@kernel.org>
The protocol for page cache lookup is as follows:
1. Locate a folio in XArray.
2. Obtain a reference on the folio using folio_try_get().
3. If successful, verify that the folio still belongs to
the mapping and has not been truncated or reclaimed.
4. Perform operations on the folio, such as copying data
to userspace.
5. Release the reference.
For short reads, the overhead of atomic operations on reference
manipulation can be significant, particularly when multiple tasks access
the same folio, leading to cache line bouncing.
To address this issue, introduce i_pages_delete_seqcnt, which increments
each time a folio is deleted from the page cache and implement a modified
page cache lookup protocol for short reads:
1. Locate a folio in XArray.
2. Take note of the i_pages_delete_seqcnt.
3. Copy the data to a local buffer on the stack.
4. Verify that the i_pages_delete_seqcnt has not changed.
5. Copy the data from the local buffer to the iterator.
If any issues arise in the fast path, fallback to the slow path that
relies on the refcount to stabilize the folio.
The new approach requires a local buffer in the stack. The size of the
buffer determines which read requests are served by the fast path. Set
the buffer to 1k. This seems to be a reasonable amount of stack usage
for the function at the bottom of the call stack.
The fast read approach demonstrates significant performance
improvements, particularly in contended cases.
16 threads, reads from 4k file(s), mean MiB/s (StdDev)
-------------------------------------------------------------
| Block | Baseline | Baseline | Patched | Patched |
| size | same file | diff files | same file | diff files |
-------------------------------------------------------------
| 1 | 10.96 | 27.56 | 30.42 | 30.4 |
| | (0.497) | (0.114) | (0.130) | (0.158) |
| 32 | 350.8 | 886.2 | 980.6 | 981.8 |
| | (13.64) | (2.863) | (3.361) | (1.303) |
| 256 | 2798 | 7009.6 | 7641.4 | 7653.6 |
| | (103.9) | (28.00) | (33.26) | (25.50) |
| 1024 | 10780 | 27040 | 29280 | 29320 |
| | (389.8) | (89.44) | (130.3) | (83.66) |
| 4096 | 43700 | 103800 | 48420 | 102000 |
| | (1953) | (447.2) | (2012) | (0) |
-------------------------------------------------------------
16 threads, reads from 1M file(s), mean MiB/s (StdDev)
--------------------------------------------------------------
| Block | Baseline | Baseline | Patched | Patched |
| size | same file | diff files | same file | diff files |
---------------------------------------------------------
| 1 | 26.38 | 27.34 | 30.38 | 30.36 |
| | (0.998) | (0.114) | (0.083) | (0.089) |
| 32 | 824.4 | 877.2 | 977.8 | 975.8 |
| | (15.78) | (3.271) | (2.683) | (1.095) |
| 256 | 6494 | 6992.8 | 7619.8 | 7625 |
| | (116.0) | (32.39) | (10.66) | (28.19) |
| 1024 | 24960 | 26840 | 29100 | 29180 |
| | (606.6) | (151.6) | (122.4) | (83.66) |
| 4096 | 94420 | 100520 | 95260 | 99760 |
| | (3144) | (672.3) | (2874) | (134.1) |
| 32768 | 386000 | 402400 | 368600 | 397400 |
| | (36599) | (10526) | (47188) | (6107) |
--------------------------------------------------------------
There's also improvement on kernel build:
Base line: 61.3462 +- 0.0597 seconds time elapsed ( +- 0.10% )
Patched: 60.6106 +- 0.0759 seconds time elapsed ( +- 0.13% )
Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
---
fs/inode.c | 2 +
include/linux/fs.h | 1 +
mm/filemap.c | 150 ++++++++++++++++++++++++++++++++++++++-------
3 files changed, 130 insertions(+), 23 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..52163d28d630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+ seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
+ &mapping->i_pages->xa_lock);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->i_private_list);
spin_lock_init(&mapping->i_private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..c9588d555f73 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -523,6 +523,7 @@ struct address_space {
struct list_head i_private_list;
struct rw_semaphore i_mmap_rwsem;
void * i_private_data;
+ seqcount_spinlock_t i_pages_delete_seqcnt;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
/*
* On most architectures that alignment is already the case; but
diff --git a/mm/filemap.c b/mm/filemap.c
index 13f0259d993c..51689c4f3773 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
xas_store(&xas, shadow);
xas_init_marks(&xas);
+ write_seqcount_end(&mapping->i_pages_delete_seqcnt);
folio->mapping = NULL;
/* Leave folio->index set: truncation lookup relies upon it */
@@ -2695,21 +2697,98 @@ static void filemap_end_dropbehind_read(struct folio *folio)
}
}
-/**
- * filemap_read - Read data from the page cache.
- * @iocb: The iocb to read.
- * @iter: Destination for the data.
- * @already_read: Number of bytes already read by the caller.
- *
- * Copies data from the page cache. If the data is not currently present,
- * uses the readahead and read_folio address_space operations to fetch it.
- *
- * Return: Total number of bytes copied, including those already read by
- * the caller. If an error happens before any bytes are copied, returns
- * a negative error number.
- */
-ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
- ssize_t already_read)
+static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
+ loff_t pos, char *buffer,
+ size_t size)
+{
+ XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
+ struct folio *folio;
+ loff_t file_size;
+ unsigned int seq;
+
+ lockdep_assert_in_rcu_read_lock();
+
+ /* Give up and go to slow path if raced with page_cache_delete() */
+ if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
+ return false;
+
+ folio = xas_load(&xas);
+ if (xas_retry(&xas, folio))
+ return 0;
+
+ if (!folio || xa_is_value(folio))
+ return 0;
+
+ if (!folio_test_uptodate(folio))
+ return 0;
+
+ /* No fast-case if readahead is supposed to started */
+ if (folio_test_readahead(folio))
+ return 0;
+ /* .. or mark it accessed */
+ if (!folio_test_referenced(folio))
+ return 0;
+
+ /* i_size check must be after folio_test_uptodate() */
+ file_size = i_size_read(mapping->host);
+ if (unlikely(pos >= file_size))
+ return 0;
+ if (size > file_size - pos)
+ size = file_size - pos;
+
+ /* Do the data copy */
+ size = memcpy_from_file_folio(buffer, folio, pos, size);
+ if (!size)
+ return 0;
+
+ /* Give up and go to slow path if raced with page_cache_delete() */
+ if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+ return 0;
+
+ return size;
+}
+
+#define FAST_READ_BUF_SIZE 1024
+
+static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
+ ssize_t *already_read)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct file_ra_state *ra = &iocb->ki_filp->f_ra;
+ char buffer[FAST_READ_BUF_SIZE];
+ size_t count;
+
+ if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+ return false;
+
+ if (iov_iter_count(iter) > sizeof(buffer))
+ return false;
+
+ count = iov_iter_count(iter);
+
+ /* Let's see if we can just do the read under RCU */
+ rcu_read_lock();
+ count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
+ rcu_read_unlock();
+
+ if (!count)
+ return false;
+
+ count = copy_to_iter(buffer, count, iter);
+ if (unlikely(!count))
+ return false;
+
+ iocb->ki_pos += count;
+ ra->prev_pos = iocb->ki_pos;
+ file_accessed(iocb->ki_filp);
+ *already_read += count;
+
+ return !iov_iter_count(iter);
+}
+
+static noinline ssize_t filemap_read_slow(struct kiocb *iocb,
+ struct iov_iter *iter,
+ ssize_t already_read)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
@@ -2721,14 +2800,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
loff_t isize, end_offset;
loff_t last_pos = ra->prev_pos;
- if (unlikely(iocb->ki_pos < 0))
- return -EINVAL;
- if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
- return 0;
- if (unlikely(!iov_iter_count(iter)))
- return 0;
-
- iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
folio_batch_init(&fbatch);
do {
@@ -2821,6 +2892,39 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
ra->prev_pos = last_pos;
return already_read ? already_read : error;
}
+
+/**
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
+ *
+ * Copies data from the page cache. If the data is not currently present,
+ * uses the readahead and read_folio address_space operations to fetch it.
+ *
+ * Return: Total number of bytes copied, including those already read by
+ * the caller. If an error happens before any bytes are copied, returns
+ * a negative error number.
+ */
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+ ssize_t already_read)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+
+ if (unlikely(iocb->ki_pos < 0))
+ return -EINVAL;
+ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
+ return 0;
+ if (unlikely(!iov_iter_count(iter)))
+ return 0;
+
+ iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
+
+ if (filemap_read_fast(iocb, iter, &already_read))
+ return already_read;
+
+ return filemap_read_slow(iocb, iter, already_read);
+}
EXPORT_SYMBOL_GPL(filemap_read);
int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
--
2.50.1
On 17.10.25 16:15, Kiryl Shutsemau wrote:
> From: Kiryl Shutsemau <kas@kernel.org>
>
> The protocol for page cache lookup is as follows:
>
> 1. Locate a folio in XArray.
> 2. Obtain a reference on the folio using folio_try_get().
> 3. If successful, verify that the folio still belongs to
> the mapping and has not been truncated or reclaimed.
> 4. Perform operations on the folio, such as copying data
> to userspace.
> 5. Release the reference.
>
> For short reads, the overhead of atomic operations on reference
> manipulation can be significant, particularly when multiple tasks access
> the same folio, leading to cache line bouncing.
>
> To address this issue, introduce i_pages_delete_seqcnt, which increments
> each time a folio is deleted from the page cache and implement a modified
> page cache lookup protocol for short reads:
>
> 1. Locate a folio in XArray.
> 2. Take note of the i_pages_delete_seqcnt.
> 3. Copy the data to a local buffer on the stack.
> 4. Verify that the i_pages_delete_seqcnt has not changed.
> 5. Copy the data from the local buffer to the iterator.
>
> If any issues arise in the fast path, fallback to the slow path that
> relies on the refcount to stabilize the folio.
>
> The new approach requires a local buffer in the stack. The size of the
> buffer determines which read requests are served by the fast path. Set
> the buffer to 1k. This seems to be a reasonable amount of stack usage
> for the function at the bottom of the call stack.
>
> The fast read approach demonstrates significant performance
> improvements, particularly in contended cases.
>
> 16 threads, reads from 4k file(s), mean MiB/s (StdDev)
>
> -------------------------------------------------------------
> | Block | Baseline | Baseline | Patched | Patched |
> | size | same file | diff files | same file | diff files |
> -------------------------------------------------------------
> | 1 | 10.96 | 27.56 | 30.42 | 30.4 |
> | | (0.497) | (0.114) | (0.130) | (0.158) |
> | 32 | 350.8 | 886.2 | 980.6 | 981.8 |
> | | (13.64) | (2.863) | (3.361) | (1.303) |
> | 256 | 2798 | 7009.6 | 7641.4 | 7653.6 |
> | | (103.9) | (28.00) | (33.26) | (25.50) |
> | 1024 | 10780 | 27040 | 29280 | 29320 |
> | | (389.8) | (89.44) | (130.3) | (83.66) |
> | 4096 | 43700 | 103800 | 48420 | 102000 |
> | | (1953) | (447.2) | (2012) | (0) |
> -------------------------------------------------------------
>
> 16 threads, reads from 1M file(s), mean MiB/s (StdDev)
>
> --------------------------------------------------------------
> | Block | Baseline | Baseline | Patched | Patched |
> | size | same file | diff files | same file | diff files |
> ---------------------------------------------------------
> | 1 | 26.38 | 27.34 | 30.38 | 30.36 |
> | | (0.998) | (0.114) | (0.083) | (0.089) |
> | 32 | 824.4 | 877.2 | 977.8 | 975.8 |
> | | (15.78) | (3.271) | (2.683) | (1.095) |
> | 256 | 6494 | 6992.8 | 7619.8 | 7625 |
> | | (116.0) | (32.39) | (10.66) | (28.19) |
> | 1024 | 24960 | 26840 | 29100 | 29180 |
> | | (606.6) | (151.6) | (122.4) | (83.66) |
> | 4096 | 94420 | 100520 | 95260 | 99760 |
> | | (3144) | (672.3) | (2874) | (134.1) |
> | 32768 | 386000 | 402400 | 368600 | 397400 |
> | | (36599) | (10526) | (47188) | (6107) |
> --------------------------------------------------------------
>
> There's also improvement on kernel build:
>
> Base line: 61.3462 +- 0.0597 seconds time elapsed ( +- 0.10% )
> Patched: 60.6106 +- 0.0759 seconds time elapsed ( +- 0.13% )
>
> Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
> ---
> fs/inode.c | 2 +
> include/linux/fs.h | 1 +
> mm/filemap.c | 150 ++++++++++++++++++++++++++++++++++++++-------
> 3 files changed, 130 insertions(+), 23 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index ec9339024ac3..52163d28d630 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
> static void __address_space_init_once(struct address_space *mapping)
> {
> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
> + seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
> + &mapping->i_pages->xa_lock);
> init_rwsem(&mapping->i_mmap_rwsem);
> INIT_LIST_HEAD(&mapping->i_private_list);
> spin_lock_init(&mapping->i_private_lock);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c895146c1444..c9588d555f73 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -523,6 +523,7 @@ struct address_space {
> struct list_head i_private_list;
> struct rw_semaphore i_mmap_rwsem;
> void * i_private_data;
> + seqcount_spinlock_t i_pages_delete_seqcnt;
> } __attribute__((aligned(sizeof(long)))) __randomize_layout;
> /*
> * On most architectures that alignment is already the case; but
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 13f0259d993c..51689c4f3773 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
>
> VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
>
> + write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
> xas_store(&xas, shadow);
> xas_init_marks(&xas);
> + write_seqcount_end(&mapping->i_pages_delete_seqcnt);
>
> folio->mapping = NULL;
> /* Leave folio->index set: truncation lookup relies upon it */
> @@ -2695,21 +2697,98 @@ static void filemap_end_dropbehind_read(struct folio *folio)
> }
> }
>
> -/**
> - * filemap_read - Read data from the page cache.
> - * @iocb: The iocb to read.
> - * @iter: Destination for the data.
> - * @already_read: Number of bytes already read by the caller.
> - *
> - * Copies data from the page cache. If the data is not currently present,
> - * uses the readahead and read_folio address_space operations to fetch it.
> - *
> - * Return: Total number of bytes copied, including those already read by
> - * the caller. If an error happens before any bytes are copied, returns
> - * a negative error number.
> - */
> -ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
> - ssize_t already_read)
> +static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
> + loff_t pos, char *buffer,
> + size_t size)
> +{
> + XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
> + struct folio *folio;
> + loff_t file_size;
> + unsigned int seq;
> +
> + lockdep_assert_in_rcu_read_lock();
> +
> + /* Give up and go to slow path if raced with page_cache_delete() */
> + if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
> + return false;
> +
> + folio = xas_load(&xas);
> + if (xas_retry(&xas, folio))
> + return 0;
> +
> + if (!folio || xa_is_value(folio))
> + return 0;
> +
> + if (!folio_test_uptodate(folio))
> + return 0;
> +
> + /* No fast-case if readahead is supposed to started */
> + if (folio_test_readahead(folio))
> + return 0;
> + /* .. or mark it accessed */
> + if (!folio_test_referenced(folio))
> + return 0;
> +
> + /* i_size check must be after folio_test_uptodate() */
> + file_size = i_size_read(mapping->host);
> + if (unlikely(pos >= file_size))
> + return 0;
> + if (size > file_size - pos)
> + size = file_size - pos;
> +
> + /* Do the data copy */
> + size = memcpy_from_file_folio(buffer, folio, pos, size);
We can be racing with folio splitting or freeing+reallocation, right? So
our folio might actually be suddenly a tail page I assume? Or change
from small to large, or change from large to small. Or a large folio can
change its size?
In that case things like folio_test_uptodate() won't be happy, because
they will bail out if called on a tail page (see PF_NO_TAIL).
But also, are we sure memcpy_from_file_folio() is save to be used in
that racy context?
offset_in_folio() which depends on folio_size(). When racing with
concurrent folio reuse, folio_size()->folio_order() can be tricked into
returning a wrong number I guess and making the result of
offset_in_folio() rather bogus.
At least that's my understanding after taking a quick look.
The concern I would have in that case
(A) the page pointer we calculate in kmap_local_folio() could be garbage
(B) the "from" pointer we return could be garbage
"garbage" as in pointing at something without a direct map, something
that's protected differently (MTE? weird CoCo protection?) or even worse
MMIO with undesired read-effects.
I'll note that some of these concerns would be gone once folios are
allocated separately: we would not suddenly see a tail page.
But concurrent folio splitting or size changes could still be an issue
IIUC Willy today once I discussed the plans about "struct folio" freeing.
Maybe I'm wrong, just wanted to raise that these functions are not
prepared to be called when we are not holding a refcount or cannot
otherwise guarantee that the folio cannot concurrently be
freed/reallocated/merged/split.
--
Cheers
David / dhildenb
On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: > "garbage" as in pointing at something without a direct map, something that's > protected differently (MTE? weird CoCo protection?) or even worse MMIO with > undesired read-effects. Pedro already points to the problem with missing direct mapping. _nofault() copy should help with this. Can direct mapping ever be converted to MMIO? It can be converted to DMA buffer (which is fine), but MMIO? I have not seen it even in virtualized environments. I cannot say for all CoCo protections, but TDX guest shared<->private should be fine. I am not sure about MTE. Is there a way to bypass MTE check for a load? And how does it deal with stray reads from load_unaligned_zeropad()? -- Kiryl Shutsemau / Kirill A. Shutemov
On Thu, Oct 23, 2025 at 3:34 AM Kiryl Shutsemau <kirill@shutemov.name> wrote: > > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: > > "garbage" as in pointing at something without a direct map, something that's > > protected differently (MTE? weird CoCo protection?) or even worse MMIO with > > undesired read-effects. > > Pedro already points to the problem with missing direct mapping. > _nofault() copy should help with this. > > Can direct mapping ever be converted to MMIO? It can be converted to DMA > buffer (which is fine), but MMIO? I have not seen it even in virtualized > environments. > > I cannot say for all CoCo protections, but TDX guest shared<->private > should be fine. > > I am not sure about MTE. Is there a way to bypass MTE check for a load? > And how does it deal with stray reads from load_unaligned_zeropad()? If I remember correctly, _nofault() copy should skip tag check too. Thanks, Yang > > -- > Kiryl Shutsemau / Kirill A. Shutemov >
On Thu, 23 Oct 2025, Yang Shi wrote: > On Thu, Oct 23, 2025 at 3:34 AM Kiryl Shutsemau <kirill@shutemov.name> wrote: > > > > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: > > > "garbage" as in pointing at something without a direct map, something that's > > > protected differently (MTE? weird CoCo protection?) or even worse MMIO with > > > undesired read-effects. > > > > Pedro already points to the problem with missing direct mapping. > > _nofault() copy should help with this. > > > > Can direct mapping ever be converted to MMIO? It can be converted to DMA > > buffer (which is fine), but MMIO? I have not seen it even in virtualized > > environments. > > > > I cannot say for all CoCo protections, but TDX guest shared<->private > > should be fine. > > > > I am not sure about MTE. Is there a way to bypass MTE check for a load? > > And how does it deal with stray reads from load_unaligned_zeropad()? > > If I remember correctly, _nofault() copy should skip tag check too. > > Thanks, > Yang Not a reply to Yang, I'm just tacking on to the latest mail... I sew no mention of page migration: __folio_migrate_mapping() does not use page_cache_delete(), so would surely need to do its own write_seqcount_begin() and _end(), wouldn't it? It is using folio_ref_freeze() and _unfreeze(), thinking that keeps everyone else safely away: but not filemap_read_fast_rcu(). And all other users of folio_ref_freeze() (probably not many but I have not searched) need to be checked too: maybe no others need changing, but that has to be established. This makes a fundamental change to speculative page cache assumptions. My own opinion was expressed very much better than I could, by Dave Chinner on Oct 21: cognitive load of niche fastpath. Hugh
On Mon, 27 Oct 2025 at 03:49, Hugh Dickins <hughd@google.com> wrote:
>
> This makes a fundamental change to speculative page cache assumptions.
Yes, but I'm a bit surprised by people who find that scary.
The page cache does *much* more scary things elsewhere, particularly
the whole folio_try_get() dance (in filemap_get_entry() and other
places).
I suspect you ignore that just because it's been that way forever, so
you're comfortable with it.
I'd argue that that is much *much* more subtle because it means that
somebody may be incrementing the page count of a page that has already
been re-allocated by somebody else.
Talk about cognitive load: that code makes you think that "hey, the
tryget means that if it has been released, we don't get a ref to it",
because that's how many of our *other* speculative RCU accesses do in
fact work.
But that's not how the page cache works, exactly because freeing isn't
actually RCU-delayed.
So while the code visually follows the exact same pattern as some
other "look up speculatively under RCU, skip if it's not there any
more", it actually does exactly the same thing as the "copy data under
RCU, then check later if it was ok". Except it does "increment
refcount under RCU, then check later if it was actually valid".
That said, I wonder if we might not consider making page cache freeing
be RCU-delayed. This has come up before (exactly *because* of that
"folio_try_get()").
Because while I am pretty sure that filemap_get_entry() is fine (and a
number of other core users), I'm not convinced that some of the other
users of folio_try_get() are necessarily aware of just how subtle that
thing is.
Anyway, I'm certainly not going to push that patch very hard.
But I do think that a "3x performance improvement on a case that is
known to be an issue for at least one real-world customer" shouldn't
be called "a niche case". I've seen *way* more niche than that.
(I do think RCU-freeing folios would potentially be an interesting
thing to look into, but I don't think the patch under discussion is
necessarily the reason to do so).
Linus
On 27.10.25 16:50, Linus Torvalds wrote: > On Mon, 27 Oct 2025 at 03:49, Hugh Dickins <hughd@google.com> wrote: >> >> This makes a fundamental change to speculative page cache assumptions. > > Yes, but I'm a bit surprised by people who find that scary. > > The page cache does *much* more scary things elsewhere, particularly > the whole folio_try_get() dance (in filemap_get_entry() and other > places). > > I suspect you ignore that just because it's been that way forever, so > you're comfortable with it. > > I'd argue that that is much *much* more subtle because it means that > somebody may be incrementing the page count of a page that has already > been re-allocated by somebody else. > > Talk about cognitive load: that code makes you think that "hey, the > tryget means that if it has been released, we don't get a ref to it", > because that's how many of our *other* speculative RCU accesses do in > fact work. > > But that's not how the page cache works, exactly because freeing isn't > actually RCU-delayed. > > So while the code visually follows the exact same pattern as some > other "look up speculatively under RCU, skip if it's not there any > more", it actually does exactly the same thing as the "copy data under > RCU, then check later if it was ok". Except it does "increment > refcount under RCU, then check later if it was actually valid". > > That said, I wonder if we might not consider making page cache freeing > be RCU-delayed. This has come up before (exactly *because* of that > "folio_try_get()"). > > Because while I am pretty sure that filemap_get_entry() is fine (and a > number of other core users), I'm not convinced that some of the other > users of folio_try_get() are necessarily aware of just how subtle that > thing is. > > Anyway, I'm certainly not going to push that patch very hard. I will sleep better at night if we can guarantee that we are reading from a folio that has not been reused in the meantime -- or reading random other memory as I raised in my other mail. So I really wish that we can defer optimizing this to freeing folios under RCU instead. -- Cheers David / dhildenb
On Mon, 27 Oct 2025 at 09:06, David Hildenbrand <david@redhat.com> wrote:
>
> So I really wish that we can defer optimizing this to freeing folios
> under RCU instead.
So just to see, I dug around when we started to do the rcu-protected
folio lookup (well, it was obviously not a folio at the time).
Mainly because we actually had a lot more of those subtle
folio_try_get() users than I expected us to have,
It goes back to July 2008 (commit e286781d5f2e: "mm: speculative page
references" being the first in the series).
I do have to say that the original naming was better: we used to call
the "try_get" operation "page_cache_get_speculative()", which made it
very clear that it was doing something speculative and different from
some of our other rcu patterns, where if it's successful it's all
good.
Because even when successful, the folio in folio_try_get() is still
speculative and needs checking.
Not all of our current users seem to re-verify the source of the folio
afterwards (deferred_split_scan() makes me go "Uhh - you seem to rely
on folio_try_get() as some kind of validity check" for example).
Oh well. This is all entirely unrelated to the suggested patch, just
musings from me looking at that other code that I think is a lot more
subtle and people don't seem to have issues with.
Linus
On 27.10.25 17:48, Linus Torvalds wrote: > On Mon, 27 Oct 2025 at 09:06, David Hildenbrand <david@redhat.com> wrote: >> >> So I really wish that we can defer optimizing this to freeing folios >> under RCU instead. > > So just to see, I dug around when we started to do the rcu-protected > folio lookup (well, it was obviously not a folio at the time). > > Mainly because we actually had a lot more of those subtle > folio_try_get() users than I expected us to have, > > It goes back to July 2008 (commit e286781d5f2e: "mm: speculative page > references" being the first in the series). > > I do have to say that the original naming was better: we used to call > the "try_get" operation "page_cache_get_speculative()", which made it > very clear that it was doing something speculative and different from > some of our other rcu patterns, where if it's successful it's all > good. > > Because even when successful, the folio in folio_try_get() is still > speculative and needs checking. Right, and we only access its content after verifying that (a) it is the one we wanted to access and (b) stabilizing it, so it will stay that way. > > Not all of our current users seem to re-verify the source of the folio > afterwards (deferred_split_scan() makes me go "Uhh - you seem to rely > on folio_try_get() as some kind of validity check" for example). Note that deferred_split_scan() only operates with anon folios and has some nasty NASTY interaction with folio freeing. :) -- Cheers David / dhildenb
On 23.10.25 12:31, Kiryl Shutsemau wrote: > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: >> "garbage" as in pointing at something without a direct map, something that's >> protected differently (MTE? weird CoCo protection?) or even worse MMIO with >> undesired read-effects. > > Pedro already points to the problem with missing direct mapping. > _nofault() copy should help with this. Yeah, we do something similar when reading the kcore for that reason. > > Can direct mapping ever be converted to MMIO? It can be converted to DMA > buffer (which is fine), but MMIO? I have not seen it even in virtualized > environments. I recall discussions in the context of PAT and the adjustment of caching attributes of the direct map for MMIO purposes: so I suspect there are ways that can happen, but I am not 100% sure. Thinking about it, in VMs we have the direct map set on balloon inflated pages that should not be touched, not even read, otherwise your hypervisor might get very angry. That case we could likely handle by checking whether the source page actually exists and doesn't have PageOffline() set, before accessing it. A bit nasty. A more obscure cases would probably be reading a page that was poisoned by hardware and is not expected to be used anymore. Could also be checked by checking the page. Essentially all cases where we try to avoid reading ordinary memory already when creating memory dumps that might have a direct map. Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately. On s390x, reading a "protected" page of a CoCo Vm will trigger an interrupt, I'd assume _nofault would take care of this. -- Cheers David / dhildenb
On 23.10.25 12:54, David Hildenbrand wrote: > On 23.10.25 12:31, Kiryl Shutsemau wrote: >> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: >>> "garbage" as in pointing at something without a direct map, something that's >>> protected differently (MTE? weird CoCo protection?) or even worse MMIO with >>> undesired read-effects. >> >> Pedro already points to the problem with missing direct mapping. >> _nofault() copy should help with this. > > Yeah, we do something similar when reading the kcore for that reason. > >> >> Can direct mapping ever be converted to MMIO? It can be converted to DMA >> buffer (which is fine), but MMIO? I have not seen it even in virtualized >> environments. > > I recall discussions in the context of PAT and the adjustment of caching > attributes of the direct map for MMIO purposes: so I suspect there are > ways that can happen, but I am not 100% sure. > > > Thinking about it, in VMs we have the direct map set on balloon inflated > pages that should not be touched, not even read, otherwise your > hypervisor might get very angry. That case we could likely handle by > checking whether the source page actually exists and doesn't have > PageOffline() set, before accessing it. A bit nasty. > > A more obscure cases would probably be reading a page that was poisoned > by hardware and is not expected to be used anymore. Could also be > checked by checking the page. > > Essentially all cases where we try to avoid reading ordinary memory > already when creating memory dumps that might have a direct map. > > > Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately. Looking into this, I'd assume the exception handler will take care of it. load_unaligned_zeropad() is interesting if there is a direct map but the memory should not be touched (especially regarding PageOffline and memory errors). I read drivers/firmware/efi/unaccepted_memory.c where we there is a lengthy discussion about guard pages and how that works for unaccepted memory. While it works for unaccepted memory, it wouldn't work for other random accesses as I suspect we could produce in this patch. -- Cheers David / dhildenb
On 23.10.25 13:10, David Hildenbrand wrote: > On 23.10.25 12:54, David Hildenbrand wrote: >> On 23.10.25 12:31, Kiryl Shutsemau wrote: >>> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: >>>> "garbage" as in pointing at something without a direct map, something that's >>>> protected differently (MTE? weird CoCo protection?) or even worse MMIO with >>>> undesired read-effects. >>> >>> Pedro already points to the problem with missing direct mapping. >>> _nofault() copy should help with this. >> >> Yeah, we do something similar when reading the kcore for that reason. >> >>> >>> Can direct mapping ever be converted to MMIO? It can be converted to DMA >>> buffer (which is fine), but MMIO? I have not seen it even in virtualized >>> environments. >> >> I recall discussions in the context of PAT and the adjustment of caching >> attributes of the direct map for MMIO purposes: so I suspect there are >> ways that can happen, but I am not 100% sure. >> >> >> Thinking about it, in VMs we have the direct map set on balloon inflated >> pages that should not be touched, not even read, otherwise your >> hypervisor might get very angry. That case we could likely handle by >> checking whether the source page actually exists and doesn't have >> PageOffline() set, before accessing it. A bit nasty. >> >> A more obscure cases would probably be reading a page that was poisoned >> by hardware and is not expected to be used anymore. Could also be >> checked by checking the page. >> >> Essentially all cases where we try to avoid reading ordinary memory >> already when creating memory dumps that might have a direct map. >> >> >> Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately. > > Looking into this, I'd assume the exception handler will take care of it. > > load_unaligned_zeropad() is interesting if there is a direct map but the > memory should not be touched (especially regarding PageOffline and > memory errors). > > I read drivers/firmware/efi/unaccepted_memory.c where we there is a > lengthy discussion about guard pages and how that works for unaccepted > memory. > > While it works for unaccepted memory, it wouldn't work for other random Sorry I meant here "while that works for load_unaligned_zeropad()". -- Cheers David / dhildenb
On Thu, Oct 23, 2025 at 01:11:43PM +0200, David Hildenbrand wrote: > On 23.10.25 13:10, David Hildenbrand wrote: > > On 23.10.25 12:54, David Hildenbrand wrote: > > > On 23.10.25 12:31, Kiryl Shutsemau wrote: > > > > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: > > > > > "garbage" as in pointing at something without a direct map, something that's > > > > > protected differently (MTE? weird CoCo protection?) or even worse MMIO with > > > > > undesired read-effects. > > > > > > > > Pedro already points to the problem with missing direct mapping. > > > > _nofault() copy should help with this. > > > > > > Yeah, we do something similar when reading the kcore for that reason. > > > > > > > > > > > Can direct mapping ever be converted to MMIO? It can be converted to DMA > > > > buffer (which is fine), but MMIO? I have not seen it even in virtualized > > > > environments. > > > > > > I recall discussions in the context of PAT and the adjustment of caching > > > attributes of the direct map for MMIO purposes: so I suspect there are > > > ways that can happen, but I am not 100% sure. > > > > > > > > > Thinking about it, in VMs we have the direct map set on balloon inflated > > > pages that should not be touched, not even read, otherwise your > > > hypervisor might get very angry. That case we could likely handle by > > > checking whether the source page actually exists and doesn't have > > > PageOffline() set, before accessing it. A bit nasty. > > > > > > A more obscure cases would probably be reading a page that was poisoned > > > by hardware and is not expected to be used anymore. Could also be > > > checked by checking the page. > > > > > > Essentially all cases where we try to avoid reading ordinary memory > > > already when creating memory dumps that might have a direct map. > > > > > > > > > Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately. > > > > Looking into this, I'd assume the exception handler will take care of it. > > > > load_unaligned_zeropad() is interesting if there is a direct map but the > > memory should not be touched (especially regarding PageOffline and > > memory errors). > > > > I read drivers/firmware/efi/unaccepted_memory.c where we there is a > > lengthy discussion about guard pages and how that works for unaccepted > > memory. > > > > While it works for unaccepted memory, it wouldn't work for other random > > Sorry I meant here "while that works for load_unaligned_zeropad()". Do we have other random reads? For unaccepted memory, we care about touching memory that was never allocated because accepting memory is one way road. I only know about load_unaligned_zeropad() that does reads like this. Do you know others? -- Kiryl Shutsemau / Kirill A. Shutemov
On 23.10.25 13:40, Kiryl Shutsemau wrote: > On Thu, Oct 23, 2025 at 01:11:43PM +0200, David Hildenbrand wrote: >> On 23.10.25 13:10, David Hildenbrand wrote: >>> On 23.10.25 12:54, David Hildenbrand wrote: >>>> On 23.10.25 12:31, Kiryl Shutsemau wrote: >>>>> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: >>>>>> "garbage" as in pointing at something without a direct map, something that's >>>>>> protected differently (MTE? weird CoCo protection?) or even worse MMIO with >>>>>> undesired read-effects. >>>>> >>>>> Pedro already points to the problem with missing direct mapping. >>>>> _nofault() copy should help with this. >>>> >>>> Yeah, we do something similar when reading the kcore for that reason. >>>> >>>>> >>>>> Can direct mapping ever be converted to MMIO? It can be converted to DMA >>>>> buffer (which is fine), but MMIO? I have not seen it even in virtualized >>>>> environments. >>>> >>>> I recall discussions in the context of PAT and the adjustment of caching >>>> attributes of the direct map for MMIO purposes: so I suspect there are >>>> ways that can happen, but I am not 100% sure. >>>> >>>> >>>> Thinking about it, in VMs we have the direct map set on balloon inflated >>>> pages that should not be touched, not even read, otherwise your >>>> hypervisor might get very angry. That case we could likely handle by >>>> checking whether the source page actually exists and doesn't have >>>> PageOffline() set, before accessing it. A bit nasty. >>>> >>>> A more obscure cases would probably be reading a page that was poisoned >>>> by hardware and is not expected to be used anymore. Could also be >>>> checked by checking the page. >>>> >>>> Essentially all cases where we try to avoid reading ordinary memory >>>> already when creating memory dumps that might have a direct map. >>>> >>>> >>>> Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately. >>> >>> Looking into this, I'd assume the exception handler will take care of it. >>> >>> load_unaligned_zeropad() is interesting if there is a direct map but the >>> memory should not be touched (especially regarding PageOffline and >>> memory errors). >>> >>> I read drivers/firmware/efi/unaccepted_memory.c where we there is a >>> lengthy discussion about guard pages and how that works for unaccepted >>> memory. >>> >>> While it works for unaccepted memory, it wouldn't work for other random >> >> Sorry I meant here "while that works for load_unaligned_zeropad()". > > Do we have other random reads? > > For unaccepted memory, we care about touching memory that was never > allocated because accepting memory is one way road. Right, but I suspect if you get a random read (as the unaccepted memory doc states) you'd be in trouble as well. The "nice" thing about unaccepted memory is that it's a one way road indeed, and at some point the system will not have unaccepted memory anymore. > > I only know about load_unaligned_zeropad() that does reads like this. Do > you know others? No, I am not aware of others. Most code that could read random memory (kcore, vmcore) was fixed to exclude pages we know are unsafe to touch. Code where might speculatively access the "struct page" after it might already have been freed (speculative pagecache lookups, GUP-fast) will just back off and never read page content. We avoid such random memory reads as best we can, as it's just a pain to deal with (like load_unaligned_zeropad(), which i would just wish we could get rid of now that it's present again in my memory. :( ). -- Cheers David / dhildenb
On Thu, Oct 23, 2025 at 01:49:22PM +0200, David Hildenbrand wrote: > On 23.10.25 13:40, Kiryl Shutsemau wrote: > > On Thu, Oct 23, 2025 at 01:11:43PM +0200, David Hildenbrand wrote: > > > On 23.10.25 13:10, David Hildenbrand wrote: > > > > On 23.10.25 12:54, David Hildenbrand wrote: > > > > > On 23.10.25 12:31, Kiryl Shutsemau wrote: > > > > > > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: > > > > > > > "garbage" as in pointing at something without a direct map, something that's > > > > > > > protected differently (MTE? weird CoCo protection?) or even worse MMIO with > > > > > > > undesired read-effects. > > > > > > > > > > > > Pedro already points to the problem with missing direct mapping. > > > > > > _nofault() copy should help with this. > > > > > > > > > > Yeah, we do something similar when reading the kcore for that reason. > > > > > > > > > > > > > > > > > Can direct mapping ever be converted to MMIO? It can be converted to DMA > > > > > > buffer (which is fine), but MMIO? I have not seen it even in virtualized > > > > > > environments. > > > > > > > > > > I recall discussions in the context of PAT and the adjustment of caching > > > > > attributes of the direct map for MMIO purposes: so I suspect there are > > > > > ways that can happen, but I am not 100% sure. > > > > > > > > > > > > > > > Thinking about it, in VMs we have the direct map set on balloon inflated > > > > > pages that should not be touched, not even read, otherwise your > > > > > hypervisor might get very angry. That case we could likely handle by > > > > > checking whether the source page actually exists and doesn't have > > > > > PageOffline() set, before accessing it. A bit nasty. > > > > > > > > > > A more obscure cases would probably be reading a page that was poisoned > > > > > by hardware and is not expected to be used anymore. Could also be > > > > > checked by checking the page. > > > > > > > > > > Essentially all cases where we try to avoid reading ordinary memory > > > > > already when creating memory dumps that might have a direct map. > > > > > > > > > > > > > > > Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately. > > > > > > > > Looking into this, I'd assume the exception handler will take care of it. > > > > > > > > load_unaligned_zeropad() is interesting if there is a direct map but the > > > > memory should not be touched (especially regarding PageOffline and > > > > memory errors). > > > > > > > > I read drivers/firmware/efi/unaccepted_memory.c where we there is a > > > > lengthy discussion about guard pages and how that works for unaccepted > > > > memory. > > > > > > > > While it works for unaccepted memory, it wouldn't work for other random > > > > > > Sorry I meant here "while that works for load_unaligned_zeropad()". > > > > Do we have other random reads? > > > > For unaccepted memory, we care about touching memory that was never > > allocated because accepting memory is one way road. > > Right, but I suspect if you get a random read (as the unaccepted memory doc > states) you'd be in trouble as well. Yes. Random read of unaccepted memory is unrecoverable exit to host for TDX guest. -- Kiryl Shutsemau / Kirill A. Shutemov
On Thu, Oct 23, 2025 at 12:54:59PM +0200, David Hildenbrand wrote: > On 23.10.25 12:31, Kiryl Shutsemau wrote: > > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: > > > "garbage" as in pointing at something without a direct map, something that's > > > protected differently (MTE? weird CoCo protection?) or even worse MMIO with > > > undesired read-effects. > > > > Pedro already points to the problem with missing direct mapping. > > _nofault() copy should help with this. > > Yeah, we do something similar when reading the kcore for that reason. > > > > > Can direct mapping ever be converted to MMIO? It can be converted to DMA > > buffer (which is fine), but MMIO? I have not seen it even in virtualized > > environments. > > I recall discussions in the context of PAT and the adjustment of caching > attributes of the direct map for MMIO purposes: so I suspect there are ways > that can happen, but I am not 100% sure. > > > Thinking about it, in VMs we have the direct map set on balloon inflated > pages that should not be touched, not even read, otherwise your hypervisor > might get very angry. That case we could likely handle by checking whether > the source page actually exists and doesn't have PageOffline() set, before > accessing it. A bit nasty. > > A more obscure cases would probably be reading a page that was poisoned by > hardware and is not expected to be used anymore. Could also be checked by > checking the page. I don't think we can check the page. Since the page is not stabilized with a reference, it is TOCTOU race. If there's some side effect that we cannot suppress on read (like _nofault() does) we are screwed. -- Kiryl Shutsemau / Kirill A. Shutemov
On 23.10.25 13:09, Kiryl Shutsemau wrote: > On Thu, Oct 23, 2025 at 12:54:59PM +0200, David Hildenbrand wrote: >> On 23.10.25 12:31, Kiryl Shutsemau wrote: >>> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote: >>>> "garbage" as in pointing at something without a direct map, something that's >>>> protected differently (MTE? weird CoCo protection?) or even worse MMIO with >>>> undesired read-effects. >>> >>> Pedro already points to the problem with missing direct mapping. >>> _nofault() copy should help with this. >> >> Yeah, we do something similar when reading the kcore for that reason. >> >>> >>> Can direct mapping ever be converted to MMIO? It can be converted to DMA >>> buffer (which is fine), but MMIO? I have not seen it even in virtualized >>> environments. >> >> I recall discussions in the context of PAT and the adjustment of caching >> attributes of the direct map for MMIO purposes: so I suspect there are ways >> that can happen, but I am not 100% sure. >> >> >> Thinking about it, in VMs we have the direct map set on balloon inflated >> pages that should not be touched, not even read, otherwise your hypervisor >> might get very angry. That case we could likely handle by checking whether >> the source page actually exists and doesn't have PageOffline() set, before >> accessing it. A bit nasty. >> >> A more obscure cases would probably be reading a page that was poisoned by >> hardware and is not expected to be used anymore. Could also be checked by >> checking the page. > > I don't think we can check the page. Since the page is not stabilized > with a reference, it is TOCTOU race. Indeed :( -- Cheers David / dhildenb
On Fri, Oct 17, 2025 at 03:15:36PM +0100, Kiryl Shutsemau wrote:
> From: Kiryl Shutsemau <kas@kernel.org>
>
> The protocol for page cache lookup is as follows:
>
> 1. Locate a folio in XArray.
> 2. Obtain a reference on the folio using folio_try_get().
> 3. If successful, verify that the folio still belongs to
> the mapping and has not been truncated or reclaimed.
> 4. Perform operations on the folio, such as copying data
> to userspace.
> 5. Release the reference.
>
> For short reads, the overhead of atomic operations on reference
> manipulation can be significant, particularly when multiple tasks access
> the same folio, leading to cache line bouncing.
>
> <snip>
>+static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
> + loff_t pos, char *buffer,
> + size_t size)
> +{
> + XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
> + struct folio *folio;
> + loff_t file_size;
> + unsigned int seq;
> +
> + lockdep_assert_in_rcu_read_lock();
> +
> + /* Give up and go to slow path if raced with page_cache_delete() */
> + if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
> + return false;
> +
> + folio = xas_load(&xas);
> + if (xas_retry(&xas, folio))
> + return 0;
> +
> + if (!folio || xa_is_value(folio))
> + return 0;
> +
> + if (!folio_test_uptodate(folio))
> + return 0;
> +
> + /* No fast-case if readahead is supposed to started */
> + if (folio_test_readahead(folio))
> + return 0;
> + /* .. or mark it accessed */
> + if (!folio_test_referenced(folio))
> + return 0;
> +
> + /* i_size check must be after folio_test_uptodate() */
> + file_size = i_size_read(mapping->host);
> + if (unlikely(pos >= file_size))
> + return 0;
> + if (size > file_size - pos)
> + size = file_size - pos;
> +
> + /* Do the data copy */
> + size = memcpy_from_file_folio(buffer, folio, pos, size);
> + if (!size)
> + return 0;
> +
I think we may still have a problematic (rare, possibly theoretical) race here where:
T0 T1 T3
filemap_read_fast_rcu() | |
folio = xas_load(&xas); | |
/* ... */ | /* truncate or reclaim frees folio, bumps delete |
| seq */ | folio_alloc() from e.g secretmem
| | set_direct_map_invalid_noflush(!!)
memcpy_from_file_folio() | |
We may have to use copy_from_kernel_nofault() here? Or is something else stopping this from happening?
--
Pedro
On Tue, 21 Oct 2025 at 21:08, Pedro Falcato <pfalcato@suse.de> wrote:
>
> I think we may still have a problematic (rare, possibly theoretical) race here where:
>
> T0 T1 T3
> filemap_read_fast_rcu() | |
> folio = xas_load(&xas); | |
> /* ... */ | /* truncate or reclaim frees folio, bumps delete |
> | seq */ | folio_alloc() from e.g secretmem
> | | set_direct_map_invalid_noflush(!!)
> memcpy_from_file_folio() | |
>
> We may have to use copy_from_kernel_nofault() here? Or is something else stopping this from happening?
Explain how the sequence count doesn't catch this?
We read the sequence count before we do the xas_load(), and we verify
it after we've done the memcpy_from_folio.
The whole *point* is that the copy itself is not race-free. That's
*why* we do the sequence count.
And only after the sequence count has been verified do we then copy
the result to user space.
So the "maybe this buffer content is garbage" happens, but it only
happens in the temporary kernel on-stack buffer, not visibly to the
user.
Linus
On Tue, Oct 21, 2025 at 09:13:28PM -1000, Linus Torvalds wrote: > On Tue, 21 Oct 2025 at 21:08, Pedro Falcato <pfalcato@suse.de> wrote: > > > > I think we may still have a problematic (rare, possibly theoretical) race here where: > > > > T0 T1 T3 > > filemap_read_fast_rcu() | | > > folio = xas_load(&xas); | | > > /* ... */ | /* truncate or reclaim frees folio, bumps delete | > > | seq */ | folio_alloc() from e.g secretmem > > | | set_direct_map_invalid_noflush(!!) > > memcpy_from_file_folio() | | > > > > We may have to use copy_from_kernel_nofault() here? Or is something else stopping this from happening? > > Explain how the sequence count doesn't catch this? > > We read the sequence count before we do the xas_load(), and we verify > it after we've done the memcpy_from_folio. > > The whole *point* is that the copy itself is not race-free. That's > *why* we do the sequence count. > > And only after the sequence count has been verified do we then copy > the result to user space. > > So the "maybe this buffer content is garbage" happens, but it only > happens in the temporary kernel on-stack buffer, not visibly to the > user. The problem isn't that the contents might be garbage, but that the direct map may be swept from under us, as we don't have a reference to the folio. So the folio can be transparently freed under us (as designed), but some user can call fun stuff like set_direct_map_invalid_noflush() and we're not handling any "oopsie we faulted reading the folio" here. The sequence count doesn't help here, because we, uhh, faulted. Does this make sense? TL;DR I don't think it's safe to touch the direct map of folios we don't own without the seatbelt of a copy_from_kernel_nofault or so. -- Pedro
On Wed, Oct 22, 2025 at 08:38:30AM +0100, Pedro Falcato wrote: > On Tue, Oct 21, 2025 at 09:13:28PM -1000, Linus Torvalds wrote: > > On Tue, 21 Oct 2025 at 21:08, Pedro Falcato <pfalcato@suse.de> wrote: > > > > > > I think we may still have a problematic (rare, possibly theoretical) race here where: > > > > > > T0 T1 T3 > > > filemap_read_fast_rcu() | | > > > folio = xas_load(&xas); | | > > > /* ... */ | /* truncate or reclaim frees folio, bumps delete | > > > | seq */ | folio_alloc() from e.g secretmem > > > | | set_direct_map_invalid_noflush(!!) > > > memcpy_from_file_folio() | | > > > > > > We may have to use copy_from_kernel_nofault() here? Or is something else stopping this from happening? > > > > Explain how the sequence count doesn't catch this? > > > > We read the sequence count before we do the xas_load(), and we verify > > it after we've done the memcpy_from_folio. > > > > The whole *point* is that the copy itself is not race-free. That's > > *why* we do the sequence count. > > > > And only after the sequence count has been verified do we then copy > > the result to user space. > > > > So the "maybe this buffer content is garbage" happens, but it only > > happens in the temporary kernel on-stack buffer, not visibly to the > > user. > > The problem isn't that the contents might be garbage, but that the direct map > may be swept from under us, as we don't have a reference to the folio. So the > folio can be transparently freed under us (as designed), but some user can > call fun stuff like set_direct_map_invalid_noflush() and we're not handling > any "oopsie we faulted reading the folio" here. The sequence count doesn't > help here, because we, uhh, faulted. Does this make sense? > > TL;DR I don't think it's safe to touch the direct map of folios we don't own > without the seatbelt of a copy_from_kernel_nofault or so. Makes sense. Thanks for catching this! -- Kiryl Shutsemau / Kirill A. Shutemov
On Fri, 17 Oct 2025 15:15:36 +0100 Kiryl Shutsemau <kirill@shutemov.name> wrote:
> From: Kiryl Shutsemau <kas@kernel.org>
>
> The protocol for page cache lookup is as follows:
>
> 1. Locate a folio in XArray.
> 2. Obtain a reference on the folio using folio_try_get().
> 3. If successful, verify that the folio still belongs to
> the mapping and has not been truncated or reclaimed.
> 4. Perform operations on the folio, such as copying data
> to userspace.
> 5. Release the reference.
>
> For short reads, the overhead of atomic operations on reference
> manipulation can be significant, particularly when multiple tasks access
> the same folio, leading to cache line bouncing.
>
> To address this issue, introduce i_pages_delete_seqcnt, which increments
> each time a folio is deleted from the page cache and implement a modified
> page cache lookup protocol for short reads:
>
> 1. Locate a folio in XArray.
> 2. Take note of the i_pages_delete_seqcnt.
> 3. Copy the data to a local buffer on the stack.
> 4. Verify that the i_pages_delete_seqcnt has not changed.
> 5. Copy the data from the local buffer to the iterator.
>
> If any issues arise in the fast path, fallback to the slow path that
> relies on the refcount to stabilize the folio.
Well this is a fun patch.
> The new approach requires a local buffer in the stack. The size of the
> buffer determines which read requests are served by the fast path. Set
> the buffer to 1k. This seems to be a reasonable amount of stack usage
> for the function at the bottom of the call stack.
A use case for alloca() or equiv. That would improve the average-case
stack depth but not the worst-case.
The 1k guess-or-giggle is crying out for histogramming - I bet 0.1k
covers the great majority. I suspect it wouldn't be hard to add a new
ad-hoc API into memory allocation profiling asking it to profile
something like this for us, given an explicit request to to do.
Is there really no way to copy the dang thing straight out to
userspace, skip the bouncing?
> The fast read approach demonstrates significant performance
> improvements, particularly in contended cases.
>
> 16 threads, reads from 4k file(s), mean MiB/s (StdDev)
>
> -------------------------------------------------------------
> | Block | Baseline | Baseline | Patched | Patched |
> | size | same file | diff files | same file | diff files |
> -------------------------------------------------------------
> | 1 | 10.96 | 27.56 | 30.42 | 30.4 |
> | | (0.497) | (0.114) | (0.130) | (0.158) |
> | 32 | 350.8 | 886.2 | 980.6 | 981.8 |
> | | (13.64) | (2.863) | (3.361) | (1.303) |
> | 256 | 2798 | 7009.6 | 7641.4 | 7653.6 |
> | | (103.9) | (28.00) | (33.26) | (25.50) |
tl;dr, 256-byte reads from the same file nearly tripled.
> | 1024 | 10780 | 27040 | 29280 | 29320 |
> | | (389.8) | (89.44) | (130.3) | (83.66) |
> | 4096 | 43700 | 103800 | 48420 | 102000 |
> | | (1953) | (447.2) | (2012) | (0) |
> -------------------------------------------------------------
>
> 16 threads, reads from 1M file(s), mean MiB/s (StdDev)
>
> --------------------------------------------------------------
> | Block | Baseline | Baseline | Patched | Patched |
> | size | same file | diff files | same file | diff files |
> ---------------------------------------------------------
> | 1 | 26.38 | 27.34 | 30.38 | 30.36 |
> | | (0.998) | (0.114) | (0.083) | (0.089) |
> | 32 | 824.4 | 877.2 | 977.8 | 975.8 |
> | | (15.78) | (3.271) | (2.683) | (1.095) |
> | 256 | 6494 | 6992.8 | 7619.8 | 7625 |
> | | (116.0) | (32.39) | (10.66) | (28.19) |
> | 1024 | 24960 | 26840 | 29100 | 29180 |
> | | (606.6) | (151.6) | (122.4) | (83.66) |
> | 4096 | 94420 | 100520 | 95260 | 99760 |
> | | (3144) | (672.3) | (2874) | (134.1) |
> | 32768 | 386000 | 402400 | 368600 | 397400 |
> | | (36599) | (10526) | (47188) | (6107) |
> --------------------------------------------------------------
>
> There's also improvement on kernel build:
>
> Base line: 61.3462 +- 0.0597 seconds time elapsed ( +- 0.10% )
> Patched: 60.6106 +- 0.0759 seconds time elapsed ( +- 0.13% )
>
> ...
>
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
>
> ...
>
> -ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
> - ssize_t already_read)
> +static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
> + loff_t pos, char *buffer,
> + size_t size)
> +{
> + XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
> + struct folio *folio;
> + loff_t file_size;
> + unsigned int seq;
> +
> + lockdep_assert_in_rcu_read_lock();
> +
> + /* Give up and go to slow path if raced with page_cache_delete() */
> + if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
> + return false;
return 0;
> +
> + folio = xas_load(&xas);
> + if (xas_retry(&xas, folio))
> + return 0;
> +
> + if (!folio || xa_is_value(folio))
> + return 0;
> +
> + if (!folio_test_uptodate(folio))
> + return 0;
> +
> + /* No fast-case if readahead is supposed to started */
Please expand this comment. "explain why, not what". Why do we care
if it's under readahead? It's uptodate, so just grab it??
> + if (folio_test_readahead(folio))
> + return 0;
> + /* .. or mark it accessed */
This comment disagrees with the code which it is commenting.
> + if (!folio_test_referenced(folio))
> + return 0;
> +
> + /* i_size check must be after folio_test_uptodate() */
why?
> + file_size = i_size_read(mapping->host);
> + if (unlikely(pos >= file_size))
> + return 0;
> + if (size > file_size - pos)
> + size = file_size - pos;
min() is feeling all sad?
> + /* Do the data copy */
We can live without this comment ;)
> + size = memcpy_from_file_folio(buffer, folio, pos, size);
> + if (!size)
> + return 0;
> +
> + /* Give up and go to slow path if raced with page_cache_delete() */
I wonder if truncation is all we need to worry about here. For
example, direct-io does weird stuff.
> + if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
> + return 0;
> +
> + return size;
> +}
> +
> +#define FAST_READ_BUF_SIZE 1024
> +
> +static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
> + ssize_t *already_read)
> +{
> + struct address_space *mapping = iocb->ki_filp->f_mapping;
> + struct file_ra_state *ra = &iocb->ki_filp->f_ra;
> + char buffer[FAST_READ_BUF_SIZE];
> + size_t count;
> +
> + if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
> + return false;
why? (comment please)
> + if (iov_iter_count(iter) > sizeof(buffer))
> + return false;
> +
> + count = iov_iter_count(iter);
It'd be a tinier bit tidier to swap the above to avoid evaluating
iov_iter_count() twice. Yes, iov_iter_count() happens to be fast, but
we aren't supposed to know that here.
> + /* Let's see if we can just do the read under RCU */
> + rcu_read_lock();
> + count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
> + rcu_read_unlock();
> +
> + if (!count)
> + return false;
> +
> + count = copy_to_iter(buffer, count, iter);
> + if (unlikely(!count))
> + return false;
> +
> + iocb->ki_pos += count;
> + ra->prev_pos = iocb->ki_pos;
> + file_accessed(iocb->ki_filp);
> + *already_read += count;
> +
> + return !iov_iter_count(iter);
> +}
> +
> +static noinline ssize_t filemap_read_slow(struct kiocb *iocb,
> + struct iov_iter *iter,
> + ssize_t already_read)
> {
> struct file *filp = iocb->ki_filp;
> struct file_ra_state *ra = &filp->f_ra;
On Sun, 19 Oct 2025 at 18:53, Andrew Morton <akpm@linux-foundation.org> wrote:
>
> Is there really no way to copy the dang thing straight out to
> userspace, skip the bouncing?
Sadly, no.
It's trivial to copy to user space in a RCU-protected region: just
disable page faults and it all works fine.
In fact, it works so fine that everything boots and it all looks
beautiful in profiles etc - ask me how I know.
But it's still wrong. The problem is that *after* you've copies things
away from the page cache, you need to check that the page cache
contents are still valid.
And it's not a problem to do that and just say "don't count the bytes
I just copied, and we'll copy over them later".
But while 99.999% of the time we *will* copy over them later, it's not
actually guaranteed. What migth happen is that after we've filled in
user space with the optimistically copied data, we figure out that the
page cache is no longer valid, and we go to the slow case, and two
problems may have happened:
(a) the file got truncated in the meantime, and we just filled in
stale data (possibly zeroes) in a user space buffer, and we're
returning a smaller length than what we filled out.
Will user space care? Not realistically, no. But it's wrong, and some
user space *might* be using the buffer as a ring-buffer or something,
and assume that if we return 5 bytes from "read()", the subsequent
bytes are still valid from (previous) ring buffer fills.
But if we decide to ignore that issue (possibly with some "open()"
time flag to say "give me optimistic short reads, and I won't care),
we still have
(b) the folio we copied from migth have been released and re-used for
something else
and this is fatal. We might have optimistically copied things that are
now security-sensitive and even if we return a short read - or
overwrite it - layer, user space should never have seen that data.
This (b) thing is solvable too, but requires that page cache releases
always would be RCU-delayed, and they aren't.
So both are "solvable", but they are very big and very separate solutions.
Linus
On Sun, Oct 19, 2025 at 09:53:28PM -0700, Andrew Morton wrote:
> On Fri, 17 Oct 2025 15:15:36 +0100 Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> > From: Kiryl Shutsemau <kas@kernel.org>
> >
> > The protocol for page cache lookup is as follows:
> >
> > 1. Locate a folio in XArray.
> > 2. Obtain a reference on the folio using folio_try_get().
> > 3. If successful, verify that the folio still belongs to
> > the mapping and has not been truncated or reclaimed.
> > 4. Perform operations on the folio, such as copying data
> > to userspace.
> > 5. Release the reference.
> >
> > For short reads, the overhead of atomic operations on reference
> > manipulation can be significant, particularly when multiple tasks access
> > the same folio, leading to cache line bouncing.
> >
> > To address this issue, introduce i_pages_delete_seqcnt, which increments
> > each time a folio is deleted from the page cache and implement a modified
> > page cache lookup protocol for short reads:
> >
> > 1. Locate a folio in XArray.
> > 2. Take note of the i_pages_delete_seqcnt.
> > 3. Copy the data to a local buffer on the stack.
> > 4. Verify that the i_pages_delete_seqcnt has not changed.
> > 5. Copy the data from the local buffer to the iterator.
> >
> > If any issues arise in the fast path, fallback to the slow path that
> > relies on the refcount to stabilize the folio.
>
> Well this is a fun patch.
Yes! :P
> > The new approach requires a local buffer in the stack. The size of the
> > buffer determines which read requests are served by the fast path. Set
> > the buffer to 1k. This seems to be a reasonable amount of stack usage
> > for the function at the bottom of the call stack.
>
> A use case for alloca() or equiv. That would improve the average-case
> stack depth but not the worst-case.
__kstack_alloca()/__builtin_alloca() would work and it bypassed
-Wframe-larger-than warning.
But I don't see any real users.
My understanding is that alloca() is similar in semantics with VLAs that
we eliminated from the kernel. I am not sure it is a good idea.
Other option is to have a per-CPU buffer. But it is less cache friendly.
> The 1k guess-or-giggle is crying out for histogramming - I bet 0.1k
> covers the great majority. I suspect it wouldn't be hard to add a new
> ad-hoc API into memory allocation profiling asking it to profile
> something like this for us, given an explicit request to to do.
Let me see what I can do.
> Is there really no way to copy the dang thing straight out to
> userspace, skip the bouncing?
I don't see a way to make it in a safe manner.
In case of a race with folio deletion we risk leaking data to userspace:
the folio we are reading from can be freed and re-allocated from under
us to random other user. Bounce buffer let's us stabilize the data
before exposing it to userspace.
> > The fast read approach demonstrates significant performance
> > improvements, particularly in contended cases.
> >
> > 16 threads, reads from 4k file(s), mean MiB/s (StdDev)
> >
> > -------------------------------------------------------------
> > | Block | Baseline | Baseline | Patched | Patched |
> > | size | same file | diff files | same file | diff files |
> > -------------------------------------------------------------
> > | 1 | 10.96 | 27.56 | 30.42 | 30.4 |
> > | | (0.497) | (0.114) | (0.130) | (0.158) |
> > | 32 | 350.8 | 886.2 | 980.6 | 981.8 |
> > | | (13.64) | (2.863) | (3.361) | (1.303) |
> > | 256 | 2798 | 7009.6 | 7641.4 | 7653.6 |
> > | | (103.9) | (28.00) | (33.26) | (25.50) |
>
> tl;dr, 256-byte reads from the same file nearly tripled.
Yep.
> > | 1024 | 10780 | 27040 | 29280 | 29320 |
> > | | (389.8) | (89.44) | (130.3) | (83.66) |
> > | 4096 | 43700 | 103800 | 48420 | 102000 |
> > | | (1953) | (447.2) | (2012) | (0) |
> > -------------------------------------------------------------
> >
> > 16 threads, reads from 1M file(s), mean MiB/s (StdDev)
> >
> > --------------------------------------------------------------
> > | Block | Baseline | Baseline | Patched | Patched |
> > | size | same file | diff files | same file | diff files |
> > ---------------------------------------------------------
> > | 1 | 26.38 | 27.34 | 30.38 | 30.36 |
> > | | (0.998) | (0.114) | (0.083) | (0.089) |
> > | 32 | 824.4 | 877.2 | 977.8 | 975.8 |
> > | | (15.78) | (3.271) | (2.683) | (1.095) |
> > | 256 | 6494 | 6992.8 | 7619.8 | 7625 |
> > | | (116.0) | (32.39) | (10.66) | (28.19) |
> > | 1024 | 24960 | 26840 | 29100 | 29180 |
> > | | (606.6) | (151.6) | (122.4) | (83.66) |
> > | 4096 | 94420 | 100520 | 95260 | 99760 |
> > | | (3144) | (672.3) | (2874) | (134.1) |
> > | 32768 | 386000 | 402400 | 368600 | 397400 |
> > | | (36599) | (10526) | (47188) | (6107) |
> > --------------------------------------------------------------
> >
> > There's also improvement on kernel build:
> >
> > Base line: 61.3462 +- 0.0597 seconds time elapsed ( +- 0.10% )
> > Patched: 60.6106 +- 0.0759 seconds time elapsed ( +- 0.13% )
> >
> > ...
> >
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> >
> > ...
> >
> > -ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
> > - ssize_t already_read)
> > +static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
> > + loff_t pos, char *buffer,
> > + size_t size)
> > +{
> > + XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
> > + struct folio *folio;
> > + loff_t file_size;
> > + unsigned int seq;
> > +
> > + lockdep_assert_in_rcu_read_lock();
> > +
> > + /* Give up and go to slow path if raced with page_cache_delete() */
> > + if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
> > + return false;
>
> return 0;
Ack.
> > +
> > + folio = xas_load(&xas);
> > + if (xas_retry(&xas, folio))
> > + return 0;
> > +
> > + if (!folio || xa_is_value(folio))
> > + return 0;
> > +
> > + if (!folio_test_uptodate(folio))
> > + return 0;
> > +
> > + /* No fast-case if readahead is supposed to started */
>
> Please expand this comment. "explain why, not what". Why do we care
> if it's under readahead? It's uptodate, so just grab it??
Readahead pages require kicking rickahead machinery with
filemap_readahead(). It is not appropriate for the fast path.
Will rewrite the comment.
> > + if (folio_test_readahead(folio))
> > + return 0;
> > + /* .. or mark it accessed */
>
> This comment disagrees with the code which it is commenting.
It is continuation of the comment for the readahead. Will rewrite to make
it clear.
Similar to readahead, we don't want to go for folio_mark_accessed().
> > + if (!folio_test_referenced(folio))
> > + return 0;
> > +
> > + /* i_size check must be after folio_test_uptodate() */
>
> why?
There is comment for i_size_read() in slow path that inidicates that it
is required, but, to be honest, I don't fully understand interaction
uptodate vs i_size here.
> > + file_size = i_size_read(mapping->host);
> > + if (unlikely(pos >= file_size))
> > + return 0;
> > + if (size > file_size - pos)
> > + size = file_size - pos;
>
> min() is feeling all sad?
Will make it happy. :)
> > + /* Do the data copy */
>
> We can live without this comment ;)
:P
> > + size = memcpy_from_file_folio(buffer, folio, pos, size);
> > + if (!size)
> > + return 0;
> > +
> > + /* Give up and go to slow path if raced with page_cache_delete() */
>
> I wonder if truncation is all we need to worry about here. For
> example, direct-io does weird stuff.
Direct I/O does page cache invalidation which is also goes via
page_cache_delete().
Reclaim also terminates in page_cache_delete() via
__filemap_remove_folio().
> > + if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
> > + return 0;
> > +
> > + return size;
> > +}
> > +
> > +#define FAST_READ_BUF_SIZE 1024
> > +
> > +static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
> > + ssize_t *already_read)
> > +{
> > + struct address_space *mapping = iocb->ki_filp->f_mapping;
> > + struct file_ra_state *ra = &iocb->ki_filp->f_ra;
> > + char buffer[FAST_READ_BUF_SIZE];
> > + size_t count;
> > +
> > + if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
> > + return false;
>
> why? (comment please)
I don't understand implication of flush_dcache_folio() on serialization
here. Will read up.
> > + if (iov_iter_count(iter) > sizeof(buffer))
> > + return false;
> > +
> > + count = iov_iter_count(iter);
>
> It'd be a tinier bit tidier to swap the above to avoid evaluating
> iov_iter_count() twice. Yes, iov_iter_count() happens to be fast, but
> we aren't supposed to know that here.
Okay.
> > + /* Let's see if we can just do the read under RCU */
> > + rcu_read_lock();
> > + count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
> > + rcu_read_unlock();
> > +
> > + if (!count)
> > + return false;
> > +
> > + count = copy_to_iter(buffer, count, iter);
> > + if (unlikely(!count))
> > + return false;
> > +
> > + iocb->ki_pos += count;
> > + ra->prev_pos = iocb->ki_pos;
> > + file_accessed(iocb->ki_filp);
> > + *already_read += count;
> > +
> > + return !iov_iter_count(iter);
> > +}
> > +
> > +static noinline ssize_t filemap_read_slow(struct kiocb *iocb,
> > + struct iov_iter *iter,
> > + ssize_t already_read)
> > {
> > struct file *filp = iocb->ki_filp;
> > struct file_ra_state *ra = &filp->f_ra;
>
--
Kiryl Shutsemau / Kirill A. Shutemov
On Mon, Oct 20, 2025 at 12:33:08PM +0100, Kiryl Shutsemau wrote: > On Sun, Oct 19, 2025 at 09:53:28PM -0700, Andrew Morton wrote: > > On Fri, 17 Oct 2025 15:15:36 +0100 Kiryl Shutsemau <kirill@shutemov.name> wrote: > > > > > From: Kiryl Shutsemau <kas@kernel.org> > > > > > > The protocol for page cache lookup is as follows: > > > > > > 1. Locate a folio in XArray. > > > 2. Obtain a reference on the folio using folio_try_get(). > > > 3. If successful, verify that the folio still belongs to > > > the mapping and has not been truncated or reclaimed. What about if it has been hole-punched? The i_size checks after testing the folio is up to date catch races with truncate down. This "works" because truncate_setsize() changes i_size before we invalidate the mapping and so we don't try to access folios that have pending invalidation. It also catches the case where the invalidation is only a partial EOF folio zero (e.g. truncate down within the same EOF folio). In this case, the deletion sequence number won't catch the invalidation because no pages are freed from the page cache. Hence reads need to check i_size to detect this case. However, fallocate() operations such as hole punching and extent shifting have exactly the same partial folio invalidation problems as truncate but they don't change i_size like truncate does (both at the front and rear edges of the ranges being operated on) Hence invalidation races with fallocate() operations cannot be detected via i_size checks and we have to serialise them differently. fallocate() also requires barriers prevent new page cache operations whilst the filesystem operation is in progress, so we actually need the invalidation serialisation to also act as a page cache instantiation barrier. This is what the mapping->invalidate_lock provides, and I suspect that this new read fast path doesn't actually work correctly w.r.t. fallocate() based invalidation because it can't detect races with partial folio invalidations that are pending nor does it take the mapping->invalidate_lock.... I also wonder if there might be other subtle races with ->remap_file_range based operations, because they also run invalidations and need page cache instatiation barriers whilst the operations run. At least with XFS, remap operations hold both the inode->i_rwsem and the mapping->invalidate_lock so nobody can access the page cache across the destination range being operated on whilst the extent mapping underlying the file is in flux. Given these potential issues, I really wonder if this niche fast path is really worth the potential pain racing against these sorts of operations could bring us. It also increases the cognitive load for anyone trying to understand how buffered reads interact with everything else (i.e. yet another set of race conditions we have to worry about when thinking about truncate!), and it is not clear to me that it is (or can be made) safe w.r.t. more complex invalidation interactions that filesystem have to handle these days. So: is the benefit for this niche workload really worth the additional complexity it adds to what is already a very complex set of behaviours and interactions? > > > + if (!folio_test_referenced(folio)) > > > + return 0; > > > + > > > + /* i_size check must be after folio_test_uptodate() */ > > > > why? > > There is comment for i_size_read() in slow path that inidicates that it > is required, but, to be honest, I don't fully understand interaction > uptodate vs i_size here. As per above, it's detecting a race with a concurrent truncate that is about to invalidate the folio but hasn't yet got to that folio in the mapping. This is where we'd also need to detect pending fallocate() or other invalidations that are in progress, but there's no way to do that easily.... -Dave. -- Dave Chinner david@fromorbit.com
On Tue, 21 Oct 2025 at 13:39, Dave Chinner <david@fromorbit.com> wrote:
>
> > > > 1. Locate a folio in XArray.
> > > > 2. Obtain a reference on the folio using folio_try_get().
> > > > 3. If successful, verify that the folio still belongs to
> > > > the mapping and has not been truncated or reclaimed.
>
> What about if it has been hole-punched?
The sequence number check should take care of anything like that. Do
you have any reason to believe it doesn't?
Yes, you can get the "before or after or between" behavior, but you
can get that with perfectly regular reads that take the refcount on
the page.
Reads have never taken the page lock, and have never been serialized that way.
So the fast case changes absolutely nothing in this respect that I can see.
Linus
On Tue, Oct 21, 2025 at 06:25:30PM -1000, Linus Torvalds wrote: > On Tue, 21 Oct 2025 at 13:39, Dave Chinner <david@fromorbit.com> wrote: > > > > > > > 1. Locate a folio in XArray. > > > > > 2. Obtain a reference on the folio using folio_try_get(). > > > > > 3. If successful, verify that the folio still belongs to > > > > > the mapping and has not been truncated or reclaimed. > > > > What about if it has been hole-punched? > > The sequence number check should take care of anything like that. Do > you have any reason to believe it doesn't? Invalidation doing partial folio zeroing isn't covered by the page cache delete sequence number. > Yes, you can get the "before or after or between" behavior, but you > can get that with perfectly regular reads that take the refcount on > the page. Yes, and it is the "in between" behaviour that is the problem here. Hole punching (and all the other fallocate() operations) are supposed to be atomic w.r.t. user IO. i.e. you should see either the non-punched data or the punched data, never a mix of the two. A mix of the two is a transient data corruption event.... This invalidation race does not exist on XFS, even on this new fast path. We protect all buffered reads with the inode->i_rwsem, so we guarantee they can't race with fallocate() operations performing invalidations because fallocate() takes the i_rwsem exclusively. This IO exclusion model was baked into the XFS locking architecture over 30 years ago. The problem is the other filesystems don't use this sort of IO exclusion model (ext4, btrfs, etc) but instead use the page cache folio locking to only avoid concurrent modification to the same file range. Hence they are exposed to this transient state because they rely on folio locks for arbitrating concurrent access to the page cache and buffered reads run completely unlocked. i.e. because.... > Reads have never taken the page lock, and have never been serialized that way. ... they are exposed to transient data states in the page cache during invalidation operations. The i_size checks avoid these transient states for truncation, but there are no checks that can be made to avoid them for other sources of invalidation operations. The mapping->invalidate_lock only prevents page cache instantiation from occurring, allowing filesystems to create a barrier that prevents page cache repopulation after invalidation until the invalidate lock is dropped. This allows them to complete the fallocate() operation before exposing the result to users. However, it does not prevent buffered read cache hits racing with overlapping invalidation operations, and that's the problem I'm pointing out that this new fast path will still hit, even though it tries to bounce-buffer it's way around transient states. So, yes, you are right when you say: > So the fast case changes absolutely nothing in this respect that I can see. But that does not make the existing page cache behaviour desirable. Reading corrupt data faster is still reading corrupt data :/ Really, though, I'm only mentioning this stuff beacuse both the author of the patch and the reviewer did not seem to know how i_size is used in this code to avoid truncate races. truncate races are the simple part of the problem, hole punching is a lot harder to get right. If the author hasn't thought about it, it is likely there are subtle bugs lurking.... -Dave. -- Dave Chinner david@fromorbit.com
On Tue, 21 Oct 2025 at 22:00, Dave Chinner <david@fromorbit.com> wrote:
>
> On Tue, Oct 21, 2025 at 06:25:30PM -1000, Linus Torvalds wrote:
> >
> > The sequence number check should take care of anything like that. Do
> > you have any reason to believe it doesn't?
>
> Invalidation doing partial folio zeroing isn't covered by the page
> cache delete sequence number.
Correct - but neither is it covered by anything else in the *regular* read path.
So the sequence number protects against the same case that the
reference count protects against: hole punching removing the whole
page.
Partial page hole-punching will fundamentally show half-way things.
> > Yes, you can get the "before or after or between" behavior, but you
> > can get that with perfectly regular reads that take the refcount on
> > the page.
>
> Yes, and it is the "in between" behaviour that is the problem here.
>
> Hole punching (and all the other fallocate() operations) are
> supposed to be atomic w.r.t. user IO. i.e. you should see either the
> non-punched data or the punched data, never a mix of the two. A mix
> of the two is a transient data corruption event....
That "supposed" comes from documentation that has never been true and
as such is just a bedtime story.
And no, iI'd argue that it's not even documenting desirable behavior,
because that bedtime story has never been true because it's
prohibitively expensive.
In some cases the documentation may have been historically "more true"
than it is today just because the documentation was written so long
ago that people used a single lock for everything (not talking about
the Linux big kernel lock, but about old BSD model of "single inode
lock for all IO").
End result: you say it would be desirable, and that might be true in a
high-level way when you ignore other issues.
POSIX is full of these bedtime stories that depend on a simplified
version of the truth, where the simplifications means that the
documentation just approximates reality at a high level.
I think it would be much better to fix the documentation, but that's
generally out of our hands.
Linus
On Wed, Oct 22, 2025 at 05:31:12AM -1000, Linus Torvalds wrote: > On Tue, 21 Oct 2025 at 22:00, Dave Chinner <david@fromorbit.com> wrote: > > > > On Tue, Oct 21, 2025 at 06:25:30PM -1000, Linus Torvalds wrote: > > > > > > The sequence number check should take care of anything like that. Do > > > you have any reason to believe it doesn't? > > > > Invalidation doing partial folio zeroing isn't covered by the page > > cache delete sequence number. > > Correct - but neither is it covered by anything else in the *regular* read path. > > So the sequence number protects against the same case that the > reference count protects against: hole punching removing the whole > page. > > Partial page hole-punching will fundamentally show half-way things. Only when you have a busted implementation of the spec. Think about it: if I said "partial page truncation will fundamentally show half-way things", you would shout at me that truncate must -never- expose half-way things to buffered reads. This is how truncate is specified to behave, and we don't violate the spec just because it is hard to implement it. We've broken truncate repeatedly over the past 20+ years in ways that have exposed stale data to users. This is always considered a critical bug that needs to be fixed ASAP. Hole punching is documented to require the same behaviour as truncate and, like truncate, any breakage of that is a potential stale kernel or physical data exposure event. Why will we not tolerate data corruption bugs in truncate due to page cache races, yet being told that we are supposed to just ignore those same data corruption races during fallocate operations? This seems like a double standard to me. Also, keep in mind that XFS isn't exposed to these bugs, so I have no real skin in the game here. People need to be made aware of of a data integrity issue that are noticed, not have them swept under the rug with dubious reasoning. > > > Yes, you can get the "before or after or between" behavior, but you > > > can get that with perfectly regular reads that take the refcount on > > > the page. > > > > Yes, and it is the "in between" behaviour that is the problem here. > > > > Hole punching (and all the other fallocate() operations) are > > supposed to be atomic w.r.t. user IO. i.e. you should see either the > > non-punched data or the punched data, never a mix of the two. A mix > > of the two is a transient data corruption event.... > > That "supposed" comes from documentation that has never been true and > as such is just a bedtime story. I'll give you the benefit of the doubt here, because you are not an expert in the field. That is, I think you are conflating POSIX buffered read/write syscall atomicity with fallocate() requirements. They are not the same thing. The fallocate() atomicity requirements come from the low level data coherency/integrity requirements of the operations being performed, not from some POSIX spec. e.g. FALLOC_FL_ZERO_RANGE turns a range of a file to zeros. The implementation of this can vary. The filesystem can: - write zeros through the page cache. Users have asked us not to do this but return -EOPNOTSUPP so they can choose the fallback mechanism themselves. - use block device offloads like WRITE_SAME(0) to have the device physically zero the range. - use DISCARD blockdev operations if the device guarantees discarded regiond return zeros. - punch out all the extents, leaving a hole. - punch out all the extents, then preallocate new unwritten extents thereby defragmenting the range at the same time. - preallocate new unwritten extents over holes and convert existing written extents to unwritten. All of these are valid implementations, and they are all multi-step operations that could expose partial completion to userspace if there is no IO serialisation against the ZERO_RANGE operation. Hence there is really only one behaviour that is required: whilst the low level operation is taking place, no external IO (read, write, discard, etc) can be performed over that range of the file being zeroed because the data andor metadata is not stable until the whole operation is completed by the filesystem. Now, this doesn't obviously read on the initial invalidation races that are the issue being discussed here because zero's written by invalidation could be considered "valid" for hole punch, zero range, etc. However, consider COLLAPSE_RANGE. Page cache invalidation writing zeros and reads racing with that is a problem, because the old data at a given offset is non-zero, whilst the new data at the same offset is alos non-zero. Hence if we allow the initial page cache invalidation to race with buffered reads, there is the possibility of random zeros appearing in the data being read. Because this is not old or new data, it is -corrupt- data. Put simply, these fallocate operations should *never* see partial invalidation data, and so the "old or new data" rule *must* apply to the initial page cache invalidation these fallocate() operations do. Hence various fallocate() operations need to act as a full IO barrier. Buffered IO, page faults and direct IO all must be blocked and drained before the invalidation of the range begins, and must not be allowed to start again until after the whole operation completes. IOWs, IO exclusion is a data integrity requirement for fallocate() operations to prevent users from being exposed to transient data corruption races whilst the fallocate() operation is being performed. It has nothing to do with POSIX in any way... Also, keep in mind that fallocate() is only one vector to this race condition. Multiple filesystems have multiple invalidation vectors to this race condition. There are similar issues with custom extent swap ioctls (e.g. for online defragmentation), ensuring ->remap_file_range() and ->copy_file_range() implementations have exclusive access to the file data and/or metadata they are manipulating (e.g. offload to hardware copy engines), and so on. fallocate() is just a convenient example to use because multiple filesystems implement it, lots of userspace applications use it and lots of people know about it. It is not niche functionality anymore, so people should be paying close attention when potential data corruption vectors are identified in these operations... > And no, iI'd argue that it's not even documenting desirable behavior, > because that bedtime story has never been true because it's > prohibitively expensive. I disagree, and so do the numbers - IO path exclusion is pretty cheap for the most part, and not a performance limiting requirement. e.g. over a range of different benchmarks on 6.15: https://www.phoronix.com/review/linux-615-filesystems/6 i.e. XFS is 27% faster overall than ext4 and btrfs on a modern NVMe SSDs. Numbers don't lie, and these numbers indicate that the IO path exclusion that XFS implementations for avoiding invalidation races doesn't have any impact on typical production workloads... That said, I am most definitely not suggesting that the XFS IO path exclusion is the solution for the specific buffered read vs cache invalidation race I pointed out. It's just one example of how it can be solved with little in the way of runtime overhead. I suspect the invalidation race could be detected at the page cache layer with a mark-and-sweep invalidation algorithm using xarray tags kinda like writeback already does. Those tags could be detected in the read path at little extra cost (the xarray node is already hot in cache) which can then punt the folio to a slow path that does the right thing. If there's no invalidation tag present, then the fast path can safely keep doing it's fast path things... > I think it would be much better to fix the documentation, but that's > generally out of our hands. We control the fallocate() specification and documentation ourselves. But that's not the problem here; the problem is that the cached data invalidation mechanism used by fallocate operations does not prevent buffered reads from racing against invalidation and exposing invalid transient data to users... -Dave. -- Dave Chinner david@fromorbit.com
On Thu 23-10-25 18:50:46, Dave Chinner wrote: > On Wed, Oct 22, 2025 at 05:31:12AM -1000, Linus Torvalds wrote: > > On Tue, 21 Oct 2025 at 22:00, Dave Chinner <david@fromorbit.com> wrote: > > > > > > On Tue, Oct 21, 2025 at 06:25:30PM -1000, Linus Torvalds wrote: > > > > > > > > The sequence number check should take care of anything like that. Do > > > > you have any reason to believe it doesn't? > > > > > > Invalidation doing partial folio zeroing isn't covered by the page > > > cache delete sequence number. > > > > Correct - but neither is it covered by anything else in the *regular* read path. > > > > So the sequence number protects against the same case that the > > reference count protects against: hole punching removing the whole > > page. > > > > Partial page hole-punching will fundamentally show half-way things. > > Only when you have a busted implementation of the spec. > > Think about it: if I said "partial page truncation will > fundamentally show half-way things", you would shout at me that > truncate must -never- expose half-way things to buffered reads. > This is how truncate is specified to behave, and we don't violate > the spec just because it is hard to implement it. Well, as a matter of fact we can expose part-way results of truncate for ext4 and similar filesystems not serializing reads to truncate with inode lock. In particular for ext4 there's the i_size check in filemap_read() but if that passes before the truncate starts, the code copying out data from the pages can race with truncate zeroing out tail of the last page. > We've broken truncate repeatedly over the past 20+ years in ways > that have exposed stale data to users. This is always considered a > critical bug that needs to be fixed ASAP. Exposing data that was never in the file is certainly a critical bug. Showing a mix of old and new data is not great but less severe and it seems over the years userspace on Linux learned to live with it and reap the performance benefit (e.g. for mixed read-write workloads to one file)... <snip> > Hence there is really only one behaviour that is required: whilst > the low level operation is taking place, no external IO (read, > write, discard, etc) can be performed over that range of the file > being zeroed because the data andor metadata is not stable until the > whole operation is completed by the filesystem. > > Now, this doesn't obviously read on the initial invalidation races > that are the issue being discussed here because zero's written by > invalidation could be considered "valid" for hole punch, zero range, > etc. > > However, consider COLLAPSE_RANGE. Page cache invalidation > writing zeros and reads racing with that is a problem, because > the old data at a given offset is non-zero, whilst the new data at > the same offset is alos non-zero. > > Hence if we allow the initial page cache invalidation to race with > buffered reads, there is the possibility of random zeros appearing > in the data being read. Because this is not old or new data, it is > -corrupt- data. Well, reasons like this are why for operations like COLLAPSE_RANGE ext4 reclaims the whole interval of the page cache starting with the first affected folio to the end. So again user will either see old data (if it managed to get the page before we invalidated the page cache) or the new data (when it needs to read from the disk which is properly synchronized with COLLAPSE_RANGE through invalidate_lock). I don't see these speculative accesses changing anything in this case either. > Put simply, these fallocate operations should *never* see partial > invalidation data, and so the "old or new data" rule *must* apply to > the initial page cache invalidation these fallocate() operations do. > > Hence various fallocate() operations need to act as a full IO > barrier. Buffered IO, page faults and direct IO all must be blocked > and drained before the invalidation of the range begins, and must > not be allowed to start again until after the whole operation > completes. Hum, I'm not sure I follow you correctly but what you describe doesn't seem like how ext4 works. There are two different things - zeroing out of partial folios affected by truncate, hole punch, zero range (other fallocate operations don't zero out) and invalidation of the page cache folios. For ext4 it is actually the removal of folios from the page cache during invalidation + holding invalidate_lock that synchronizes with reads. As such zeroing of partial folios *can* actually race with reads within these partial folios and so you can get a mix of zeros and old data from reads. Honza -- Jan Kara <jack@suse.com> SUSE Labs, CR
On Mon, 20 Oct 2025 at 01:33, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> On Sun, Oct 19, 2025 at 09:53:28PM -0700, Andrew Morton wrote:
> >
> > A use case for alloca() or equiv. That would improve the average-case
> > stack depth but not the worst-case.
>
> __kstack_alloca()/__builtin_alloca() would work and it bypassed
> -Wframe-larger-than warning.
>
> But I don't see any real users.
Yes, and we've walked away from alloca() (and on-stack VLAs, which are
really exactly the same thing as far as a compiler is concerned),
because it makes static analysis much *MUCH* harder.
Let's not ever re-introduce dynamic stack use in the kernel.
Linus
On Fri, 17 Oct 2025 at 04:15, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> To address this issue, introduce i_pages_delete_seqcnt, which increments
> each time a folio is deleted from the page cache and implement a modified
> page cache lookup protocol for short reads:
So this patch looks good to me, but to avoid the stack size warnings,
let's just make FAST_READ_BUF_SIZE be 768 bytes or something like
that, not the full 1k.
It really shouldn't make much of a difference, and we do have that
stack size limit check for a reason.
And obviously
> --- a/fs/inode.c
> +++ b/fs/inode.c
> + seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
> + &mapping->i_pages->xa_lock);
will need to use '&mapping->i_pages.xa_lock', since mapping->i_pages
is the embedded xarray, not a pointer to it.
But I do think the patch looks quite good.
Linus
On Sat, Oct 18, 2025 at 07:56:48AM -1000, Linus Torvalds wrote:
> On Fri, 17 Oct 2025 at 04:15, Kiryl Shutsemau <kirill@shutemov.name> wrote:
> >
> > To address this issue, introduce i_pages_delete_seqcnt, which increments
> > each time a folio is deleted from the page cache and implement a modified
> > page cache lookup protocol for short reads:
>
> So this patch looks good to me, but to avoid the stack size warnings,
> let's just make FAST_READ_BUF_SIZE be 768 bytes or something like
> that, not the full 1k.
>
> It really shouldn't make much of a difference, and we do have that
> stack size limit check for a reason.
My reasoning is that we are at the leaf of the call chain. Slow path
goes much deeper to I/O.
Reducing the buffer size would invalidate my benchmarking :/
It took time.
What about disabling the warning for the function?
@@ -2750,6 +2750,8 @@ static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
#define FAST_READ_BUF_SIZE 1024
+__diag_push();
+__diag_ignore_all("-Wframe-larger-than=", "Allow on-stack buffer for fast read");
static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
ssize_t *already_read)
{
@@ -2785,6 +2787,7 @@ static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter
return !iov_iter_count(iter);
}
+__diag_pop();
static noinline ssize_t filemap_read_slow(struct kiocb *iocb,
struct iov_iter *iter,
>
> And obviously
>
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > + seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
> > + &mapping->i_pages->xa_lock);
>
> will need to use '&mapping->i_pages.xa_lock', since mapping->i_pages
> is the embedded xarray, not a pointer to it.
Doh!
--
Kiryl Shutsemau / Kirill A. Shutemov
Hi Kiryl,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Kiryl-Shutsemau/mm-filemap-Implement-fast-short-reads/20251017-221655
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251017141536.577466-1-kirill%40shutemov.name
patch subject: [PATCH] mm/filemap: Implement fast short reads
config: i386-defconfig (https://download.01.org/0day-ci/archive/20251018/202510181215.jcL2gJMQ-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251018/202510181215.jcL2gJMQ-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510181215.jcL2gJMQ-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> mm/filemap.c:2753:22: warning: stack frame size (1096) exceeds limit (1024) in 'filemap_read_fast' [-Wframe-larger-than]
2753 | static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
| ^
1 warning generated.
vim +/filemap_read_fast +2753 mm/filemap.c
2752
> 2753 static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
2754 ssize_t *already_read)
2755 {
2756 struct address_space *mapping = iocb->ki_filp->f_mapping;
2757 struct file_ra_state *ra = &iocb->ki_filp->f_ra;
2758 char buffer[FAST_READ_BUF_SIZE];
2759 size_t count;
2760
2761 if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
2762 return false;
2763
2764 if (iov_iter_count(iter) > sizeof(buffer))
2765 return false;
2766
2767 count = iov_iter_count(iter);
2768
2769 /* Let's see if we can just do the read under RCU */
2770 rcu_read_lock();
2771 count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
2772 rcu_read_unlock();
2773
2774 if (!count)
2775 return false;
2776
2777 count = copy_to_iter(buffer, count, iter);
2778 if (unlikely(!count))
2779 return false;
2780
2781 iocb->ki_pos += count;
2782 ra->prev_pos = iocb->ki_pos;
2783 file_accessed(iocb->ki_filp);
2784 *already_read += count;
2785
2786 return !iov_iter_count(iter);
2787 }
2788
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Kiryl,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Kiryl-Shutsemau/mm-filemap-Implement-fast-short-reads/20251017-221655
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251017141536.577466-1-kirill%40shutemov.name
patch subject: [PATCH] mm/filemap: Implement fast short reads
config: i386-allmodconfig (https://download.01.org/0day-ci/archive/20251018/202510181107.YiEpGvU3-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251018/202510181107.YiEpGvU3-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510181107.YiEpGvU3-lkp@intel.com/
All warnings (new ones prefixed by >>):
mm/filemap.c: In function 'filemap_read_fast':
>> mm/filemap.c:2787:1: warning: the frame size of 1048 bytes is larger than 1024 bytes [-Wframe-larger-than=]
2787 | }
| ^
vim +2787 mm/filemap.c
2752
2753 static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
2754 ssize_t *already_read)
2755 {
2756 struct address_space *mapping = iocb->ki_filp->f_mapping;
2757 struct file_ra_state *ra = &iocb->ki_filp->f_ra;
2758 char buffer[FAST_READ_BUF_SIZE];
2759 size_t count;
2760
2761 if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
2762 return false;
2763
2764 if (iov_iter_count(iter) > sizeof(buffer))
2765 return false;
2766
2767 count = iov_iter_count(iter);
2768
2769 /* Let's see if we can just do the read under RCU */
2770 rcu_read_lock();
2771 count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
2772 rcu_read_unlock();
2773
2774 if (!count)
2775 return false;
2776
2777 count = copy_to_iter(buffer, count, iter);
2778 if (unlikely(!count))
2779 return false;
2780
2781 iocb->ki_pos += count;
2782 ra->prev_pos = iocb->ki_pos;
2783 file_accessed(iocb->ki_filp);
2784 *already_read += count;
2785
2786 return !iov_iter_count(iter);
> 2787 }
2788
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Kiryl,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Kiryl-Shutsemau/mm-filemap-Implement-fast-short-reads/20251017-221655
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251017141536.577466-1-kirill%40shutemov.name
patch subject: [PATCH] mm/filemap: Implement fast short reads
config: riscv-randconfig-001-20251018 (https://download.01.org/0day-ci/archive/20251018/202510181054.Fmf1S18u-lkp@intel.com/config)
compiler: riscv32-linux-gcc (GCC) 8.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251018/202510181054.Fmf1S18u-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510181054.Fmf1S18u-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from include/linux/sched.h:45,
from include/linux/rcupdate.h:27,
from include/linux/rculist.h:11,
from include/linux/dcache.h:8,
from include/linux/fs.h:9,
from fs/inode.c:7:
fs/inode.c: In function '__address_space_init_once':
>> fs/inode.c:486:28: error: invalid type argument of '->' (have 'struct xarray')
&mapping->i_pages->xa_lock);
^~
include/linux/seqlock_types.h:57:26: note: in definition of macro '__SEQ_LOCK'
#define __SEQ_LOCK(expr) expr
^~~~
include/linux/seqlock.h:131:42: note: in expansion of macro 'seqcount_LOCKNAME_init'
#define seqcount_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, spinlock)
^~~~~~~~~~~~~~~~~~~~~~
fs/inode.c:485:2: note: in expansion of macro 'seqcount_spinlock_init'
seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
^~~~~~~~~~~~~~~~~~~~~~
Kconfig warnings: (for reference only)
WARNING: unmet direct dependencies detected for ARCH_HAS_ELF_CORE_EFLAGS
Depends on [n]: BINFMT_ELF [=n] && ELF_CORE [=n]
Selected by [y]:
- RISCV [=y]
vim +486 fs/inode.c
481
482 static void __address_space_init_once(struct address_space *mapping)
483 {
484 xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
485 seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
> 486 &mapping->i_pages->xa_lock);
487 init_rwsem(&mapping->i_mmap_rwsem);
488 INIT_LIST_HEAD(&mapping->i_private_list);
489 spin_lock_init(&mapping->i_private_lock);
490 mapping->i_mmap = RB_ROOT_CACHED;
491 }
492
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2026 Red Hat, Inc.