mm/filemap: Implement fast short reads

[PATCH] mm/filemap: Implement fast short reads

Posted by Kiryl Shutsemau 3 months, 3 weeks ago

From: Kiryl Shutsemau <kas@kernel.org>

The protocol for page cache lookup is as follows:

  1. Locate a folio in XArray.
  2. Obtain a reference on the folio using folio_try_get().
  3. If successful, verify that the folio still belongs to
     the mapping and has not been truncated or reclaimed.
  4. Perform operations on the folio, such as copying data
     to userspace.
  5. Release the reference.

For short reads, the overhead of atomic operations on reference
manipulation can be significant, particularly when multiple tasks access
the same folio, leading to cache line bouncing.

To address this issue, introduce i_pages_delete_seqcnt, which increments
each time a folio is deleted from the page cache and implement a modified
page cache lookup protocol for short reads:

  1. Locate a folio in XArray.
  2. Take note of the i_pages_delete_seqcnt.
  3. Copy the data to a local buffer on the stack.
  4. Verify that the i_pages_delete_seqcnt has not changed.
  5. Copy the data from the local buffer to the iterator.

If any issues arise in the fast path, fallback to the slow path that
relies on the refcount to stabilize the folio.

The new approach requires a local buffer in the stack. The size of the
buffer determines which read requests are served by the fast path. Set
the buffer to 1k. This seems to be a reasonable amount of stack usage
for the function at the bottom of the call stack.

The fast read approach demonstrates significant performance
improvements, particularly in contended cases.

16 threads, reads from 4k file(s), mean MiB/s (StdDev)

 -------------------------------------------------------------
| Block |  Baseline  |  Baseline   |  Patched   |  Patched    |
| size  |  same file |  diff files |  same file | diff files  |
 -------------------------------------------------------------
|     1 |    10.96   |     27.56   |    30.42   |     30.4    |
|       |    (0.497) |     (0.114) |    (0.130) |     (0.158) |
|    32 |   350.8    |    886.2    |   980.6    |    981.8    |
|       |   (13.64)  |     (2.863) |    (3.361) |     (1.303) |
|   256 |  2798      |   7009.6    |  7641.4    |   7653.6    |
|       |  (103.9)   |    (28.00)  |   (33.26)  |    (25.50)  |
|  1024 | 10780      |  27040      | 29280      |  29320      |
|       |  (389.8)   |    (89.44)  |  (130.3)   |    (83.66)  |
|  4096 | 43700      | 103800      | 48420      | 102000      |
|       | (1953)     |    (447.2)  | (2012)     |     (0)     |
 -------------------------------------------------------------

16 threads, reads from 1M file(s), mean MiB/s (StdDev)

 --------------------------------------------------------------
| Block |  Baseline   |  Baseline   |  Patched    |  Patched   |
| size  |  same file  |  diff files |  same file  | diff files |
 ---------------------------------------------------------
|     1 |     26.38   |     27.34   |     30.38   |    30.36   |
|       |     (0.998) |     (0.114) |     (0.083) |    (0.089) |
|    32 |    824.4    |    877.2    |    977.8    |   975.8    |
|       |    (15.78)  |     (3.271) |     (2.683) |    (1.095) |
|   256 |   6494      |   6992.8    |   7619.8    |   7625     |
|       |   (116.0)   |    (32.39)  |    (10.66)  |    (28.19) |
|  1024 |  24960      |  26840      |  29100      |  29180     |
|       |   (606.6)   |   (151.6)   |   (122.4)   |    (83.66) |
|  4096 |  94420      | 100520      |  95260      |  99760     |
|       |  (3144)     |   (672.3)   |  (2874)     |   (134.1)  |
| 32768 | 386000      | 402400      | 368600      | 397400     |
|       | (36599)     | (10526)     | (47188)     |  (6107)    |
 --------------------------------------------------------------

There's also improvement on kernel build:

Base line: 61.3462 +- 0.0597 seconds time elapsed  ( +-  0.10% )
Patched:   60.6106 +- 0.0759 seconds time elapsed  ( +-  0.13% )

Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
---
 fs/inode.c         |   2 +
 include/linux/fs.h |   1 +
 mm/filemap.c       | 150 ++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 130 insertions(+), 23 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..52163d28d630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
 static void __address_space_init_once(struct address_space *mapping)
 {
 	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+	seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
+			       &mapping->i_pages->xa_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..c9588d555f73 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -523,6 +523,7 @@ struct address_space {
 	struct list_head	i_private_list;
 	struct rw_semaphore	i_mmap_rwsem;
 	void *			i_private_data;
+	seqcount_spinlock_t	i_pages_delete_seqcnt;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/filemap.c b/mm/filemap.c
index 13f0259d993c..51689c4f3773 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
+	write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
 	xas_store(&xas, shadow);
 	xas_init_marks(&xas);
+	write_seqcount_end(&mapping->i_pages_delete_seqcnt);
 
 	folio->mapping = NULL;
 	/* Leave folio->index set: truncation lookup relies upon it */
@@ -2695,21 +2697,98 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
-/**
- * filemap_read - Read data from the page cache.
- * @iocb: The iocb to read.
- * @iter: Destination for the data.
- * @already_read: Number of bytes already read by the caller.
- *
- * Copies data from the page cache.  If the data is not currently present,
- * uses the readahead and read_folio address_space operations to fetch it.
- *
- * Return: Total number of bytes copied, including those already read by
- * the caller.  If an error happens before any bytes are copied, returns
- * a negative error number.
- */
-ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
-		ssize_t already_read)
+static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
+						  loff_t pos, char *buffer,
+						  size_t size)
+{
+	XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
+	struct folio *folio;
+	loff_t file_size;
+	unsigned int seq;
+
+	lockdep_assert_in_rcu_read_lock();
+
+	/* Give up and go to slow path if raced with page_cache_delete() */
+	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
+		return false;
+
+	folio = xas_load(&xas);
+	if (xas_retry(&xas, folio))
+		return 0;
+
+	if (!folio || xa_is_value(folio))
+		return 0;
+
+	if (!folio_test_uptodate(folio))
+		return 0;
+
+	/* No fast-case if readahead is supposed to started */
+	if (folio_test_readahead(folio))
+		return 0;
+	/* .. or mark it accessed */
+	if (!folio_test_referenced(folio))
+		return 0;
+
+	/* i_size check must be after folio_test_uptodate() */
+	file_size = i_size_read(mapping->host);
+	if (unlikely(pos >= file_size))
+		return 0;
+	if (size > file_size - pos)
+		size = file_size - pos;
+
+	/* Do the data copy */
+	size = memcpy_from_file_folio(buffer, folio, pos, size);
+	if (!size)
+		return 0;
+
+	/* Give up and go to slow path if raced with page_cache_delete() */
+	if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+		return 0;
+
+	return size;
+}
+
+#define FAST_READ_BUF_SIZE 1024
+
+static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
+				       ssize_t *already_read)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
+	char buffer[FAST_READ_BUF_SIZE];
+	size_t count;
+
+	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+		return false;
+
+	if (iov_iter_count(iter) > sizeof(buffer))
+		return false;
+
+	count = iov_iter_count(iter);
+
+	/* Let's see if we can just do the read under RCU */
+	rcu_read_lock();
+	count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
+	rcu_read_unlock();
+
+	if (!count)
+		return false;
+
+	count = copy_to_iter(buffer, count, iter);
+	if (unlikely(!count))
+		return false;
+
+	iocb->ki_pos += count;
+	ra->prev_pos = iocb->ki_pos;
+	file_accessed(iocb->ki_filp);
+	*already_read += count;
+
+	return !iov_iter_count(iter);
+}
+
+static noinline ssize_t filemap_read_slow(struct kiocb *iocb,
+					  struct iov_iter *iter,
+					  ssize_t already_read)
 {
 	struct file *filp = iocb->ki_filp;
 	struct file_ra_state *ra = &filp->f_ra;
@@ -2721,14 +2800,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t isize, end_offset;
 	loff_t last_pos = ra->prev_pos;
 
-	if (unlikely(iocb->ki_pos < 0))
-		return -EINVAL;
-	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
-		return 0;
-	if (unlikely(!iov_iter_count(iter)))
-		return 0;
-
-	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
 	folio_batch_init(&fbatch);
 
 	do {
@@ -2821,6 +2892,39 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	ra->prev_pos = last_pos;
 	return already_read ? already_read : error;
 }
+
+/**
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
+ *
+ * Copies data from the page cache.  If the data is not currently present,
+ * uses the readahead and read_folio address_space operations to fetch it.
+ *
+ * Return: Total number of bytes copied, including those already read by
+ * the caller.  If an error happens before any bytes are copied, returns
+ * a negative error number.
+ */
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+		     ssize_t already_read)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+
+	if (unlikely(iocb->ki_pos < 0))
+		return -EINVAL;
+	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
+		return 0;
+	if (unlikely(!iov_iter_count(iter)))
+		return 0;
+
+	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
+
+	if (filemap_read_fast(iocb, iter, &already_read))
+		return already_read;
+
+	return filemap_read_slow(iocb, iter, already_read);
+}
 EXPORT_SYMBOL_GPL(filemap_read);
 
 int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
-- 
2.50.1

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by David Hildenbrand 3 months, 2 weeks ago

On 17.10.25 16:15, Kiryl Shutsemau wrote:
> From: Kiryl Shutsemau <kas@kernel.org>
> 
> The protocol for page cache lookup is as follows:
> 
>    1. Locate a folio in XArray.
>    2. Obtain a reference on the folio using folio_try_get().
>    3. If successful, verify that the folio still belongs to
>       the mapping and has not been truncated or reclaimed.
>    4. Perform operations on the folio, such as copying data
>       to userspace.
>    5. Release the reference.
> 
> For short reads, the overhead of atomic operations on reference
> manipulation can be significant, particularly when multiple tasks access
> the same folio, leading to cache line bouncing.
> 
> To address this issue, introduce i_pages_delete_seqcnt, which increments
> each time a folio is deleted from the page cache and implement a modified
> page cache lookup protocol for short reads:
> 
>    1. Locate a folio in XArray.
>    2. Take note of the i_pages_delete_seqcnt.
>    3. Copy the data to a local buffer on the stack.
>    4. Verify that the i_pages_delete_seqcnt has not changed.
>    5. Copy the data from the local buffer to the iterator.
> 
> If any issues arise in the fast path, fallback to the slow path that
> relies on the refcount to stabilize the folio.
> 
> The new approach requires a local buffer in the stack. The size of the
> buffer determines which read requests are served by the fast path. Set
> the buffer to 1k. This seems to be a reasonable amount of stack usage
> for the function at the bottom of the call stack.
> 
> The fast read approach demonstrates significant performance
> improvements, particularly in contended cases.
> 
> 16 threads, reads from 4k file(s), mean MiB/s (StdDev)
> 
>   -------------------------------------------------------------
> | Block |  Baseline  |  Baseline   |  Patched   |  Patched    |
> | size  |  same file |  diff files |  same file | diff files  |
>   -------------------------------------------------------------
> |     1 |    10.96   |     27.56   |    30.42   |     30.4    |
> |       |    (0.497) |     (0.114) |    (0.130) |     (0.158) |
> |    32 |   350.8    |    886.2    |   980.6    |    981.8    |
> |       |   (13.64)  |     (2.863) |    (3.361) |     (1.303) |
> |   256 |  2798      |   7009.6    |  7641.4    |   7653.6    |
> |       |  (103.9)   |    (28.00)  |   (33.26)  |    (25.50)  |
> |  1024 | 10780      |  27040      | 29280      |  29320      |
> |       |  (389.8)   |    (89.44)  |  (130.3)   |    (83.66)  |
> |  4096 | 43700      | 103800      | 48420      | 102000      |
> |       | (1953)     |    (447.2)  | (2012)     |     (0)     |
>   -------------------------------------------------------------
> 
> 16 threads, reads from 1M file(s), mean MiB/s (StdDev)
> 
>   --------------------------------------------------------------
> | Block |  Baseline   |  Baseline   |  Patched    |  Patched   |
> | size  |  same file  |  diff files |  same file  | diff files |
>   ---------------------------------------------------------
> |     1 |     26.38   |     27.34   |     30.38   |    30.36   |
> |       |     (0.998) |     (0.114) |     (0.083) |    (0.089) |
> |    32 |    824.4    |    877.2    |    977.8    |   975.8    |
> |       |    (15.78)  |     (3.271) |     (2.683) |    (1.095) |
> |   256 |   6494      |   6992.8    |   7619.8    |   7625     |
> |       |   (116.0)   |    (32.39)  |    (10.66)  |    (28.19) |
> |  1024 |  24960      |  26840      |  29100      |  29180     |
> |       |   (606.6)   |   (151.6)   |   (122.4)   |    (83.66) |
> |  4096 |  94420      | 100520      |  95260      |  99760     |
> |       |  (3144)     |   (672.3)   |  (2874)     |   (134.1)  |
> | 32768 | 386000      | 402400      | 368600      | 397400     |
> |       | (36599)     | (10526)     | (47188)     |  (6107)    |
>   --------------------------------------------------------------
> 
> There's also improvement on kernel build:
> 
> Base line: 61.3462 +- 0.0597 seconds time elapsed  ( +-  0.10% )
> Patched:   60.6106 +- 0.0759 seconds time elapsed  ( +-  0.13% )
> 
> Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
> ---
>   fs/inode.c         |   2 +
>   include/linux/fs.h |   1 +
>   mm/filemap.c       | 150 ++++++++++++++++++++++++++++++++++++++-------
>   3 files changed, 130 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index ec9339024ac3..52163d28d630 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
>   static void __address_space_init_once(struct address_space *mapping)
>   {
>   	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
> +	seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
> +			       &mapping->i_pages->xa_lock);
>   	init_rwsem(&mapping->i_mmap_rwsem);
>   	INIT_LIST_HEAD(&mapping->i_private_list);
>   	spin_lock_init(&mapping->i_private_lock);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c895146c1444..c9588d555f73 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -523,6 +523,7 @@ struct address_space {
>   	struct list_head	i_private_list;
>   	struct rw_semaphore	i_mmap_rwsem;
>   	void *			i_private_data;
> +	seqcount_spinlock_t	i_pages_delete_seqcnt;
>   } __attribute__((aligned(sizeof(long)))) __randomize_layout;
>   	/*
>   	 * On most architectures that alignment is already the case; but
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 13f0259d993c..51689c4f3773 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
>   
>   	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
>   
> +	write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
>   	xas_store(&xas, shadow);
>   	xas_init_marks(&xas);
> +	write_seqcount_end(&mapping->i_pages_delete_seqcnt);
>   
>   	folio->mapping = NULL;
>   	/* Leave folio->index set: truncation lookup relies upon it */
> @@ -2695,21 +2697,98 @@ static void filemap_end_dropbehind_read(struct folio *folio)
>   	}
>   }
>   
> -/**
> - * filemap_read - Read data from the page cache.
> - * @iocb: The iocb to read.
> - * @iter: Destination for the data.
> - * @already_read: Number of bytes already read by the caller.
> - *
> - * Copies data from the page cache.  If the data is not currently present,
> - * uses the readahead and read_folio address_space operations to fetch it.
> - *
> - * Return: Total number of bytes copied, including those already read by
> - * the caller.  If an error happens before any bytes are copied, returns
> - * a negative error number.
> - */
> -ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
> -		ssize_t already_read)
> +static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
> +						  loff_t pos, char *buffer,
> +						  size_t size)
> +{
> +	XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
> +	struct folio *folio;
> +	loff_t file_size;
> +	unsigned int seq;
> +
> +	lockdep_assert_in_rcu_read_lock();
> +
> +	/* Give up and go to slow path if raced with page_cache_delete() */
> +	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
> +		return false;
> +
> +	folio = xas_load(&xas);
> +	if (xas_retry(&xas, folio))
> +		return 0;
> +
> +	if (!folio || xa_is_value(folio))
> +		return 0;
> +
> +	if (!folio_test_uptodate(folio))
> +		return 0;
> +
> +	/* No fast-case if readahead is supposed to started */
> +	if (folio_test_readahead(folio))
> +		return 0;
> +	/* .. or mark it accessed */
> +	if (!folio_test_referenced(folio))
> +		return 0;
> +
> +	/* i_size check must be after folio_test_uptodate() */
> +	file_size = i_size_read(mapping->host);
> +	if (unlikely(pos >= file_size))
> +		return 0;
> +	if (size > file_size - pos)
> +		size = file_size - pos;
> +
> +	/* Do the data copy */
> +	size = memcpy_from_file_folio(buffer, folio, pos, size);

We can be racing with folio splitting or freeing+reallocation, right? So 
our folio might actually be suddenly a tail page I assume? Or change 
from small to large, or change from large to small. Or a large folio can 
change its size?

In that case things like folio_test_uptodate() won't be happy, because 
they will bail out if called on a tail page (see PF_NO_TAIL).


But also, are we sure memcpy_from_file_folio() is save to be used in 
that racy context?


offset_in_folio() which depends on folio_size(). When racing with 
concurrent folio reuse, folio_size()->folio_order() can be tricked into 
returning a wrong number I guess and making the result of 
offset_in_folio() rather bogus.

At least that's my understanding after taking a quick look.

The concern I would have in that case
(A) the page pointer we calculate in kmap_local_folio() could be garbage
(B) the "from" pointer we return could be garbage

"garbage" as in pointing at something without a direct map, something 
that's protected differently (MTE? weird CoCo protection?) or even worse 
MMIO with undesired read-effects.


I'll note that some of these concerns would be gone once folios are 
allocated separately: we would not suddenly see a tail page.

But concurrent folio splitting or size changes could still be an issue 
IIUC Willy today once I discussed the plans about "struct folio" freeing.

Maybe I'm wrong, just wanted to raise that these functions are not 
prepared to be called when we are not holding a refcount or cannot 
otherwise guarantee that the folio cannot concurrently be 
freed/reallocated/merged/split.

-- 
Cheers

David / dhildenb

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Kiryl Shutsemau 3 months, 2 weeks ago

On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
> "garbage" as in pointing at something without a direct map, something that's
> protected differently (MTE? weird CoCo protection?) or even worse MMIO with
> undesired read-effects.

Pedro already points to the problem with missing direct mapping.
_nofault() copy should help with this.

Can direct mapping ever be converted to MMIO? It can be converted to DMA
buffer (which is fine), but MMIO? I have not seen it even in virtualized
environments.

I cannot say for all CoCo protections, but TDX guest shared<->private
should be fine.

I am not sure about MTE. Is there a way to bypass MTE check for a load?
And how does it deal with stray reads from load_unaligned_zeropad()?

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Yang Shi 3 months, 2 weeks ago

On Thu, Oct 23, 2025 at 3:34 AM Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
> > "garbage" as in pointing at something without a direct map, something that's
> > protected differently (MTE? weird CoCo protection?) or even worse MMIO with
> > undesired read-effects.
>
> Pedro already points to the problem with missing direct mapping.
> _nofault() copy should help with this.
>
> Can direct mapping ever be converted to MMIO? It can be converted to DMA
> buffer (which is fine), but MMIO? I have not seen it even in virtualized
> environments.
>
> I cannot say for all CoCo protections, but TDX guest shared<->private
> should be fine.
>
> I am not sure about MTE. Is there a way to bypass MTE check for a load?
> And how does it deal with stray reads from load_unaligned_zeropad()?

If I remember correctly, _nofault() copy should skip tag check too.

Thanks,
Yang

>
> --
>   Kiryl Shutsemau / Kirill A. Shutemov
>

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Hugh Dickins 3 months, 1 week ago

On Thu, 23 Oct 2025, Yang Shi wrote:
> On Thu, Oct 23, 2025 at 3:34 AM Kiryl Shutsemau <kirill@shutemov.name> wrote:
> >
> > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
> > > "garbage" as in pointing at something without a direct map, something that's
> > > protected differently (MTE? weird CoCo protection?) or even worse MMIO with
> > > undesired read-effects.
> >
> > Pedro already points to the problem with missing direct mapping.
> > _nofault() copy should help with this.
> >
> > Can direct mapping ever be converted to MMIO? It can be converted to DMA
> > buffer (which is fine), but MMIO? I have not seen it even in virtualized
> > environments.
> >
> > I cannot say for all CoCo protections, but TDX guest shared<->private
> > should be fine.
> >
> > I am not sure about MTE. Is there a way to bypass MTE check for a load?
> > And how does it deal with stray reads from load_unaligned_zeropad()?
> 
> If I remember correctly, _nofault() copy should skip tag check too.
> 
> Thanks,
> Yang

Not a reply to Yang, I'm just tacking on to the latest mail...

I sew no mention of page migration: __folio_migrate_mapping() does
not use page_cache_delete(), so would surely need to do its own
write_seqcount_begin() and _end(), wouldn't it?

It is using folio_ref_freeze() and _unfreeze(), thinking that
keeps everyone else safely away: but not filemap_read_fast_rcu().

And all other users of folio_ref_freeze() (probably not many but
I have not searched) need to be checked too: maybe no others need
changing, but that has to be established.

This makes a fundamental change to speculative page cache assumptions.

My own opinion was expressed very much better than I could,
by Dave Chinner on Oct 21: cognitive load of niche fastpath.

Hugh

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Linus Torvalds 3 months, 1 week ago

On Mon, 27 Oct 2025 at 03:49, Hugh Dickins <hughd@google.com> wrote:
>
> This makes a fundamental change to speculative page cache assumptions.

Yes, but I'm a bit surprised by people who find that scary.

The page cache does *much* more scary things elsewhere, particularly
the whole folio_try_get() dance (in filemap_get_entry() and other
places).

I suspect you ignore that just because it's been that way forever, so
you're comfortable with it.

I'd argue that that is much *much* more subtle because it means that
somebody may be incrementing the page count of a page that has already
been re-allocated by somebody else.

Talk about cognitive load: that code makes you think that "hey, the
tryget means that if it has been released, we don't get a ref to it",
because that's how many of our *other* speculative RCU accesses do in
fact work.

But that's not how the page cache works, exactly because freeing isn't
actually RCU-delayed.

So while the code visually follows the exact same pattern as some
other "look up speculatively under RCU, skip if it's not there any
more", it actually does exactly the same thing as the "copy data under
RCU, then check later if it was ok". Except it does "increment
refcount under RCU, then check later if it was actually valid".

That said, I wonder if we might not consider making page cache freeing
be RCU-delayed. This has come up before (exactly *because* of that
"folio_try_get()").

Because while I am pretty sure that filemap_get_entry() is fine (and a
number of other core users), I'm not convinced that some of the other
users of folio_try_get() are necessarily aware of just how subtle that
thing is.

Anyway, I'm certainly not going to push that patch very hard.

But I do think that a "3x performance improvement on a case that is
known to be an issue for at least one real-world customer"  shouldn't
be called "a niche case". I've seen *way* more niche than that.

(I do think RCU-freeing folios would potentially be an interesting
thing to look into, but I don't think the patch under discussion is
necessarily the reason to do so).

               Linus

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by David Hildenbrand 3 months, 1 week ago

On 27.10.25 16:50, Linus Torvalds wrote:
> On Mon, 27 Oct 2025 at 03:49, Hugh Dickins <hughd@google.com> wrote:
>>
>> This makes a fundamental change to speculative page cache assumptions.
> 
> Yes, but I'm a bit surprised by people who find that scary.
> 
> The page cache does *much* more scary things elsewhere, particularly
> the whole folio_try_get() dance (in filemap_get_entry() and other
> places).
> 
> I suspect you ignore that just because it's been that way forever, so
> you're comfortable with it.
> 
> I'd argue that that is much *much* more subtle because it means that
> somebody may be incrementing the page count of a page that has already
> been re-allocated by somebody else.
> 
> Talk about cognitive load: that code makes you think that "hey, the
> tryget means that if it has been released, we don't get a ref to it",
> because that's how many of our *other* speculative RCU accesses do in
> fact work.
> 
> But that's not how the page cache works, exactly because freeing isn't
> actually RCU-delayed.
> 
> So while the code visually follows the exact same pattern as some
> other "look up speculatively under RCU, skip if it's not there any
> more", it actually does exactly the same thing as the "copy data under
> RCU, then check later if it was ok". Except it does "increment
> refcount under RCU, then check later if it was actually valid".
> 
> That said, I wonder if we might not consider making page cache freeing
> be RCU-delayed. This has come up before (exactly *because* of that
> "folio_try_get()").
> 
> Because while I am pretty sure that filemap_get_entry() is fine (and a
> number of other core users), I'm not convinced that some of the other
> users of folio_try_get() are necessarily aware of just how subtle that
> thing is.
> 
> Anyway, I'm certainly not going to push that patch very hard.

I will sleep better at night if we can guarantee that we are reading 
from a folio that has not been reused in the meantime -- or reading 
random other memory as I raised in my other mail.

So I really wish that we can defer optimizing this to freeing folios 
under RCU instead.

-- 
Cheers

David / dhildenb

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Linus Torvalds 3 months, 1 week ago

On Mon, 27 Oct 2025 at 09:06, David Hildenbrand <david@redhat.com> wrote:
>
> So I really wish that we can defer optimizing this to freeing folios
> under RCU instead.

So just to see, I dug around when we started to do the rcu-protected
folio lookup (well, it was obviously not a folio at the time).

Mainly because we actually had a lot more of those subtle
folio_try_get() users than I expected us to have,

It goes back to July 2008 (commit e286781d5f2e: "mm: speculative page
references" being the first in the series).

I do have to say that the original naming was better: we used to call
the "try_get" operation "page_cache_get_speculative()", which made it
very clear that it was doing something speculative and different from
some of our other rcu patterns, where if it's successful it's all
good.

Because even when successful, the folio in folio_try_get() is still
speculative and needs checking.

Not all of our current users seem to re-verify the source of the folio
afterwards (deferred_split_scan() makes me go "Uhh - you seem to rely
on folio_try_get() as some kind of validity check" for example).

Oh well. This is all entirely unrelated to the suggested patch, just
musings from me looking at that other code that I think is a lot more
subtle and people don't seem to have issues with.

                 Linus

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by David Hildenbrand 3 months, 1 week ago

On 27.10.25 17:48, Linus Torvalds wrote:
> On Mon, 27 Oct 2025 at 09:06, David Hildenbrand <david@redhat.com> wrote:
>>
>> So I really wish that we can defer optimizing this to freeing folios
>> under RCU instead.
> 
> So just to see, I dug around when we started to do the rcu-protected
> folio lookup (well, it was obviously not a folio at the time).
> 
> Mainly because we actually had a lot more of those subtle
> folio_try_get() users than I expected us to have,
> 
> It goes back to July 2008 (commit e286781d5f2e: "mm: speculative page
> references" being the first in the series).
> 
> I do have to say that the original naming was better: we used to call
> the "try_get" operation "page_cache_get_speculative()", which made it
> very clear that it was doing something speculative and different from
> some of our other rcu patterns, where if it's successful it's all
> good.
> 
> Because even when successful, the folio in folio_try_get() is still
> speculative and needs checking.

Right, and we only access its content after verifying that (a) it is the 
one we wanted to access and (b) stabilizing it, so it will stay that way.

> 
> Not all of our current users seem to re-verify the source of the folio
> afterwards (deferred_split_scan() makes me go "Uhh - you seem to rely
> on folio_try_get() as some kind of validity check" for example).

Note that deferred_split_scan() only operates with anon folios and has 
some nasty NASTY interaction with folio freeing. :)

-- 
Cheers

David / dhildenb

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by David Hildenbrand 3 months, 2 weeks ago

On 23.10.25 12:31, Kiryl Shutsemau wrote:
> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
>> "garbage" as in pointing at something without a direct map, something that's
>> protected differently (MTE? weird CoCo protection?) or even worse MMIO with
>> undesired read-effects.
> 
> Pedro already points to the problem with missing direct mapping.
> _nofault() copy should help with this.

Yeah, we do something similar when reading the kcore for that reason.

> 
> Can direct mapping ever be converted to MMIO? It can be converted to DMA
> buffer (which is fine), but MMIO? I have not seen it even in virtualized
> environments.

I recall discussions in the context of PAT and the adjustment of caching 
attributes of the direct map for MMIO purposes: so I suspect there are 
ways that can happen, but I am not 100% sure.

Thinking about it, in VMs we have the direct map set on balloon inflated 
pages that should not be touched, not even read, otherwise your 
hypervisor might get very angry. That case we could likely handle by 
checking whether the source page actually exists and doesn't have 
PageOffline() set, before accessing it. A bit nasty.

A more obscure cases would probably be reading a page that was poisoned 
by hardware and is not expected to be used anymore. Could also be 
checked by checking the page.

Essentially all cases where we try to avoid reading ordinary memory 
already when creating memory dumps that might have a direct map.

Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately.

On s390x, reading a "protected" page of a CoCo Vm will trigger an 
interrupt, I'd assume _nofault would take care of this.

-- 
Cheers

David / dhildenb

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by David Hildenbrand 3 months, 2 weeks ago

On 23.10.25 12:54, David Hildenbrand wrote:
> On 23.10.25 12:31, Kiryl Shutsemau wrote:
>> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
>>> "garbage" as in pointing at something without a direct map, something that's
>>> protected differently (MTE? weird CoCo protection?) or even worse MMIO with
>>> undesired read-effects.
>>
>> Pedro already points to the problem with missing direct mapping.
>> _nofault() copy should help with this.
> 
> Yeah, we do something similar when reading the kcore for that reason.
> 
>>
>> Can direct mapping ever be converted to MMIO? It can be converted to DMA
>> buffer (which is fine), but MMIO? I have not seen it even in virtualized
>> environments.
> 
> I recall discussions in the context of PAT and the adjustment of caching
> attributes of the direct map for MMIO purposes: so I suspect there are
> ways that can happen, but I am not 100% sure.
> 
> 
> Thinking about it, in VMs we have the direct map set on balloon inflated
> pages that should not be touched, not even read, otherwise your
> hypervisor might get very angry. That case we could likely handle by
> checking whether the source page actually exists and doesn't have
> PageOffline() set, before accessing it. A bit nasty.
> 
> A more obscure cases would probably be reading a page that was poisoned
> by hardware and is not expected to be used anymore. Could also be
> checked by checking the page.
> 
> Essentially all cases where we try to avoid reading ordinary memory
> already when creating memory dumps that might have a direct map.
> 
> 
> Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately.

Looking into this, I'd assume the exception handler will take care of it.

load_unaligned_zeropad() is interesting if there is a direct map but the 
memory should not be touched (especially regarding PageOffline and 
memory errors).

I read drivers/firmware/efi/unaccepted_memory.c where we there is a 
lengthy discussion about guard pages and how that works for unaccepted 
memory.

While it works for unaccepted memory, it wouldn't work for other random 
accesses as I suspect we could produce in this patch.

-- 
Cheers

David / dhildenb

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by David Hildenbrand 3 months, 2 weeks ago

On 23.10.25 13:10, David Hildenbrand wrote:
> On 23.10.25 12:54, David Hildenbrand wrote:
>> On 23.10.25 12:31, Kiryl Shutsemau wrote:
>>> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
>>>> "garbage" as in pointing at something without a direct map, something that's
>>>> protected differently (MTE? weird CoCo protection?) or even worse MMIO with
>>>> undesired read-effects.
>>>
>>> Pedro already points to the problem with missing direct mapping.
>>> _nofault() copy should help with this.
>>
>> Yeah, we do something similar when reading the kcore for that reason.
>>
>>>
>>> Can direct mapping ever be converted to MMIO? It can be converted to DMA
>>> buffer (which is fine), but MMIO? I have not seen it even in virtualized
>>> environments.
>>
>> I recall discussions in the context of PAT and the adjustment of caching
>> attributes of the direct map for MMIO purposes: so I suspect there are
>> ways that can happen, but I am not 100% sure.
>>
>>
>> Thinking about it, in VMs we have the direct map set on balloon inflated
>> pages that should not be touched, not even read, otherwise your
>> hypervisor might get very angry. That case we could likely handle by
>> checking whether the source page actually exists and doesn't have
>> PageOffline() set, before accessing it. A bit nasty.
>>
>> A more obscure cases would probably be reading a page that was poisoned
>> by hardware and is not expected to be used anymore. Could also be
>> checked by checking the page.
>>
>> Essentially all cases where we try to avoid reading ordinary memory
>> already when creating memory dumps that might have a direct map.
>>
>>
>> Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately.
> 
> Looking into this, I'd assume the exception handler will take care of it.
> 
> load_unaligned_zeropad() is interesting if there is a direct map but the
> memory should not be touched (especially regarding PageOffline and
> memory errors).
> 
> I read drivers/firmware/efi/unaccepted_memory.c where we there is a
> lengthy discussion about guard pages and how that works for unaccepted
> memory.
> 
> While it works for unaccepted memory, it wouldn't work for other random

Sorry I meant here "while that works for load_unaligned_zeropad()".

-- 
Cheers

David / dhildenb

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Kiryl Shutsemau 3 months, 2 weeks ago

On Thu, Oct 23, 2025 at 01:11:43PM +0200, David Hildenbrand wrote:
> On 23.10.25 13:10, David Hildenbrand wrote:
> > On 23.10.25 12:54, David Hildenbrand wrote:
> > > On 23.10.25 12:31, Kiryl Shutsemau wrote:
> > > > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
> > > > > "garbage" as in pointing at something without a direct map, something that's
> > > > > protected differently (MTE? weird CoCo protection?) or even worse MMIO with
> > > > > undesired read-effects.
> > > > 
> > > > Pedro already points to the problem with missing direct mapping.
> > > > _nofault() copy should help with this.
> > > 
> > > Yeah, we do something similar when reading the kcore for that reason.
> > > 
> > > > 
> > > > Can direct mapping ever be converted to MMIO? It can be converted to DMA
> > > > buffer (which is fine), but MMIO? I have not seen it even in virtualized
> > > > environments.
> > > 
> > > I recall discussions in the context of PAT and the adjustment of caching
> > > attributes of the direct map for MMIO purposes: so I suspect there are
> > > ways that can happen, but I am not 100% sure.
> > > 
> > > 
> > > Thinking about it, in VMs we have the direct map set on balloon inflated
> > > pages that should not be touched, not even read, otherwise your
> > > hypervisor might get very angry. That case we could likely handle by
> > > checking whether the source page actually exists and doesn't have
> > > PageOffline() set, before accessing it. A bit nasty.
> > > 
> > > A more obscure cases would probably be reading a page that was poisoned
> > > by hardware and is not expected to be used anymore. Could also be
> > > checked by checking the page.
> > > 
> > > Essentially all cases where we try to avoid reading ordinary memory
> > > already when creating memory dumps that might have a direct map.
> > > 
> > > 
> > > Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately.
> > 
> > Looking into this, I'd assume the exception handler will take care of it.
> > 
> > load_unaligned_zeropad() is interesting if there is a direct map but the
> > memory should not be touched (especially regarding PageOffline and
> > memory errors).
> > 
> > I read drivers/firmware/efi/unaccepted_memory.c where we there is a
> > lengthy discussion about guard pages and how that works for unaccepted
> > memory.
> > 
> > While it works for unaccepted memory, it wouldn't work for other random
> 
> Sorry I meant here "while that works for load_unaligned_zeropad()".

Do we have other random reads?

For unaccepted memory, we care about touching memory that was never
allocated because accepting memory is one way road.

I only know about load_unaligned_zeropad() that does reads like this. Do
you know others?

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by David Hildenbrand 3 months, 2 weeks ago

On 23.10.25 13:40, Kiryl Shutsemau wrote:
> On Thu, Oct 23, 2025 at 01:11:43PM +0200, David Hildenbrand wrote:
>> On 23.10.25 13:10, David Hildenbrand wrote:
>>> On 23.10.25 12:54, David Hildenbrand wrote:
>>>> On 23.10.25 12:31, Kiryl Shutsemau wrote:
>>>>> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
>>>>>> "garbage" as in pointing at something without a direct map, something that's
>>>>>> protected differently (MTE? weird CoCo protection?) or even worse MMIO with
>>>>>> undesired read-effects.
>>>>>
>>>>> Pedro already points to the problem with missing direct mapping.
>>>>> _nofault() copy should help with this.
>>>>
>>>> Yeah, we do something similar when reading the kcore for that reason.
>>>>
>>>>>
>>>>> Can direct mapping ever be converted to MMIO? It can be converted to DMA
>>>>> buffer (which is fine), but MMIO? I have not seen it even in virtualized
>>>>> environments.
>>>>
>>>> I recall discussions in the context of PAT and the adjustment of caching
>>>> attributes of the direct map for MMIO purposes: so I suspect there are
>>>> ways that can happen, but I am not 100% sure.
>>>>
>>>>
>>>> Thinking about it, in VMs we have the direct map set on balloon inflated
>>>> pages that should not be touched, not even read, otherwise your
>>>> hypervisor might get very angry. That case we could likely handle by
>>>> checking whether the source page actually exists and doesn't have
>>>> PageOffline() set, before accessing it. A bit nasty.
>>>>
>>>> A more obscure cases would probably be reading a page that was poisoned
>>>> by hardware and is not expected to be used anymore. Could also be
>>>> checked by checking the page.
>>>>
>>>> Essentially all cases where we try to avoid reading ordinary memory
>>>> already when creating memory dumps that might have a direct map.
>>>>
>>>>
>>>> Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately.
>>>
>>> Looking into this, I'd assume the exception handler will take care of it.
>>>
>>> load_unaligned_zeropad() is interesting if there is a direct map but the
>>> memory should not be touched (especially regarding PageOffline and
>>> memory errors).
>>>
>>> I read drivers/firmware/efi/unaccepted_memory.c where we there is a
>>> lengthy discussion about guard pages and how that works for unaccepted
>>> memory.
>>>
>>> While it works for unaccepted memory, it wouldn't work for other random
>>
>> Sorry I meant here "while that works for load_unaligned_zeropad()".
> 
> Do we have other random reads?
> 
> For unaccepted memory, we care about touching memory that was never
> allocated because accepting memory is one way road.

Right, but I suspect if you get a random read (as the unaccepted memory 
doc states) you'd be in trouble as well.

The "nice" thing about unaccepted memory is that it's a one way road 
indeed, and at some point the system will not have unaccepted memory 
anymore.

> 
> I only know about load_unaligned_zeropad() that does reads like this. Do
> you know others?

No, I am not aware of others. Most code that could read random memory 
(kcore, vmcore) was fixed to exclude pages we know are unsafe to touch.

Code where might speculatively access the "struct page" after it might 
already have been freed (speculative pagecache lookups, GUP-fast) will 
just back off and never read page content.

We avoid such random memory reads as best we can, as it's just a pain to 
deal with (like load_unaligned_zeropad(), which i would just wish we 
could get rid of now that it's present again in my memory. :( ).

-- 
Cheers

David / dhildenb

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Kiryl Shutsemau 3 months, 2 weeks ago

On Thu, Oct 23, 2025 at 01:49:22PM +0200, David Hildenbrand wrote:
> On 23.10.25 13:40, Kiryl Shutsemau wrote:
> > On Thu, Oct 23, 2025 at 01:11:43PM +0200, David Hildenbrand wrote:
> > > On 23.10.25 13:10, David Hildenbrand wrote:
> > > > On 23.10.25 12:54, David Hildenbrand wrote:
> > > > > On 23.10.25 12:31, Kiryl Shutsemau wrote:
> > > > > > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
> > > > > > > "garbage" as in pointing at something without a direct map, something that's
> > > > > > > protected differently (MTE? weird CoCo protection?) or even worse MMIO with
> > > > > > > undesired read-effects.
> > > > > > 
> > > > > > Pedro already points to the problem with missing direct mapping.
> > > > > > _nofault() copy should help with this.
> > > > > 
> > > > > Yeah, we do something similar when reading the kcore for that reason.
> > > > > 
> > > > > > 
> > > > > > Can direct mapping ever be converted to MMIO? It can be converted to DMA
> > > > > > buffer (which is fine), but MMIO? I have not seen it even in virtualized
> > > > > > environments.
> > > > > 
> > > > > I recall discussions in the context of PAT and the adjustment of caching
> > > > > attributes of the direct map for MMIO purposes: so I suspect there are
> > > > > ways that can happen, but I am not 100% sure.
> > > > > 
> > > > > 
> > > > > Thinking about it, in VMs we have the direct map set on balloon inflated
> > > > > pages that should not be touched, not even read, otherwise your
> > > > > hypervisor might get very angry. That case we could likely handle by
> > > > > checking whether the source page actually exists and doesn't have
> > > > > PageOffline() set, before accessing it. A bit nasty.
> > > > > 
> > > > > A more obscure cases would probably be reading a page that was poisoned
> > > > > by hardware and is not expected to be used anymore. Could also be
> > > > > checked by checking the page.
> > > > > 
> > > > > Essentially all cases where we try to avoid reading ordinary memory
> > > > > already when creating memory dumps that might have a direct map.
> > > > > 
> > > > > 
> > > > > Regarding MTE and load_unaligned_zeropad(): I don't know unfortunately.
> > > > 
> > > > Looking into this, I'd assume the exception handler will take care of it.
> > > > 
> > > > load_unaligned_zeropad() is interesting if there is a direct map but the
> > > > memory should not be touched (especially regarding PageOffline and
> > > > memory errors).
> > > > 
> > > > I read drivers/firmware/efi/unaccepted_memory.c where we there is a
> > > > lengthy discussion about guard pages and how that works for unaccepted
> > > > memory.
> > > > 
> > > > While it works for unaccepted memory, it wouldn't work for other random
> > > 
> > > Sorry I meant here "while that works for load_unaligned_zeropad()".
> > 
> > Do we have other random reads?
> > 
> > For unaccepted memory, we care about touching memory that was never
> > allocated because accepting memory is one way road.
> 
> Right, but I suspect if you get a random read (as the unaccepted memory doc
> states) you'd be in trouble as well.

Yes. Random read of unaccepted memory is unrecoverable exit to host for
TDX guest.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Kiryl Shutsemau 3 months, 2 weeks ago

On Thu, Oct 23, 2025 at 12:54:59PM +0200, David Hildenbrand wrote:
> On 23.10.25 12:31, Kiryl Shutsemau wrote:
> > On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
> > > "garbage" as in pointing at something without a direct map, something that's
> > > protected differently (MTE? weird CoCo protection?) or even worse MMIO with
> > > undesired read-effects.
> > 
> > Pedro already points to the problem with missing direct mapping.
> > _nofault() copy should help with this.
> 
> Yeah, we do something similar when reading the kcore for that reason.
> 
> > 
> > Can direct mapping ever be converted to MMIO? It can be converted to DMA
> > buffer (which is fine), but MMIO? I have not seen it even in virtualized
> > environments.
> 
> I recall discussions in the context of PAT and the adjustment of caching
> attributes of the direct map for MMIO purposes: so I suspect there are ways
> that can happen, but I am not 100% sure.
> 
> 
> Thinking about it, in VMs we have the direct map set on balloon inflated
> pages that should not be touched, not even read, otherwise your hypervisor
> might get very angry. That case we could likely handle by checking whether
> the source page actually exists and doesn't have PageOffline() set, before
> accessing it. A bit nasty.
> 
> A more obscure cases would probably be reading a page that was poisoned by
> hardware and is not expected to be used anymore. Could also be checked by
> checking the page.

I don't think we can check the page. Since the page is not stabilized
with a reference, it is TOCTOU race. If there's some side effect that
we cannot suppress on read (like _nofault() does) we are screwed.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by David Hildenbrand 3 months, 2 weeks ago

On 23.10.25 13:09, Kiryl Shutsemau wrote:
> On Thu, Oct 23, 2025 at 12:54:59PM +0200, David Hildenbrand wrote:
>> On 23.10.25 12:31, Kiryl Shutsemau wrote:
>>> On Wed, Oct 22, 2025 at 07:28:27PM +0200, David Hildenbrand wrote:
>>>> "garbage" as in pointing at something without a direct map, something that's
>>>> protected differently (MTE? weird CoCo protection?) or even worse MMIO with
>>>> undesired read-effects.
>>>
>>> Pedro already points to the problem with missing direct mapping.
>>> _nofault() copy should help with this.
>>
>> Yeah, we do something similar when reading the kcore for that reason.
>>
>>>
>>> Can direct mapping ever be converted to MMIO? It can be converted to DMA
>>> buffer (which is fine), but MMIO? I have not seen it even in virtualized
>>> environments.
>>
>> I recall discussions in the context of PAT and the adjustment of caching
>> attributes of the direct map for MMIO purposes: so I suspect there are ways
>> that can happen, but I am not 100% sure.
>>
>>
>> Thinking about it, in VMs we have the direct map set on balloon inflated
>> pages that should not be touched, not even read, otherwise your hypervisor
>> might get very angry. That case we could likely handle by checking whether
>> the source page actually exists and doesn't have PageOffline() set, before
>> accessing it. A bit nasty.
>>
>> A more obscure cases would probably be reading a page that was poisoned by
>> hardware and is not expected to be used anymore. Could also be checked by
>> checking the page.
> 
> I don't think we can check the page. Since the page is not stabilized
> with a reference, it is TOCTOU race.

Indeed :(

-- 
Cheers

David / dhildenb

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Pedro Falcato 3 months, 2 weeks ago

On Fri, Oct 17, 2025 at 03:15:36PM +0100, Kiryl Shutsemau wrote:
> From: Kiryl Shutsemau <kas@kernel.org>
> 
> The protocol for page cache lookup is as follows:
> 
>   1. Locate a folio in XArray.
>   2. Obtain a reference on the folio using folio_try_get().
>   3. If successful, verify that the folio still belongs to
>      the mapping and has not been truncated or reclaimed.
>   4. Perform operations on the folio, such as copying data
>      to userspace.
>   5. Release the reference.
> 
> For short reads, the overhead of atomic operations on reference
> manipulation can be significant, particularly when multiple tasks access
> the same folio, leading to cache line bouncing.
> 
> <snip>
>+static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
> +						  loff_t pos, char *buffer,
> +						  size_t size)
> +{
> +	XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
> +	struct folio *folio;
> +	loff_t file_size;
> +	unsigned int seq;
> +
> +	lockdep_assert_in_rcu_read_lock();
> +
> +	/* Give up and go to slow path if raced with page_cache_delete() */
> +	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
> +		return false;
> +
> +	folio = xas_load(&xas);
> +	if (xas_retry(&xas, folio))
> +		return 0;
> +
> +	if (!folio || xa_is_value(folio))
> +		return 0;
> +
> +	if (!folio_test_uptodate(folio))
> +		return 0;
> +
> +	/* No fast-case if readahead is supposed to started */
> +	if (folio_test_readahead(folio))
> +		return 0;
> +	/* .. or mark it accessed */
> +	if (!folio_test_referenced(folio))
> +		return 0;
> +
> +	/* i_size check must be after folio_test_uptodate() */
> +	file_size = i_size_read(mapping->host);
> +	if (unlikely(pos >= file_size))
> +		return 0;
> +	if (size > file_size - pos)
> +		size = file_size - pos;
> +
> +	/* Do the data copy */
> +	size = memcpy_from_file_folio(buffer, folio, pos, size);
> +	if (!size)
> +		return 0;
> +

I think we may still have a problematic (rare, possibly theoretical) race here where:

   T0				  		T1						T3
filemap_read_fast_rcu()    |							|
  folio = xas_load(&xas);  |							|
  /* ... */                |  /* truncate or reclaim frees folio, bumps delete	|
                           |     seq */						|  	folio_alloc() from e.g secretmem
  			   |							|	set_direct_map_invalid_noflush(!!)
memcpy_from_file_folio()   |							|

We may have to use copy_from_kernel_nofault() here? Or is something else stopping this from happening?

-- 
Pedro

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Linus Torvalds 3 months, 2 weeks ago

On Tue, 21 Oct 2025 at 21:08, Pedro Falcato <pfalcato@suse.de> wrote:
>
> I think we may still have a problematic (rare, possibly theoretical) race here where:
>
>    T0                                           T1                                              T3
> filemap_read_fast_rcu()    |                                                    |
>   folio = xas_load(&xas);  |                                                    |
>   /* ... */                |  /* truncate or reclaim frees folio, bumps delete  |
>                            |     seq */                                         |       folio_alloc() from e.g secretmem
>                            |                                                    |       set_direct_map_invalid_noflush(!!)
> memcpy_from_file_folio()   |                                                    |
>
> We may have to use copy_from_kernel_nofault() here? Or is something else stopping this from happening?

Explain how the sequence count doesn't catch this?

We read the sequence count before we do the xas_load(), and we verify
it after we've done the memcpy_from_folio.

The whole *point* is that the copy itself is not race-free. That's
*why* we do the sequence count.

And only after the sequence count has been verified do we then copy
the result to user space.

So the "maybe this buffer content is garbage" happens, but it only
happens in the temporary kernel on-stack buffer, not visibly to the
user.

             Linus

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Pedro Falcato 3 months, 2 weeks ago

On Tue, Oct 21, 2025 at 09:13:28PM -1000, Linus Torvalds wrote:
> On Tue, 21 Oct 2025 at 21:08, Pedro Falcato <pfalcato@suse.de> wrote:
> >
> > I think we may still have a problematic (rare, possibly theoretical) race here where:
> >
> >    T0                                           T1                                              T3
> > filemap_read_fast_rcu()    |                                                    |
> >   folio = xas_load(&xas);  |                                                    |
> >   /* ... */                |  /* truncate or reclaim frees folio, bumps delete  |
> >                            |     seq */                                         |       folio_alloc() from e.g secretmem
> >                            |                                                    |       set_direct_map_invalid_noflush(!!)
> > memcpy_from_file_folio()   |                                                    |
> >
> > We may have to use copy_from_kernel_nofault() here? Or is something else stopping this from happening?
> 
> Explain how the sequence count doesn't catch this?
> 
> We read the sequence count before we do the xas_load(), and we verify
> it after we've done the memcpy_from_folio.
> 
> The whole *point* is that the copy itself is not race-free. That's
> *why* we do the sequence count.
> 
> And only after the sequence count has been verified do we then copy
> the result to user space.
> 
> So the "maybe this buffer content is garbage" happens, but it only
> happens in the temporary kernel on-stack buffer, not visibly to the
> user.

The problem isn't that the contents might be garbage, but that the direct map
may be swept from under us, as we don't have a reference to the folio. So the
folio can be transparently freed under us (as designed), but some user can
call fun stuff like set_direct_map_invalid_noflush() and we're not handling
any "oopsie we faulted reading the folio" here. The sequence count doesn't
help here, because we, uhh, faulted. Does this make sense?

TL;DR I don't think it's safe to touch the direct map of folios we don't own
without the seatbelt of a copy_from_kernel_nofault or so.

-- 
Pedro

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Kiryl Shutsemau 3 months, 2 weeks ago

On Wed, Oct 22, 2025 at 08:38:30AM +0100, Pedro Falcato wrote:
> On Tue, Oct 21, 2025 at 09:13:28PM -1000, Linus Torvalds wrote:
> > On Tue, 21 Oct 2025 at 21:08, Pedro Falcato <pfalcato@suse.de> wrote:
> > >
> > > I think we may still have a problematic (rare, possibly theoretical) race here where:
> > >
> > >    T0                                           T1                                              T3
> > > filemap_read_fast_rcu()    |                                                    |
> > >   folio = xas_load(&xas);  |                                                    |
> > >   /* ... */                |  /* truncate or reclaim frees folio, bumps delete  |
> > >                            |     seq */                                         |       folio_alloc() from e.g secretmem
> > >                            |                                                    |       set_direct_map_invalid_noflush(!!)
> > > memcpy_from_file_folio()   |                                                    |
> > >
> > > We may have to use copy_from_kernel_nofault() here? Or is something else stopping this from happening?
> > 
> > Explain how the sequence count doesn't catch this?
> > 
> > We read the sequence count before we do the xas_load(), and we verify
> > it after we've done the memcpy_from_folio.
> > 
> > The whole *point* is that the copy itself is not race-free. That's
> > *why* we do the sequence count.
> > 
> > And only after the sequence count has been verified do we then copy
> > the result to user space.
> > 
> > So the "maybe this buffer content is garbage" happens, but it only
> > happens in the temporary kernel on-stack buffer, not visibly to the
> > user.
> 
> The problem isn't that the contents might be garbage, but that the direct map
> may be swept from under us, as we don't have a reference to the folio. So the
> folio can be transparently freed under us (as designed), but some user can
> call fun stuff like set_direct_map_invalid_noflush() and we're not handling
> any "oopsie we faulted reading the folio" here. The sequence count doesn't
> help here, because we, uhh, faulted. Does this make sense?
> 
> TL;DR I don't think it's safe to touch the direct map of folios we don't own
> without the seatbelt of a copy_from_kernel_nofault or so.

Makes sense. Thanks for catching this!

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Andrew Morton 3 months, 2 weeks ago

On Fri, 17 Oct 2025 15:15:36 +0100 Kiryl Shutsemau <kirill@shutemov.name> wrote:

> From: Kiryl Shutsemau <kas@kernel.org>
> 
> The protocol for page cache lookup is as follows:
> 
>   1. Locate a folio in XArray.
>   2. Obtain a reference on the folio using folio_try_get().
>   3. If successful, verify that the folio still belongs to
>      the mapping and has not been truncated or reclaimed.
>   4. Perform operations on the folio, such as copying data
>      to userspace.
>   5. Release the reference.
> 
> For short reads, the overhead of atomic operations on reference
> manipulation can be significant, particularly when multiple tasks access
> the same folio, leading to cache line bouncing.
> 
> To address this issue, introduce i_pages_delete_seqcnt, which increments
> each time a folio is deleted from the page cache and implement a modified
> page cache lookup protocol for short reads:
> 
>   1. Locate a folio in XArray.
>   2. Take note of the i_pages_delete_seqcnt.
>   3. Copy the data to a local buffer on the stack.
>   4. Verify that the i_pages_delete_seqcnt has not changed.
>   5. Copy the data from the local buffer to the iterator.
> 
> If any issues arise in the fast path, fallback to the slow path that
> relies on the refcount to stabilize the folio.

Well this is a fun patch.

> The new approach requires a local buffer in the stack. The size of the
> buffer determines which read requests are served by the fast path. Set
> the buffer to 1k. This seems to be a reasonable amount of stack usage
> for the function at the bottom of the call stack.

A use case for alloca() or equiv.  That would improve the average-case
stack depth but not the worst-case.

The 1k guess-or-giggle is crying out for histogramming - I bet 0.1k
covers the great majority.  I suspect it wouldn't be hard to add a new
ad-hoc API into memory allocation profiling asking it to profile
something like this for us, given an explicit request to to do.

Is there really no way to copy the dang thing straight out to
userspace, skip the bouncing?

> The fast read approach demonstrates significant performance
> improvements, particularly in contended cases.
> 
> 16 threads, reads from 4k file(s), mean MiB/s (StdDev)
> 
>  -------------------------------------------------------------
> | Block |  Baseline  |  Baseline   |  Patched   |  Patched    |
> | size  |  same file |  diff files |  same file | diff files  |
>  -------------------------------------------------------------
> |     1 |    10.96   |     27.56   |    30.42   |     30.4    |
> |       |    (0.497) |     (0.114) |    (0.130) |     (0.158) |
> |    32 |   350.8    |    886.2    |   980.6    |    981.8    |
> |       |   (13.64)  |     (2.863) |    (3.361) |     (1.303) |
> |   256 |  2798      |   7009.6    |  7641.4    |   7653.6    |
> |       |  (103.9)   |    (28.00)  |   (33.26)  |    (25.50)  |

tl;dr, 256-byte reads from the same file nearly tripled.

> |  1024 | 10780      |  27040      | 29280      |  29320      |
> |       |  (389.8)   |    (89.44)  |  (130.3)   |    (83.66)  |
> |  4096 | 43700      | 103800      | 48420      | 102000      |
> |       | (1953)     |    (447.2)  | (2012)     |     (0)     |
>  -------------------------------------------------------------
> 
> 16 threads, reads from 1M file(s), mean MiB/s (StdDev)
> 
>  --------------------------------------------------------------
> | Block |  Baseline   |  Baseline   |  Patched    |  Patched   |
> | size  |  same file  |  diff files |  same file  | diff files |
>  ---------------------------------------------------------
> |     1 |     26.38   |     27.34   |     30.38   |    30.36   |
> |       |     (0.998) |     (0.114) |     (0.083) |    (0.089) |
> |    32 |    824.4    |    877.2    |    977.8    |   975.8    |
> |       |    (15.78)  |     (3.271) |     (2.683) |    (1.095) |
> |   256 |   6494      |   6992.8    |   7619.8    |   7625     |
> |       |   (116.0)   |    (32.39)  |    (10.66)  |    (28.19) |
> |  1024 |  24960      |  26840      |  29100      |  29180     |
> |       |   (606.6)   |   (151.6)   |   (122.4)   |    (83.66) |
> |  4096 |  94420      | 100520      |  95260      |  99760     |
> |       |  (3144)     |   (672.3)   |  (2874)     |   (134.1)  |
> | 32768 | 386000      | 402400      | 368600      | 397400     |
> |       | (36599)     | (10526)     | (47188)     |  (6107)    |
>  --------------------------------------------------------------
> 
> There's also improvement on kernel build:
> 
> Base line: 61.3462 +- 0.0597 seconds time elapsed  ( +-  0.10% )
> Patched:   60.6106 +- 0.0759 seconds time elapsed  ( +-  0.13% )
> 
> ...
>
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
>
> ...
>
> -ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
> -		ssize_t already_read)
> +static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
> +						  loff_t pos, char *buffer,
> +						  size_t size)
> +{
> +	XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
> +	struct folio *folio;
> +	loff_t file_size;
> +	unsigned int seq;
> +
> +	lockdep_assert_in_rcu_read_lock();
> +
> +	/* Give up and go to slow path if raced with page_cache_delete() */
> +	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
> +		return false;

	return 0;

> +
> +	folio = xas_load(&xas);
> +	if (xas_retry(&xas, folio))
> +		return 0;
> +
> +	if (!folio || xa_is_value(folio))
> +		return 0;
> +
> +	if (!folio_test_uptodate(folio))
> +		return 0;
> +
> +	/* No fast-case if readahead is supposed to started */

Please expand this comment.  "explain why, not what".  Why do we care
if it's under readahead?  It's uptodate, so just grab it??

> +	if (folio_test_readahead(folio))
> +		return 0;
> +	/* .. or mark it accessed */

This comment disagrees with the code which it is commenting.

> +	if (!folio_test_referenced(folio))
> +		return 0;
> +
> +	/* i_size check must be after folio_test_uptodate() */

why?

> +	file_size = i_size_read(mapping->host);
> +	if (unlikely(pos >= file_size))
> +		return 0;
> +	if (size > file_size - pos)
> +		size = file_size - pos;

min() is feeling all sad?

> +	/* Do the data copy */

We can live without this comment ;)

> +	size = memcpy_from_file_folio(buffer, folio, pos, size);
> +	if (!size)
> +		return 0;
> +
> +	/* Give up and go to slow path if raced with page_cache_delete() */

I wonder if truncation is all we need to worry about here.  For
example, direct-io does weird stuff.

> +	if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
> +		return 0;
> +
> +	return size;
> +}
> +
> +#define FAST_READ_BUF_SIZE 1024
> +
> +static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
> +				       ssize_t *already_read)
> +{
> +	struct address_space *mapping = iocb->ki_filp->f_mapping;
> +	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
> +	char buffer[FAST_READ_BUF_SIZE];
> +	size_t count;
> +
> +	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
> +		return false;

why?  (comment please)

> +	if (iov_iter_count(iter) > sizeof(buffer))
> +		return false;
> +
> +	count = iov_iter_count(iter);

It'd be a tinier bit tidier to swap the above to avoid evaluating
iov_iter_count() twice.  Yes, iov_iter_count() happens to be fast, but
we aren't supposed to know that here.

> +	/* Let's see if we can just do the read under RCU */
> +	rcu_read_lock();
> +	count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
> +	rcu_read_unlock();
> +
> +	if (!count)
> +		return false;
> +
> +	count = copy_to_iter(buffer, count, iter);
> +	if (unlikely(!count))
> +		return false;
> +
> +	iocb->ki_pos += count;
> +	ra->prev_pos = iocb->ki_pos;
> +	file_accessed(iocb->ki_filp);
> +	*already_read += count;
> +
> +	return !iov_iter_count(iter);
> +}
> +
> +static noinline ssize_t filemap_read_slow(struct kiocb *iocb,
> +					  struct iov_iter *iter,
> +					  ssize_t already_read)
>  {
>  	struct file *filp = iocb->ki_filp;
>  	struct file_ra_state *ra = &filp->f_ra;

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Linus Torvalds 3 months, 2 weeks ago

On Sun, 19 Oct 2025 at 18:53, Andrew Morton <akpm@linux-foundation.org> wrote:
>
> Is there really no way to copy the dang thing straight out to
> userspace, skip the bouncing?

Sadly, no.

It's trivial to copy to user space in a RCU-protected region: just
disable page faults and it all works fine.

In fact, it works so fine that everything boots and it all looks
beautiful in profiles etc - ask me how I know.

But it's still wrong. The problem is that *after* you've copies things
away from the page cache, you need to check that the page cache
contents are still valid.

And it's not a problem to do that and just say "don't count the bytes
I just copied, and we'll copy over them later".

But while 99.999% of the time we *will* copy over them later, it's not
actually guaranteed. What migth happen is that after we've filled in
user space with the optimistically copied data, we figure out that the
page cache is no longer valid, and we go to the slow case, and two
problems may have happened:

 (a) the file got truncated in the meantime, and we just filled in
stale data (possibly zeroes) in a user space buffer, and we're
returning a smaller length than what we filled out.

Will user space care? Not realistically, no. But it's wrong, and some
user space *might* be using the buffer as a ring-buffer or something,
and assume that if we return 5 bytes from "read()", the subsequent
bytes are still valid from (previous) ring buffer fills.

But if we decide to ignore that issue (possibly with some "open()"
time flag to say "give me optimistic short reads, and I won't care),
we still have

 (b) the folio we copied from migth have been released and re-used for
something else

and this is fatal. We might have optimistically copied things that are
now security-sensitive and even if we return a short read - or
overwrite it - layer, user space should never have seen that data.

This (b) thing is solvable too, but requires that page cache releases
always would be RCU-delayed, and they aren't.

So both are "solvable", but they are very big and very separate solutions.

               Linus

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Kiryl Shutsemau 3 months, 2 weeks ago

On Sun, Oct 19, 2025 at 09:53:28PM -0700, Andrew Morton wrote:
> On Fri, 17 Oct 2025 15:15:36 +0100 Kiryl Shutsemau <kirill@shutemov.name> wrote:
> 
> > From: Kiryl Shutsemau <kas@kernel.org>
> > 
> > The protocol for page cache lookup is as follows:
> > 
> >   1. Locate a folio in XArray.
> >   2. Obtain a reference on the folio using folio_try_get().
> >   3. If successful, verify that the folio still belongs to
> >      the mapping and has not been truncated or reclaimed.
> >   4. Perform operations on the folio, such as copying data
> >      to userspace.
> >   5. Release the reference.
> > 
> > For short reads, the overhead of atomic operations on reference
> > manipulation can be significant, particularly when multiple tasks access
> > the same folio, leading to cache line bouncing.
> > 
> > To address this issue, introduce i_pages_delete_seqcnt, which increments
> > each time a folio is deleted from the page cache and implement a modified
> > page cache lookup protocol for short reads:
> > 
> >   1. Locate a folio in XArray.
> >   2. Take note of the i_pages_delete_seqcnt.
> >   3. Copy the data to a local buffer on the stack.
> >   4. Verify that the i_pages_delete_seqcnt has not changed.
> >   5. Copy the data from the local buffer to the iterator.
> > 
> > If any issues arise in the fast path, fallback to the slow path that
> > relies on the refcount to stabilize the folio.
> 
> Well this is a fun patch.

Yes! :P

> > The new approach requires a local buffer in the stack. The size of the
> > buffer determines which read requests are served by the fast path. Set
> > the buffer to 1k. This seems to be a reasonable amount of stack usage
> > for the function at the bottom of the call stack.
> 
> A use case for alloca() or equiv.  That would improve the average-case
> stack depth but not the worst-case.

__kstack_alloca()/__builtin_alloca() would work and it bypassed
-Wframe-larger-than warning.

But I don't see any real users.

My understanding is that alloca() is similar in semantics with VLAs that
we eliminated from the kernel. I am not sure it is a good idea.

Other option is to have a per-CPU buffer. But it is less cache friendly.

> The 1k guess-or-giggle is crying out for histogramming - I bet 0.1k
> covers the great majority.  I suspect it wouldn't be hard to add a new
> ad-hoc API into memory allocation profiling asking it to profile
> something like this for us, given an explicit request to to do.

Let me see what I can do.

> Is there really no way to copy the dang thing straight out to
> userspace, skip the bouncing?

I don't see a way to make it in a safe manner.

In case of a race with folio deletion we risk leaking data to userspace:
the folio we are reading from can be freed and re-allocated from under
us to random other user. Bounce buffer let's us stabilize the data
before exposing it to userspace.

> > The fast read approach demonstrates significant performance
> > improvements, particularly in contended cases.
> > 
> > 16 threads, reads from 4k file(s), mean MiB/s (StdDev)
> > 
> >  -------------------------------------------------------------
> > | Block |  Baseline  |  Baseline   |  Patched   |  Patched    |
> > | size  |  same file |  diff files |  same file | diff files  |
> >  -------------------------------------------------------------
> > |     1 |    10.96   |     27.56   |    30.42   |     30.4    |
> > |       |    (0.497) |     (0.114) |    (0.130) |     (0.158) |
> > |    32 |   350.8    |    886.2    |   980.6    |    981.8    |
> > |       |   (13.64)  |     (2.863) |    (3.361) |     (1.303) |
> > |   256 |  2798      |   7009.6    |  7641.4    |   7653.6    |
> > |       |  (103.9)   |    (28.00)  |   (33.26)  |    (25.50)  |
> 
> tl;dr, 256-byte reads from the same file nearly tripled.

Yep.

> > |  1024 | 10780      |  27040      | 29280      |  29320      |
> > |       |  (389.8)   |    (89.44)  |  (130.3)   |    (83.66)  |
> > |  4096 | 43700      | 103800      | 48420      | 102000      |
> > |       | (1953)     |    (447.2)  | (2012)     |     (0)     |
> >  -------------------------------------------------------------
> > 
> > 16 threads, reads from 1M file(s), mean MiB/s (StdDev)
> > 
> >  --------------------------------------------------------------
> > | Block |  Baseline   |  Baseline   |  Patched    |  Patched   |
> > | size  |  same file  |  diff files |  same file  | diff files |
> >  ---------------------------------------------------------
> > |     1 |     26.38   |     27.34   |     30.38   |    30.36   |
> > |       |     (0.998) |     (0.114) |     (0.083) |    (0.089) |
> > |    32 |    824.4    |    877.2    |    977.8    |   975.8    |
> > |       |    (15.78)  |     (3.271) |     (2.683) |    (1.095) |
> > |   256 |   6494      |   6992.8    |   7619.8    |   7625     |
> > |       |   (116.0)   |    (32.39)  |    (10.66)  |    (28.19) |
> > |  1024 |  24960      |  26840      |  29100      |  29180     |
> > |       |   (606.6)   |   (151.6)   |   (122.4)   |    (83.66) |
> > |  4096 |  94420      | 100520      |  95260      |  99760     |
> > |       |  (3144)     |   (672.3)   |  (2874)     |   (134.1)  |
> > | 32768 | 386000      | 402400      | 368600      | 397400     |
> > |       | (36599)     | (10526)     | (47188)     |  (6107)    |
> >  --------------------------------------------------------------
> > 
> > There's also improvement on kernel build:
> > 
> > Base line: 61.3462 +- 0.0597 seconds time elapsed  ( +-  0.10% )
> > Patched:   60.6106 +- 0.0759 seconds time elapsed  ( +-  0.13% )
> > 
> > ...
> >
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> >
> > ...
> >
> > -ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
> > -		ssize_t already_read)
> > +static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
> > +						  loff_t pos, char *buffer,
> > +						  size_t size)
> > +{
> > +	XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
> > +	struct folio *folio;
> > +	loff_t file_size;
> > +	unsigned int seq;
> > +
> > +	lockdep_assert_in_rcu_read_lock();
> > +
> > +	/* Give up and go to slow path if raced with page_cache_delete() */
> > +	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
> > +		return false;
> 
> 	return 0;

Ack.

> > +
> > +	folio = xas_load(&xas);
> > +	if (xas_retry(&xas, folio))
> > +		return 0;
> > +
> > +	if (!folio || xa_is_value(folio))
> > +		return 0;
> > +
> > +	if (!folio_test_uptodate(folio))
> > +		return 0;
> > +
> > +	/* No fast-case if readahead is supposed to started */
> 
> Please expand this comment.  "explain why, not what".  Why do we care
> if it's under readahead?  It's uptodate, so just grab it??

Readahead pages require kicking rickahead machinery with
filemap_readahead(). It is not appropriate for the fast path.

Will rewrite the comment.

> > +	if (folio_test_readahead(folio))
> > +		return 0;
> > +	/* .. or mark it accessed */
> 
> This comment disagrees with the code which it is commenting.

It is continuation of the comment for the readahead. Will rewrite to make
it clear.

Similar to readahead, we don't want to go for folio_mark_accessed().

> > +	if (!folio_test_referenced(folio))
> > +		return 0;
> > +
> > +	/* i_size check must be after folio_test_uptodate() */
> 
> why?

There is comment for i_size_read() in slow path that inidicates that it
is required, but, to be honest, I don't fully understand interaction
uptodate vs i_size here.

> > +	file_size = i_size_read(mapping->host);
> > +	if (unlikely(pos >= file_size))
> > +		return 0;
> > +	if (size > file_size - pos)
> > +		size = file_size - pos;
> 
> min() is feeling all sad?

Will make it happy. :)

> > +	/* Do the data copy */
> 
> We can live without this comment ;)

:P

> > +	size = memcpy_from_file_folio(buffer, folio, pos, size);
> > +	if (!size)
> > +		return 0;
> > +
> > +	/* Give up and go to slow path if raced with page_cache_delete() */
> 
> I wonder if truncation is all we need to worry about here.  For
> example, direct-io does weird stuff.

Direct I/O does page cache invalidation which is also goes via
page_cache_delete().

Reclaim also terminates in page_cache_delete() via
__filemap_remove_folio().

> > +	if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
> > +		return 0;
> > +
> > +	return size;
> > +}
> > +
> > +#define FAST_READ_BUF_SIZE 1024
> > +
> > +static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
> > +				       ssize_t *already_read)
> > +{
> > +	struct address_space *mapping = iocb->ki_filp->f_mapping;
> > +	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
> > +	char buffer[FAST_READ_BUF_SIZE];
> > +	size_t count;
> > +
> > +	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
> > +		return false;
> 
> why?  (comment please)

I don't understand implication of flush_dcache_folio() on serialization
here. Will read up.

> > +	if (iov_iter_count(iter) > sizeof(buffer))
> > +		return false;
> > +
> > +	count = iov_iter_count(iter);
> 
> It'd be a tinier bit tidier to swap the above to avoid evaluating
> iov_iter_count() twice.  Yes, iov_iter_count() happens to be fast, but
> we aren't supposed to know that here.

Okay.

> > +	/* Let's see if we can just do the read under RCU */
> > +	rcu_read_lock();
> > +	count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
> > +	rcu_read_unlock();
> > +
> > +	if (!count)
> > +		return false;
> > +
> > +	count = copy_to_iter(buffer, count, iter);
> > +	if (unlikely(!count))
> > +		return false;
> > +
> > +	iocb->ki_pos += count;
> > +	ra->prev_pos = iocb->ki_pos;
> > +	file_accessed(iocb->ki_filp);
> > +	*already_read += count;
> > +
> > +	return !iov_iter_count(iter);
> > +}
> > +
> > +static noinline ssize_t filemap_read_slow(struct kiocb *iocb,
> > +					  struct iov_iter *iter,
> > +					  ssize_t already_read)
> >  {
> >  	struct file *filp = iocb->ki_filp;
> >  	struct file_ra_state *ra = &filp->f_ra;
> 

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Dave Chinner 3 months, 2 weeks ago

On Mon, Oct 20, 2025 at 12:33:08PM +0100, Kiryl Shutsemau wrote:
> On Sun, Oct 19, 2025 at 09:53:28PM -0700, Andrew Morton wrote:
> > On Fri, 17 Oct 2025 15:15:36 +0100 Kiryl Shutsemau <kirill@shutemov.name> wrote:
> > 
> > > From: Kiryl Shutsemau <kas@kernel.org>
> > > 
> > > The protocol for page cache lookup is as follows:
> > > 
> > >   1. Locate a folio in XArray.
> > >   2. Obtain a reference on the folio using folio_try_get().
> > >   3. If successful, verify that the folio still belongs to
> > >      the mapping and has not been truncated or reclaimed.

What about if it has been hole-punched?

The i_size checks after testing the folio is up to date catch races
with truncate down.  This "works" because truncate_setsize() changes
i_size before we invalidate the mapping and so we don't try to
access folios that have pending invalidation.

It also catches the case where the invalidation is only a partial
EOF folio zero (e.g. truncate down within the same EOF folio). In
this case, the deletion sequence number won't catch the invalidation
because no pages are freed from the page cache. Hence reads need to
check i_size to detect this case.

However, fallocate() operations such as hole punching and extent
shifting have exactly the same partial folio invalidation problems
as truncate but they don't change i_size like truncate does (both at
the front and rear edges of the ranges being operated on)

Hence invalidation races with fallocate() operations cannot be
detected via i_size checks and we have to serialise them differently.
fallocate() also requires barriers prevent new page cache operations
whilst the filesystem operation is in progress, so we actually need
the invalidation serialisation to also act as a page cache instantiation
barrier. This is what the mapping->invalidate_lock provides, and I
suspect that this new read fast path doesn't actually work correctly
w.r.t. fallocate() based invalidation because it can't detect races
with partial folio invalidations that are pending nor does it take
the mapping->invalidate_lock....

I also wonder if there might be other subtle races with
->remap_file_range based operations, because they also run
invalidations and need page cache instatiation barriers whilst the
operations run.  At least with XFS, remap operations hold both the
inode->i_rwsem and the mapping->invalidate_lock so nobody can access
the page cache across the destination range being operated on whilst
the extent mapping underlying the file is in flux.

Given these potential issues, I really wonder if this niche fast
path is really worth the potential pain racing against these sorts
of operations could bring us. It also increases the cognitive
load for anyone trying to understand how buffered reads interact
with everything else (i.e. yet another set of race conditions we
have to worry about when thinking about truncate!), and it is not
clear to me that it is (or can be made) safe w.r.t. more complex
invalidation interactions that filesystem have to handle these days.

So: is the benefit for this niche workload really worth the
additional complexity it adds to what is already a very complex set
of behaviours and interactions?

> > > +	if (!folio_test_referenced(folio))
> > > +		return 0;
> > > +
> > > +	/* i_size check must be after folio_test_uptodate() */
> > 
> > why?
> 
> There is comment for i_size_read() in slow path that inidicates that it
> is required, but, to be honest, I don't fully understand interaction
> uptodate vs i_size here.

As per above, it's detecting a race with a concurrent truncate that
is about to invalidate the folio but hasn't yet got to that folio in
the mapping.

This is where we'd also need to detect pending fallocate() or other
invalidations that are in progress, but there's no way to do that
easily....

-Dave.
-- 
Dave Chinner
david@fromorbit.com

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Linus Torvalds 3 months, 2 weeks ago

On Tue, 21 Oct 2025 at 13:39, Dave Chinner <david@fromorbit.com> wrote:
>
> > > >   1. Locate a folio in XArray.
> > > >   2. Obtain a reference on the folio using folio_try_get().
> > > >   3. If successful, verify that the folio still belongs to
> > > >      the mapping and has not been truncated or reclaimed.
>
> What about if it has been hole-punched?

The sequence number check should take care of anything like that. Do
you have any reason to believe it doesn't?

Yes, you can get the "before or after or between" behavior, but you
can get that with perfectly regular reads that take the refcount on
the page.

Reads have never taken the page lock, and have never been serialized that way.

So the fast case changes absolutely nothing in this respect that I can see.

               Linus

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Dave Chinner 3 months, 2 weeks ago

On Tue, Oct 21, 2025 at 06:25:30PM -1000, Linus Torvalds wrote:
> On Tue, 21 Oct 2025 at 13:39, Dave Chinner <david@fromorbit.com> wrote:
> >
> > > > >   1. Locate a folio in XArray.
> > > > >   2. Obtain a reference on the folio using folio_try_get().
> > > > >   3. If successful, verify that the folio still belongs to
> > > > >      the mapping and has not been truncated or reclaimed.
> >
> > What about if it has been hole-punched?
> 
> The sequence number check should take care of anything like that. Do
> you have any reason to believe it doesn't?

Invalidation doing partial folio zeroing isn't covered by the page
cache delete sequence number.

> Yes, you can get the "before or after or between" behavior, but you
> can get that with perfectly regular reads that take the refcount on
> the page.

Yes, and it is the "in between" behaviour that is the problem here.

Hole punching (and all the other fallocate() operations) are
supposed to be atomic w.r.t. user IO. i.e. you should see either the
non-punched data or the punched data, never a mix of the two. A mix
of the two is a transient data corruption event....

This invalidation race does not exist on XFS, even on this
new fast path.  We protect all buffered reads with the
inode->i_rwsem, so we guarantee they can't race
with fallocate() operations performing invalidations because
fallocate() takes the i_rwsem exclusively. This IO exclusion model
was baked into the XFS locking architecture over 30 years ago.

The problem is the other filesystems don't use this sort of IO
exclusion model (ext4, btrfs, etc) but instead use the page cache
folio locking to only avoid concurrent modification to the same file
range.

Hence they are exposed to this transient state because they rely on
folio locks for arbitrating concurrent access to the page cache and
buffered reads run completely unlocked. i.e. because....

> Reads have never taken the page lock, and have never been serialized that way.

... they are exposed to transient data states in the page cache
during invalidation operations. The i_size checks avoid these
transient states for truncation, but there are no checks that can be
made to avoid them for other sources of invalidation operations.

The mapping->invalidate_lock only prevents page cache instantiation
from occurring, allowing filesystems to create a barrier that
prevents page cache repopulation after invalidation until the
invalidate lock is dropped. This allows them to complete the
fallocate() operation before exposing the result to users.

However, it does not prevent buffered read cache hits racing with
overlapping invalidation operations, and that's the problem I'm
pointing out that this new fast path will still hit, even though it
tries to bounce-buffer it's way around transient states.

So, yes, you are right when you say:

> So the fast case changes absolutely nothing in this respect that I can see.

But that does not make the existing page cache behaviour desirable.
Reading corrupt data faster is still reading corrupt data :/

Really, though, I'm only mentioning this stuff beacuse both the
author of the patch and the reviewer did not seem to know how i_size
is used in this code to avoid truncate races. truncate races are the
simple part of the problem, hole punching is a lot harder to get
right.  If the author hasn't thought about it, it is likely there
are subtle bugs lurking....

-Dave.
-- 
Dave Chinner
david@fromorbit.com

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Linus Torvalds 3 months, 2 weeks ago

On Tue, 21 Oct 2025 at 22:00, Dave Chinner <david@fromorbit.com> wrote:
>
> On Tue, Oct 21, 2025 at 06:25:30PM -1000, Linus Torvalds wrote:
> >
> > The sequence number check should take care of anything like that. Do
> > you have any reason to believe it doesn't?
>
> Invalidation doing partial folio zeroing isn't covered by the page
> cache delete sequence number.

Correct - but neither is it covered by anything else in the *regular* read path.

So the sequence number protects against the same case that the
reference count protects against: hole punching removing the whole
page.

Partial page hole-punching will fundamentally show half-way things.

> > Yes, you can get the "before or after or between" behavior, but you
> > can get that with perfectly regular reads that take the refcount on
> > the page.
>
> Yes, and it is the "in between" behaviour that is the problem here.
>
> Hole punching (and all the other fallocate() operations) are
> supposed to be atomic w.r.t. user IO. i.e. you should see either the
> non-punched data or the punched data, never a mix of the two. A mix
> of the two is a transient data corruption event....

That "supposed" comes from documentation that has never been true and
as such is just a bedtime story.

And no, iI'd argue that it's not even documenting desirable behavior,
because that bedtime story has never been true because it's
prohibitively expensive.

In some cases the documentation may have been historically "more true"
than it is today just because the documentation was written so long
ago that people used a single lock for everything (not talking about
the Linux big kernel lock, but about old BSD model of "single inode
lock for all IO").

End result: you say it would be desirable, and that might be true in a
high-level way when you ignore other issues.

POSIX is full of these bedtime stories that depend on a simplified
version of the truth, where the simplifications means that the
documentation just approximates reality at a high level.

I think it would be much better to fix the documentation, but that's
generally out of our hands.

            Linus

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Dave Chinner 3 months, 2 weeks ago

On Wed, Oct 22, 2025 at 05:31:12AM -1000, Linus Torvalds wrote:
> On Tue, 21 Oct 2025 at 22:00, Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Tue, Oct 21, 2025 at 06:25:30PM -1000, Linus Torvalds wrote:
> > >
> > > The sequence number check should take care of anything like that. Do
> > > you have any reason to believe it doesn't?
> >
> > Invalidation doing partial folio zeroing isn't covered by the page
> > cache delete sequence number.
> 
> Correct - but neither is it covered by anything else in the *regular* read path.
> 
> So the sequence number protects against the same case that the
> reference count protects against: hole punching removing the whole
> page.
> 
> Partial page hole-punching will fundamentally show half-way things.

Only when you have a busted implementation of the spec.

Think about it: if I said "partial page truncation will
fundamentally show half-way things", you would shout at me that
truncate must -never- expose half-way things to buffered reads.
This is how truncate is specified to behave, and we don't violate
the spec just because it is hard to implement it.

We've broken truncate repeatedly over the past 20+ years in ways
that have exposed stale data to users. This is always considered a
critical bug that needs to be fixed ASAP.

Hole punching is documented to require the same behaviour as
truncate and, like truncate, any breakage of that is a potential
stale kernel or physical data exposure event.

Why will we not tolerate data corruption bugs in truncate due to
page cache races, yet being told that we are supposed to just ignore
those same data corruption races during fallocate operations?

This seems like a double standard to me. 

Also, keep in mind that XFS isn't exposed to these bugs, so I have
no real skin in the game here. People need to be made aware of of a
data integrity issue that are noticed, not have them swept under the
rug with dubious reasoning.

> > > Yes, you can get the "before or after or between" behavior, but you
> > > can get that with perfectly regular reads that take the refcount on
> > > the page.
> >
> > Yes, and it is the "in between" behaviour that is the problem here.
> >
> > Hole punching (and all the other fallocate() operations) are
> > supposed to be atomic w.r.t. user IO. i.e. you should see either the
> > non-punched data or the punched data, never a mix of the two. A mix
> > of the two is a transient data corruption event....
> 
> That "supposed" comes from documentation that has never been true and
> as such is just a bedtime story.

I'll give you the benefit of the doubt here, because you are not
an expert in the field.

That is, I think you are conflating POSIX buffered read/write
syscall atomicity with fallocate() requirements. They are not the
same thing.  The fallocate() atomicity requirements come from the
low level data coherency/integrity requirements of the operations
being performed, not from some POSIX spec.

e.g. FALLOC_FL_ZERO_RANGE turns a range of a file to zeros. The
implementation of this can vary. The filesystem can:
	- write zeros through the page cache. Users have asked us
	  not to do this but return -EOPNOTSUPP so they can choose
	  the fallback mechanism themselves.
	- use block device offloads like WRITE_SAME(0) to have the
	  device physically zero the range.
	- use DISCARD blockdev operations if the device guarantees
	  discarded regiond return zeros.
	- punch out all the extents, leaving a hole.
	- punch out all the extents, then preallocate new unwritten
	  extents thereby defragmenting the range at the same time.
	- preallocate new unwritten extents over holes and convert
	  existing written extents to unwritten.

All of these are valid implementations, and they are all multi-step
operations that could expose partial completion to userspace if
there is no IO serialisation against the ZERO_RANGE operation.

Hence there is really only one behaviour that is required: whilst
the low level operation is taking place, no external IO (read,
write, discard, etc) can be performed over that range of the file
being zeroed because the data andor metadata is not stable until the
whole operation is completed by the filesystem.

Now, this doesn't obviously read on the initial invalidation races
that are the issue being discussed here because zero's written by
invalidation could be considered "valid" for hole punch, zero range,
etc.

However, consider COLLAPSE_RANGE.  Page cache invalidation
writing zeros and reads racing with that is a problem, because
the old data at a given offset is non-zero, whilst the new data at
the same offset is alos non-zero.

Hence if we allow the initial page cache invalidation to race with
buffered reads, there is the possibility of random zeros appearing
in the data being read. Because this is not old or new data, it is
-corrupt- data.

Put simply, these fallocate operations should *never* see partial
invalidation data, and so the "old or new data" rule *must* apply to
the initial page cache invalidation these fallocate() operations do.

Hence various fallocate() operations need to act as a full IO
barrier. Buffered IO, page faults and direct IO all must be blocked
and drained before the invalidation of the range begins, and must
not be allowed to start again until after the whole operation
completes.

IOWs, IO exclusion is a data integrity requirement for fallocate()
operations to prevent users from being exposed to transient data
corruption races whilst the fallocate() operation is being
performed. It has nothing to do with POSIX in any way...

Also, keep in mind that fallocate() is only one vector to this race
condition.  Multiple filesystems have multiple invalidation vectors
to this race condition.  There are similar issues with custom extent
swap ioctls (e.g. for online defragmentation), ensuring
->remap_file_range() and ->copy_file_range() implementations have
exclusive access to the file data and/or metadata they are
manipulating (e.g. offload to hardware copy engines), and so on.

fallocate() is just a convenient example to use because multiple
filesystems implement it, lots of userspace applications use it and
lots of people know about it. It is not niche functionality anymore,
so people should be paying close attention when potential data
corruption vectors are identified in these operations...

> And no, iI'd argue that it's not even documenting desirable behavior,
> because that bedtime story has never been true because it's
> prohibitively expensive.

I disagree, and so do the numbers - IO path exclusion is pretty
cheap for the most part, and not a performance limiting requirement.

e.g. over a range of different benchmarks on 6.15:

https://www.phoronix.com/review/linux-615-filesystems/6

i.e. XFS is 27% faster overall than ext4 and btrfs on a modern NVMe
SSDs.  Numbers don't lie, and these numbers indicate that the IO
path exclusion that XFS implementations for avoiding invalidation
races doesn't have any impact on typical production workloads...

That said, I am most definitely not suggesting that the XFS IO path
exclusion is the solution for the specific buffered read vs cache
invalidation race I pointed out. It's just one example of how it can
be solved with little in the way of runtime overhead.

I suspect the invalidation race could be detected at the page cache
layer with a mark-and-sweep invalidation algorithm using xarray tags
kinda like writeback already does. Those tags could be detected in
the read path at little extra cost (the xarray node is already hot
in cache) which can then punt the folio to a slow path that does the
right thing.  If there's no invalidation tag present, then the fast
path can safely keep doing it's fast path things...

> I think it would be much better to fix the documentation, but that's
> generally out of our hands.

We control the fallocate() specification and documentation
ourselves. But that's not the problem here; the problem is that the
cached data invalidation mechanism used by fallocate operations does
not prevent buffered reads from racing against invalidation and
exposing invalid transient data to users...

-Dave.
-- 
Dave Chinner
david@fromorbit.com

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Jan Kara 3 months, 2 weeks ago

On Thu 23-10-25 18:50:46, Dave Chinner wrote:
> On Wed, Oct 22, 2025 at 05:31:12AM -1000, Linus Torvalds wrote:
> > On Tue, 21 Oct 2025 at 22:00, Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > On Tue, Oct 21, 2025 at 06:25:30PM -1000, Linus Torvalds wrote:
> > > >
> > > > The sequence number check should take care of anything like that. Do
> > > > you have any reason to believe it doesn't?
> > >
> > > Invalidation doing partial folio zeroing isn't covered by the page
> > > cache delete sequence number.
> > 
> > Correct - but neither is it covered by anything else in the *regular* read path.
> > 
> > So the sequence number protects against the same case that the
> > reference count protects against: hole punching removing the whole
> > page.
> > 
> > Partial page hole-punching will fundamentally show half-way things.
> 
> Only when you have a busted implementation of the spec.
> 
> Think about it: if I said "partial page truncation will
> fundamentally show half-way things", you would shout at me that
> truncate must -never- expose half-way things to buffered reads.
> This is how truncate is specified to behave, and we don't violate
> the spec just because it is hard to implement it.

Well, as a matter of fact we can expose part-way results of truncate for
ext4 and similar filesystems not serializing reads to truncate with inode
lock. In particular for ext4 there's the i_size check in filemap_read() but
if that passes before the truncate starts, the code copying out data from
the pages can race with truncate zeroing out tail of the last page.

> We've broken truncate repeatedly over the past 20+ years in ways
> that have exposed stale data to users. This is always considered a
> critical bug that needs to be fixed ASAP.

Exposing data that was never in the file is certainly a critical bug.
Showing a mix of old and new data is not great but less severe and it seems
over the years userspace on Linux learned to live with it and reap the
performance benefit (e.g. for mixed read-write workloads to one file)...

<snip>

> Hence there is really only one behaviour that is required: whilst
> the low level operation is taking place, no external IO (read,
> write, discard, etc) can be performed over that range of the file
> being zeroed because the data andor metadata is not stable until the
> whole operation is completed by the filesystem.
> 
> Now, this doesn't obviously read on the initial invalidation races
> that are the issue being discussed here because zero's written by
> invalidation could be considered "valid" for hole punch, zero range,
> etc.
> 
> However, consider COLLAPSE_RANGE.  Page cache invalidation
> writing zeros and reads racing with that is a problem, because
> the old data at a given offset is non-zero, whilst the new data at
> the same offset is alos non-zero.
> 
> Hence if we allow the initial page cache invalidation to race with
> buffered reads, there is the possibility of random zeros appearing
> in the data being read. Because this is not old or new data, it is
> -corrupt- data.

Well, reasons like this are why for operations like COLLAPSE_RANGE ext4
reclaims the whole interval of the page cache starting with the first
affected folio to the end. So again user will either see old data (if it
managed to get the page before we invalidated the page cache) or the new
data (when it needs to read from the disk which is properly synchronized
with COLLAPSE_RANGE through invalidate_lock). I don't see these speculative
accesses changing anything in this case either.

> Put simply, these fallocate operations should *never* see partial
> invalidation data, and so the "old or new data" rule *must* apply to
> the initial page cache invalidation these fallocate() operations do.
> 
> Hence various fallocate() operations need to act as a full IO
> barrier. Buffered IO, page faults and direct IO all must be blocked
> and drained before the invalidation of the range begins, and must
> not be allowed to start again until after the whole operation
> completes.

Hum, I'm not sure I follow you correctly but what you describe doesn't seem
like how ext4 works. There are two different things - zeroing out of
partial folios affected by truncate, hole punch, zero range (other
fallocate operations don't zero out) and invalidation of the page cache
folios. For ext4 it is actually the removal of folios from the page cache
during invalidation + holding invalidate_lock that synchronizes with reads.
As such zeroing of partial folios *can* actually race with reads within
these partial folios and so you can get a mix of zeros and old data from
reads.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Linus Torvalds 3 months, 2 weeks ago

On Mon, 20 Oct 2025 at 01:33, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> On Sun, Oct 19, 2025 at 09:53:28PM -0700, Andrew Morton wrote:
> >
> > A use case for alloca() or equiv.  That would improve the average-case
> > stack depth but not the worst-case.
>
> __kstack_alloca()/__builtin_alloca() would work and it bypassed
> -Wframe-larger-than warning.
>
> But I don't see any real users.

Yes, and we've walked away from alloca() (and on-stack VLAs, which are
really exactly the same thing as far as a compiler is concerned),
because it makes static analysis much *MUCH* harder.

Let's not ever re-introduce dynamic stack use in the kernel.

                Linus

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Linus Torvalds 3 months, 3 weeks ago

On Fri, 17 Oct 2025 at 04:15, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> To address this issue, introduce i_pages_delete_seqcnt, which increments
> each time a folio is deleted from the page cache and implement a modified
> page cache lookup protocol for short reads:

So this patch looks good to me, but to avoid the stack size warnings,
let's just make FAST_READ_BUF_SIZE be 768 bytes or something like
that, not the full 1k.

It really shouldn't make much of a difference, and we do have that
stack size limit check for a reason.

And obviously

> --- a/fs/inode.c
> +++ b/fs/inode.c
> +       seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
> +                              &mapping->i_pages->xa_lock);

will need to use '&mapping->i_pages.xa_lock', since mapping->i_pages
is the embedded xarray, not a pointer to it.

But I do think the patch looks quite good.

               Linus

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by Kiryl Shutsemau 3 months, 2 weeks ago

On Sat, Oct 18, 2025 at 07:56:48AM -1000, Linus Torvalds wrote:
> On Fri, 17 Oct 2025 at 04:15, Kiryl Shutsemau <kirill@shutemov.name> wrote:
> >
> > To address this issue, introduce i_pages_delete_seqcnt, which increments
> > each time a folio is deleted from the page cache and implement a modified
> > page cache lookup protocol for short reads:
> 
> So this patch looks good to me, but to avoid the stack size warnings,
> let's just make FAST_READ_BUF_SIZE be 768 bytes or something like
> that, not the full 1k.
> 
> It really shouldn't make much of a difference, and we do have that
> stack size limit check for a reason.

My reasoning is that we are at the leaf of the call chain. Slow path
goes much deeper to I/O.

Reducing the buffer size would invalidate my benchmarking :/
It took time.

What about disabling the warning for the function?

@@ -2750,6 +2750,8 @@ static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,

 #define FAST_READ_BUF_SIZE 1024

+__diag_push();
+__diag_ignore_all("-Wframe-larger-than=", "Allow on-stack buffer for fast read");
 static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
                                       ssize_t *already_read)
 {
@@ -2785,6 +2787,7 @@ static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter

        return !iov_iter_count(iter);
 }
+__diag_pop();

 static noinline ssize_t filemap_read_slow(struct kiocb *iocb,
                                          struct iov_iter *iter,

> 
> And obviously
> 
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > +       seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
> > +                              &mapping->i_pages->xa_lock);
> 
> will need to use '&mapping->i_pages.xa_lock', since mapping->i_pages
> is the embedded xarray, not a pointer to it.

Doh!

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by kernel test robot 3 months, 3 weeks ago

Hi Kiryl,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Kiryl-Shutsemau/mm-filemap-Implement-fast-short-reads/20251017-221655
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20251017141536.577466-1-kirill%40shutemov.name
patch subject: [PATCH] mm/filemap: Implement fast short reads
config: i386-defconfig (https://download.01.org/0day-ci/archive/20251018/202510181215.jcL2gJMQ-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251018/202510181215.jcL2gJMQ-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510181215.jcL2gJMQ-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/filemap.c:2753:22: warning: stack frame size (1096) exceeds limit (1024) in 'filemap_read_fast' [-Wframe-larger-than]
    2753 | static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
         |                      ^
   1 warning generated.


vim +/filemap_read_fast +2753 mm/filemap.c

  2752	
> 2753	static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
  2754					       ssize_t *already_read)
  2755	{
  2756		struct address_space *mapping = iocb->ki_filp->f_mapping;
  2757		struct file_ra_state *ra = &iocb->ki_filp->f_ra;
  2758		char buffer[FAST_READ_BUF_SIZE];
  2759		size_t count;
  2760	
  2761		if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
  2762			return false;
  2763	
  2764		if (iov_iter_count(iter) > sizeof(buffer))
  2765			return false;
  2766	
  2767		count = iov_iter_count(iter);
  2768	
  2769		/* Let's see if we can just do the read under RCU */
  2770		rcu_read_lock();
  2771		count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
  2772		rcu_read_unlock();
  2773	
  2774		if (!count)
  2775			return false;
  2776	
  2777		count = copy_to_iter(buffer, count, iter);
  2778		if (unlikely(!count))
  2779			return false;
  2780	
  2781		iocb->ki_pos += count;
  2782		ra->prev_pos = iocb->ki_pos;
  2783		file_accessed(iocb->ki_filp);
  2784		*already_read += count;
  2785	
  2786		return !iov_iter_count(iter);
  2787	}
  2788	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by kernel test robot 3 months, 3 weeks ago

Hi Kiryl,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Kiryl-Shutsemau/mm-filemap-Implement-fast-short-reads/20251017-221655
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20251017141536.577466-1-kirill%40shutemov.name
patch subject: [PATCH] mm/filemap: Implement fast short reads
config: i386-allmodconfig (https://download.01.org/0day-ci/archive/20251018/202510181107.YiEpGvU3-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251018/202510181107.YiEpGvU3-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510181107.YiEpGvU3-lkp@intel.com/

All warnings (new ones prefixed by >>):

   mm/filemap.c: In function 'filemap_read_fast':
>> mm/filemap.c:2787:1: warning: the frame size of 1048 bytes is larger than 1024 bytes [-Wframe-larger-than=]
    2787 | }
         | ^


vim +2787 mm/filemap.c

  2752	
  2753	static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
  2754					       ssize_t *already_read)
  2755	{
  2756		struct address_space *mapping = iocb->ki_filp->f_mapping;
  2757		struct file_ra_state *ra = &iocb->ki_filp->f_ra;
  2758		char buffer[FAST_READ_BUF_SIZE];
  2759		size_t count;
  2760	
  2761		if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
  2762			return false;
  2763	
  2764		if (iov_iter_count(iter) > sizeof(buffer))
  2765			return false;
  2766	
  2767		count = iov_iter_count(iter);
  2768	
  2769		/* Let's see if we can just do the read under RCU */
  2770		rcu_read_lock();
  2771		count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
  2772		rcu_read_unlock();
  2773	
  2774		if (!count)
  2775			return false;
  2776	
  2777		count = copy_to_iter(buffer, count, iter);
  2778		if (unlikely(!count))
  2779			return false;
  2780	
  2781		iocb->ki_pos += count;
  2782		ra->prev_pos = iocb->ki_pos;
  2783		file_accessed(iocb->ki_filp);
  2784		*already_read += count;
  2785	
  2786		return !iov_iter_count(iter);
> 2787	}
  2788	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH] mm/filemap: Implement fast short reads

Posted by kernel test robot 3 months, 3 weeks ago

Hi Kiryl,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Kiryl-Shutsemau/mm-filemap-Implement-fast-short-reads/20251017-221655
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20251017141536.577466-1-kirill%40shutemov.name
patch subject: [PATCH] mm/filemap: Implement fast short reads
config: riscv-randconfig-001-20251018 (https://download.01.org/0day-ci/archive/20251018/202510181054.Fmf1S18u-lkp@intel.com/config)
compiler: riscv32-linux-gcc (GCC) 8.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251018/202510181054.Fmf1S18u-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510181054.Fmf1S18u-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from include/linux/sched.h:45,
                    from include/linux/rcupdate.h:27,
                    from include/linux/rculist.h:11,
                    from include/linux/dcache.h:8,
                    from include/linux/fs.h:9,
                    from fs/inode.c:7:
   fs/inode.c: In function '__address_space_init_once':
>> fs/inode.c:486:28: error: invalid type argument of '->' (have 'struct xarray')
              &mapping->i_pages->xa_lock);
                               ^~
   include/linux/seqlock_types.h:57:26: note: in definition of macro '__SEQ_LOCK'
    #define __SEQ_LOCK(expr) expr
                             ^~~~
   include/linux/seqlock.h:131:42: note: in expansion of macro 'seqcount_LOCKNAME_init'
    #define seqcount_spinlock_init(s, lock)  seqcount_LOCKNAME_init(s, lock, spinlock)
                                             ^~~~~~~~~~~~~~~~~~~~~~
   fs/inode.c:485:2: note: in expansion of macro 'seqcount_spinlock_init'
     seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
     ^~~~~~~~~~~~~~~~~~~~~~

Kconfig warnings: (for reference only)
   WARNING: unmet direct dependencies detected for ARCH_HAS_ELF_CORE_EFLAGS
   Depends on [n]: BINFMT_ELF [=n] && ELF_CORE [=n]
   Selected by [y]:
   - RISCV [=y]


vim +486 fs/inode.c

   481	
   482	static void __address_space_init_once(struct address_space *mapping)
   483	{
   484		xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
   485		seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
 > 486				       &mapping->i_pages->xa_lock);
   487		init_rwsem(&mapping->i_mmap_rwsem);
   488		INIT_LIST_HEAD(&mapping->i_private_list);
   489		spin_lock_init(&mapping->i_private_lock);
   490		mapping->i_mmap = RB_ROOT_CACHED;
   491	}
   492	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki