With the upcoming iov_iter_extract_pages() function, pages extracted from a
non-user-backed iterator such as ITER_PIPE aren't pinned.
__iomap_dio_rw(), however, calls iov_iter_revert() to shorten the iterator
to just the bufferage it is going to use - which has the side-effect of
freeing the excess pipe buffers, even though they're attached to a bio and
may get written to by DMA (thanks to Hillf Danton for spotting this[1]).
This then causes memory corruption that is particularly noticable when the
syzbot test[2] is run. The test boils down to:
out = creat(argv[1], 0666);
ftruncate(out, 0x800);
lseek(out, 0x200, SEEK_SET);
in = open(argv[1], O_RDONLY | O_DIRECT | O_NOFOLLOW);
sendfile(out, in, NULL, 0x1dd00);
run repeatedly in parallel. What I think is happening is that ftruncate()
occasionally shortens the DIO read that's about to be made by sendfile's
splice core by reducing i_size.
Fix this by splitting the handling of a splice from an O_DIRECT file fd off
from that of non-DIO and in this case, replacing the use of an ITER_PIPE
iterator with an ITER_BVEC iterator for which reversion won't free the
buffers. The DIO-specific code bulk allocates all the buffers it thinks it
is going to use in advance, does the read synchronously and only then trims
the buffer down. The pages we did use get pushed into the pipe.
This should be more efficient for DIO read by virtue of doing a bulk page
allocation, but slightly less efficient by ignoring any partial page in the
pipe.
Fixes: 920756a3306a ("block: Convert bio_iov_iter_get_pages to use iov_iter_extract_pages")
Reported-by: syzbot+a440341a59e3b7142895@syzkaller.appspotmail.com
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: David Hildenbrand <david@redhat.com>
cc: John Hubbard <jhubbard@nvidia.com>
cc: linux-mm@kvack.org
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20230207094731.1390-1-hdanton@sina.com/ [1]
Link: https://lore.kernel.org/r/000000000000b0b3c005f3a09383@google.com/ [2]
---
Notes:
ver #13)
- Don't completely replace generic_file_splice_read(), but rather only use
this if we're doing a splicing from an O_DIRECT file fd.
fs/splice.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 96 insertions(+)
diff --git a/fs/splice.c b/fs/splice.c
index 5969b7a1d353..b4be6fc314a1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -282,6 +282,99 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
kfree(spd->partial);
}
+/*
+ * Splice data from an O_DIRECT file into pages and then add them to the output
+ * pipe.
+ */
+static ssize_t generic_file_direct_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ LIST_HEAD(pages);
+ struct iov_iter to;
+ struct bio_vec *bv;
+ struct kiocb kiocb;
+ struct page *page;
+ unsigned int head;
+ ssize_t ret;
+ size_t used, npages, chunk, remain, reclaim;
+ int i;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+ npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+ bv = kmalloc(array_size(npages, sizeof(bv[0])), GFP_KERNEL);
+ if (!bv)
+ return -ENOMEM;
+
+ npages = alloc_pages_bulk_list(GFP_USER, npages, &pages);
+ if (!npages) {
+ kfree(bv);
+ return -ENOMEM;
+ }
+
+ remain = len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ for (i = 0; i < npages; i++) {
+ chunk = min_t(size_t, PAGE_SIZE, remain);
+ page = list_first_entry(&pages, struct page, lru);
+ list_del_init(&page->lru);
+ bv[i].bv_page = page;
+ bv[i].bv_offset = 0;
+ bv[i].bv_len = chunk;
+ remain -= chunk;
+ }
+
+ /* Do the I/O */
+ iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
+ init_sync_kiocb(&kiocb, in);
+ kiocb.ki_pos = *ppos;
+ ret = call_read_iter(in, &kiocb, &to);
+
+ reclaim = npages * PAGE_SIZE;
+ remain = 0;
+ if (ret > 0) {
+ reclaim -= ret;
+ remain = ret;
+ *ppos = kiocb.ki_pos;
+ file_accessed(in);
+ } else if (ret < 0) {
+ /*
+ * callers of ->splice_read() expect -EAGAIN on
+ * "can't put anything in there", rather than -EFAULT.
+ */
+ if (ret == -EFAULT)
+ ret = -EAGAIN;
+ }
+
+ /* Free any pages that didn't get touched at all. */
+ for (; reclaim >= PAGE_SIZE; reclaim -= PAGE_SIZE)
+ __free_page(bv[--npages].bv_page);
+
+ /* Push the remaining pages into the pipe. */
+ head = pipe->head;
+ for (i = 0; i < npages; i++) {
+ struct pipe_buffer *buf = &pipe->bufs[head & (pipe->ring_size - 1)];
+
+ chunk = min_t(size_t, remain, PAGE_SIZE);
+ *buf = (struct pipe_buffer) {
+ .ops = &default_pipe_buf_ops,
+ .page = bv[i].bv_page,
+ .offset = 0,
+ .len = chunk,
+ };
+ head++;
+ remain -= chunk;
+ }
+ pipe->head = head;
+
+ kfree(bv);
+ return ret;
+}
+
/**
* generic_file_splice_read - splice data from file to a pipe
* @in: file to splice from
@@ -303,6 +396,9 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct kiocb kiocb;
int ret;
+ if (in->f_flags & O_DIRECT)
+ return generic_file_direct_splice_read(in, ppos, pipe, len, flags);
+
iov_iter_pipe(&to, ITER_DEST, pipe, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
On Thu, Feb 09, 2023 at 10:29:43AM +0000, David Howells wrote: > + npages = alloc_pages_bulk_list(GFP_USER, npages, &pages); Please don't use alloc_pages_bulk_list(). If nobody uses it, it can go away again soon. Does alloc_pages_bulk_array() work for you? It's faster. > + /* Free any pages that didn't get touched at all. */ > + for (; reclaim >= PAGE_SIZE; reclaim -= PAGE_SIZE) > + __free_page(bv[--npages].bv_page); If you have that array, you can then use release_pages() to free them, which will be faster.
Matthew Wilcox <willy@infradead.org> wrote: > On Thu, Feb 09, 2023 at 10:29:43AM +0000, David Howells wrote: > > + npages = alloc_pages_bulk_list(GFP_USER, npages, &pages); > > Please don't use alloc_pages_bulk_list(). If nobody uses it, it can go > away again soon. Does alloc_pages_bulk_array() work for you? It's > faster. Sure. > > + /* Free any pages that didn't get touched at all. */ > > + for (; reclaim >= PAGE_SIZE; reclaim -= PAGE_SIZE) > > + __free_page(bv[--npages].bv_page); > > If you have that array, you can then use release_pages() to free > them, which will be faster. Um. I would normally overlay the array on end of the bvec[] so that I could save on an allocation (I have to fill in the bvec[] anyway) - which means I wouldn't still have the array at release time. But in this case I can make an exception, though I would've thought that the expectation would be that all the requested data would be fetched. David
Matthew Wilcox <willy@infradead.org> wrote:
> Please don't use alloc_pages_bulk_list().
...
> If you have that array, you can then use release_pages() ...
Done. See attached replacement patch.
David
---
splice: Fix O_DIRECT file read splice to avoid reversion of ITER_PIPE
With the upcoming iov_iter_extract_pages() function, pages extracted from a
non-user-backed iterator such as ITER_PIPE aren't pinned.
__iomap_dio_rw(), however, calls iov_iter_revert() to shorten the iterator
to just the bufferage it is going to use - which has the side-effect of
freeing the excess pipe buffers, even though they're attached to a bio and
may get written to by DMA (thanks to Hillf Danton for spotting this[1]).
This then causes memory corruption that is particularly noticable when the
syzbot test[2] is run. The test boils down to:
out = creat(argv[1], 0666);
ftruncate(out, 0x800);
lseek(out, 0x200, SEEK_SET);
in = open(argv[1], O_RDONLY | O_DIRECT | O_NOFOLLOW);
sendfile(out, in, NULL, 0x1dd00);
run repeatedly in parallel. What I think is happening is that ftruncate()
occasionally shortens the DIO read that's about to be made by sendfile's
splice core by reducing i_size.
Fix this by splitting the handling of a splice from an O_DIRECT file fd off
from that of non-DIO and in this case, replacing the use of an ITER_PIPE
iterator with an ITER_BVEC iterator for which reversion won't free the
buffers. The DIO-specific code bulk allocates all the buffers it thinks it
is going to use in advance, does the read synchronously and only then trims
the buffer down. The pages we did use get pushed into the pipe.
This should be more efficient for DIO read by virtue of doing a bulk page
allocation, but slightly less efficient by ignoring any partial page in the
pipe.
Fixes: 920756a3306a ("block: Convert bio_iov_iter_get_pages to use iov_iter_extract_pages")
Reported-by: syzbot+a440341a59e3b7142895@syzkaller.appspotmail.com
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: David Hildenbrand <david@redhat.com>
cc: John Hubbard <jhubbard@nvidia.com>
cc: linux-mm@kvack.org
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20230207094731.1390-1-hdanton@sina.com/ [1]
Link: https://lore.kernel.org/r/000000000000b0b3c005f3a09383@google.com/ [2]
---
Notes:
ver #14)
- Use alloc_pages_bulk_array() rather than alloc_pages_bulk_list().
- Use release_pages() rather than a loop calling __free_page().
ver #13)
- Don't completely replace generic_file_splice_read(), but rather only use
this if we're doing a splicing from an O_DIRECT file fd.
fs/splice.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 98 insertions(+)
diff --git a/fs/splice.c b/fs/splice.c
index 5969b7a1d353..91244270b36e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -282,6 +282,101 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
kfree(spd->partial);
}
+/*
+ * Splice data from an O_DIRECT file into pages and then add them to the output
+ * pipe.
+ */
+static ssize_t generic_file_direct_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct iov_iter to;
+ struct bio_vec *bv;
+ struct kiocb kiocb;
+ struct page **pages;
+ unsigned int head;
+ ssize_t ret;
+ size_t used, npages, chunk, remain, reclaim;
+ int i;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+ npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+ bv = kzalloc(array_size(npages, sizeof(bv[0])) +
+ array_size(npages, sizeof(struct page *)), GFP_KERNEL);
+ if (!bv)
+ return -ENOMEM;
+
+ pages = (void *)(bv + npages);
+ npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
+ if (!npages) {
+ kfree(bv);
+ return -ENOMEM;
+ }
+
+ remain = len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ for (i = 0; i < npages; i++) {
+ chunk = min_t(size_t, PAGE_SIZE, remain);
+ bv[i].bv_page = pages[i];
+ bv[i].bv_offset = 0;
+ bv[i].bv_len = chunk;
+ remain -= chunk;
+ }
+
+ /* Do the I/O */
+ iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
+ init_sync_kiocb(&kiocb, in);
+ kiocb.ki_pos = *ppos;
+ ret = call_read_iter(in, &kiocb, &to);
+
+ reclaim = npages * PAGE_SIZE;
+ remain = 0;
+ if (ret > 0) {
+ reclaim -= ret;
+ remain = ret;
+ *ppos = kiocb.ki_pos;
+ file_accessed(in);
+ } else if (ret < 0) {
+ /*
+ * callers of ->splice_read() expect -EAGAIN on
+ * "can't put anything in there", rather than -EFAULT.
+ */
+ if (ret == -EFAULT)
+ ret = -EAGAIN;
+ }
+
+ /* Free any pages that didn't get touched at all. */
+ reclaim /= PAGE_SIZE;
+ if (reclaim) {
+ npages -= reclaim;
+ release_pages(pages + npages, reclaim);
+ }
+
+ /* Push the remaining pages into the pipe. */
+ head = pipe->head;
+ for (i = 0; i < npages; i++) {
+ struct pipe_buffer *buf = &pipe->bufs[head & (pipe->ring_size - 1)];
+
+ chunk = min_t(size_t, remain, PAGE_SIZE);
+ *buf = (struct pipe_buffer) {
+ .ops = &default_pipe_buf_ops,
+ .page = bv[i].bv_page,
+ .offset = 0,
+ .len = chunk,
+ };
+ head++;
+ remain -= chunk;
+ }
+ pipe->head = head;
+
+ kfree(bv);
+ return ret;
+}
+
/**
* generic_file_splice_read - splice data from file to a pipe
* @in: file to splice from
@@ -303,6 +398,9 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct kiocb kiocb;
int ret;
+ if (in->f_flags & O_DIRECT)
+ return generic_file_direct_splice_read(in, ppos, pipe, len, flags);
+
iov_iter_pipe(&to, ITER_DEST, pipe, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
> + if (!bv) > + return -ENOMEM; > + > + pages = (void *)(bv + npages); I think this cast should be to struct page **… not void *. > + npages = alloc_pages_bulk_array(GFP_USER, npages, pages); > + if (!npages) { > + kfree(bv); > + return -ENOMEM; > + } > + reclaim = npages * PAGE_SIZE; > + remain = 0; > + if (ret > 0) { > + reclaim -= ret; > + remain = ret; ... > + /* Free any pages that didn't get touched at all. */ > + reclaim /= PAGE_SIZE; Any reason not to keep reclaim in PAGE_SIZE units to start with?
Christoph Hellwig <hch@infradead.org> wrote: > > + pages = (void *)(bv + npages); > > I think this cast should be to struct page **… not void *. Yeah. Doesn't change anything functionally, though, I think. > > + /* Free any pages that didn't get touched at all. */ > > + reclaim /= PAGE_SIZE; > > Any reason not to keep reclaim in PAGE_SIZE units to start with? Probably not, but I don't want to fiddle with that right now. I can send a follow up patch for it. David
On Wed, Feb 15, 2023 at 01:17:56PM +0000, David Howells wrote: > Probably not, but I don't want to fiddle with that right now. I can send a > follow up patch for it. Honestly, I think this rush for 6.3 inclusion is a really bad idea. This series fundamentally changes how splice reads work, and has only been out for about a week. It hasn't even been Cc'ed to Al and Linus which generally have a good knowledge of the splice code and an opinion on it. I think it is a good change, but I'd feel much more comfortable with it for the next merge window rather than rushing it.
Christoph Hellwig <hch@infradead.org> wrote: > On Wed, Feb 15, 2023 at 01:17:56PM +0000, David Howells wrote: > > Probably not, but I don't want to fiddle with that right now. I can send a > > follow up patch for it. > > Honestly, I think this rush for 6.3 inclusion is a really bad idea. > > This series fundamentally changes how splice reads work, and has only > been out for about a week. It hasn't even been Cc'ed to Al Sorry, what?! Al has been To'd or cc'd on every patch. > and Linus I don't know that it's necessary to cc Linus on everything. Jens is the splice maintainer, I thought. > which generally have a good knowledge of the splice code and an opinion > on it. > > I think it is a good change, but I'd feel much more comfortable with > it for the next merge window rather than rushing it. The lack of iov_iter_extract_pages() is blocking other things I want to work on - and will push those out another 3 months further beyond this. I'm fine with dropping the block layer changes and most of the splice changes, but I do want to try to get patches 1-3, 10 and 11: mm: Pass info, not iter, into filemap_get_pages() splice: Add a func to do a splice from a buffered file without ITER_PIPE splice: Add a func to do a splice from an O_DIRECT file without ITER_PIPE iov_iter: Add a function to extract a page list from an iterator iov_iter: Define flags to qualify page extraction. upstream through the cifs tree if you, Jens and Steve French have no objection, with my cifs iteratorisation patches on top. It shouldn't affect anything other than cifs in this merge window, barring the change to the flags to iov_iter_get_pages*(). David
How about the attached? I won't fold it down for the moment, but it could be
pushed along later.
David
---
splice: Clean up direct_splice_read() a bit
Do a couple of cleanups to direct_splice_read():
(1) Cast to struct page **, not void *.
(2) Simplify the calculation of the number of pages to keep/reclaim in
direct_splice_read().
Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Jens Axboe <axboe@kernel.dk>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: David Hildenbrand <david@redhat.com>
cc: John Hubbard <jhubbard@nvidia.com>
cc: linux-mm@kvack.org
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
diff --git a/fs/splice.c b/fs/splice.c
index 9e798c901087..572d3e2a669a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -295,7 +295,7 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos,
struct kiocb kiocb;
struct page **pages;
ssize_t ret;
- size_t used, npages, chunk, remain, reclaim;
+ size_t used, npages, chunk, remain, keep = 0;
int i;
/* Work out how much data we can actually add into the pipe */
@@ -332,11 +332,8 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos,
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
- reclaim = npages * PAGE_SIZE;
- remain = 0;
if (ret > 0) {
- reclaim -= ret;
- remain = ret;
+ keep = DIV_ROUND_UP(ret, PAGE_SIZE);
*ppos = kiocb.ki_pos;
file_accessed(in);
} else if (ret < 0) {
@@ -349,14 +346,12 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos,
}
/* Free any pages that didn't get touched at all. */
- reclaim /= PAGE_SIZE;
- if (reclaim) {
- npages -= reclaim;
- release_pages(pages + npages, reclaim);
- }
+ if (keep < npages)
+ release_pages(pages + keep, npages - keep);
/* Push the remaining pages into the pipe. */
- for (i = 0; i < npages; i++) {
+ remain = ret;
+ for (i = 0; i < keep; i++) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
chunk = min_t(size_t, remain, PAGE_SIZE);
I forgot to commit the cast change too. Try the attached instead. David --- splice: Clean up direct_splice_read() a bit Do a couple of cleanups to direct_splice_read(): (1) Cast to struct page **, not void *. (2) Simplify the calculation of the number of pages to keep/reclaim in direct_splice_read(). Suggested-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: David Howells <dhowells@redhat.com> cc: Jens Axboe <axboe@kernel.dk> cc: Christoph Hellwig <hch@lst.de> cc: Al Viro <viro@zeniv.linux.org.uk> cc: David Hildenbrand <david@redhat.com> cc: John Hubbard <jhubbard@nvidia.com> cc: linux-mm@kvack.org cc: linux-block@vger.kernel.org cc: linux-fsdevel@vger.kernel.org --- diff --git a/fs/splice.c b/fs/splice.c index 9e798c901087..e97f9aa30717 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -295,7 +295,7 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, struct kiocb kiocb; struct page **pages; ssize_t ret; - size_t used, npages, chunk, remain, reclaim; + size_t used, npages, chunk, remain, keep = 0; int i; /* Work out how much data we can actually add into the pipe */ @@ -309,7 +309,7 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, if (!bv) return -ENOMEM; - pages = (void *)(bv + npages); + pages = (struct page **)(bv + npages); npages = alloc_pages_bulk_array(GFP_USER, npages, pages); if (!npages) { kfree(bv); @@ -332,11 +332,8 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); - reclaim = npages * PAGE_SIZE; - remain = 0; if (ret > 0) { - reclaim -= ret; - remain = ret; + keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; file_accessed(in); } else if (ret < 0) { @@ -349,14 +346,12 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, } /* Free any pages that didn't get touched at all. */ - reclaim /= PAGE_SIZE; - if (reclaim) { - npages -= reclaim; - release_pages(pages + npages, reclaim); - } + if (keep < npages) + release_pages(pages + keep, npages - keep); /* Push the remaining pages into the pipe. */ - for (i = 0; i < npages; i++) { + remain = ret; + for (i = 0; i < keep; i++) { struct pipe_buffer *buf = pipe_head_buf(pipe); chunk = min_t(size_t, remain, PAGE_SIZE);
© 2016 - 2024 Red Hat, Inc.