ublk: use copy_{to,from}_iter() for user copy

[PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Caleb Sander Mateos 3 months, 1 week ago

ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
iov_iter_get_pages2() to extract the pages from the iov_iter and
memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
user page reference count increments and decrements and needing to split
the memcpy() at user page boundaries. It also simplifies the code
considerably.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
 drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
 1 file changed, 14 insertions(+), 48 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 0c74a41a6753..852350e639d6 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
 	.open =		ublk_open,
 	.free_disk =	ublk_free_disk,
 	.report_zones =	ublk_report_zones,
 };
 
-#define UBLK_MAX_PIN_PAGES	32
-
 struct ublk_io_iter {
-	struct page *pages[UBLK_MAX_PIN_PAGES];
 	struct bio *bio;
 	struct bvec_iter iter;
 };
 
-/* return how many pages are copied */
-static void ublk_copy_io_pages(struct ublk_io_iter *data,
-		size_t total, size_t pg_off, int dir)
+/* return how many bytes are copied */
+static size_t ublk_copy_io_pages(struct ublk_io_iter *data,
+		struct iov_iter *uiter, int dir)
 {
-	unsigned done = 0;
-	unsigned pg_idx = 0;
+	size_t done = 0;
 
-	while (done < total) {
+	for (;;) {
 		struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
-		unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
-				(unsigned)(PAGE_SIZE - pg_off));
 		void *bv_buf = bvec_kmap_local(&bv);
-		void *pg_buf = kmap_local_page(data->pages[pg_idx]);
+		size_t copied;
 
 		if (dir == ITER_DEST)
-			memcpy(pg_buf + pg_off, bv_buf, bytes);
+			copied = copy_to_iter(bv_buf, bv.bv_len, uiter);
 		else
-			memcpy(bv_buf, pg_buf + pg_off, bytes);
+			copied = copy_from_iter(bv_buf, bv.bv_len, uiter);
 
-		kunmap_local(pg_buf);
 		kunmap_local(bv_buf);
 
-		/* advance page array */
-		pg_off += bytes;
-		if (pg_off == PAGE_SIZE) {
-			pg_idx += 1;
-			pg_off = 0;
-		}
-
-		done += bytes;
+		done += copied;
+		if (copied < bv.bv_len)
+			break;
 
 		/* advance bio */
-		bio_advance_iter_single(data->bio, &data->iter, bytes);
+		bio_advance_iter_single(data->bio, &data->iter, copied);
 		if (!data->iter.bi_size) {
 			data->bio = data->bio->bi_next;
 			if (data->bio == NULL)
 				break;
 			data->iter = data->bio->bi_iter;
 		}
 	}
+	return done;
 }
 
 static bool ublk_advance_io_iter(const struct request *req,
 		struct ublk_io_iter *iter, unsigned int offset)
 {
@@ -987,38 +976,15 @@ static bool ublk_advance_io_iter(const struct request *req,
  */
 static size_t ublk_copy_user_pages(const struct request *req,
 		unsigned offset, struct iov_iter *uiter, int dir)
 {
 	struct ublk_io_iter iter;
-	size_t done = 0;
 
 	if (!ublk_advance_io_iter(req, &iter, offset))
 		return 0;
 
-	while (iov_iter_count(uiter) && iter.bio) {
-		unsigned nr_pages;
-		ssize_t len;
-		size_t off;
-		int i;
-
-		len = iov_iter_get_pages2(uiter, iter.pages,
-				iov_iter_count(uiter),
-				UBLK_MAX_PIN_PAGES, &off);
-		if (len <= 0)
-			return done;
-
-		ublk_copy_io_pages(&iter, len, off, dir);
-		nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
-		for (i = 0; i < nr_pages; i++) {
-			if (dir == ITER_DEST)
-				set_page_dirty(iter.pages[i]);
-			put_page(iter.pages[i]);
-		}
-		done += len;
-	}
-
-	return done;
+	return ublk_copy_io_pages(&iter, uiter, dir);
 }
 
 static inline bool ublk_need_map_req(const struct request *req)
 {
 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
-- 
2.45.2

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Ming Lei 3 months, 1 week ago

On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> iov_iter_get_pages2() to extract the pages from the iov_iter and
> memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> user page reference count increments and decrements and needing to split
> the memcpy() at user page boundaries. It also simplifies the code
> considerably.
> 
> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> ---
>  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
>  1 file changed, 14 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 0c74a41a6753..852350e639d6 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
>  	.open =		ublk_open,
>  	.free_disk =	ublk_free_disk,
>  	.report_zones =	ublk_report_zones,
>  };
>  
> -#define UBLK_MAX_PIN_PAGES	32
> -
>  struct ublk_io_iter {
> -	struct page *pages[UBLK_MAX_PIN_PAGES];
>  	struct bio *bio;
>  	struct bvec_iter iter;
>  };

->pages[] is actually for pinning user io pages in batch, so killing it may cause
perf drop.

This similar trick is used in direct io code path too.


Thanks 
Ming

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Caleb Sander Mateos 3 months, 1 week ago

On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > user page reference count increments and decrements and needing to split
> > the memcpy() at user page boundaries. It also simplifies the code
> > considerably.
> >
> > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > ---
> >  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> >  1 file changed, 14 insertions(+), 48 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index 0c74a41a6753..852350e639d6 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> >       .open =         ublk_open,
> >       .free_disk =    ublk_free_disk,
> >       .report_zones = ublk_report_zones,
> >  };
> >
> > -#define UBLK_MAX_PIN_PAGES   32
> > -
> >  struct ublk_io_iter {
> > -     struct page *pages[UBLK_MAX_PIN_PAGES];
> >       struct bio *bio;
> >       struct bvec_iter iter;
> >  };
>
> ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> perf drop.

As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
pinning entirely. It calls copy_to_user_iter() for each contiguous
user address range:

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter, memcpy_to_iter);
}

Which just checks that the address range doesn't include any kernel
addresses and then memcpy()s directly via the userspace virtual
addresses:

static __always_inline
size_t copy_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = raw_copy_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline __must_check unsigned long
raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
        return copy_user_generic((__force void *)dst, src, size);
}

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned long len)
{
        stac();
        /*
         * If CPU has FSRM feature, use 'rep movs'.
         * Otherwise, use rep_movs_alternative.
         */
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep movsb",
                            "call rep_movs_alternative",
ALT_NOT(X86_FEATURE_FSRM))
                "2:\n"
                _ASM_EXTABLE_UA(1b, 2b)
                :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
                : : "memory", "rax");
        clac();
        return len;
}

Am I missing something?

Best,
Caleb

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Ming Lei 3 months, 1 week ago

On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > user page reference count increments and decrements and needing to split
> > > the memcpy() at user page boundaries. It also simplifies the code
> > > considerably.
> > >
> > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > ---
> > >  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > >  1 file changed, 14 insertions(+), 48 deletions(-)
> > >
> > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > index 0c74a41a6753..852350e639d6 100644
> > > --- a/drivers/block/ublk_drv.c
> > > +++ b/drivers/block/ublk_drv.c
> > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > >       .open =         ublk_open,
> > >       .free_disk =    ublk_free_disk,
> > >       .report_zones = ublk_report_zones,
> > >  };
> > >
> > > -#define UBLK_MAX_PIN_PAGES   32
> > > -
> > >  struct ublk_io_iter {
> > > -     struct page *pages[UBLK_MAX_PIN_PAGES];
> > >       struct bio *bio;
> > >       struct bvec_iter iter;
> > >  };
> >
> > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > perf drop.
> 
> As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> pinning entirely. It calls copy_to_user_iter() for each contiguous
> user address range:
> 
> size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> {
>         if (WARN_ON_ONCE(i->data_source))
>                 return 0;
>         if (user_backed_iter(i))
>                 might_fault();
>         return iterate_and_advance(i, bytes, (void *)addr,
>                                    copy_to_user_iter, memcpy_to_iter);
> }
> 
> Which just checks that the address range doesn't include any kernel
> addresses and then memcpy()s directly via the userspace virtual
> addresses:
> 
> static __always_inline
> size_t copy_to_user_iter(void __user *iter_to, size_t progress,
>                          size_t len, void *from, void *priv2)
> {
>         if (should_fail_usercopy())
>                 return len;
>         if (access_ok(iter_to, len)) {
>                 from += progress;
>                 instrument_copy_to_user(iter_to, from, len);
>                 len = raw_copy_to_user(iter_to, from, len);
>         }
>         return len;
> }
> 
> static __always_inline __must_check unsigned long
> raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> {
>         return copy_user_generic((__force void *)dst, src, size);
> }
> 
> static __always_inline __must_check unsigned long
> copy_user_generic(void *to, const void *from, unsigned long len)
> {
>         stac();
>         /*
>          * If CPU has FSRM feature, use 'rep movs'.
>          * Otherwise, use rep_movs_alternative.
>          */
>         asm volatile(
>                 "1:\n\t"
>                 ALTERNATIVE("rep movsb",
>                             "call rep_movs_alternative",
> ALT_NOT(X86_FEATURE_FSRM))
>                 "2:\n"
>                 _ASM_EXTABLE_UA(1b, 2b)
>                 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
>                 : : "memory", "rax");
>         clac();
>         return len;
> }
> 
> Am I missing something?

page is allocated & mapped in page fault handler.

However, in typical cases, pages in io buffer shouldn't be swapped out
frequently, so this cleanup may be good, I will run some perf test.

Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
bvec_kmap_local(), and the two helper can handle one whole bvec instead
of single page.

Then rq_for_each_bvec() can be used directly, and `ublk_io_iter` may be
killed.



Thanks,
Ming

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Caleb Sander Mateos 3 months ago

On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > user page reference count increments and decrements and needing to split
> > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > considerably.
> > > >
> > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > ---
> > > >  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > >  1 file changed, 14 insertions(+), 48 deletions(-)
> > > >
> > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > index 0c74a41a6753..852350e639d6 100644
> > > > --- a/drivers/block/ublk_drv.c
> > > > +++ b/drivers/block/ublk_drv.c
> > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > >       .open =         ublk_open,
> > > >       .free_disk =    ublk_free_disk,
> > > >       .report_zones = ublk_report_zones,
> > > >  };
> > > >
> > > > -#define UBLK_MAX_PIN_PAGES   32
> > > > -
> > > >  struct ublk_io_iter {
> > > > -     struct page *pages[UBLK_MAX_PIN_PAGES];
> > > >       struct bio *bio;
> > > >       struct bvec_iter iter;
> > > >  };
> > >
> > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > perf drop.
> >
> > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > user address range:
> >
> > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > {
> >         if (WARN_ON_ONCE(i->data_source))
> >                 return 0;
> >         if (user_backed_iter(i))
> >                 might_fault();
> >         return iterate_and_advance(i, bytes, (void *)addr,
> >                                    copy_to_user_iter, memcpy_to_iter);
> > }
> >
> > Which just checks that the address range doesn't include any kernel
> > addresses and then memcpy()s directly via the userspace virtual
> > addresses:
> >
> > static __always_inline
> > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> >                          size_t len, void *from, void *priv2)
> > {
> >         if (should_fail_usercopy())
> >                 return len;
> >         if (access_ok(iter_to, len)) {
> >                 from += progress;
> >                 instrument_copy_to_user(iter_to, from, len);
> >                 len = raw_copy_to_user(iter_to, from, len);
> >         }
> >         return len;
> > }
> >
> > static __always_inline __must_check unsigned long
> > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > {
> >         return copy_user_generic((__force void *)dst, src, size);
> > }
> >
> > static __always_inline __must_check unsigned long
> > copy_user_generic(void *to, const void *from, unsigned long len)
> > {
> >         stac();
> >         /*
> >          * If CPU has FSRM feature, use 'rep movs'.
> >          * Otherwise, use rep_movs_alternative.
> >          */
> >         asm volatile(
> >                 "1:\n\t"
> >                 ALTERNATIVE("rep movsb",
> >                             "call rep_movs_alternative",
> > ALT_NOT(X86_FEATURE_FSRM))
> >                 "2:\n"
> >                 _ASM_EXTABLE_UA(1b, 2b)
> >                 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> >                 : : "memory", "rax");
> >         clac();
> >         return len;
> > }
> >
> > Am I missing something?
>
> page is allocated & mapped in page fault handler.

Right, physical pages certainly need to be allocated for the virtual
address range being copied to/from. But that would have happened
previously in iov_iter_get_pages2(), so this isn't a new cost. And as
you point out, in the common case that the virtual pages are already
mapped to physical pages, the copy won't cause any page faults.

>
> However, in typical cases, pages in io buffer shouldn't be swapped out
> frequently, so this cleanup may be good, I will run some perf test.

Thanks for testing.

>
> Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> bvec_kmap_local(), and the two helper can handle one whole bvec instead
> of single page.

Yes, that's a good idea. Thanks, I didn't know about that.

>
> Then rq_for_each_bvec() can be used directly, and `ublk_io_iter` may be
> killed.

Hmm, we still need a way to offset into the request (i.e. what
ublk_advance_io_iter() does currently). Are you thinking of a single
rq_for_each_bvec() loop that would skip bvecs until the offset is
reached and then copy until reaching the end of the user iterator?

Best,
Caleb

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Caleb Sander Mateos 3 months ago

On Mon, Nov 3, 2025 at 8:40 AM Caleb Sander Mateos
<csander@purestorage.com> wrote:
>
> On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > > user page reference count increments and decrements and needing to split
> > > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > > considerably.
> > > > >
> > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > ---
> > > > >  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > > >  1 file changed, 14 insertions(+), 48 deletions(-)
> > > > >
> > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > index 0c74a41a6753..852350e639d6 100644
> > > > > --- a/drivers/block/ublk_drv.c
> > > > > +++ b/drivers/block/ublk_drv.c
> > > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > > >       .open =         ublk_open,
> > > > >       .free_disk =    ublk_free_disk,
> > > > >       .report_zones = ublk_report_zones,
> > > > >  };
> > > > >
> > > > > -#define UBLK_MAX_PIN_PAGES   32
> > > > > -
> > > > >  struct ublk_io_iter {
> > > > > -     struct page *pages[UBLK_MAX_PIN_PAGES];
> > > > >       struct bio *bio;
> > > > >       struct bvec_iter iter;
> > > > >  };
> > > >
> > > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > > perf drop.
> > >
> > > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > > user address range:
> > >
> > > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > > {
> > >         if (WARN_ON_ONCE(i->data_source))
> > >                 return 0;
> > >         if (user_backed_iter(i))
> > >                 might_fault();
> > >         return iterate_and_advance(i, bytes, (void *)addr,
> > >                                    copy_to_user_iter, memcpy_to_iter);
> > > }
> > >
> > > Which just checks that the address range doesn't include any kernel
> > > addresses and then memcpy()s directly via the userspace virtual
> > > addresses:
> > >
> > > static __always_inline
> > > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> > >                          size_t len, void *from, void *priv2)
> > > {
> > >         if (should_fail_usercopy())
> > >                 return len;
> > >         if (access_ok(iter_to, len)) {
> > >                 from += progress;
> > >                 instrument_copy_to_user(iter_to, from, len);
> > >                 len = raw_copy_to_user(iter_to, from, len);
> > >         }
> > >         return len;
> > > }
> > >
> > > static __always_inline __must_check unsigned long
> > > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > > {
> > >         return copy_user_generic((__force void *)dst, src, size);
> > > }
> > >
> > > static __always_inline __must_check unsigned long
> > > copy_user_generic(void *to, const void *from, unsigned long len)
> > > {
> > >         stac();
> > >         /*
> > >          * If CPU has FSRM feature, use 'rep movs'.
> > >          * Otherwise, use rep_movs_alternative.
> > >          */
> > >         asm volatile(
> > >                 "1:\n\t"
> > >                 ALTERNATIVE("rep movsb",
> > >                             "call rep_movs_alternative",
> > > ALT_NOT(X86_FEATURE_FSRM))
> > >                 "2:\n"
> > >                 _ASM_EXTABLE_UA(1b, 2b)
> > >                 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > >                 : : "memory", "rax");
> > >         clac();
> > >         return len;
> > > }
> > >
> > > Am I missing something?
> >
> > page is allocated & mapped in page fault handler.
>
> Right, physical pages certainly need to be allocated for the virtual
> address range being copied to/from. But that would have happened
> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> you point out, in the common case that the virtual pages are already
> mapped to physical pages, the copy won't cause any page faults.
>
> >
> > However, in typical cases, pages in io buffer shouldn't be swapped out
> > frequently, so this cleanup may be good, I will run some perf test.
>
> Thanks for testing.
>
> >
> > Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> > bvec_kmap_local(), and the two helper can handle one whole bvec instead
> > of single page.
>
> Yes, that's a good idea. Thanks, I didn't know about that.

Looking into this further, copy_page_{to,from}_iter() doesn't seem
like an improvement. It still uses kmap_local_page() +
copy_{to,from}_iter() + kunmap_local() internally, and it splits the
copy at page boundaries instead of copying the whole bvec at once. I
don't think it's worth switching just to hide the bvec_kmap_local() +
kunmap_local() calls.

Best,
Caleb

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Caleb Sander Mateos 3 months ago

On Wed, Nov 5, 2025 at 8:16 AM Caleb Sander Mateos
<csander@purestorage.com> wrote:
>
> On Mon, Nov 3, 2025 at 8:40 AM Caleb Sander Mateos
> <csander@purestorage.com> wrote:
> >
> > On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > > > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > >
> > > > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > > > user page reference count increments and decrements and needing to split
> > > > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > > > considerably.
> > > > > >
> > > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > > ---
> > > > > >  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > > > >  1 file changed, 14 insertions(+), 48 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > > index 0c74a41a6753..852350e639d6 100644
> > > > > > --- a/drivers/block/ublk_drv.c
> > > > > > +++ b/drivers/block/ublk_drv.c
> > > > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > > > >       .open =         ublk_open,
> > > > > >       .free_disk =    ublk_free_disk,
> > > > > >       .report_zones = ublk_report_zones,
> > > > > >  };
> > > > > >
> > > > > > -#define UBLK_MAX_PIN_PAGES   32
> > > > > > -
> > > > > >  struct ublk_io_iter {
> > > > > > -     struct page *pages[UBLK_MAX_PIN_PAGES];
> > > > > >       struct bio *bio;
> > > > > >       struct bvec_iter iter;
> > > > > >  };
> > > > >
> > > > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > > > perf drop.
> > > >
> > > > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > > > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > > > user address range:
> > > >
> > > > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > > > {
> > > >         if (WARN_ON_ONCE(i->data_source))
> > > >                 return 0;
> > > >         if (user_backed_iter(i))
> > > >                 might_fault();
> > > >         return iterate_and_advance(i, bytes, (void *)addr,
> > > >                                    copy_to_user_iter, memcpy_to_iter);
> > > > }
> > > >
> > > > Which just checks that the address range doesn't include any kernel
> > > > addresses and then memcpy()s directly via the userspace virtual
> > > > addresses:
> > > >
> > > > static __always_inline
> > > > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> > > >                          size_t len, void *from, void *priv2)
> > > > {
> > > >         if (should_fail_usercopy())
> > > >                 return len;
> > > >         if (access_ok(iter_to, len)) {
> > > >                 from += progress;
> > > >                 instrument_copy_to_user(iter_to, from, len);
> > > >                 len = raw_copy_to_user(iter_to, from, len);
> > > >         }
> > > >         return len;
> > > > }
> > > >
> > > > static __always_inline __must_check unsigned long
> > > > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > > > {
> > > >         return copy_user_generic((__force void *)dst, src, size);
> > > > }
> > > >
> > > > static __always_inline __must_check unsigned long
> > > > copy_user_generic(void *to, const void *from, unsigned long len)
> > > > {
> > > >         stac();
> > > >         /*
> > > >          * If CPU has FSRM feature, use 'rep movs'.
> > > >          * Otherwise, use rep_movs_alternative.
> > > >          */
> > > >         asm volatile(
> > > >                 "1:\n\t"
> > > >                 ALTERNATIVE("rep movsb",
> > > >                             "call rep_movs_alternative",
> > > > ALT_NOT(X86_FEATURE_FSRM))
> > > >                 "2:\n"
> > > >                 _ASM_EXTABLE_UA(1b, 2b)
> > > >                 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > > >                 : : "memory", "rax");
> > > >         clac();
> > > >         return len;
> > > > }
> > > >
> > > > Am I missing something?
> > >
> > > page is allocated & mapped in page fault handler.
> >
> > Right, physical pages certainly need to be allocated for the virtual
> > address range being copied to/from. But that would have happened
> > previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> > you point out, in the common case that the virtual pages are already
> > mapped to physical pages, the copy won't cause any page faults.
> >
> > >
> > > However, in typical cases, pages in io buffer shouldn't be swapped out
> > > frequently, so this cleanup may be good, I will run some perf test.
> >
> > Thanks for testing.
> >
> > >
> > > Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> > > bvec_kmap_local(), and the two helper can handle one whole bvec instead
> > > of single page.
> >
> > Yes, that's a good idea. Thanks, I didn't know about that.
>
> Looking into this further, copy_page_{to,from}_iter() doesn't seem
> like an improvement. It still uses kmap_local_page() +
> copy_{to,from}_iter() + kunmap_local() internally, and it splits the
> copy at page boundaries instead of copying the whole bvec at once. I
> don't think it's worth switching just to hide the bvec_kmap_local() +
> kunmap_local() calls.

Maybe that's because bvec_kmap_local() only maps the first page of the
bvec even if it spans multiple pages? Seems like it may be an existing
bug that ublk_copy_io_pages() can copy past the end of the page
returned from bvec_kmap_local(). But given that kmap_local_page() is a
no-op when CONFIG_HIGHMEM is disabled, it's probably not visible in
typical configurations.

Best,
Caleb

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Caleb Sander Mateos 3 months ago

On Wed, Nov 5, 2025 at 9:10 AM Caleb Sander Mateos
<csander@purestorage.com> wrote:
>
> On Wed, Nov 5, 2025 at 8:16 AM Caleb Sander Mateos
> <csander@purestorage.com> wrote:
> >
> > On Mon, Nov 3, 2025 at 8:40 AM Caleb Sander Mateos
> > <csander@purestorage.com> wrote:
> > >
> > > On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > > > > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > >
> > > > > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > > > > user page reference count increments and decrements and needing to split
> > > > > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > > > > considerably.
> > > > > > >
> > > > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > > > ---
> > > > > > >  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > > > > >  1 file changed, 14 insertions(+), 48 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > > > index 0c74a41a6753..852350e639d6 100644
> > > > > > > --- a/drivers/block/ublk_drv.c
> > > > > > > +++ b/drivers/block/ublk_drv.c
> > > > > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > > > > >       .open =         ublk_open,
> > > > > > >       .free_disk =    ublk_free_disk,
> > > > > > >       .report_zones = ublk_report_zones,
> > > > > > >  };
> > > > > > >
> > > > > > > -#define UBLK_MAX_PIN_PAGES   32
> > > > > > > -
> > > > > > >  struct ublk_io_iter {
> > > > > > > -     struct page *pages[UBLK_MAX_PIN_PAGES];
> > > > > > >       struct bio *bio;
> > > > > > >       struct bvec_iter iter;
> > > > > > >  };
> > > > > >
> > > > > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > > > > perf drop.
> > > > >
> > > > > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > > > > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > > > > user address range:
> > > > >
> > > > > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > > > > {
> > > > >         if (WARN_ON_ONCE(i->data_source))
> > > > >                 return 0;
> > > > >         if (user_backed_iter(i))
> > > > >                 might_fault();
> > > > >         return iterate_and_advance(i, bytes, (void *)addr,
> > > > >                                    copy_to_user_iter, memcpy_to_iter);
> > > > > }
> > > > >
> > > > > Which just checks that the address range doesn't include any kernel
> > > > > addresses and then memcpy()s directly via the userspace virtual
> > > > > addresses:
> > > > >
> > > > > static __always_inline
> > > > > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> > > > >                          size_t len, void *from, void *priv2)
> > > > > {
> > > > >         if (should_fail_usercopy())
> > > > >                 return len;
> > > > >         if (access_ok(iter_to, len)) {
> > > > >                 from += progress;
> > > > >                 instrument_copy_to_user(iter_to, from, len);
> > > > >                 len = raw_copy_to_user(iter_to, from, len);
> > > > >         }
> > > > >         return len;
> > > > > }
> > > > >
> > > > > static __always_inline __must_check unsigned long
> > > > > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > > > > {
> > > > >         return copy_user_generic((__force void *)dst, src, size);
> > > > > }
> > > > >
> > > > > static __always_inline __must_check unsigned long
> > > > > copy_user_generic(void *to, const void *from, unsigned long len)
> > > > > {
> > > > >         stac();
> > > > >         /*
> > > > >          * If CPU has FSRM feature, use 'rep movs'.
> > > > >          * Otherwise, use rep_movs_alternative.
> > > > >          */
> > > > >         asm volatile(
> > > > >                 "1:\n\t"
> > > > >                 ALTERNATIVE("rep movsb",
> > > > >                             "call rep_movs_alternative",
> > > > > ALT_NOT(X86_FEATURE_FSRM))
> > > > >                 "2:\n"
> > > > >                 _ASM_EXTABLE_UA(1b, 2b)
> > > > >                 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > > > >                 : : "memory", "rax");
> > > > >         clac();
> > > > >         return len;
> > > > > }
> > > > >
> > > > > Am I missing something?
> > > >
> > > > page is allocated & mapped in page fault handler.
> > >
> > > Right, physical pages certainly need to be allocated for the virtual
> > > address range being copied to/from. But that would have happened
> > > previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> > > you point out, in the common case that the virtual pages are already
> > > mapped to physical pages, the copy won't cause any page faults.
> > >
> > > >
> > > > However, in typical cases, pages in io buffer shouldn't be swapped out
> > > > frequently, so this cleanup may be good, I will run some perf test.
> > >
> > > Thanks for testing.
> > >
> > > >
> > > > Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> > > > bvec_kmap_local(), and the two helper can handle one whole bvec instead
> > > > of single page.
> > >
> > > Yes, that's a good idea. Thanks, I didn't know about that.
> >
> > Looking into this further, copy_page_{to,from}_iter() doesn't seem
> > like an improvement. It still uses kmap_local_page() +
> > copy_{to,from}_iter() + kunmap_local() internally, and it splits the
> > copy at page boundaries instead of copying the whole bvec at once. I
> > don't think it's worth switching just to hide the bvec_kmap_local() +
> > kunmap_local() calls.
>
> Maybe that's because bvec_kmap_local() only maps the first page of the
> bvec even if it spans multiple pages? Seems like it may be an existing
> bug that ublk_copy_io_pages() can copy past the end of the page
> returned from bvec_kmap_local(). But given that kmap_local_page() is a
> no-op when CONFIG_HIGHMEM is disabled, it's probably not visible in
> typical configurations.

I guess bio_iter_iovec() restricts the bvec to a single page, so
there's no existing issue. If we switch to rq_for_each_bvec(), we'll
need to use copy_page_{to,from}_iter(). Sorry for the thinking out
loud...

Best,
Caleb

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Ming Lei 3 months ago

On Mon, Nov 03, 2025 at 08:40:30AM -0800, Caleb Sander Mateos wrote:
> On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > > user page reference count increments and decrements and needing to split
> > > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > > considerably.
> > > > >
> > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > ---
> > > > >  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > > >  1 file changed, 14 insertions(+), 48 deletions(-)
> > > > >
> > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > index 0c74a41a6753..852350e639d6 100644
> > > > > --- a/drivers/block/ublk_drv.c
> > > > > +++ b/drivers/block/ublk_drv.c
> > > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > > >       .open =         ublk_open,
> > > > >       .free_disk =    ublk_free_disk,
> > > > >       .report_zones = ublk_report_zones,
> > > > >  };
> > > > >
> > > > > -#define UBLK_MAX_PIN_PAGES   32
> > > > > -
> > > > >  struct ublk_io_iter {
> > > > > -     struct page *pages[UBLK_MAX_PIN_PAGES];
> > > > >       struct bio *bio;
> > > > >       struct bvec_iter iter;
> > > > >  };
> > > >
> > > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > > perf drop.
> > >
> > > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > > user address range:
> > >
> > > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > > {
> > >         if (WARN_ON_ONCE(i->data_source))
> > >                 return 0;
> > >         if (user_backed_iter(i))
> > >                 might_fault();
> > >         return iterate_and_advance(i, bytes, (void *)addr,
> > >                                    copy_to_user_iter, memcpy_to_iter);
> > > }
> > >
> > > Which just checks that the address range doesn't include any kernel
> > > addresses and then memcpy()s directly via the userspace virtual
> > > addresses:
> > >
> > > static __always_inline
> > > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> > >                          size_t len, void *from, void *priv2)
> > > {
> > >         if (should_fail_usercopy())
> > >                 return len;
> > >         if (access_ok(iter_to, len)) {
> > >                 from += progress;
> > >                 instrument_copy_to_user(iter_to, from, len);
> > >                 len = raw_copy_to_user(iter_to, from, len);
> > >         }
> > >         return len;
> > > }
> > >
> > > static __always_inline __must_check unsigned long
> > > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > > {
> > >         return copy_user_generic((__force void *)dst, src, size);
> > > }
> > >
> > > static __always_inline __must_check unsigned long
> > > copy_user_generic(void *to, const void *from, unsigned long len)
> > > {
> > >         stac();
> > >         /*
> > >          * If CPU has FSRM feature, use 'rep movs'.
> > >          * Otherwise, use rep_movs_alternative.
> > >          */
> > >         asm volatile(
> > >                 "1:\n\t"
> > >                 ALTERNATIVE("rep movsb",
> > >                             "call rep_movs_alternative",
> > > ALT_NOT(X86_FEATURE_FSRM))
> > >                 "2:\n"
> > >                 _ASM_EXTABLE_UA(1b, 2b)
> > >                 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > >                 : : "memory", "rax");
> > >         clac();
> > >         return len;
> > > }
> > >
> > > Am I missing something?
> >
> > page is allocated & mapped in page fault handler.
> 
> Right, physical pages certainly need to be allocated for the virtual
> address range being copied to/from. But that would have happened
> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> you point out, in the common case that the virtual pages are already
> mapped to physical pages, the copy won't cause any page faults.
> 
> >
> > However, in typical cases, pages in io buffer shouldn't be swapped out
> > frequently, so this cleanup may be good, I will run some perf test.
> 
> Thanks for testing.

`fio/t/io_uring` shows 40% improvement on `./kublk -t null -q 2` with this
patch in my test VM, so looks very nice improvement.

Also it works well by forcing to pass IOSQE_ASYNC on the ublk uring_cmd,
and this change is correct because the copy is guaranteed to be done in ublk
daemon context.

> 
> >
> > Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> > bvec_kmap_local(), and the two helper can handle one whole bvec instead
> > of single page.
> 
> Yes, that's a good idea. Thanks, I didn't know about that.
> 
> >
> > Then rq_for_each_bvec() can be used directly, and `ublk_io_iter` may be
> > killed.
> 
> Hmm, we still need a way to offset into the request (i.e. what
> ublk_advance_io_iter() does currently). Are you thinking of a single
> rq_for_each_bvec() loop that would skip bvecs until the offset is
> reached and then copy until reaching the end of the user iterator?

Yeah, that is basically what ublk_advance_io_iter() does.


Thanks,
Ming

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Jens Axboe 3 months ago

On 11/4/25 6:48 PM, Ming Lei wrote:
> On Mon, Nov 03, 2025 at 08:40:30AM -0800, Caleb Sander Mateos wrote:
>> On Fri, Oct 31, 2025 at 4:04?PM Ming Lei <ming.lei@redhat.com> wrote:
>>>
>>> On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
>>>> On Thu, Oct 30, 2025 at 8:45?PM Ming Lei <ming.lei@redhat.com> wrote:
>>>>>
>>>>> On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
>>>>>> ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
>>>>>> iov_iter_get_pages2() to extract the pages from the iov_iter and
>>>>>> memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
>>>>>> Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
>>>>>> user page reference count increments and decrements and needing to split
>>>>>> the memcpy() at user page boundaries. It also simplifies the code
>>>>>> considerably.
>>>>>>
>>>>>> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
>>>>>> ---
>>>>>>  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
>>>>>>  1 file changed, 14 insertions(+), 48 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>>>>>> index 0c74a41a6753..852350e639d6 100644
>>>>>> --- a/drivers/block/ublk_drv.c
>>>>>> +++ b/drivers/block/ublk_drv.c
>>>>>> @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
>>>>>>       .open =         ublk_open,
>>>>>>       .free_disk =    ublk_free_disk,
>>>>>>       .report_zones = ublk_report_zones,
>>>>>>  };
>>>>>>
>>>>>> -#define UBLK_MAX_PIN_PAGES   32
>>>>>> -
>>>>>>  struct ublk_io_iter {
>>>>>> -     struct page *pages[UBLK_MAX_PIN_PAGES];
>>>>>>       struct bio *bio;
>>>>>>       struct bvec_iter iter;
>>>>>>  };
>>>>>
>>>>> ->pages[] is actually for pinning user io pages in batch, so killing it may cause
>>>>> perf drop.
>>>>
>>>> As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
>>>> pinning entirely. It calls copy_to_user_iter() for each contiguous
>>>> user address range:
>>>>
>>>> size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>>>> {
>>>>         if (WARN_ON_ONCE(i->data_source))
>>>>                 return 0;
>>>>         if (user_backed_iter(i))
>>>>                 might_fault();
>>>>         return iterate_and_advance(i, bytes, (void *)addr,
>>>>                                    copy_to_user_iter, memcpy_to_iter);
>>>> }
>>>>
>>>> Which just checks that the address range doesn't include any kernel
>>>> addresses and then memcpy()s directly via the userspace virtual
>>>> addresses:
>>>>
>>>> static __always_inline
>>>> size_t copy_to_user_iter(void __user *iter_to, size_t progress,
>>>>                          size_t len, void *from, void *priv2)
>>>> {
>>>>         if (should_fail_usercopy())
>>>>                 return len;
>>>>         if (access_ok(iter_to, len)) {
>>>>                 from += progress;
>>>>                 instrument_copy_to_user(iter_to, from, len);
>>>>                 len = raw_copy_to_user(iter_to, from, len);
>>>>         }
>>>>         return len;
>>>> }
>>>>
>>>> static __always_inline __must_check unsigned long
>>>> raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
>>>> {
>>>>         return copy_user_generic((__force void *)dst, src, size);
>>>> }
>>>>
>>>> static __always_inline __must_check unsigned long
>>>> copy_user_generic(void *to, const void *from, unsigned long len)
>>>> {
>>>>         stac();
>>>>         /*
>>>>          * If CPU has FSRM feature, use 'rep movs'.
>>>>          * Otherwise, use rep_movs_alternative.
>>>>          */
>>>>         asm volatile(
>>>>                 "1:\n\t"
>>>>                 ALTERNATIVE("rep movsb",
>>>>                             "call rep_movs_alternative",
>>>> ALT_NOT(X86_FEATURE_FSRM))
>>>>                 "2:\n"
>>>>                 _ASM_EXTABLE_UA(1b, 2b)
>>>>                 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
>>>>                 : : "memory", "rax");
>>>>         clac();
>>>>         return len;
>>>> }
>>>>
>>>> Am I missing something?
>>>
>>> page is allocated & mapped in page fault handler.
>>
>> Right, physical pages certainly need to be allocated for the virtual
>> address range being copied to/from. But that would have happened
>> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
>> you point out, in the common case that the virtual pages are already
>> mapped to physical pages, the copy won't cause any page faults.
>>
>>>
>>> However, in typical cases, pages in io buffer shouldn't be swapped out
>>> frequently, so this cleanup may be good, I will run some perf test.
>>
>> Thanks for testing.
> 
> `fio/t/io_uring` shows 40% improvement on `./kublk -t null -q 2` with this
> patch in my test VM, so looks very nice improvement.
> 
> Also it works well by forcing to pass IOSQE_ASYNC on the ublk uring_cmd,
> and this change is correct because the copy is guaranteed to be done in ublk
> daemon context.

We good to queue this up then?

-- 
Jens Axboe

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Caleb Sander Mateos 3 months ago

On Wed, Nov 5, 2025 at 7:26 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 11/4/25 6:48 PM, Ming Lei wrote:
> > On Mon, Nov 03, 2025 at 08:40:30AM -0800, Caleb Sander Mateos wrote:
> >> On Fri, Oct 31, 2025 at 4:04?PM Ming Lei <ming.lei@redhat.com> wrote:
> >>>
> >>> On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> >>>> On Thu, Oct 30, 2025 at 8:45?PM Ming Lei <ming.lei@redhat.com> wrote:
> >>>>>
> >>>>> On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> >>>>>> ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> >>>>>> iov_iter_get_pages2() to extract the pages from the iov_iter and
> >>>>>> memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> >>>>>> Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> >>>>>> user page reference count increments and decrements and needing to split
> >>>>>> the memcpy() at user page boundaries. It also simplifies the code
> >>>>>> considerably.
> >>>>>>
> >>>>>> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> >>>>>> ---
> >>>>>>  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> >>>>>>  1 file changed, 14 insertions(+), 48 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> >>>>>> index 0c74a41a6753..852350e639d6 100644
> >>>>>> --- a/drivers/block/ublk_drv.c
> >>>>>> +++ b/drivers/block/ublk_drv.c
> >>>>>> @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> >>>>>>       .open =         ublk_open,
> >>>>>>       .free_disk =    ublk_free_disk,
> >>>>>>       .report_zones = ublk_report_zones,
> >>>>>>  };
> >>>>>>
> >>>>>> -#define UBLK_MAX_PIN_PAGES   32
> >>>>>> -
> >>>>>>  struct ublk_io_iter {
> >>>>>> -     struct page *pages[UBLK_MAX_PIN_PAGES];
> >>>>>>       struct bio *bio;
> >>>>>>       struct bvec_iter iter;
> >>>>>>  };
> >>>>>
> >>>>> ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> >>>>> perf drop.
> >>>>
> >>>> As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> >>>> pinning entirely. It calls copy_to_user_iter() for each contiguous
> >>>> user address range:
> >>>>
> >>>> size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> >>>> {
> >>>>         if (WARN_ON_ONCE(i->data_source))
> >>>>                 return 0;
> >>>>         if (user_backed_iter(i))
> >>>>                 might_fault();
> >>>>         return iterate_and_advance(i, bytes, (void *)addr,
> >>>>                                    copy_to_user_iter, memcpy_to_iter);
> >>>> }
> >>>>
> >>>> Which just checks that the address range doesn't include any kernel
> >>>> addresses and then memcpy()s directly via the userspace virtual
> >>>> addresses:
> >>>>
> >>>> static __always_inline
> >>>> size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> >>>>                          size_t len, void *from, void *priv2)
> >>>> {
> >>>>         if (should_fail_usercopy())
> >>>>                 return len;
> >>>>         if (access_ok(iter_to, len)) {
> >>>>                 from += progress;
> >>>>                 instrument_copy_to_user(iter_to, from, len);
> >>>>                 len = raw_copy_to_user(iter_to, from, len);
> >>>>         }
> >>>>         return len;
> >>>> }
> >>>>
> >>>> static __always_inline __must_check unsigned long
> >>>> raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> >>>> {
> >>>>         return copy_user_generic((__force void *)dst, src, size);
> >>>> }
> >>>>
> >>>> static __always_inline __must_check unsigned long
> >>>> copy_user_generic(void *to, const void *from, unsigned long len)
> >>>> {
> >>>>         stac();
> >>>>         /*
> >>>>          * If CPU has FSRM feature, use 'rep movs'.
> >>>>          * Otherwise, use rep_movs_alternative.
> >>>>          */
> >>>>         asm volatile(
> >>>>                 "1:\n\t"
> >>>>                 ALTERNATIVE("rep movsb",
> >>>>                             "call rep_movs_alternative",
> >>>> ALT_NOT(X86_FEATURE_FSRM))
> >>>>                 "2:\n"
> >>>>                 _ASM_EXTABLE_UA(1b, 2b)
> >>>>                 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> >>>>                 : : "memory", "rax");
> >>>>         clac();
> >>>>         return len;
> >>>> }
> >>>>
> >>>> Am I missing something?
> >>>
> >>> page is allocated & mapped in page fault handler.
> >>
> >> Right, physical pages certainly need to be allocated for the virtual
> >> address range being copied to/from. But that would have happened
> >> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> >> you point out, in the common case that the virtual pages are already
> >> mapped to physical pages, the copy won't cause any page faults.
> >>
> >>>
> >>> However, in typical cases, pages in io buffer shouldn't be swapped out
> >>> frequently, so this cleanup may be good, I will run some perf test.
> >>
> >> Thanks for testing.
> >
> > `fio/t/io_uring` shows 40% improvement on `./kublk -t null -q 2` with this
> > patch in my test VM, so looks very nice improvement.
> >
> > Also it works well by forcing to pass IOSQE_ASYNC on the ublk uring_cmd,
> > and this change is correct because the copy is guaranteed to be done in ublk
> > daemon context.
>
> We good to queue this up then?

Let me write a v2 implementing Ming's suggestions to use
copy_page_{to,from}_iter() and get rid of the open-coded bvec
iteration.

Thanks,
Caleb

Re: [PATCH] ublk: use copy_{to,from}_iter() for user copy

Posted by Jens Axboe 3 months ago

On 11/5/25 8:37 AM, Caleb Sander Mateos wrote:
> On Wed, Nov 5, 2025 at 7:26?AM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 11/4/25 6:48 PM, Ming Lei wrote:
>>> On Mon, Nov 03, 2025 at 08:40:30AM -0800, Caleb Sander Mateos wrote:
>>>> On Fri, Oct 31, 2025 at 4:04?PM Ming Lei <ming.lei@redhat.com> wrote:
>>>>>
>>>>> On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
>>>>>> On Thu, Oct 30, 2025 at 8:45?PM Ming Lei <ming.lei@redhat.com> wrote:
>>>>>>>
>>>>>>> On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
>>>>>>>> ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
>>>>>>>> iov_iter_get_pages2() to extract the pages from the iov_iter and
>>>>>>>> memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
>>>>>>>> Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
>>>>>>>> user page reference count increments and decrements and needing to split
>>>>>>>> the memcpy() at user page boundaries. It also simplifies the code
>>>>>>>> considerably.
>>>>>>>>
>>>>>>>> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
>>>>>>>> ---
>>>>>>>>  drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
>>>>>>>>  1 file changed, 14 insertions(+), 48 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>>>>>>>> index 0c74a41a6753..852350e639d6 100644
>>>>>>>> --- a/drivers/block/ublk_drv.c
>>>>>>>> +++ b/drivers/block/ublk_drv.c
>>>>>>>> @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
>>>>>>>>       .open =         ublk_open,
>>>>>>>>       .free_disk =    ublk_free_disk,
>>>>>>>>       .report_zones = ublk_report_zones,
>>>>>>>>  };
>>>>>>>>
>>>>>>>> -#define UBLK_MAX_PIN_PAGES   32
>>>>>>>> -
>>>>>>>>  struct ublk_io_iter {
>>>>>>>> -     struct page *pages[UBLK_MAX_PIN_PAGES];
>>>>>>>>       struct bio *bio;
>>>>>>>>       struct bvec_iter iter;
>>>>>>>>  };
>>>>>>>
>>>>>>> ->pages[] is actually for pinning user io pages in batch, so killing it may cause
>>>>>>> perf drop.
>>>>>>
>>>>>> As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
>>>>>> pinning entirely. It calls copy_to_user_iter() for each contiguous
>>>>>> user address range:
>>>>>>
>>>>>> size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>>>>>> {
>>>>>>         if (WARN_ON_ONCE(i->data_source))
>>>>>>                 return 0;
>>>>>>         if (user_backed_iter(i))
>>>>>>                 might_fault();
>>>>>>         return iterate_and_advance(i, bytes, (void *)addr,
>>>>>>                                    copy_to_user_iter, memcpy_to_iter);
>>>>>> }
>>>>>>
>>>>>> Which just checks that the address range doesn't include any kernel
>>>>>> addresses and then memcpy()s directly via the userspace virtual
>>>>>> addresses:
>>>>>>
>>>>>> static __always_inline
>>>>>> size_t copy_to_user_iter(void __user *iter_to, size_t progress,
>>>>>>                          size_t len, void *from, void *priv2)
>>>>>> {
>>>>>>         if (should_fail_usercopy())
>>>>>>                 return len;
>>>>>>         if (access_ok(iter_to, len)) {
>>>>>>                 from += progress;
>>>>>>                 instrument_copy_to_user(iter_to, from, len);
>>>>>>                 len = raw_copy_to_user(iter_to, from, len);
>>>>>>         }
>>>>>>         return len;
>>>>>> }
>>>>>>
>>>>>> static __always_inline __must_check unsigned long
>>>>>> raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
>>>>>> {
>>>>>>         return copy_user_generic((__force void *)dst, src, size);
>>>>>> }
>>>>>>
>>>>>> static __always_inline __must_check unsigned long
>>>>>> copy_user_generic(void *to, const void *from, unsigned long len)
>>>>>> {
>>>>>>         stac();
>>>>>>         /*
>>>>>>          * If CPU has FSRM feature, use 'rep movs'.
>>>>>>          * Otherwise, use rep_movs_alternative.
>>>>>>          */
>>>>>>         asm volatile(
>>>>>>                 "1:\n\t"
>>>>>>                 ALTERNATIVE("rep movsb",
>>>>>>                             "call rep_movs_alternative",
>>>>>> ALT_NOT(X86_FEATURE_FSRM))
>>>>>>                 "2:\n"
>>>>>>                 _ASM_EXTABLE_UA(1b, 2b)
>>>>>>                 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
>>>>>>                 : : "memory", "rax");
>>>>>>         clac();
>>>>>>         return len;
>>>>>> }
>>>>>>
>>>>>> Am I missing something?
>>>>>
>>>>> page is allocated & mapped in page fault handler.
>>>>
>>>> Right, physical pages certainly need to be allocated for the virtual
>>>> address range being copied to/from. But that would have happened
>>>> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
>>>> you point out, in the common case that the virtual pages are already
>>>> mapped to physical pages, the copy won't cause any page faults.
>>>>
>>>>>
>>>>> However, in typical cases, pages in io buffer shouldn't be swapped out
>>>>> frequently, so this cleanup may be good, I will run some perf test.
>>>>
>>>> Thanks for testing.
>>>
>>> `fio/t/io_uring` shows 40% improvement on `./kublk -t null -q 2` with this
>>> patch in my test VM, so looks very nice improvement.
>>>
>>> Also it works well by forcing to pass IOSQE_ASYNC on the ublk uring_cmd,
>>> and this change is correct because the copy is guaranteed to be done in ublk
>>> daemon context.
>>
>> We good to queue this up then?
> 
> Let me write a v2 implementing Ming's suggestions to use
> copy_page_{to,from}_iter() and get rid of the open-coded bvec
> iteration.

Sounds good.

-- 
Jens Axboe