drivers/block/ublk_drv.c | 62 +++++++++------------------------------- 1 file changed, 14 insertions(+), 48 deletions(-)
ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
iov_iter_get_pages2() to extract the pages from the iov_iter and
memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
user page reference count increments and decrements and needing to split
the memcpy() at user page boundaries. It also simplifies the code
considerably.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
1 file changed, 14 insertions(+), 48 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 0c74a41a6753..852350e639d6 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
.open = ublk_open,
.free_disk = ublk_free_disk,
.report_zones = ublk_report_zones,
};
-#define UBLK_MAX_PIN_PAGES 32
-
struct ublk_io_iter {
- struct page *pages[UBLK_MAX_PIN_PAGES];
struct bio *bio;
struct bvec_iter iter;
};
-/* return how many pages are copied */
-static void ublk_copy_io_pages(struct ublk_io_iter *data,
- size_t total, size_t pg_off, int dir)
+/* return how many bytes are copied */
+static size_t ublk_copy_io_pages(struct ublk_io_iter *data,
+ struct iov_iter *uiter, int dir)
{
- unsigned done = 0;
- unsigned pg_idx = 0;
+ size_t done = 0;
- while (done < total) {
+ for (;;) {
struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
- unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
- (unsigned)(PAGE_SIZE - pg_off));
void *bv_buf = bvec_kmap_local(&bv);
- void *pg_buf = kmap_local_page(data->pages[pg_idx]);
+ size_t copied;
if (dir == ITER_DEST)
- memcpy(pg_buf + pg_off, bv_buf, bytes);
+ copied = copy_to_iter(bv_buf, bv.bv_len, uiter);
else
- memcpy(bv_buf, pg_buf + pg_off, bytes);
+ copied = copy_from_iter(bv_buf, bv.bv_len, uiter);
- kunmap_local(pg_buf);
kunmap_local(bv_buf);
- /* advance page array */
- pg_off += bytes;
- if (pg_off == PAGE_SIZE) {
- pg_idx += 1;
- pg_off = 0;
- }
-
- done += bytes;
+ done += copied;
+ if (copied < bv.bv_len)
+ break;
/* advance bio */
- bio_advance_iter_single(data->bio, &data->iter, bytes);
+ bio_advance_iter_single(data->bio, &data->iter, copied);
if (!data->iter.bi_size) {
data->bio = data->bio->bi_next;
if (data->bio == NULL)
break;
data->iter = data->bio->bi_iter;
}
}
+ return done;
}
static bool ublk_advance_io_iter(const struct request *req,
struct ublk_io_iter *iter, unsigned int offset)
{
@@ -987,38 +976,15 @@ static bool ublk_advance_io_iter(const struct request *req,
*/
static size_t ublk_copy_user_pages(const struct request *req,
unsigned offset, struct iov_iter *uiter, int dir)
{
struct ublk_io_iter iter;
- size_t done = 0;
if (!ublk_advance_io_iter(req, &iter, offset))
return 0;
- while (iov_iter_count(uiter) && iter.bio) {
- unsigned nr_pages;
- ssize_t len;
- size_t off;
- int i;
-
- len = iov_iter_get_pages2(uiter, iter.pages,
- iov_iter_count(uiter),
- UBLK_MAX_PIN_PAGES, &off);
- if (len <= 0)
- return done;
-
- ublk_copy_io_pages(&iter, len, off, dir);
- nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
- for (i = 0; i < nr_pages; i++) {
- if (dir == ITER_DEST)
- set_page_dirty(iter.pages[i]);
- put_page(iter.pages[i]);
- }
- done += len;
- }
-
- return done;
+ return ublk_copy_io_pages(&iter, uiter, dir);
}
static inline bool ublk_need_map_req(const struct request *req)
{
return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
--
2.45.2
On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> iov_iter_get_pages2() to extract the pages from the iov_iter and
> memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> user page reference count increments and decrements and needing to split
> the memcpy() at user page boundaries. It also simplifies the code
> considerably.
>
> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> ---
> drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> 1 file changed, 14 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 0c74a41a6753..852350e639d6 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> .open = ublk_open,
> .free_disk = ublk_free_disk,
> .report_zones = ublk_report_zones,
> };
>
> -#define UBLK_MAX_PIN_PAGES 32
> -
> struct ublk_io_iter {
> - struct page *pages[UBLK_MAX_PIN_PAGES];
> struct bio *bio;
> struct bvec_iter iter;
> };
->pages[] is actually for pinning user io pages in batch, so killing it may cause
perf drop.
This similar trick is used in direct io code path too.
Thanks
Ming
On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > user page reference count increments and decrements and needing to split
> > the memcpy() at user page boundaries. It also simplifies the code
> > considerably.
> >
> > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > ---
> > drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > 1 file changed, 14 insertions(+), 48 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index 0c74a41a6753..852350e639d6 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > .open = ublk_open,
> > .free_disk = ublk_free_disk,
> > .report_zones = ublk_report_zones,
> > };
> >
> > -#define UBLK_MAX_PIN_PAGES 32
> > -
> > struct ublk_io_iter {
> > - struct page *pages[UBLK_MAX_PIN_PAGES];
> > struct bio *bio;
> > struct bvec_iter iter;
> > };
>
> ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> perf drop.
As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
pinning entirely. It calls copy_to_user_iter() for each contiguous
user address range:
size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
if (WARN_ON_ONCE(i->data_source))
return 0;
if (user_backed_iter(i))
might_fault();
return iterate_and_advance(i, bytes, (void *)addr,
copy_to_user_iter, memcpy_to_iter);
}
Which just checks that the address range doesn't include any kernel
addresses and then memcpy()s directly via the userspace virtual
addresses:
static __always_inline
size_t copy_to_user_iter(void __user *iter_to, size_t progress,
size_t len, void *from, void *priv2)
{
if (should_fail_usercopy())
return len;
if (access_ok(iter_to, len)) {
from += progress;
instrument_copy_to_user(iter_to, from, len);
len = raw_copy_to_user(iter_to, from, len);
}
return len;
}
static __always_inline __must_check unsigned long
raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
return copy_user_generic((__force void *)dst, src, size);
}
static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned long len)
{
stac();
/*
* If CPU has FSRM feature, use 'rep movs'.
* Otherwise, use rep_movs_alternative.
*/
asm volatile(
"1:\n\t"
ALTERNATIVE("rep movsb",
"call rep_movs_alternative",
ALT_NOT(X86_FEATURE_FSRM))
"2:\n"
_ASM_EXTABLE_UA(1b, 2b)
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
: : "memory", "rax");
clac();
return len;
}
Am I missing something?
Best,
Caleb
On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > user page reference count increments and decrements and needing to split
> > > the memcpy() at user page boundaries. It also simplifies the code
> > > considerably.
> > >
> > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > ---
> > > drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > 1 file changed, 14 insertions(+), 48 deletions(-)
> > >
> > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > index 0c74a41a6753..852350e639d6 100644
> > > --- a/drivers/block/ublk_drv.c
> > > +++ b/drivers/block/ublk_drv.c
> > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > .open = ublk_open,
> > > .free_disk = ublk_free_disk,
> > > .report_zones = ublk_report_zones,
> > > };
> > >
> > > -#define UBLK_MAX_PIN_PAGES 32
> > > -
> > > struct ublk_io_iter {
> > > - struct page *pages[UBLK_MAX_PIN_PAGES];
> > > struct bio *bio;
> > > struct bvec_iter iter;
> > > };
> >
> > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > perf drop.
>
> As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> pinning entirely. It calls copy_to_user_iter() for each contiguous
> user address range:
>
> size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> {
> if (WARN_ON_ONCE(i->data_source))
> return 0;
> if (user_backed_iter(i))
> might_fault();
> return iterate_and_advance(i, bytes, (void *)addr,
> copy_to_user_iter, memcpy_to_iter);
> }
>
> Which just checks that the address range doesn't include any kernel
> addresses and then memcpy()s directly via the userspace virtual
> addresses:
>
> static __always_inline
> size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> size_t len, void *from, void *priv2)
> {
> if (should_fail_usercopy())
> return len;
> if (access_ok(iter_to, len)) {
> from += progress;
> instrument_copy_to_user(iter_to, from, len);
> len = raw_copy_to_user(iter_to, from, len);
> }
> return len;
> }
>
> static __always_inline __must_check unsigned long
> raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> {
> return copy_user_generic((__force void *)dst, src, size);
> }
>
> static __always_inline __must_check unsigned long
> copy_user_generic(void *to, const void *from, unsigned long len)
> {
> stac();
> /*
> * If CPU has FSRM feature, use 'rep movs'.
> * Otherwise, use rep_movs_alternative.
> */
> asm volatile(
> "1:\n\t"
> ALTERNATIVE("rep movsb",
> "call rep_movs_alternative",
> ALT_NOT(X86_FEATURE_FSRM))
> "2:\n"
> _ASM_EXTABLE_UA(1b, 2b)
> :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> : : "memory", "rax");
> clac();
> return len;
> }
>
> Am I missing something?
page is allocated & mapped in page fault handler.
However, in typical cases, pages in io buffer shouldn't be swapped out
frequently, so this cleanup may be good, I will run some perf test.
Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
bvec_kmap_local(), and the two helper can handle one whole bvec instead
of single page.
Then rq_for_each_bvec() can be used directly, and `ublk_io_iter` may be
killed.
Thanks,
Ming
On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > user page reference count increments and decrements and needing to split
> > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > considerably.
> > > >
> > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > ---
> > > > drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > > 1 file changed, 14 insertions(+), 48 deletions(-)
> > > >
> > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > index 0c74a41a6753..852350e639d6 100644
> > > > --- a/drivers/block/ublk_drv.c
> > > > +++ b/drivers/block/ublk_drv.c
> > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > > .open = ublk_open,
> > > > .free_disk = ublk_free_disk,
> > > > .report_zones = ublk_report_zones,
> > > > };
> > > >
> > > > -#define UBLK_MAX_PIN_PAGES 32
> > > > -
> > > > struct ublk_io_iter {
> > > > - struct page *pages[UBLK_MAX_PIN_PAGES];
> > > > struct bio *bio;
> > > > struct bvec_iter iter;
> > > > };
> > >
> > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > perf drop.
> >
> > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > user address range:
> >
> > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > {
> > if (WARN_ON_ONCE(i->data_source))
> > return 0;
> > if (user_backed_iter(i))
> > might_fault();
> > return iterate_and_advance(i, bytes, (void *)addr,
> > copy_to_user_iter, memcpy_to_iter);
> > }
> >
> > Which just checks that the address range doesn't include any kernel
> > addresses and then memcpy()s directly via the userspace virtual
> > addresses:
> >
> > static __always_inline
> > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> > size_t len, void *from, void *priv2)
> > {
> > if (should_fail_usercopy())
> > return len;
> > if (access_ok(iter_to, len)) {
> > from += progress;
> > instrument_copy_to_user(iter_to, from, len);
> > len = raw_copy_to_user(iter_to, from, len);
> > }
> > return len;
> > }
> >
> > static __always_inline __must_check unsigned long
> > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > {
> > return copy_user_generic((__force void *)dst, src, size);
> > }
> >
> > static __always_inline __must_check unsigned long
> > copy_user_generic(void *to, const void *from, unsigned long len)
> > {
> > stac();
> > /*
> > * If CPU has FSRM feature, use 'rep movs'.
> > * Otherwise, use rep_movs_alternative.
> > */
> > asm volatile(
> > "1:\n\t"
> > ALTERNATIVE("rep movsb",
> > "call rep_movs_alternative",
> > ALT_NOT(X86_FEATURE_FSRM))
> > "2:\n"
> > _ASM_EXTABLE_UA(1b, 2b)
> > :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > : : "memory", "rax");
> > clac();
> > return len;
> > }
> >
> > Am I missing something?
>
> page is allocated & mapped in page fault handler.
Right, physical pages certainly need to be allocated for the virtual
address range being copied to/from. But that would have happened
previously in iov_iter_get_pages2(), so this isn't a new cost. And as
you point out, in the common case that the virtual pages are already
mapped to physical pages, the copy won't cause any page faults.
>
> However, in typical cases, pages in io buffer shouldn't be swapped out
> frequently, so this cleanup may be good, I will run some perf test.
Thanks for testing.
>
> Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> bvec_kmap_local(), and the two helper can handle one whole bvec instead
> of single page.
Yes, that's a good idea. Thanks, I didn't know about that.
>
> Then rq_for_each_bvec() can be used directly, and `ublk_io_iter` may be
> killed.
Hmm, we still need a way to offset into the request (i.e. what
ublk_advance_io_iter() does currently). Are you thinking of a single
rq_for_each_bvec() loop that would skip bvecs until the offset is
reached and then copy until reaching the end of the user iterator?
Best,
Caleb
On Mon, Nov 3, 2025 at 8:40 AM Caleb Sander Mateos
<csander@purestorage.com> wrote:
>
> On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > > user page reference count increments and decrements and needing to split
> > > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > > considerably.
> > > > >
> > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > ---
> > > > > drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > > > 1 file changed, 14 insertions(+), 48 deletions(-)
> > > > >
> > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > index 0c74a41a6753..852350e639d6 100644
> > > > > --- a/drivers/block/ublk_drv.c
> > > > > +++ b/drivers/block/ublk_drv.c
> > > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > > > .open = ublk_open,
> > > > > .free_disk = ublk_free_disk,
> > > > > .report_zones = ublk_report_zones,
> > > > > };
> > > > >
> > > > > -#define UBLK_MAX_PIN_PAGES 32
> > > > > -
> > > > > struct ublk_io_iter {
> > > > > - struct page *pages[UBLK_MAX_PIN_PAGES];
> > > > > struct bio *bio;
> > > > > struct bvec_iter iter;
> > > > > };
> > > >
> > > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > > perf drop.
> > >
> > > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > > user address range:
> > >
> > > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > > {
> > > if (WARN_ON_ONCE(i->data_source))
> > > return 0;
> > > if (user_backed_iter(i))
> > > might_fault();
> > > return iterate_and_advance(i, bytes, (void *)addr,
> > > copy_to_user_iter, memcpy_to_iter);
> > > }
> > >
> > > Which just checks that the address range doesn't include any kernel
> > > addresses and then memcpy()s directly via the userspace virtual
> > > addresses:
> > >
> > > static __always_inline
> > > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> > > size_t len, void *from, void *priv2)
> > > {
> > > if (should_fail_usercopy())
> > > return len;
> > > if (access_ok(iter_to, len)) {
> > > from += progress;
> > > instrument_copy_to_user(iter_to, from, len);
> > > len = raw_copy_to_user(iter_to, from, len);
> > > }
> > > return len;
> > > }
> > >
> > > static __always_inline __must_check unsigned long
> > > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > > {
> > > return copy_user_generic((__force void *)dst, src, size);
> > > }
> > >
> > > static __always_inline __must_check unsigned long
> > > copy_user_generic(void *to, const void *from, unsigned long len)
> > > {
> > > stac();
> > > /*
> > > * If CPU has FSRM feature, use 'rep movs'.
> > > * Otherwise, use rep_movs_alternative.
> > > */
> > > asm volatile(
> > > "1:\n\t"
> > > ALTERNATIVE("rep movsb",
> > > "call rep_movs_alternative",
> > > ALT_NOT(X86_FEATURE_FSRM))
> > > "2:\n"
> > > _ASM_EXTABLE_UA(1b, 2b)
> > > :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > > : : "memory", "rax");
> > > clac();
> > > return len;
> > > }
> > >
> > > Am I missing something?
> >
> > page is allocated & mapped in page fault handler.
>
> Right, physical pages certainly need to be allocated for the virtual
> address range being copied to/from. But that would have happened
> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> you point out, in the common case that the virtual pages are already
> mapped to physical pages, the copy won't cause any page faults.
>
> >
> > However, in typical cases, pages in io buffer shouldn't be swapped out
> > frequently, so this cleanup may be good, I will run some perf test.
>
> Thanks for testing.
>
> >
> > Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> > bvec_kmap_local(), and the two helper can handle one whole bvec instead
> > of single page.
>
> Yes, that's a good idea. Thanks, I didn't know about that.
Looking into this further, copy_page_{to,from}_iter() doesn't seem
like an improvement. It still uses kmap_local_page() +
copy_{to,from}_iter() + kunmap_local() internally, and it splits the
copy at page boundaries instead of copying the whole bvec at once. I
don't think it's worth switching just to hide the bvec_kmap_local() +
kunmap_local() calls.
Best,
Caleb
On Wed, Nov 5, 2025 at 8:16 AM Caleb Sander Mateos
<csander@purestorage.com> wrote:
>
> On Mon, Nov 3, 2025 at 8:40 AM Caleb Sander Mateos
> <csander@purestorage.com> wrote:
> >
> > On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > > > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > >
> > > > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > > > user page reference count increments and decrements and needing to split
> > > > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > > > considerably.
> > > > > >
> > > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > > ---
> > > > > > drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > > > > 1 file changed, 14 insertions(+), 48 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > > index 0c74a41a6753..852350e639d6 100644
> > > > > > --- a/drivers/block/ublk_drv.c
> > > > > > +++ b/drivers/block/ublk_drv.c
> > > > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > > > > .open = ublk_open,
> > > > > > .free_disk = ublk_free_disk,
> > > > > > .report_zones = ublk_report_zones,
> > > > > > };
> > > > > >
> > > > > > -#define UBLK_MAX_PIN_PAGES 32
> > > > > > -
> > > > > > struct ublk_io_iter {
> > > > > > - struct page *pages[UBLK_MAX_PIN_PAGES];
> > > > > > struct bio *bio;
> > > > > > struct bvec_iter iter;
> > > > > > };
> > > > >
> > > > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > > > perf drop.
> > > >
> > > > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > > > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > > > user address range:
> > > >
> > > > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > > > {
> > > > if (WARN_ON_ONCE(i->data_source))
> > > > return 0;
> > > > if (user_backed_iter(i))
> > > > might_fault();
> > > > return iterate_and_advance(i, bytes, (void *)addr,
> > > > copy_to_user_iter, memcpy_to_iter);
> > > > }
> > > >
> > > > Which just checks that the address range doesn't include any kernel
> > > > addresses and then memcpy()s directly via the userspace virtual
> > > > addresses:
> > > >
> > > > static __always_inline
> > > > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> > > > size_t len, void *from, void *priv2)
> > > > {
> > > > if (should_fail_usercopy())
> > > > return len;
> > > > if (access_ok(iter_to, len)) {
> > > > from += progress;
> > > > instrument_copy_to_user(iter_to, from, len);
> > > > len = raw_copy_to_user(iter_to, from, len);
> > > > }
> > > > return len;
> > > > }
> > > >
> > > > static __always_inline __must_check unsigned long
> > > > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > > > {
> > > > return copy_user_generic((__force void *)dst, src, size);
> > > > }
> > > >
> > > > static __always_inline __must_check unsigned long
> > > > copy_user_generic(void *to, const void *from, unsigned long len)
> > > > {
> > > > stac();
> > > > /*
> > > > * If CPU has FSRM feature, use 'rep movs'.
> > > > * Otherwise, use rep_movs_alternative.
> > > > */
> > > > asm volatile(
> > > > "1:\n\t"
> > > > ALTERNATIVE("rep movsb",
> > > > "call rep_movs_alternative",
> > > > ALT_NOT(X86_FEATURE_FSRM))
> > > > "2:\n"
> > > > _ASM_EXTABLE_UA(1b, 2b)
> > > > :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > > > : : "memory", "rax");
> > > > clac();
> > > > return len;
> > > > }
> > > >
> > > > Am I missing something?
> > >
> > > page is allocated & mapped in page fault handler.
> >
> > Right, physical pages certainly need to be allocated for the virtual
> > address range being copied to/from. But that would have happened
> > previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> > you point out, in the common case that the virtual pages are already
> > mapped to physical pages, the copy won't cause any page faults.
> >
> > >
> > > However, in typical cases, pages in io buffer shouldn't be swapped out
> > > frequently, so this cleanup may be good, I will run some perf test.
> >
> > Thanks for testing.
> >
> > >
> > > Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> > > bvec_kmap_local(), and the two helper can handle one whole bvec instead
> > > of single page.
> >
> > Yes, that's a good idea. Thanks, I didn't know about that.
>
> Looking into this further, copy_page_{to,from}_iter() doesn't seem
> like an improvement. It still uses kmap_local_page() +
> copy_{to,from}_iter() + kunmap_local() internally, and it splits the
> copy at page boundaries instead of copying the whole bvec at once. I
> don't think it's worth switching just to hide the bvec_kmap_local() +
> kunmap_local() calls.
Maybe that's because bvec_kmap_local() only maps the first page of the
bvec even if it spans multiple pages? Seems like it may be an existing
bug that ublk_copy_io_pages() can copy past the end of the page
returned from bvec_kmap_local(). But given that kmap_local_page() is a
no-op when CONFIG_HIGHMEM is disabled, it's probably not visible in
typical configurations.
Best,
Caleb
On Wed, Nov 5, 2025 at 9:10 AM Caleb Sander Mateos
<csander@purestorage.com> wrote:
>
> On Wed, Nov 5, 2025 at 8:16 AM Caleb Sander Mateos
> <csander@purestorage.com> wrote:
> >
> > On Mon, Nov 3, 2025 at 8:40 AM Caleb Sander Mateos
> > <csander@purestorage.com> wrote:
> > >
> > > On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > > > > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > >
> > > > > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > > > > user page reference count increments and decrements and needing to split
> > > > > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > > > > considerably.
> > > > > > >
> > > > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > > > ---
> > > > > > > drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > > > > > 1 file changed, 14 insertions(+), 48 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > > > index 0c74a41a6753..852350e639d6 100644
> > > > > > > --- a/drivers/block/ublk_drv.c
> > > > > > > +++ b/drivers/block/ublk_drv.c
> > > > > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > > > > > .open = ublk_open,
> > > > > > > .free_disk = ublk_free_disk,
> > > > > > > .report_zones = ublk_report_zones,
> > > > > > > };
> > > > > > >
> > > > > > > -#define UBLK_MAX_PIN_PAGES 32
> > > > > > > -
> > > > > > > struct ublk_io_iter {
> > > > > > > - struct page *pages[UBLK_MAX_PIN_PAGES];
> > > > > > > struct bio *bio;
> > > > > > > struct bvec_iter iter;
> > > > > > > };
> > > > > >
> > > > > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > > > > perf drop.
> > > > >
> > > > > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > > > > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > > > > user address range:
> > > > >
> > > > > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > > > > {
> > > > > if (WARN_ON_ONCE(i->data_source))
> > > > > return 0;
> > > > > if (user_backed_iter(i))
> > > > > might_fault();
> > > > > return iterate_and_advance(i, bytes, (void *)addr,
> > > > > copy_to_user_iter, memcpy_to_iter);
> > > > > }
> > > > >
> > > > > Which just checks that the address range doesn't include any kernel
> > > > > addresses and then memcpy()s directly via the userspace virtual
> > > > > addresses:
> > > > >
> > > > > static __always_inline
> > > > > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> > > > > size_t len, void *from, void *priv2)
> > > > > {
> > > > > if (should_fail_usercopy())
> > > > > return len;
> > > > > if (access_ok(iter_to, len)) {
> > > > > from += progress;
> > > > > instrument_copy_to_user(iter_to, from, len);
> > > > > len = raw_copy_to_user(iter_to, from, len);
> > > > > }
> > > > > return len;
> > > > > }
> > > > >
> > > > > static __always_inline __must_check unsigned long
> > > > > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > > > > {
> > > > > return copy_user_generic((__force void *)dst, src, size);
> > > > > }
> > > > >
> > > > > static __always_inline __must_check unsigned long
> > > > > copy_user_generic(void *to, const void *from, unsigned long len)
> > > > > {
> > > > > stac();
> > > > > /*
> > > > > * If CPU has FSRM feature, use 'rep movs'.
> > > > > * Otherwise, use rep_movs_alternative.
> > > > > */
> > > > > asm volatile(
> > > > > "1:\n\t"
> > > > > ALTERNATIVE("rep movsb",
> > > > > "call rep_movs_alternative",
> > > > > ALT_NOT(X86_FEATURE_FSRM))
> > > > > "2:\n"
> > > > > _ASM_EXTABLE_UA(1b, 2b)
> > > > > :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > > > > : : "memory", "rax");
> > > > > clac();
> > > > > return len;
> > > > > }
> > > > >
> > > > > Am I missing something?
> > > >
> > > > page is allocated & mapped in page fault handler.
> > >
> > > Right, physical pages certainly need to be allocated for the virtual
> > > address range being copied to/from. But that would have happened
> > > previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> > > you point out, in the common case that the virtual pages are already
> > > mapped to physical pages, the copy won't cause any page faults.
> > >
> > > >
> > > > However, in typical cases, pages in io buffer shouldn't be swapped out
> > > > frequently, so this cleanup may be good, I will run some perf test.
> > >
> > > Thanks for testing.
> > >
> > > >
> > > > Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> > > > bvec_kmap_local(), and the two helper can handle one whole bvec instead
> > > > of single page.
> > >
> > > Yes, that's a good idea. Thanks, I didn't know about that.
> >
> > Looking into this further, copy_page_{to,from}_iter() doesn't seem
> > like an improvement. It still uses kmap_local_page() +
> > copy_{to,from}_iter() + kunmap_local() internally, and it splits the
> > copy at page boundaries instead of copying the whole bvec at once. I
> > don't think it's worth switching just to hide the bvec_kmap_local() +
> > kunmap_local() calls.
>
> Maybe that's because bvec_kmap_local() only maps the first page of the
> bvec even if it spans multiple pages? Seems like it may be an existing
> bug that ublk_copy_io_pages() can copy past the end of the page
> returned from bvec_kmap_local(). But given that kmap_local_page() is a
> no-op when CONFIG_HIGHMEM is disabled, it's probably not visible in
> typical configurations.
I guess bio_iter_iovec() restricts the bvec to a single page, so
there's no existing issue. If we switch to rq_for_each_bvec(), we'll
need to use copy_page_{to,from}_iter(). Sorry for the thinking out
loud...
Best,
Caleb
On Mon, Nov 03, 2025 at 08:40:30AM -0800, Caleb Sander Mateos wrote:
> On Fri, Oct 31, 2025 at 4:04 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> > > On Thu, Oct 30, 2025 at 8:45 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> > > > > ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> > > > > iov_iter_get_pages2() to extract the pages from the iov_iter and
> > > > > memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> > > > > Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> > > > > user page reference count increments and decrements and needing to split
> > > > > the memcpy() at user page boundaries. It also simplifies the code
> > > > > considerably.
> > > > >
> > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > ---
> > > > > drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> > > > > 1 file changed, 14 insertions(+), 48 deletions(-)
> > > > >
> > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > index 0c74a41a6753..852350e639d6 100644
> > > > > --- a/drivers/block/ublk_drv.c
> > > > > +++ b/drivers/block/ublk_drv.c
> > > > > @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> > > > > .open = ublk_open,
> > > > > .free_disk = ublk_free_disk,
> > > > > .report_zones = ublk_report_zones,
> > > > > };
> > > > >
> > > > > -#define UBLK_MAX_PIN_PAGES 32
> > > > > -
> > > > > struct ublk_io_iter {
> > > > > - struct page *pages[UBLK_MAX_PIN_PAGES];
> > > > > struct bio *bio;
> > > > > struct bvec_iter iter;
> > > > > };
> > > >
> > > > ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> > > > perf drop.
> > >
> > > As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> > > pinning entirely. It calls copy_to_user_iter() for each contiguous
> > > user address range:
> > >
> > > size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > > {
> > > if (WARN_ON_ONCE(i->data_source))
> > > return 0;
> > > if (user_backed_iter(i))
> > > might_fault();
> > > return iterate_and_advance(i, bytes, (void *)addr,
> > > copy_to_user_iter, memcpy_to_iter);
> > > }
> > >
> > > Which just checks that the address range doesn't include any kernel
> > > addresses and then memcpy()s directly via the userspace virtual
> > > addresses:
> > >
> > > static __always_inline
> > > size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> > > size_t len, void *from, void *priv2)
> > > {
> > > if (should_fail_usercopy())
> > > return len;
> > > if (access_ok(iter_to, len)) {
> > > from += progress;
> > > instrument_copy_to_user(iter_to, from, len);
> > > len = raw_copy_to_user(iter_to, from, len);
> > > }
> > > return len;
> > > }
> > >
> > > static __always_inline __must_check unsigned long
> > > raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> > > {
> > > return copy_user_generic((__force void *)dst, src, size);
> > > }
> > >
> > > static __always_inline __must_check unsigned long
> > > copy_user_generic(void *to, const void *from, unsigned long len)
> > > {
> > > stac();
> > > /*
> > > * If CPU has FSRM feature, use 'rep movs'.
> > > * Otherwise, use rep_movs_alternative.
> > > */
> > > asm volatile(
> > > "1:\n\t"
> > > ALTERNATIVE("rep movsb",
> > > "call rep_movs_alternative",
> > > ALT_NOT(X86_FEATURE_FSRM))
> > > "2:\n"
> > > _ASM_EXTABLE_UA(1b, 2b)
> > > :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > > : : "memory", "rax");
> > > clac();
> > > return len;
> > > }
> > >
> > > Am I missing something?
> >
> > page is allocated & mapped in page fault handler.
>
> Right, physical pages certainly need to be allocated for the virtual
> address range being copied to/from. But that would have happened
> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> you point out, in the common case that the virtual pages are already
> mapped to physical pages, the copy won't cause any page faults.
>
> >
> > However, in typical cases, pages in io buffer shouldn't be swapped out
> > frequently, so this cleanup may be good, I will run some perf test.
>
> Thanks for testing.
`fio/t/io_uring` shows 40% improvement on `./kublk -t null -q 2` with this
patch in my test VM, so looks very nice improvement.
Also it works well by forcing to pass IOSQE_ASYNC on the ublk uring_cmd,
and this change is correct because the copy is guaranteed to be done in ublk
daemon context.
>
> >
> > Also copy_page_from_iter()/copy_page_to_iter() can be used for avoiding
> > bvec_kmap_local(), and the two helper can handle one whole bvec instead
> > of single page.
>
> Yes, that's a good idea. Thanks, I didn't know about that.
>
> >
> > Then rq_for_each_bvec() can be used directly, and `ublk_io_iter` may be
> > killed.
>
> Hmm, we still need a way to offset into the request (i.e. what
> ublk_advance_io_iter() does currently). Are you thinking of a single
> rq_for_each_bvec() loop that would skip bvecs until the offset is
> reached and then copy until reaching the end of the user iterator?
Yeah, that is basically what ublk_advance_io_iter() does.
Thanks,
Ming
On 11/4/25 6:48 PM, Ming Lei wrote:
> On Mon, Nov 03, 2025 at 08:40:30AM -0800, Caleb Sander Mateos wrote:
>> On Fri, Oct 31, 2025 at 4:04?PM Ming Lei <ming.lei@redhat.com> wrote:
>>>
>>> On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
>>>> On Thu, Oct 30, 2025 at 8:45?PM Ming Lei <ming.lei@redhat.com> wrote:
>>>>>
>>>>> On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
>>>>>> ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
>>>>>> iov_iter_get_pages2() to extract the pages from the iov_iter and
>>>>>> memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
>>>>>> Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
>>>>>> user page reference count increments and decrements and needing to split
>>>>>> the memcpy() at user page boundaries. It also simplifies the code
>>>>>> considerably.
>>>>>>
>>>>>> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
>>>>>> ---
>>>>>> drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
>>>>>> 1 file changed, 14 insertions(+), 48 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>>>>>> index 0c74a41a6753..852350e639d6 100644
>>>>>> --- a/drivers/block/ublk_drv.c
>>>>>> +++ b/drivers/block/ublk_drv.c
>>>>>> @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
>>>>>> .open = ublk_open,
>>>>>> .free_disk = ublk_free_disk,
>>>>>> .report_zones = ublk_report_zones,
>>>>>> };
>>>>>>
>>>>>> -#define UBLK_MAX_PIN_PAGES 32
>>>>>> -
>>>>>> struct ublk_io_iter {
>>>>>> - struct page *pages[UBLK_MAX_PIN_PAGES];
>>>>>> struct bio *bio;
>>>>>> struct bvec_iter iter;
>>>>>> };
>>>>>
>>>>> ->pages[] is actually for pinning user io pages in batch, so killing it may cause
>>>>> perf drop.
>>>>
>>>> As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
>>>> pinning entirely. It calls copy_to_user_iter() for each contiguous
>>>> user address range:
>>>>
>>>> size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>>>> {
>>>> if (WARN_ON_ONCE(i->data_source))
>>>> return 0;
>>>> if (user_backed_iter(i))
>>>> might_fault();
>>>> return iterate_and_advance(i, bytes, (void *)addr,
>>>> copy_to_user_iter, memcpy_to_iter);
>>>> }
>>>>
>>>> Which just checks that the address range doesn't include any kernel
>>>> addresses and then memcpy()s directly via the userspace virtual
>>>> addresses:
>>>>
>>>> static __always_inline
>>>> size_t copy_to_user_iter(void __user *iter_to, size_t progress,
>>>> size_t len, void *from, void *priv2)
>>>> {
>>>> if (should_fail_usercopy())
>>>> return len;
>>>> if (access_ok(iter_to, len)) {
>>>> from += progress;
>>>> instrument_copy_to_user(iter_to, from, len);
>>>> len = raw_copy_to_user(iter_to, from, len);
>>>> }
>>>> return len;
>>>> }
>>>>
>>>> static __always_inline __must_check unsigned long
>>>> raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
>>>> {
>>>> return copy_user_generic((__force void *)dst, src, size);
>>>> }
>>>>
>>>> static __always_inline __must_check unsigned long
>>>> copy_user_generic(void *to, const void *from, unsigned long len)
>>>> {
>>>> stac();
>>>> /*
>>>> * If CPU has FSRM feature, use 'rep movs'.
>>>> * Otherwise, use rep_movs_alternative.
>>>> */
>>>> asm volatile(
>>>> "1:\n\t"
>>>> ALTERNATIVE("rep movsb",
>>>> "call rep_movs_alternative",
>>>> ALT_NOT(X86_FEATURE_FSRM))
>>>> "2:\n"
>>>> _ASM_EXTABLE_UA(1b, 2b)
>>>> :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
>>>> : : "memory", "rax");
>>>> clac();
>>>> return len;
>>>> }
>>>>
>>>> Am I missing something?
>>>
>>> page is allocated & mapped in page fault handler.
>>
>> Right, physical pages certainly need to be allocated for the virtual
>> address range being copied to/from. But that would have happened
>> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
>> you point out, in the common case that the virtual pages are already
>> mapped to physical pages, the copy won't cause any page faults.
>>
>>>
>>> However, in typical cases, pages in io buffer shouldn't be swapped out
>>> frequently, so this cleanup may be good, I will run some perf test.
>>
>> Thanks for testing.
>
> `fio/t/io_uring` shows 40% improvement on `./kublk -t null -q 2` with this
> patch in my test VM, so looks very nice improvement.
>
> Also it works well by forcing to pass IOSQE_ASYNC on the ublk uring_cmd,
> and this change is correct because the copy is guaranteed to be done in ublk
> daemon context.
We good to queue this up then?
--
Jens Axboe
On Wed, Nov 5, 2025 at 7:26 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 11/4/25 6:48 PM, Ming Lei wrote:
> > On Mon, Nov 03, 2025 at 08:40:30AM -0800, Caleb Sander Mateos wrote:
> >> On Fri, Oct 31, 2025 at 4:04?PM Ming Lei <ming.lei@redhat.com> wrote:
> >>>
> >>> On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
> >>>> On Thu, Oct 30, 2025 at 8:45?PM Ming Lei <ming.lei@redhat.com> wrote:
> >>>>>
> >>>>> On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
> >>>>>> ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
> >>>>>> iov_iter_get_pages2() to extract the pages from the iov_iter and
> >>>>>> memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
> >>>>>> Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
> >>>>>> user page reference count increments and decrements and needing to split
> >>>>>> the memcpy() at user page boundaries. It also simplifies the code
> >>>>>> considerably.
> >>>>>>
> >>>>>> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> >>>>>> ---
> >>>>>> drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
> >>>>>> 1 file changed, 14 insertions(+), 48 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> >>>>>> index 0c74a41a6753..852350e639d6 100644
> >>>>>> --- a/drivers/block/ublk_drv.c
> >>>>>> +++ b/drivers/block/ublk_drv.c
> >>>>>> @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
> >>>>>> .open = ublk_open,
> >>>>>> .free_disk = ublk_free_disk,
> >>>>>> .report_zones = ublk_report_zones,
> >>>>>> };
> >>>>>>
> >>>>>> -#define UBLK_MAX_PIN_PAGES 32
> >>>>>> -
> >>>>>> struct ublk_io_iter {
> >>>>>> - struct page *pages[UBLK_MAX_PIN_PAGES];
> >>>>>> struct bio *bio;
> >>>>>> struct bvec_iter iter;
> >>>>>> };
> >>>>>
> >>>>> ->pages[] is actually for pinning user io pages in batch, so killing it may cause
> >>>>> perf drop.
> >>>>
> >>>> As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
> >>>> pinning entirely. It calls copy_to_user_iter() for each contiguous
> >>>> user address range:
> >>>>
> >>>> size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> >>>> {
> >>>> if (WARN_ON_ONCE(i->data_source))
> >>>> return 0;
> >>>> if (user_backed_iter(i))
> >>>> might_fault();
> >>>> return iterate_and_advance(i, bytes, (void *)addr,
> >>>> copy_to_user_iter, memcpy_to_iter);
> >>>> }
> >>>>
> >>>> Which just checks that the address range doesn't include any kernel
> >>>> addresses and then memcpy()s directly via the userspace virtual
> >>>> addresses:
> >>>>
> >>>> static __always_inline
> >>>> size_t copy_to_user_iter(void __user *iter_to, size_t progress,
> >>>> size_t len, void *from, void *priv2)
> >>>> {
> >>>> if (should_fail_usercopy())
> >>>> return len;
> >>>> if (access_ok(iter_to, len)) {
> >>>> from += progress;
> >>>> instrument_copy_to_user(iter_to, from, len);
> >>>> len = raw_copy_to_user(iter_to, from, len);
> >>>> }
> >>>> return len;
> >>>> }
> >>>>
> >>>> static __always_inline __must_check unsigned long
> >>>> raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
> >>>> {
> >>>> return copy_user_generic((__force void *)dst, src, size);
> >>>> }
> >>>>
> >>>> static __always_inline __must_check unsigned long
> >>>> copy_user_generic(void *to, const void *from, unsigned long len)
> >>>> {
> >>>> stac();
> >>>> /*
> >>>> * If CPU has FSRM feature, use 'rep movs'.
> >>>> * Otherwise, use rep_movs_alternative.
> >>>> */
> >>>> asm volatile(
> >>>> "1:\n\t"
> >>>> ALTERNATIVE("rep movsb",
> >>>> "call rep_movs_alternative",
> >>>> ALT_NOT(X86_FEATURE_FSRM))
> >>>> "2:\n"
> >>>> _ASM_EXTABLE_UA(1b, 2b)
> >>>> :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> >>>> : : "memory", "rax");
> >>>> clac();
> >>>> return len;
> >>>> }
> >>>>
> >>>> Am I missing something?
> >>>
> >>> page is allocated & mapped in page fault handler.
> >>
> >> Right, physical pages certainly need to be allocated for the virtual
> >> address range being copied to/from. But that would have happened
> >> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
> >> you point out, in the common case that the virtual pages are already
> >> mapped to physical pages, the copy won't cause any page faults.
> >>
> >>>
> >>> However, in typical cases, pages in io buffer shouldn't be swapped out
> >>> frequently, so this cleanup may be good, I will run some perf test.
> >>
> >> Thanks for testing.
> >
> > `fio/t/io_uring` shows 40% improvement on `./kublk -t null -q 2` with this
> > patch in my test VM, so looks very nice improvement.
> >
> > Also it works well by forcing to pass IOSQE_ASYNC on the ublk uring_cmd,
> > and this change is correct because the copy is guaranteed to be done in ublk
> > daemon context.
>
> We good to queue this up then?
Let me write a v2 implementing Ming's suggestions to use
copy_page_{to,from}_iter() and get rid of the open-coded bvec
iteration.
Thanks,
Caleb
On 11/5/25 8:37 AM, Caleb Sander Mateos wrote:
> On Wed, Nov 5, 2025 at 7:26?AM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 11/4/25 6:48 PM, Ming Lei wrote:
>>> On Mon, Nov 03, 2025 at 08:40:30AM -0800, Caleb Sander Mateos wrote:
>>>> On Fri, Oct 31, 2025 at 4:04?PM Ming Lei <ming.lei@redhat.com> wrote:
>>>>>
>>>>> On Fri, Oct 31, 2025 at 09:02:48AM -0700, Caleb Sander Mateos wrote:
>>>>>> On Thu, Oct 30, 2025 at 8:45?PM Ming Lei <ming.lei@redhat.com> wrote:
>>>>>>>
>>>>>>> On Thu, Oct 30, 2025 at 07:05:21PM -0600, Caleb Sander Mateos wrote:
>>>>>>>> ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
>>>>>>>> iov_iter_get_pages2() to extract the pages from the iov_iter and
>>>>>>>> memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
>>>>>>>> Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
>>>>>>>> user page reference count increments and decrements and needing to split
>>>>>>>> the memcpy() at user page boundaries. It also simplifies the code
>>>>>>>> considerably.
>>>>>>>>
>>>>>>>> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
>>>>>>>> ---
>>>>>>>> drivers/block/ublk_drv.c | 62 +++++++++-------------------------------
>>>>>>>> 1 file changed, 14 insertions(+), 48 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>>>>>>>> index 0c74a41a6753..852350e639d6 100644
>>>>>>>> --- a/drivers/block/ublk_drv.c
>>>>>>>> +++ b/drivers/block/ublk_drv.c
>>>>>>>> @@ -912,58 +912,47 @@ static const struct block_device_operations ub_fops = {
>>>>>>>> .open = ublk_open,
>>>>>>>> .free_disk = ublk_free_disk,
>>>>>>>> .report_zones = ublk_report_zones,
>>>>>>>> };
>>>>>>>>
>>>>>>>> -#define UBLK_MAX_PIN_PAGES 32
>>>>>>>> -
>>>>>>>> struct ublk_io_iter {
>>>>>>>> - struct page *pages[UBLK_MAX_PIN_PAGES];
>>>>>>>> struct bio *bio;
>>>>>>>> struct bvec_iter iter;
>>>>>>>> };
>>>>>>>
>>>>>>> ->pages[] is actually for pinning user io pages in batch, so killing it may cause
>>>>>>> perf drop.
>>>>>>
>>>>>> As far as I can tell, copy_to_iter()/copy_from_iter() avoids the page
>>>>>> pinning entirely. It calls copy_to_user_iter() for each contiguous
>>>>>> user address range:
>>>>>>
>>>>>> size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>>>>>> {
>>>>>> if (WARN_ON_ONCE(i->data_source))
>>>>>> return 0;
>>>>>> if (user_backed_iter(i))
>>>>>> might_fault();
>>>>>> return iterate_and_advance(i, bytes, (void *)addr,
>>>>>> copy_to_user_iter, memcpy_to_iter);
>>>>>> }
>>>>>>
>>>>>> Which just checks that the address range doesn't include any kernel
>>>>>> addresses and then memcpy()s directly via the userspace virtual
>>>>>> addresses:
>>>>>>
>>>>>> static __always_inline
>>>>>> size_t copy_to_user_iter(void __user *iter_to, size_t progress,
>>>>>> size_t len, void *from, void *priv2)
>>>>>> {
>>>>>> if (should_fail_usercopy())
>>>>>> return len;
>>>>>> if (access_ok(iter_to, len)) {
>>>>>> from += progress;
>>>>>> instrument_copy_to_user(iter_to, from, len);
>>>>>> len = raw_copy_to_user(iter_to, from, len);
>>>>>> }
>>>>>> return len;
>>>>>> }
>>>>>>
>>>>>> static __always_inline __must_check unsigned long
>>>>>> raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
>>>>>> {
>>>>>> return copy_user_generic((__force void *)dst, src, size);
>>>>>> }
>>>>>>
>>>>>> static __always_inline __must_check unsigned long
>>>>>> copy_user_generic(void *to, const void *from, unsigned long len)
>>>>>> {
>>>>>> stac();
>>>>>> /*
>>>>>> * If CPU has FSRM feature, use 'rep movs'.
>>>>>> * Otherwise, use rep_movs_alternative.
>>>>>> */
>>>>>> asm volatile(
>>>>>> "1:\n\t"
>>>>>> ALTERNATIVE("rep movsb",
>>>>>> "call rep_movs_alternative",
>>>>>> ALT_NOT(X86_FEATURE_FSRM))
>>>>>> "2:\n"
>>>>>> _ASM_EXTABLE_UA(1b, 2b)
>>>>>> :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
>>>>>> : : "memory", "rax");
>>>>>> clac();
>>>>>> return len;
>>>>>> }
>>>>>>
>>>>>> Am I missing something?
>>>>>
>>>>> page is allocated & mapped in page fault handler.
>>>>
>>>> Right, physical pages certainly need to be allocated for the virtual
>>>> address range being copied to/from. But that would have happened
>>>> previously in iov_iter_get_pages2(), so this isn't a new cost. And as
>>>> you point out, in the common case that the virtual pages are already
>>>> mapped to physical pages, the copy won't cause any page faults.
>>>>
>>>>>
>>>>> However, in typical cases, pages in io buffer shouldn't be swapped out
>>>>> frequently, so this cleanup may be good, I will run some perf test.
>>>>
>>>> Thanks for testing.
>>>
>>> `fio/t/io_uring` shows 40% improvement on `./kublk -t null -q 2` with this
>>> patch in my test VM, so looks very nice improvement.
>>>
>>> Also it works well by forcing to pass IOSQE_ASYNC on the ublk uring_cmd,
>>> and this change is correct because the copy is guaranteed to be done in ublk
>>> daemon context.
>>
>> We good to queue this up then?
>
> Let me write a v2 implementing Ming's suggestions to use
> copy_page_{to,from}_iter() and get rid of the open-coded bvec
> iteration.
Sounds good.
--
Jens Axboe
© 2016 - 2026 Red Hat, Inc.