block/file-posix.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-)
Full Unit Access (FUA) is an optimization where a disk write with the
flag set will be persisted to disk immediately instead of potentially
remaining in the disk's write cache. This commit address the todo task
for using pwritev2() with RWF_DSYNC in the thread pool section of
raw_co_prw(), if pwritev2 with RWF_DSYNC is available in the host,
which is alway for Linux kernel >= 4.7. The intent for FUA is indicated
with the BDRV_REQ_FUA flag. The old code paths are preserved in case
BDRV_REQ_FUA is off or pwritev2() with RWF_DSYNC is not available.
During testing, I observed that the BDRV_REQ_FUA is always turned on
when blk->enable_write_cache is not set in block/block-backend.c, so
I commented this section off during testing:
https://gitlab.com/qemu-project/qemu/-/blob/master/block/block-backend.c?ref_type=heads#L1432-1434
Signed-off-by: Pinku Deb Nath <prantoran@gmail.com>
---
block/file-posix.c | 42 ++++++++++++++++++++++++++++++++++--------
1 file changed, 34 insertions(+), 8 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index 56d1972d15..34de816eab 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -229,6 +229,7 @@ typedef struct RawPosixAIOData {
unsigned long op;
} zone_mgmt;
};
+ BdrvRequestFlags flags;
} RawPosixAIOData;
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -1674,6 +1675,16 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
return pwritev(fd, iov, nr_iov, offset);
}
+static ssize_t
+qemu_pwrite_fua(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+#ifdef RWF_DSYNC
+ return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC);
+#else
+ return pwritev2(fd, iov, nr_iov, offset, 0);
+#endif
+}
+
#else
static bool preadv_present = false;
@@ -1698,10 +1709,15 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
len = RETRY_ON_EINTR(
(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
- qemu_pwritev(aiocb->aio_fildes,
- aiocb->io.iov,
- aiocb->io.niov,
- aiocb->aio_offset) :
+ (aiocb->flags & BDRV_REQ_FUA) ?
+ qemu_pwrite_fua(aiocb->aio_fildes,
+ aiocb->io.iov,
+ aiocb->io.niov,
+ aiocb->aio_offset) :
+ qemu_pwritev(aiocb->aio_fildes,
+ aiocb->io.iov,
+ aiocb->io.niov,
+ aiocb->aio_offset) :
qemu_preadv(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->io.niov,
@@ -1727,10 +1743,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
while (offset < aiocb->aio_nbytes) {
if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
- len = pwrite(aiocb->aio_fildes,
- (const char *)buf + offset,
- aiocb->aio_nbytes - offset,
- aiocb->aio_offset + offset);
+ if (aiocb->flags & BDRV_REQ_FUA) {
+ len = qemu_pwrite_fua(aiocb->aio_fildes,
+ aiocb->io.iov,
+ aiocb->io.niov,
+ aiocb->aio_offset);
+ } else {
+ len = pwrite(aiocb->aio_fildes,
+ (const char *)buf + offset,
+ aiocb->aio_nbytes - offset,
+ aiocb->aio_offset + offset);
+ }
} else {
len = pread(aiocb->aio_fildes,
buf + offset,
@@ -2539,14 +2562,17 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
.iov = qiov->iov,
.niov = qiov->niov,
},
+ .flags = flags,
};
assert(qiov->size == bytes);
ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
+#ifndef RWD_DSYNC
if (ret == 0 && (flags & BDRV_REQ_FUA)) {
/* TODO Use pwritev2() instead if it's available */
ret = raw_co_flush_to_disk(bs);
}
+#endif
goto out; /* Avoid the compiler err of unused label */
out:
--
2.43.0
On Wed, Apr 02, 2025 at 02:31:01AM -0700, Pinku Deb Nath wrote:
> Full Unit Access (FUA) is an optimization where a disk write with the
> flag set will be persisted to disk immediately instead of potentially
> remaining in the disk's write cache. This commit address the todo task
> for using pwritev2() with RWF_DSYNC in the thread pool section of
> raw_co_prw(), if pwritev2 with RWF_DSYNC is available in the host,
> which is alway for Linux kernel >= 4.7. The intent for FUA is indicated
"alway" -> "always true"?
> with the BDRV_REQ_FUA flag. The old code paths are preserved in case
> BDRV_REQ_FUA is off or pwritev2() with RWF_DSYNC is not available.
>
> During testing, I observed that the BDRV_REQ_FUA is always turned on
> when blk->enable_write_cache is not set in block/block-backend.c, so
> I commented this section off during testing:
> https://gitlab.com/qemu-project/qemu/-/blob/master/block/block-backend.c?ref_type=heads#L1432-1434
You can also use the qemu-io "open -t writeback <filename>" command to
open the image file with enable_write_cache.
>
> Signed-off-by: Pinku Deb Nath <prantoran@gmail.com>
> ---
> block/file-posix.c | 42 ++++++++++++++++++++++++++++++++++--------
> 1 file changed, 34 insertions(+), 8 deletions(-)
>
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 56d1972d15..34de816eab 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -229,6 +229,7 @@ typedef struct RawPosixAIOData {
> unsigned long op;
> } zone_mgmt;
> };
> + BdrvRequestFlags flags;
> } RawPosixAIOData;
>
> #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> @@ -1674,6 +1675,16 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
> return pwritev(fd, iov, nr_iov, offset);
> }
>
> +static ssize_t
> +qemu_pwrite_fua(int fd, const struct iovec *iov, int nr_iov, off_t offset)
Please call this function qemu_pwritev_fua() since it takes an iovec
array.
> +{
> +#ifdef RWF_DSYNC
> + return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC);
> +#else
> + return pwritev2(fd, iov, nr_iov, offset, 0);
In the #else case the function is not doing what its name says - FUA is
skipped. Kevin mentioned that the flush could be moved into the handle
function. How about calling handle_aiocb_flush() so that
qemu_pwrite_fua() has the intended semantics even on hosts without
RWF_DSYNC?
> +#endif
> +}
> +
> #else
>
> static bool preadv_present = false;
> @@ -1698,10 +1709,15 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>
> len = RETRY_ON_EINTR(
> (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
> - qemu_pwritev(aiocb->aio_fildes,
> - aiocb->io.iov,
> - aiocb->io.niov,
> - aiocb->aio_offset) :
> + (aiocb->flags & BDRV_REQ_FUA) ?
> + qemu_pwrite_fua(aiocb->aio_fildes,
> + aiocb->io.iov,
> + aiocb->io.niov,
> + aiocb->aio_offset) :
> + qemu_pwritev(aiocb->aio_fildes,
> + aiocb->io.iov,
> + aiocb->io.niov,
> + aiocb->aio_offset) :
> qemu_preadv(aiocb->aio_fildes,
> aiocb->io.iov,
> aiocb->io.niov,
> @@ -1727,10 +1743,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>
> while (offset < aiocb->aio_nbytes) {
> if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> - len = pwrite(aiocb->aio_fildes,
> - (const char *)buf + offset,
> - aiocb->aio_nbytes - offset,
> - aiocb->aio_offset + offset);
> + if (aiocb->flags & BDRV_REQ_FUA) {
> + len = qemu_pwrite_fua(aiocb->aio_fildes,
> + aiocb->io.iov,
> + aiocb->io.niov,
> + aiocb->aio_offset);
handle_aiocb_rw() can pass in a temporary buffer (instead of iov), so
it's incorrect to use aiocb->io.iov here. Instead, create a local struct
iov iov = { .iov_base = buf, .iov_len = aiocb->aio_nbytes - offset }
here.
> + } else {
> + len = pwrite(aiocb->aio_fildes,
> + (const char *)buf + offset,
> + aiocb->aio_nbytes - offset,
> + aiocb->aio_offset + offset);
> + }
> } else {
> len = pread(aiocb->aio_fildes,
> buf + offset,
> @@ -2539,14 +2562,17 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
> .iov = qiov->iov,
> .niov = qiov->niov,
> },
> + .flags = flags,
> };
>
> assert(qiov->size == bytes);
> ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
> +#ifndef RWD_DSYNC
RWF_DSYNC
> if (ret == 0 && (flags & BDRV_REQ_FUA)) {
> /* TODO Use pwritev2() instead if it's available */
This TODO comment can be deleted now.
> ret = raw_co_flush_to_disk(bs);
This can be removed if qemu_pwritev_fua() ensures that the flush has
occurred.
> }
> +#endif
> goto out; /* Avoid the compiler err of unused label */
>
> out:
> --
> 2.43.0
>
© 2016 - 2025 Red Hat, Inc.