[PATCH] block/file-posix: optimize append write

Sam Li posted 1 patch 1 week, 2 days ago
There is a newer version of this series
block/file-posix.c | 33 ++++++++++++++++-----------------
1 file changed, 16 insertions(+), 17 deletions(-)
[PATCH] block/file-posix: optimize append write
Posted by Sam Li 1 week, 2 days ago
When the file-posix driver emulates append write, it holds the lock
whenever accessing wp, which limits the IO queue depth to one.

The write IO flow can be optimized to allow concurrent writes. The lock
is held in two cases:
1. Assumed that the write IO succeeds, update the wp before issuing the
write.
2. If the write IO fails, report that zone and use the reported value
as the current wp.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/file-posix.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index ff928b5e85..64a57fadb1 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2489,11 +2489,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
 #if defined(CONFIG_BLKZONED)
     if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
         bs->bl.zoned != BLK_Z_NONE) {
-        qemu_co_mutex_lock(&bs->wps->colock);
+        BlockZoneWps *wps = bs->wps;
+        int index = offset / bs->bl.zone_size;
+        qemu_co_mutex_lock(&wps->colock);
+        uint64_t *wp = &wps->wp[index];
         if (type & QEMU_AIO_ZONE_APPEND) {
-            int index = offset / bs->bl.zone_size;
-            offset = bs->wps->wp[index];
+            offset = *wp;
+            *offset_ptr = offset;
+        }
+        /* Advance the wp if needed */
+        if (offset + bytes > *wp) {
+            *wp = offset + bytes;
         }
+        qemu_co_mutex_unlock(&bs->wps->colock);
     }
 #endif
 
@@ -2540,28 +2548,19 @@ out:
 #if defined(CONFIG_BLKZONED)
     if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
         bs->bl.zoned != BLK_Z_NONE) {
-        BlockZoneWps *wps = bs->wps;
         if (ret == 0) {
-            uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
-            if (!BDRV_ZT_IS_CONV(*wp)) {
-                if (type & QEMU_AIO_ZONE_APPEND) {
-                    *offset_ptr = *wp;
-                    trace_zbd_zone_append_complete(bs, *offset_ptr
-                        >> BDRV_SECTOR_BITS);
-                }
-                /* Advance the wp if needed */
-                if (offset + bytes > *wp) {
-                    *wp = offset + bytes;
-                }
+            if (type & QEMU_AIO_ZONE_APPEND) {
+                trace_zbd_zone_append_complete(bs, *offset_ptr
+                    >> BDRV_SECTOR_BITS);
             }
         } else {
+            qemu_co_mutex_lock(&bs->wps->colock);
             /*
              * write and append write are not allowed to cross zone boundaries
              */
             update_zones_wp(bs, s->fd, offset, 1);
+            qemu_co_mutex_unlock(&bs->wps->colock);
         }
-
-        qemu_co_mutex_unlock(&wps->colock);
     }
 #endif
     return ret;
-- 
2.34.1
Re: [PATCH] block/file-posix: optimize append write
Posted by Damien Le Moal 1 week, 1 day ago
On 9/30/24 01:03, Sam Li wrote:
> When the file-posix driver emulates append write, it holds the lock
> whenever accessing wp, which limits the IO queue depth to one.
> 
> The write IO flow can be optimized to allow concurrent writes. The lock
> is held in two cases:
> 1. Assumed that the write IO succeeds, update the wp before issuing the
> write.
> 2. If the write IO fails, report that zone and use the reported value
> as the current wp.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/file-posix.c | 33 ++++++++++++++++-----------------
>  1 file changed, 16 insertions(+), 17 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index ff928b5e85..64a57fadb1 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -2489,11 +2489,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
>  #if defined(CONFIG_BLKZONED)
>      if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
>          bs->bl.zoned != BLK_Z_NONE) {
> -        qemu_co_mutex_lock(&bs->wps->colock);
> +        BlockZoneWps *wps = bs->wps;
> +        int index = offset / bs->bl.zone_size;

A blank line after this declaration (to separate declarations from code) woule
make this code more readable...

> +        qemu_co_mutex_lock(&wps->colock);
> +        uint64_t *wp = &wps->wp[index];
>          if (type & QEMU_AIO_ZONE_APPEND) {
> -            int index = offset / bs->bl.zone_size;
> -            offset = bs->wps->wp[index];
> +            offset = *wp;
> +            *offset_ptr = offset;
> +        }
> +        /* Advance the wp if needed */
> +        if (offset + bytes > *wp) {

Why the if ? offset MUST be equal to wp for writes, and for zone append we do
not need to check the offset at all. So advancing the wp should be unconditional.

BUT ! where are the checks for "zone is full" and "offset == wp" for write
operations ? These must be checked while holding the zone lock.

> +            *wp = offset + bytes;
>          }
> +        qemu_co_mutex_unlock(&bs->wps->colock);
>      }
>  #endif
>  
> @@ -2540,28 +2548,19 @@ out:
>  #if defined(CONFIG_BLKZONED)
>      if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
>          bs->bl.zoned != BLK_Z_NONE) {
> -        BlockZoneWps *wps = bs->wps;
>          if (ret == 0) {
> -            uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
> -            if (!BDRV_ZT_IS_CONV(*wp)) {
> -                if (type & QEMU_AIO_ZONE_APPEND) {
> -                    *offset_ptr = *wp;
> -                    trace_zbd_zone_append_complete(bs, *offset_ptr
> -                        >> BDRV_SECTOR_BITS);
> -                }
> -                /* Advance the wp if needed */
> -                if (offset + bytes > *wp) {
> -                    *wp = offset + bytes;
> -                }
> +            if (type & QEMU_AIO_ZONE_APPEND) {
> +                trace_zbd_zone_append_complete(bs, *offset_ptr
> +                    >> BDRV_SECTOR_BITS);
>              }
>          } else {
> +            qemu_co_mutex_lock(&bs->wps->colock);
>              /*
>               * write and append write are not allowed to cross zone boundaries
>               */
>              update_zones_wp(bs, s->fd, offset, 1);
> +            qemu_co_mutex_unlock(&bs->wps->colock);
>          }
> -
> -        qemu_co_mutex_unlock(&wps->colock);
>      }
>  #endif
>      return ret;


-- 
Damien Le Moal
Western Digital Research