[RFC v2] Adding block layer APIs resembling Linux ZoneBlockDevice ioctls.

Sam Li posted 1 patch 1 year, 11 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20220620033611.82166-1-faithilikerun@gmail.com
Maintainers: Kevin Wolf <kwolf@redhat.com>, Hanna Reitz <hreitz@redhat.com>, Stefan Hajnoczi <stefanha@redhat.com>, Fam Zheng <fam@euphon.net>
block/block-backend.c             |  22 ++++
block/coroutines.h                |   5 +
block/file-posix.c                | 182 ++++++++++++++++++++++++++++++
block/io.c                        |  23 ++++
include/block/block-common.h      |  43 ++++++-
include/block/block-io.h          |  13 +++
include/block/block_int-common.h  |  20 ++++
qemu-io-cmds.c                    | 118 +++++++++++++++++++
tests/qemu-iotests/tests/zoned.sh |  52 +++++++++
9 files changed, 477 insertions(+), 1 deletion(-)
create mode 100644 tests/qemu-iotests/tests/zoned.sh
[RFC v2] Adding block layer APIs resembling Linux ZoneBlockDevice ioctls.
Posted by Sam Li 1 year, 11 months ago
Fix some mistakes before. It can report a range of zones now.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/block-backend.c             |  22 ++++
 block/coroutines.h                |   5 +
 block/file-posix.c                | 182 ++++++++++++++++++++++++++++++
 block/io.c                        |  23 ++++
 include/block/block-common.h      |  43 ++++++-
 include/block/block-io.h          |  13 +++
 include/block/block_int-common.h  |  20 ++++
 qemu-io-cmds.c                    | 118 +++++++++++++++++++
 tests/qemu-iotests/tests/zoned.sh |  52 +++++++++
 9 files changed, 477 insertions(+), 1 deletion(-)
 create mode 100644 tests/qemu-iotests/tests/zoned.sh

diff --git a/block/block-backend.c b/block/block-backend.c
index e0e1aff4b1..20248e4a35 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -104,6 +104,8 @@ typedef struct BlockBackendAIOCB {
     int ret;
 } BlockBackendAIOCB;
 
+
+
 static const AIOCBInfo block_backend_aiocb_info = {
     .get_aio_context = blk_aiocb_get_aio_context,
     .aiocb_size = sizeof(BlockBackendAIOCB),
@@ -1810,6 +1812,25 @@ int blk_flush(BlockBackend *blk)
     return ret;
 }
 
+int blk_co_zone_report(BlockBackend *blk, int64_t offset, int64_t len,
+                       int64_t *nr_zones,
+                       struct BlockZoneDescriptor *zones)
+{
+    int ret;
+    ret = bdrv_co_zone_report(blk->root->bs, offset, len, nr_zones, zones);
+    return ret;
+}
+
+int blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+        int64_t offset, int64_t len)
+{
+    int ret;
+    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
+
+    return ret;
+}
+
+
 void blk_drain(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
@@ -2634,3 +2655,4 @@ int blk_make_empty(BlockBackend *blk, Error **errp)
 
     return bdrv_make_empty(blk->root, errp);
 }
+
diff --git a/block/coroutines.h b/block/coroutines.h
index 830ecaa733..247326255f 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -80,6 +80,11 @@ int coroutine_fn
 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
 
 int coroutine_fn blk_co_do_flush(BlockBackend *blk);
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    int64_t len, int64_t *nr_zones,
+                                    struct BlockZoneDescriptor *zones);
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+        int64_t offset, int64_t len);
 
 
 /*
diff --git a/block/file-posix.c b/block/file-posix.c
index 48cd096624..71fd21f8ba 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -178,6 +178,137 @@ typedef struct BDRVRawReopenState {
     bool check_cache_dropped;
 } BDRVRawReopenState;
 
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+static inline void parse_zone(struct BlockZoneDescriptor *zone,
+        struct blk_zone *blkz) {
+    zone->start = blkz->start << BDRV_SECTOR_BITS;
+    zone->length = blkz->len << BDRV_SECTOR_BITS;
+    zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
+    zone->wp = blkz->wp << BDRV_SECTOR_BITS;
+    zone->type = blkz->type;
+    zone->cond = blkz->type;
+}
+
+/**
+ * zone report - Get a zone block device's information in the
+ * form of an array of zone descriptors.
+ *
+ * @param bs: passing zone block device file descriptor
+ * @param zones: Space to hold zone information on reply
+ * @param offset: the location in the zone block device
+ * @return 0 on success, -1 on failure
+ */
+static int raw_co_zone_report(BlockDriverState *bs, int64_t offset, int64_t len,
+                              int64_t *nr_zones,
+                              struct BlockZoneDescriptor *zones) {
+    BDRVRawState *s = bs->opaque;
+    struct blk_zone_report *rep;
+    struct BlockZoneDescriptor bzd;
+    struct blk_zone *blkz;
+    int64_t zone_size_mask, end, rep_size, nrz;
+    int ret, n = 0, i = 0;
+
+    printf("%s\n", "start to report zone");
+    nrz = *nr_zones;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    rep = (struct blk_zone_report *)malloc(rep_size);
+    if (!rep) {
+        return -1;
+    }
+
+    zone_size_mask = zone_start_sector - 1;
+    /* align up */
+    end = ((offset + len + zone_size_mask) & (~zone_size_mask))
+            >> BDRV_SECTOR_BITS;
+
+    blkz = (struct blk_zone *)(rep + 1);
+    while (offset < end) {
+        memset(rep, 0, rep_size);
+        rep->sector = offset;
+        rep->nr_zones = nrz;
+
+        ret = ioctl(s->fd, BLKREPORTZONE, rep);
+        if (ret != 0) {
+            ret = -errno;
+            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
+                         s->fd, offset, errno);
+            free(rep);
+            return ret;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++) {
+            parse_zone(&bzd, &blkz[i]);
+            if (zones) {
+                memcpy(&zones[n], &bzd, sizeof(bzd));
+            }
+        }
+
+        offset = blkz[i].start + blkz[i].len;
+    }
+
+    return ret;
+}
+
+/**
+ * zone management operations - Execute an operation on a zone
+ */
+static int raw_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
+        int64_t offset, int64_t len) {
+    BDRVRawState *s = bs->opaque;
+    struct blk_zone_range range;
+    const char *ioctl_name;
+    uint64_t ioctl_op;
+    int64_t zone_size_mask, end;
+    int ret;
+
+    zone_size_mask = zone_start_sector - 1;
+    /* align up */
+    end = ((offset + len + zone_size_mask) & (~zone_size_mask))
+            >> BDRV_SECTOR_BITS;
+    offset = (offset & (~zone_size_mask)) >> BDRV_SECTOR_BITS; /* align down */
+
+    switch (op) {
+    case zone_open:
+        ioctl_name = "BLKOPENZONE";
+        ioctl_op = BLKOPENZONE;
+        break;
+    case zone_close:
+        ioctl_name = "BLKCLOSEZONE";
+        ioctl_op = BLKCLOSEZONE;
+        break;
+    case zone_finish:
+        ioctl_name = "BLKFINISHZONE";
+        ioctl_op = BLKFINISHZONE;
+        break;
+    case zone_reset:
+        ioctl_name = "BLKRESETZONE";
+        ioctl_op = BLKRESETZONE;
+        break;
+    default:
+        error_report("Invalid zone operation 0x%x", op);
+        errno = -EINVAL;
+        return -1;
+    }
+
+    /* Execute the operation */
+    range.sector = offset;
+    range.nr_sectors = end - offset;
+    ret = ioctl(s->fd, ioctl_op, &range);
+    if (ret != 0) {
+        error_report("ioctl %s failed %d",
+                         ioctl_name, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
 static int fd_open(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
@@ -3324,6 +3455,9 @@ BlockDriver bdrv_file = {
     .bdrv_abort_perm_update = raw_abort_perm_update,
     .create_opts = &raw_create_opts,
     .mutable_opts = mutable_opts,
+
+    .bdrv_co_zone_report = raw_co_zone_report,
+    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
 };
 
 /***********************************************/
@@ -3703,6 +3837,53 @@ static BlockDriver bdrv_host_device = {
 #endif
 };
 
+static BlockDriver bdrv_zoned_host_device = {
+        .format_name = "zoned_host_device",
+        .protocol_name = "zoned_host_device",
+        .instance_size = sizeof(BDRVRawState),
+        .bdrv_needs_filename = true,
+        .bdrv_probe_device  = hdev_probe_device,
+        .bdrv_parse_filename = hdev_parse_filename,
+        .bdrv_file_open     = hdev_open,
+        .bdrv_close         = raw_close,
+        .bdrv_reopen_prepare = raw_reopen_prepare,
+        .bdrv_reopen_commit  = raw_reopen_commit,
+        .bdrv_reopen_abort   = raw_reopen_abort,
+        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+        .create_opts         = &bdrv_create_opts_simple,
+        .mutable_opts        = mutable_opts,
+        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
+        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
+
+        .bdrv_co_preadv         = raw_co_preadv,
+        .bdrv_co_pwritev        = raw_co_pwritev,
+        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
+        .bdrv_co_pdiscard       = hdev_co_pdiscard,
+        .bdrv_co_copy_range_from = raw_co_copy_range_from,
+        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
+        .bdrv_refresh_limits = raw_refresh_limits,
+        .bdrv_io_plug = raw_aio_plug,
+        .bdrv_io_unplug = raw_aio_unplug,
+        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
+
+        .bdrv_co_truncate       = raw_co_truncate,
+        .bdrv_getlength = raw_getlength,
+        .bdrv_get_info = raw_get_info,
+        .bdrv_get_allocated_file_size
+                            = raw_get_allocated_file_size,
+        .bdrv_get_specific_stats = hdev_get_specific_stats,
+        .bdrv_check_perm = raw_check_perm,
+        .bdrv_set_perm   = raw_set_perm,
+        .bdrv_abort_perm_update = raw_abort_perm_update,
+        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
+        .bdrv_probe_geometry = hdev_probe_geometry,
+        .bdrv_co_ioctl = hdev_co_ioctl,
+
+        /* zone management operations */
+        .bdrv_co_zone_report = raw_co_zone_report,
+        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+};
+
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static void cdrom_parse_filename(const char *filename, QDict *options,
                                  Error **errp)
@@ -3964,6 +4145,7 @@ static void bdrv_file_init(void)
 #if defined(HAVE_HOST_BLOCK_DEVICE)
     bdrv_register(&bdrv_host_device);
 #ifdef __linux__
+    bdrv_register(&bdrv_zoned_host_device);
     bdrv_register(&bdrv_host_cdrom);
 #endif
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/block/io.c b/block/io.c
index 789e6373d5..3e8bb83cc3 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3258,6 +3258,29 @@ out:
     return co.ret;
 }
 
+int bdrv_co_zone_report(BlockDriverState *bs,
+                        int64_t offset, int64_t len, int64_t *nr_zones,
+                        struct BlockZoneDescriptor *zones)
+{
+    int ret = 0;
+    if (!bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones)) {
+        return -ENOTSUP;
+    }
+
+    return ret;
+}
+
+int bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
+        int64_t offset, int64_t len)
+{
+    int ret = 0;
+    if (!bs->drv->bdrv_co_zone_mgmt(bs, op, offset, len)) {
+        return -ENOTSUP;
+    }
+
+    return ret;
+}
+
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
     IO_CODE();
diff --git a/include/block/block-common.h b/include/block/block-common.h
index fdb7306e78..24c468d8ad 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -23,7 +23,7 @@
  */
 #ifndef BLOCK_COMMON_H
 #define BLOCK_COMMON_H
-
+#include <linux/blkzoned.h>
 #include "block/aio.h"
 #include "block/aio-wait.h"
 #include "qemu/iov.h"
@@ -48,6 +48,47 @@
 typedef struct BlockDriver BlockDriver;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildClass BdrvChildClass;
+enum zone_type {
+    BLK_ZT_CONV = BLK_ZONE_TYPE_CONVENTIONAL,
+    BLK_ZT_SWR = BLK_ZONE_TYPE_SEQWRITE_REQ,
+    BLK_ZT_SWP = BLK_ZONE_TYPE_SEQWRITE_PREF,
+};
+
+enum zone_cond {
+    BLK_ZS_NOT_WP = BLK_ZONE_COND_NOT_WP,
+    BLK_ZS_EMPTY = BLK_ZONE_COND_EMPTY,
+    BLK_ZS_IOPEN = BLK_ZONE_COND_IMP_OPEN,
+    BLK_ZS_EOPEN = BLK_ZONE_COND_EXP_OPEN,
+    BLK_ZS_CLOSED = BLK_ZONE_COND_CLOSED,
+    BLK_ZS_RDONLY = BLK_ZONE_COND_READONLY,
+    BLK_ZS_FULL = BLK_ZONE_COND_FULL,
+    BLK_ZS_OFFLINE = BLK_ZONE_COND_OFFLINE,
+};
+
+enum zone_op {
+    zone_open,
+    zone_close,
+    zone_finish,
+    zone_reset,
+};
+
+/*
+ * Zone descriptor data structure.
+ * Provide information on a zone with all position and size values in bytes.
+ */
+typedef struct BlockZoneDescriptor {
+    uint64_t start;
+    uint64_t length;
+    uint64_t cap;
+    uint64_t wp;
+    enum zone_type type;
+    enum zone_cond cond;
+} BlockZoneDescriptor;
+
+enum zone_model {
+    BLK_Z_HM,
+    BLK_Z_HA,
+};
 
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
diff --git a/include/block/block-io.h b/include/block/block-io.h
index 62c84f0519..deb8902684 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -80,6 +80,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
 /* Ensure contents are flushed to disk.  */
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
 
+/* Report zone information of zone block device. */
+int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+                                     int64_t len, int64_t *nr_zones,
+                                     struct BlockZoneDescriptor *zones);
+int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
+        int64_t offset, int64_t len);
+
 int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
 int bdrv_block_status(BlockDriverState *bs, int64_t offset,
@@ -290,6 +297,12 @@ bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 int generated_co_wrapper
 bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 
+int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
+                                         int64_t len, int64_t *nr_zones,
+                                         struct BlockZoneDescriptor *zones);
+int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, enum zone_op op,
+        int64_t offset, int64_t len);
+
 /**
  * bdrv_parent_drained_begin_single:
  *
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 8947abab76..4d0180a7da 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -63,6 +63,7 @@
 #define BLOCK_OPT_EXTL2             "extended_l2"
 
 #define BLOCK_PROBE_BUF_SIZE        512
+#define zone_start_sector           512
 
 enum BdrvTrackedRequestType {
     BDRV_TRACKED_READ,
@@ -94,6 +95,19 @@ typedef struct BdrvTrackedRequest {
     struct BdrvTrackedRequest *waiting_for;
 } BdrvTrackedRequest;
 
+/**
+ * Zone device information data structure.
+ * Provide information on a device.
+ */
+typedef struct zbd_dev {
+    enum zone_model model;
+    uint32_t block_size;
+    uint32_t write_granularity;
+    uint32_t nr_zones;
+    struct BlockZoneDescriptor *zones; /* array of zones */
+    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
+    uint32_t max_nr_active_zones;
+} zbd_dev;
 
 struct BlockDriver {
     /*
@@ -691,6 +705,12 @@ struct BlockDriver {
                                           QEMUIOVector *qiov,
                                           int64_t pos);
 
+    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
+            int64_t offset, int64_t len, int64_t *nr_zones,
+            struct BlockZoneDescriptor *zones);
+    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
+            int64_t offset, int64_t len);
+
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
     void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 2f0d8ac25a..51da6b7a89 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1706,6 +1706,119 @@ static const cmdinfo_t flush_cmd = {
     .oneline    = "flush all in-core file state to disk",
 };
 
+static int zone_report_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    printf("block report\n");
+    BlockZoneDescriptor *zones;
+    int64_t offset, len, nr_zones;
+    int i = 0;
+
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ++optind;
+    nr_zones = cvtnum(argv[optind]);
+    zones = malloc(sizeof(struct BlockZoneDescriptor) * nr_zones);
+    ret = blk_zone_report(blk, offset, len, &nr_zones, zones);
+    while (i < nr_zones) {
+        fprintf(stdout, "start: 0x%lx, len 0x%lx, cap 0x%lx, wptr 0x%lx, "
+                        "zcond:%u, [type: %u]\n",
+                zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
+                zones[i].cond, zones[i].type);
+        ++i;
+    }
+    free(zones);
+    return ret;
+}
+
+static const cmdinfo_t zone_report_cmd = {
+        .name = "zone_report",
+        .altname = "f",
+        .cfunc = zone_report_f,
+        .argmin = 3,
+        .argmax = 3,
+        .args = "offset [offset..] len [len..] number [num..]",
+        .oneline = "report a number of bytes of zone information",
+};
+
+static int zone_open_f(BlockBackend *blk, int argc, char **argv)
+{
+    return blk_zone_mgmt(blk, zone_open, 0, 0);
+}
+
+static const cmdinfo_t zone_open_cmd = {
+        .name = "zone_open",
+        .altname = "f",
+        .cfunc = zone_open_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset [offset..] len [len..]",
+        .oneline = "explicit open a range of zones in zone block device",
+};
+
+static int zone_close_f(BlockBackend *blk, int argc, char **argv)
+{
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    return blk_zone_mgmt(blk, zone_close, offset, len);
+}
+
+static const cmdinfo_t zone_close_cmd = {
+        .name = "zone_close",
+        .altname = "f",
+        .cfunc = zone_close_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset [offset..] len [len..]",
+        .oneline = "close a range of zones in zone block device",
+};
+
+static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
+{
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    return blk_zone_mgmt(blk, zone_finish, offset, len);
+}
+
+static const cmdinfo_t zone_finish_cmd = {
+        .name = "zone_finish",
+        .altname = "f",
+        .cfunc = zone_finish_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset [offset..] len [len..]",
+        .oneline = "finish a range of zones in zone block device",
+};
+
+static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
+{
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    return blk_zone_mgmt(blk, zone_reset, offset, len);
+}
+
+static const cmdinfo_t zone_reset_cmd = {
+        .name = "zone_reset",
+        .altname = "f",
+        .cfunc = zone_reset_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset [offset..] len [len..]",
+        .oneline = "reset a zone write pointer in zone block device",
+};
+
+
 static int truncate_f(BlockBackend *blk, int argc, char **argv);
 static const cmdinfo_t truncate_cmd = {
     .name       = "truncate",
@@ -2498,6 +2611,11 @@ static void __attribute((constructor)) init_qemuio_commands(void)
     qemuio_add_command(&aio_write_cmd);
     qemuio_add_command(&aio_flush_cmd);
     qemuio_add_command(&flush_cmd);
+    qemuio_add_command(&zone_report_cmd);
+    qemuio_add_command(&zone_open_cmd);
+    qemuio_add_command(&zone_close_cmd);
+    qemuio_add_command(&zone_finish_cmd);
+    qemuio_add_command(&zone_reset_cmd);
     qemuio_add_command(&truncate_cmd);
     qemuio_add_command(&length_cmd);
     qemuio_add_command(&info_cmd);
diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
new file mode 100644
index 0000000000..042e47af27
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+#
+# Test zone management operations.
+#
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1 # failure is the default!
+
+_cleanup()
+{
+  _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.pattern
+
+# much of this could be generic for any format supporting compression.
+_supported_fmt qcow qcow2
+_supported_proto file
+_supported_os Linux
+
+TEST_OFFSETS="0"
+TEST_LENS="1000"
+IMG="--image-opts driver=zoned_host_device,filename=/dev/nullb0 -c"
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+
+echo "Testing a null_blk device"
+echo
+echo "Simple cases: testing operations once at a time"
+echo
+echo "At beginning: report all of the zones"
+echo
+$QEMU_IO "$IMG zone_report"
+$QEMU_IO "$IMG zone_open"
+echo "After opening a zone:"
+$QEMU_IO "$IMG zone_report"
+$QEMU_IO "$IMG zone_close"
+echo "After closing a zone:"
+$QEMU_IO "$IMG zone_report"
+$QEMU_IO "$IMG zone_reset"
+echo "After resetting a zone:"
+$QEMU_IO "$IMG zone_report"
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
-- 
2.35.3
Re: [RFC v2] Adding block layer APIs resembling Linux ZoneBlockDevice ioctls.
Posted by Stefan Hajnoczi 1 year, 11 months ago
On Mon, Jun 20, 2022 at 11:36:11AM +0800, Sam Li wrote:

Hi Sam,
Is this version 2 of "[RFC v1] Add support for zoned device"? Please
keep the email subject line the same (except for "v2", "v3", etc) so
that it's clear which patch series this new version replaces.

> Fix some mistakes before. It can report a range of zones now.

This looks like the description of what changed compared to v1. Please
put the changelog below "---" in the future. When patch emails are
merged by git-am(1) it keeps the text above "---" and discards the text
below "---". The changelog is usually no longer useful once the patches
are merged, so it should be located below the "---" line.

The text above the "---" is the commit description (an explanation of
why this commit is necessary). In this case the commit description
should explain that this patch adds .bdrv_co_zone_report() and
.bdrv_co_zone_mgmt() to BlockDriver so that zoned block devices can be
supported.

> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/block-backend.c             |  22 ++++
>  block/coroutines.h                |   5 +
>  block/file-posix.c                | 182 ++++++++++++++++++++++++++++++
>  block/io.c                        |  23 ++++
>  include/block/block-common.h      |  43 ++++++-
>  include/block/block-io.h          |  13 +++
>  include/block/block_int-common.h  |  20 ++++
>  qemu-io-cmds.c                    | 118 +++++++++++++++++++
>  tests/qemu-iotests/tests/zoned.sh |  52 +++++++++
>  9 files changed, 477 insertions(+), 1 deletion(-)
>  create mode 100644 tests/qemu-iotests/tests/zoned.sh
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index e0e1aff4b1..20248e4a35 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -104,6 +104,8 @@ typedef struct BlockBackendAIOCB {
>      int ret;
>  } BlockBackendAIOCB;
>  
> +
> +

Please avoid whitespace changes in code that is otherwise untouched by
your patch. Code changes can cause merge conflicts and they make it
harder to use git-annotate(1), so only changes that are necessary should
be included in a patch.

>  static const AIOCBInfo block_backend_aiocb_info = {
>      .get_aio_context = blk_aiocb_get_aio_context,
>      .aiocb_size = sizeof(BlockBackendAIOCB),
> @@ -1810,6 +1812,25 @@ int blk_flush(BlockBackend *blk)
>      return ret;
>  }
>  

Please add a documentation comment for blk_co_zone_report() that
explains how to use the functions and the purpose of the arguments. For
example, does offset have to be the first byte in a zone or can it be
any byte offset? What are the alignment requirements of offset and len?
Why is nr_zones a pointer?

> +int blk_co_zone_report(BlockBackend *blk, int64_t offset, int64_t len,

Functions that run in coroutine context must be labeled with
coroutine_fn:

    int coroutine_fn blk_co_zone_report(...)

This tells humans and tools that the function can only be called from a
coroutine. There is a blog post about coroutines in QEMU here:
https://blog.vmsplice.net/2014/01/coroutines-in-qemu-basics.html

> +                       int64_t *nr_zones,
> +                       struct BlockZoneDescriptor *zones)

QEMU coding style uses typedefs when defining structs, so "struct
BlockZoneDescriptor *zones" should be written as "BlockZoneDescriptor
*zones".

> +{
> +    int ret;

This function is called from the I/O code path, please mark it with:

  IO_CODE();

From include/block/block-io.h:

  * I/O API functions. These functions are thread-safe, and therefore
  * can run in any thread as long as the thread has called
  * aio_context_acquire/release().
  *
  * These functions can only call functions from I/O and Common categories,
  * but can be invoked by GS, "I/O or GS" and I/O APIs.
  *
  * All functions in this category must use the macro
  * IO_CODE();
  * to catch when they are accidentally called by the wrong API.

> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len, nr_zones, zones);

Please add blk_inc_in_flight(blk) and blk_dec_in_flight(blk) around this
function call to ensure that zone report requests finish before I/O is
drained (see bdrv_drained_begin()). This is necessary so that it's
possible to wait for I/O requests, including zone report, to complete.

Similar to blk_co_do_preadv() we need blk_wait_while_drained(blk),
blk_check_byte_request(), and bdrv_inc_in_flight(bs) before calling
bdrv_co_zone_report(). bdrv_dec_in_flight(bs) needs to be called after
bdrv_co_zone_report() returns.

> +    return ret;
> +}
> +
> +int blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len)
> +{
> +    int ret;
> +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> +
> +    return ret;
> +}

The same applies to this function.

> +
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> @@ -2634,3 +2655,4 @@ int blk_make_empty(BlockBackend *blk, Error **errp)
>  
>      return bdrv_make_empty(blk->root, errp);
>  }
> +

Unrelated whitespace change.

> diff --git a/block/coroutines.h b/block/coroutines.h
> index 830ecaa733..247326255f 100644
> --- a/block/coroutines.h
> +++ b/block/coroutines.h
> @@ -80,6 +80,11 @@ int coroutine_fn
>  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
>  
>  int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    int64_t len, int64_t *nr_zones,
> +                                    struct BlockZoneDescriptor *zones);
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len);
>  
>  
>  /*
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 48cd096624..71fd21f8ba 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -178,6 +178,137 @@ typedef struct BDRVRawReopenState {
>      bool check_cache_dropped;
>  } BDRVRawReopenState;
>  
> +/*
> + * parse_zone - Fill a zone descriptor
> + */
> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> +        struct blk_zone *blkz) {
> +    zone->start = blkz->start << BDRV_SECTOR_BITS;
> +    zone->length = blkz->len << BDRV_SECTOR_BITS;
> +    zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
> +    zone->wp = blkz->wp << BDRV_SECTOR_BITS;
> +    zone->type = blkz->type;
> +    zone->cond = blkz->type;

Should this be "zone->cond = blkz->cond"?

> +}
> +
> +/**
> + * zone report - Get a zone block device's information in the
> + * form of an array of zone descriptors.
> + *
> + * @param bs: passing zone block device file descriptor
> + * @param zones: Space to hold zone information on reply
> + * @param offset: the location in the zone block device
> + * @return 0 on success, -1 on failure
> + */
> +static int raw_co_zone_report(BlockDriverState *bs, int64_t offset, int64_t len,

coroutine_fn

> +                              int64_t *nr_zones,
> +                              struct BlockZoneDescriptor *zones) {
> +    BDRVRawState *s = bs->opaque;
> +    struct blk_zone_report *rep;
> +    struct BlockZoneDescriptor bzd;
> +    struct blk_zone *blkz;
> +    int64_t zone_size_mask, end, rep_size, nrz;
> +    int ret, n = 0, i = 0;
> +
> +    printf("%s\n", "start to report zone");

This looks like debug output. QEMU has a tracing system that you can
use. See docs/devel/tracing.rst.

> +    nrz = *nr_zones;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    rep = (struct blk_zone_report *)malloc(rep_size);

Please use g_autofree and g_new(). QEMU avoids direct use of malloc(3)/free(3).

> +    if (!rep) {
> +        return -1;
> +    }
> +
> +    zone_size_mask = zone_start_sector - 1;
> +    /* align up */
> +    end = ((offset + len + zone_size_mask) & (~zone_size_mask))
> +            >> BDRV_SECTOR_BITS;

More readable:

  end = DIV_ROUND_UP(offset + len, BDRV_SECTOR_SIZE);

> +
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (offset < end) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = offset;
> +        rep->nr_zones = nrz;
> +
> +        ret = ioctl(s->fd, BLKREPORTZONE, rep);

Can this ioctl() block? It seems likely. If yes, then the call needs to
be made from the thread pool to avoid blocking the current thread. See
raw_thread_pool_submit().

> +        if (ret != 0) {
> +            ret = -errno;
> +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> +                         s->fd, offset, errno);
> +            free(rep);
> +            return ret;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++) {
> +            parse_zone(&bzd, &blkz[i]);
> +            if (zones) {
> +                memcpy(&zones[n], &bzd, sizeof(bzd));

n is never incremented so this always overwrites the first element.

> +            }
> +        }
> +
> +        offset = blkz[i].start + blkz[i].len;
> +    }
> +

Does this function need to update *nr_zones = n before returning? How does
the caller know how many zones were returned?

> +    return ret;
> +}
> +
> +/**
> + * zone management operations - Execute an operation on a zone
> + */
> +static int raw_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
> +        int64_t offset, int64_t len) {
> +    BDRVRawState *s = bs->opaque;
> +    struct blk_zone_range range;
> +    const char *ioctl_name;
> +    uint64_t ioctl_op;

ioctl()'s second argument is unsigned long request. Please use unsigned
long to keep the types consistent.

> +    int64_t zone_size_mask, end;
> +    int ret;
> +
> +    zone_size_mask = zone_start_sector - 1;
> +    /* align up */
> +    end = ((offset + len + zone_size_mask) & (~zone_size_mask))
> +            >> BDRV_SECTOR_BITS;
> +    offset = (offset & (~zone_size_mask)) >> BDRV_SECTOR_BITS; /* align down */
> +
> +    switch (op) {
> +    case zone_open:
> +        ioctl_name = "BLKOPENZONE";
> +        ioctl_op = BLKOPENZONE;
> +        break;
> +    case zone_close:
> +        ioctl_name = "BLKCLOSEZONE";
> +        ioctl_op = BLKCLOSEZONE;
> +        break;
> +    case zone_finish:
> +        ioctl_name = "BLKFINISHZONE";
> +        ioctl_op = BLKFINISHZONE;
> +        break;
> +    case zone_reset:
> +        ioctl_name = "BLKRESETZONE";
> +        ioctl_op = BLKRESETZONE;
> +        break;
> +    default:
> +        error_report("Invalid zone operation 0x%x", op);
> +        errno = -EINVAL;
> +        return -1;
> +    }
> +
> +    /* Execute the operation */
> +    range.sector = offset;
> +    range.nr_sectors = end - offset;
> +    ret = ioctl(s->fd, ioctl_op, &range);
> +    if (ret != 0) {
> +        error_report("ioctl %s failed %d",
> +                         ioctl_name, errno);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
>  static int fd_open(BlockDriverState *bs)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -3324,6 +3455,9 @@ BlockDriver bdrv_file = {
>      .bdrv_abort_perm_update = raw_abort_perm_update,
>      .create_opts = &raw_create_opts,
>      .mutable_opts = mutable_opts,
> +
> +    .bdrv_co_zone_report = raw_co_zone_report,
> +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>  };
>  
>  /***********************************************/
> @@ -3703,6 +3837,53 @@ static BlockDriver bdrv_host_device = {
>  #endif
>  };
>  
> +static BlockDriver bdrv_zoned_host_device = {
> +        .format_name = "zoned_host_device",
> +        .protocol_name = "zoned_host_device",
> +        .instance_size = sizeof(BDRVRawState),
> +        .bdrv_needs_filename = true,
> +        .bdrv_probe_device  = hdev_probe_device,
> +        .bdrv_parse_filename = hdev_parse_filename,
> +        .bdrv_file_open     = hdev_open,
> +        .bdrv_close         = raw_close,
> +        .bdrv_reopen_prepare = raw_reopen_prepare,
> +        .bdrv_reopen_commit  = raw_reopen_commit,
> +        .bdrv_reopen_abort   = raw_reopen_abort,
> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> +        .create_opts         = &bdrv_create_opts_simple,
> +        .mutable_opts        = mutable_opts,
> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> +
> +        .bdrv_co_preadv         = raw_co_preadv,
> +        .bdrv_co_pwritev        = raw_co_pwritev,
> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> +        .bdrv_refresh_limits = raw_refresh_limits,
> +        .bdrv_io_plug = raw_aio_plug,
> +        .bdrv_io_unplug = raw_aio_unplug,
> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> +
> +        .bdrv_co_truncate       = raw_co_truncate,
> +        .bdrv_getlength = raw_getlength,
> +        .bdrv_get_info = raw_get_info,
> +        .bdrv_get_allocated_file_size
> +                            = raw_get_allocated_file_size,
> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> +        .bdrv_check_perm = raw_check_perm,
> +        .bdrv_set_perm   = raw_set_perm,
> +        .bdrv_abort_perm_update = raw_abort_perm_update,
> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> +        .bdrv_probe_geometry = hdev_probe_geometry,
> +        .bdrv_co_ioctl = hdev_co_ioctl,
> +
> +        /* zone management operations */
> +        .bdrv_co_zone_report = raw_co_zone_report,
> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +};
> +
>  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>  static void cdrom_parse_filename(const char *filename, QDict *options,
>                                   Error **errp)
> @@ -3964,6 +4145,7 @@ static void bdrv_file_init(void)
>  #if defined(HAVE_HOST_BLOCK_DEVICE)
>      bdrv_register(&bdrv_host_device);
>  #ifdef __linux__
> +    bdrv_register(&bdrv_zoned_host_device);
>      bdrv_register(&bdrv_host_cdrom);
>  #endif
>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> diff --git a/block/io.c b/block/io.c
> index 789e6373d5..3e8bb83cc3 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3258,6 +3258,29 @@ out:
>      return co.ret;
>  }
>  
> +int bdrv_co_zone_report(BlockDriverState *bs,
> +                        int64_t offset, int64_t len, int64_t *nr_zones,
> +                        struct BlockZoneDescriptor *zones)
> +{
> +    int ret = 0;
> +    if (!bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones)) {
> +        return -ENOTSUP;
> +    }

This returns -ENOTSUP any time ->bdrv_co_zone_report() returns 0. Should
this be:

  if (!bs->drv->bdrv_co_zone_report) {
      return -ENOTSUP;
  }

  return bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones);

?

> +
> +    return ret;
> +}
> +
> +int bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
> +        int64_t offset, int64_t len)
> +{
> +    int ret = 0;
> +    if (!bs->drv->bdrv_co_zone_mgmt(bs, op, offset, len)) {
> +        return -ENOTSUP;
> +    }
> +
> +    return ret;
> +}
> +
>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>  {
>      IO_CODE();
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index fdb7306e78..24c468d8ad 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -23,7 +23,7 @@
>   */
>  #ifndef BLOCK_COMMON_H
>  #define BLOCK_COMMON_H
> -
> +#include <linux/blkzoned.h>

Linux-specific code must be #ifdef __linux__ to avoid compilation errors
on non-Linux hosts.

>  #include "block/aio.h"
>  #include "block/aio-wait.h"
>  #include "qemu/iov.h"
> @@ -48,6 +48,47 @@
>  typedef struct BlockDriver BlockDriver;
>  typedef struct BdrvChild BdrvChild;
>  typedef struct BdrvChildClass BdrvChildClass;
> +enum zone_type {

Please use "typedef enum { ... } BdrvZoneType" so that enums follow
QEMU's coding style.

> +    BLK_ZT_CONV = BLK_ZONE_TYPE_CONVENTIONAL,
> +    BLK_ZT_SWR = BLK_ZONE_TYPE_SEQWRITE_REQ,
> +    BLK_ZT_SWP = BLK_ZONE_TYPE_SEQWRITE_PREF,
> +};
> +
> +enum zone_cond {
> +    BLK_ZS_NOT_WP = BLK_ZONE_COND_NOT_WP,
> +    BLK_ZS_EMPTY = BLK_ZONE_COND_EMPTY,
> +    BLK_ZS_IOPEN = BLK_ZONE_COND_IMP_OPEN,
> +    BLK_ZS_EOPEN = BLK_ZONE_COND_EXP_OPEN,
> +    BLK_ZS_CLOSED = BLK_ZONE_COND_CLOSED,
> +    BLK_ZS_RDONLY = BLK_ZONE_COND_READONLY,
> +    BLK_ZS_FULL = BLK_ZONE_COND_FULL,
> +    BLK_ZS_OFFLINE = BLK_ZONE_COND_OFFLINE,
> +};

This 1:1 correspondence with Linux constants could make the code a
little harder to port.

Maybe QEMU's block layer should define its own numeric constants so the
code doesn't rely on operating system-specific headers.
block/file-posix.c #ifdef __linux__ code can be responsible for
converting Linux-specific constants to QEMU constants (and the 1:1
mapping can be used there).

> +
> +enum zone_op {
> +    zone_open,
> +    zone_close,
> +    zone_finish,
> +    zone_reset,
> +};
> +
> +/*
> + * Zone descriptor data structure.
> + * Provide information on a zone with all position and size values in bytes.
> + */
> +typedef struct BlockZoneDescriptor {
> +    uint64_t start;
> +    uint64_t length;
> +    uint64_t cap;
> +    uint64_t wp;
> +    enum zone_type type;
> +    enum zone_cond cond;
> +} BlockZoneDescriptor;
> +
> +enum zone_model {
> +    BLK_Z_HM,
> +    BLK_Z_HA,
> +};
>  
>  typedef struct BlockDriverInfo {
>      /* in bytes, 0 if irrelevant */
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index 62c84f0519..deb8902684 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -80,6 +80,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
>  /* Ensure contents are flushed to disk.  */
>  int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
>  
> +/* Report zone information of zone block device. */
> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                                     int64_t len, int64_t *nr_zones,
> +                                     struct BlockZoneDescriptor *zones);
> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
> +        int64_t offset, int64_t len);
> +
>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
>  int bdrv_block_status(BlockDriverState *bs, int64_t offset,
> @@ -290,6 +297,12 @@ bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
>  int generated_co_wrapper
>  bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
>  
> +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
> +                                         int64_t len, int64_t *nr_zones,
> +                                         struct BlockZoneDescriptor *zones);
> +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len);
> +
>  /**
>   * bdrv_parent_drained_begin_single:
>   *
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 8947abab76..4d0180a7da 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -63,6 +63,7 @@
>  #define BLOCK_OPT_EXTL2             "extended_l2"
>  
>  #define BLOCK_PROBE_BUF_SIZE        512
> +#define zone_start_sector           512
>  
>  enum BdrvTrackedRequestType {
>      BDRV_TRACKED_READ,
> @@ -94,6 +95,19 @@ typedef struct BdrvTrackedRequest {
>      struct BdrvTrackedRequest *waiting_for;
>  } BdrvTrackedRequest;
>  
> +/**
> + * Zone device information data structure.
> + * Provide information on a device.
> + */
> +typedef struct zbd_dev {
> +    enum zone_model model;
> +    uint32_t block_size;

How does this relate to QEMU's BlockLimits?

> +    uint32_t write_granularity;

Same. Maybe this belongs in BlockLimits?

> +    uint32_t nr_zones;

Should this really be limited to 32-bit? For example, take 256 MB zones,
then the max nr_zones * 256 MB is much smaller than a uint64_t capacity
value. It seems safer to make this 64-bit, but maybe Dmitry, Hannes, or
Damien can tell us what to do here.

> +    struct BlockZoneDescriptor *zones; /* array of zones */

When is this used?
Re: [RFC v2] Adding block layer APIs resembling Linux ZoneBlockDevice ioctls.
Posted by Sam Li 1 year, 10 months ago
Hi Stefan,

Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月20日周一 15:55写道:
>
> On Mon, Jun 20, 2022 at 11:36:11AM +0800, Sam Li wrote:
>
> Hi Sam,
> Is this version 2 of "[RFC v1] Add support for zoned device"? Please
> keep the email subject line the same (except for "v2", "v3", etc) so
> that it's clear which patch series this new version replaces.
>
> > Fix some mistakes before. It can report a range of zones now.
>
> This looks like the description of what changed compared to v1. Please
> put the changelog below "---" in the future. When patch emails are
> merged by git-am(1) it keeps the text above "---" and discards the text
> below "---". The changelog is usually no longer useful once the patches
> are merged, so it should be located below the "---" line.
>
> The text above the "---" is the commit description (an explanation of
> why this commit is necessary). In this case the commit description
> should explain that this patch adds .bdrv_co_zone_report() and
> .bdrv_co_zone_mgmt() to BlockDriver so that zoned block devices can be
> supported.
>
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  block/block-backend.c             |  22 ++++
> >  block/coroutines.h                |   5 +
> >  block/file-posix.c                | 182 ++++++++++++++++++++++++++++++
> >  block/io.c                        |  23 ++++
> >  include/block/block-common.h      |  43 ++++++-
> >  include/block/block-io.h          |  13 +++
> >  include/block/block_int-common.h  |  20 ++++
> >  qemu-io-cmds.c                    | 118 +++++++++++++++++++
> >  tests/qemu-iotests/tests/zoned.sh |  52 +++++++++
> >  9 files changed, 477 insertions(+), 1 deletion(-)
> >  create mode 100644 tests/qemu-iotests/tests/zoned.sh
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index e0e1aff4b1..20248e4a35 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -104,6 +104,8 @@ typedef struct BlockBackendAIOCB {
> >      int ret;
> >  } BlockBackendAIOCB;
> >
> > +
> > +
>
> Please avoid whitespace changes in code that is otherwise untouched by
> your patch. Code changes can cause merge conflicts and they make it
> harder to use git-annotate(1), so only changes that are necessary should
> be included in a patch.
>
> >  static const AIOCBInfo block_backend_aiocb_info = {
> >      .get_aio_context = blk_aiocb_get_aio_context,
> >      .aiocb_size = sizeof(BlockBackendAIOCB),
> > @@ -1810,6 +1812,25 @@ int blk_flush(BlockBackend *blk)
> >      return ret;
> >  }
> >
>
> Please add a documentation comment for blk_co_zone_report() that
> explains how to use the functions and the purpose of the arguments. For
> example, does offset have to be the first byte in a zone or can it be
> any byte offset? What are the alignment requirements of offset and len?
> Why is nr_zones a pointer?
>
> > +int blk_co_zone_report(BlockBackend *blk, int64_t offset, int64_t len,
>
> Functions that run in coroutine context must be labeled with
> coroutine_fn:
>
>     int coroutine_fn blk_co_zone_report(...)
>
> This tells humans and tools that the function can only be called from a
> coroutine. There is a blog post about coroutines in QEMU here:
> https://blog.vmsplice.net/2014/01/coroutines-in-qemu-basics.html
>
> > +                       int64_t *nr_zones,
> > +                       struct BlockZoneDescriptor *zones)
>
> QEMU coding style uses typedefs when defining structs, so "struct
> BlockZoneDescriptor *zones" should be written as "BlockZoneDescriptor
> *zones".
>
> > +{
> > +    int ret;
>
> This function is called from the I/O code path, please mark it with:
>
>   IO_CODE();
>
> From include/block/block-io.h:
>
>   * I/O API functions. These functions are thread-safe, and therefore
>   * can run in any thread as long as the thread has called
>   * aio_context_acquire/release().
>   *
>   * These functions can only call functions from I/O and Common categories,
>   * but can be invoked by GS, "I/O or GS" and I/O APIs.
>   *
>   * All functions in this category must use the macro
>   * IO_CODE();
>   * to catch when they are accidentally called by the wrong API.
>
> > +    ret = bdrv_co_zone_report(blk->root->bs, offset, len, nr_zones, zones);
>
> Please add blk_inc_in_flight(blk) and blk_dec_in_flight(blk) around this
> function call to ensure that zone report requests finish before I/O is
> drained (see bdrv_drained_begin()). This is necessary so that it's
> possible to wait for I/O requests, including zone report, to complete.
>
> Similar to blk_co_do_preadv() we need blk_wait_while_drained(blk),
> blk_check_byte_request(), and bdrv_inc_in_flight(bs) before calling
> bdrv_co_zone_report(). bdrv_dec_in_flight(bs) needs to be called after
> bdrv_co_zone_report() returns.
>
After adding similar structure to blk_co_do_preadv(), zone operation
command will always fail at blk_wait_while_drained(blk) because
blk->inflight <= 0. Would it be ok to just remove
blk_wait_while_drained?

> > +    return ret;
> > +}
> > +
> > +int blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len)
> > +{
> > +    int ret;
> > +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> > +
> > +    return ret;
> > +}
>
> The same applies to this function.
>
> > +
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >      BlockDriverState *bs = blk_bs(blk);
> > @@ -2634,3 +2655,4 @@ int blk_make_empty(BlockBackend *blk, Error **errp)
> >
> >      return bdrv_make_empty(blk->root, errp);
> >  }
> > +
>
> Unrelated whitespace change.
>
> > diff --git a/block/coroutines.h b/block/coroutines.h
> > index 830ecaa733..247326255f 100644
> > --- a/block/coroutines.h
> > +++ b/block/coroutines.h
> > @@ -80,6 +80,11 @@ int coroutine_fn
> >  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
> >
> >  int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t len, int64_t *nr_zones,
> > +                                    struct BlockZoneDescriptor *zones);
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len);
> >
> >
> >  /*
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 48cd096624..71fd21f8ba 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -178,6 +178,137 @@ typedef struct BDRVRawReopenState {
> >      bool check_cache_dropped;
> >  } BDRVRawReopenState;
> >
> > +/*
> > + * parse_zone - Fill a zone descriptor
> > + */
> > +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> > +        struct blk_zone *blkz) {
> > +    zone->start = blkz->start << BDRV_SECTOR_BITS;
> > +    zone->length = blkz->len << BDRV_SECTOR_BITS;
> > +    zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
> > +    zone->wp = blkz->wp << BDRV_SECTOR_BITS;
> > +    zone->type = blkz->type;
> > +    zone->cond = blkz->type;
>
> Should this be "zone->cond = blkz->cond"?
>
> > +}
> > +
> > +/**
> > + * zone report - Get a zone block device's information in the
> > + * form of an array of zone descriptors.
> > + *
> > + * @param bs: passing zone block device file descriptor
> > + * @param zones: Space to hold zone information on reply
> > + * @param offset: the location in the zone block device
> > + * @return 0 on success, -1 on failure
> > + */
> > +static int raw_co_zone_report(BlockDriverState *bs, int64_t offset, int64_t len,
>
> coroutine_fn
>
> > +                              int64_t *nr_zones,
> > +                              struct BlockZoneDescriptor *zones) {
> > +    BDRVRawState *s = bs->opaque;
> > +    struct blk_zone_report *rep;
> > +    struct BlockZoneDescriptor bzd;
> > +    struct blk_zone *blkz;
> > +    int64_t zone_size_mask, end, rep_size, nrz;
> > +    int ret, n = 0, i = 0;
> > +
> > +    printf("%s\n", "start to report zone");
>
> This looks like debug output. QEMU has a tracing system that you can
> use. See docs/devel/tracing.rst.
>
> > +    nrz = *nr_zones;
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> > +    rep = (struct blk_zone_report *)malloc(rep_size);
>
> Please use g_autofree and g_new(). QEMU avoids direct use of malloc(3)/free(3).
>
> > +    if (!rep) {
> > +        return -1;
> > +    }
> > +
> > +    zone_size_mask = zone_start_sector - 1;
> > +    /* align up */
> > +    end = ((offset + len + zone_size_mask) & (~zone_size_mask))
> > +            >> BDRV_SECTOR_BITS;
>
> More readable:
>
>   end = DIV_ROUND_UP(offset + len, BDRV_SECTOR_SIZE);
>
> > +
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (offset < end) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = offset;
> > +        rep->nr_zones = nrz;
> > +
> > +        ret = ioctl(s->fd, BLKREPORTZONE, rep);
>
> Can this ioctl() block? It seems likely. If yes, then the call needs to
> be made from the thread pool to avoid blocking the current thread. See
> raw_thread_pool_submit().
>
> > +        if (ret != 0) {
> > +            ret = -errno;
> > +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> > +                         s->fd, offset, errno);
> > +            free(rep);
> > +            return ret;
> > +        }
> > +
> > +        if (!rep->nr_zones) {
> > +            break;
> > +        }
> > +
> > +        for (i = 0; i < rep->nr_zones; i++) {
> > +            parse_zone(&bzd, &blkz[i]);
> > +            if (zones) {
> > +                memcpy(&zones[n], &bzd, sizeof(bzd));
>
> n is never incremented so this always overwrites the first element.
>
> > +            }
> > +        }
> > +
> > +        offset = blkz[i].start + blkz[i].len;
> > +    }
> > +
>
> Does this function need to update *nr_zones = n before returning? How does
> the caller know how many zones were returned?
>
> > +    return ret;
> > +}
> > +
> > +/**
> > + * zone management operations - Execute an operation on a zone
> > + */
> > +static int raw_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
> > +        int64_t offset, int64_t len) {
> > +    BDRVRawState *s = bs->opaque;
> > +    struct blk_zone_range range;
> > +    const char *ioctl_name;
> > +    uint64_t ioctl_op;
>
> ioctl()'s second argument is unsigned long request. Please use unsigned
> long to keep the types consistent.
>
> > +    int64_t zone_size_mask, end;
> > +    int ret;
> > +
> > +    zone_size_mask = zone_start_sector - 1;
> > +    /* align up */
> > +    end = ((offset + len + zone_size_mask) & (~zone_size_mask))
> > +            >> BDRV_SECTOR_BITS;
> > +    offset = (offset & (~zone_size_mask)) >> BDRV_SECTOR_BITS; /* align down */
> > +
> > +    switch (op) {
> > +    case zone_open:
> > +        ioctl_name = "BLKOPENZONE";
> > +        ioctl_op = BLKOPENZONE;
> > +        break;
> > +    case zone_close:
> > +        ioctl_name = "BLKCLOSEZONE";
> > +        ioctl_op = BLKCLOSEZONE;
> > +        break;
> > +    case zone_finish:
> > +        ioctl_name = "BLKFINISHZONE";
> > +        ioctl_op = BLKFINISHZONE;
> > +        break;
> > +    case zone_reset:
> > +        ioctl_name = "BLKRESETZONE";
> > +        ioctl_op = BLKRESETZONE;
> > +        break;
> > +    default:
> > +        error_report("Invalid zone operation 0x%x", op);
> > +        errno = -EINVAL;
> > +        return -1;
> > +    }
> > +
> > +    /* Execute the operation */
> > +    range.sector = offset;
> > +    range.nr_sectors = end - offset;
> > +    ret = ioctl(s->fd, ioctl_op, &range);
> > +    if (ret != 0) {
> > +        error_report("ioctl %s failed %d",
> > +                         ioctl_name, errno);
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  static int fd_open(BlockDriverState *bs)
> >  {
> >      BDRVRawState *s = bs->opaque;
> > @@ -3324,6 +3455,9 @@ BlockDriver bdrv_file = {
> >      .bdrv_abort_perm_update = raw_abort_perm_update,
> >      .create_opts = &raw_create_opts,
> >      .mutable_opts = mutable_opts,
> > +
> > +    .bdrv_co_zone_report = raw_co_zone_report,
> > +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> >  };
> >
> >  /***********************************************/
> > @@ -3703,6 +3837,53 @@ static BlockDriver bdrv_host_device = {
> >  #endif
> >  };
> >
> > +static BlockDriver bdrv_zoned_host_device = {
> > +        .format_name = "zoned_host_device",
> > +        .protocol_name = "zoned_host_device",
> > +        .instance_size = sizeof(BDRVRawState),
> > +        .bdrv_needs_filename = true,
> > +        .bdrv_probe_device  = hdev_probe_device,
> > +        .bdrv_parse_filename = hdev_parse_filename,
> > +        .bdrv_file_open     = hdev_open,
> > +        .bdrv_close         = raw_close,
> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
> > +        .bdrv_reopen_commit  = raw_reopen_commit,
> > +        .bdrv_reopen_abort   = raw_reopen_abort,
> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > +        .create_opts         = &bdrv_create_opts_simple,
> > +        .mutable_opts        = mutable_opts,
> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> > +
> > +        .bdrv_co_preadv         = raw_co_preadv,
> > +        .bdrv_co_pwritev        = raw_co_pwritev,
> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> > +        .bdrv_refresh_limits = raw_refresh_limits,
> > +        .bdrv_io_plug = raw_aio_plug,
> > +        .bdrv_io_unplug = raw_aio_unplug,
> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > +
> > +        .bdrv_co_truncate       = raw_co_truncate,
> > +        .bdrv_getlength = raw_getlength,
> > +        .bdrv_get_info = raw_get_info,
> > +        .bdrv_get_allocated_file_size
> > +                            = raw_get_allocated_file_size,
> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> > +        .bdrv_check_perm = raw_check_perm,
> > +        .bdrv_set_perm   = raw_set_perm,
> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> > +        .bdrv_probe_geometry = hdev_probe_geometry,
> > +        .bdrv_co_ioctl = hdev_co_ioctl,
> > +
> > +        /* zone management operations */
> > +        .bdrv_co_zone_report = raw_co_zone_report,
> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +};
> > +
> >  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> >  static void cdrom_parse_filename(const char *filename, QDict *options,
> >                                   Error **errp)
> > @@ -3964,6 +4145,7 @@ static void bdrv_file_init(void)
> >  #if defined(HAVE_HOST_BLOCK_DEVICE)
> >      bdrv_register(&bdrv_host_device);
> >  #ifdef __linux__
> > +    bdrv_register(&bdrv_zoned_host_device);
> >      bdrv_register(&bdrv_host_cdrom);
> >  #endif
> >  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> > diff --git a/block/io.c b/block/io.c
> > index 789e6373d5..3e8bb83cc3 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -3258,6 +3258,29 @@ out:
> >      return co.ret;
> >  }
> >
> > +int bdrv_co_zone_report(BlockDriverState *bs,
> > +                        int64_t offset, int64_t len, int64_t *nr_zones,
> > +                        struct BlockZoneDescriptor *zones)
> > +{
> > +    int ret = 0;
> > +    if (!bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones)) {
> > +        return -ENOTSUP;
> > +    }
>
> This returns -ENOTSUP any time ->bdrv_co_zone_report() returns 0. Should
> this be:
>
>   if (!bs->drv->bdrv_co_zone_report) {
>       return -ENOTSUP;
>   }
>
>   return bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones);
>
> ?
>
> > +
> > +    return ret;
> > +}
> > +
> > +int bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
> > +        int64_t offset, int64_t len)
> > +{
> > +    int ret = 0;
> > +    if (!bs->drv->bdrv_co_zone_mgmt(bs, op, offset, len)) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    return ret;
> > +}
> > +
> >  void *qemu_blockalign(BlockDriverState *bs, size_t size)
> >  {
> >      IO_CODE();
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index fdb7306e78..24c468d8ad 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -23,7 +23,7 @@
> >   */
> >  #ifndef BLOCK_COMMON_H
> >  #define BLOCK_COMMON_H
> > -
> > +#include <linux/blkzoned.h>
>
> Linux-specific code must be #ifdef __linux__ to avoid compilation errors
> on non-Linux hosts.
>
> >  #include "block/aio.h"
> >  #include "block/aio-wait.h"
> >  #include "qemu/iov.h"
> > @@ -48,6 +48,47 @@
> >  typedef struct BlockDriver BlockDriver;
> >  typedef struct BdrvChild BdrvChild;
> >  typedef struct BdrvChildClass BdrvChildClass;
> > +enum zone_type {
>
> Please use "typedef enum { ... } BdrvZoneType" so that enums follow
> QEMU's coding style.
>
> > +    BLK_ZT_CONV = BLK_ZONE_TYPE_CONVENTIONAL,
> > +    BLK_ZT_SWR = BLK_ZONE_TYPE_SEQWRITE_REQ,
> > +    BLK_ZT_SWP = BLK_ZONE_TYPE_SEQWRITE_PREF,
> > +};
> > +
> > +enum zone_cond {
> > +    BLK_ZS_NOT_WP = BLK_ZONE_COND_NOT_WP,
> > +    BLK_ZS_EMPTY = BLK_ZONE_COND_EMPTY,
> > +    BLK_ZS_IOPEN = BLK_ZONE_COND_IMP_OPEN,
> > +    BLK_ZS_EOPEN = BLK_ZONE_COND_EXP_OPEN,
> > +    BLK_ZS_CLOSED = BLK_ZONE_COND_CLOSED,
> > +    BLK_ZS_RDONLY = BLK_ZONE_COND_READONLY,
> > +    BLK_ZS_FULL = BLK_ZONE_COND_FULL,
> > +    BLK_ZS_OFFLINE = BLK_ZONE_COND_OFFLINE,
> > +};
>
> This 1:1 correspondence with Linux constants could make the code a
> little harder to port.
>
> Maybe QEMU's block layer should define its own numeric constants so the
> code doesn't rely on operating system-specific headers.
> block/file-posix.c #ifdef __linux__ code can be responsible for
> converting Linux-specific constants to QEMU constants (and the 1:1
> mapping can be used there).
>
Can we define those constants in block-common.h? Because
BlockZoneDescriptor requires zone_condition, zone_type defined and
BlockZoneDesicriptor are used in header files and qemu-io
sub-commands. If we use #ifdef __linux__ in block-common.h, it can be
responsible for converting Linux constants instead.

Thanks for reviewing! If there is any problem, please let me know.

Best regards,
Sam

> > +
> > +enum zone_op {
> > +    zone_open,
> > +    zone_close,
> > +    zone_finish,
> > +    zone_reset,
> > +};
> > +
> > +/*
> > + * Zone descriptor data structure.
> > + * Provide information on a zone with all position and size values in bytes.
> > + */
> > +typedef struct BlockZoneDescriptor {
> > +    uint64_t start;
> > +    uint64_t length;
> > +    uint64_t cap;
> > +    uint64_t wp;
> > +    enum zone_type type;
> > +    enum zone_cond cond;
> > +} BlockZoneDescriptor;
> > +
> > +enum zone_model {
> > +    BLK_Z_HM,
> > +    BLK_Z_HA,
> > +};
> >
> >  typedef struct BlockDriverInfo {
> >      /* in bytes, 0 if irrelevant */
> > diff --git a/include/block/block-io.h b/include/block/block-io.h
> > index 62c84f0519..deb8902684 100644
> > --- a/include/block/block-io.h
> > +++ b/include/block/block-io.h
> > @@ -80,6 +80,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
> >  /* Ensure contents are flushed to disk.  */
> >  int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
> >
> > +/* Report zone information of zone block device. */
> > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +                                     int64_t len, int64_t *nr_zones,
> > +                                     struct BlockZoneDescriptor *zones);
> > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
> > +        int64_t offset, int64_t len);
> > +
> >  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
> >  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> >  int bdrv_block_status(BlockDriverState *bs, int64_t offset,
> > @@ -290,6 +297,12 @@ bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
> >  int generated_co_wrapper
> >  bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
> >
> > +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
> > +                                         int64_t len, int64_t *nr_zones,
> > +                                         struct BlockZoneDescriptor *zones);
> > +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len);
> > +
> >  /**
> >   * bdrv_parent_drained_begin_single:
> >   *
> > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> > index 8947abab76..4d0180a7da 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -63,6 +63,7 @@
> >  #define BLOCK_OPT_EXTL2             "extended_l2"
> >
> >  #define BLOCK_PROBE_BUF_SIZE        512
> > +#define zone_start_sector           512
> >
> >  enum BdrvTrackedRequestType {
> >      BDRV_TRACKED_READ,
> > @@ -94,6 +95,19 @@ typedef struct BdrvTrackedRequest {
> >      struct BdrvTrackedRequest *waiting_for;
> >  } BdrvTrackedRequest;
> >
> > +/**
> > + * Zone device information data structure.
> > + * Provide information on a device.
> > + */
> > +typedef struct zbd_dev {
> > +    enum zone_model model;
> > +    uint32_t block_size;
>
> How does this relate to QEMU's BlockLimits?
>
> > +    uint32_t write_granularity;
>
> Same. Maybe this belongs in BlockLimits?
>
> > +    uint32_t nr_zones;
>
> Should this really be limited to 32-bit? For example, take 256 MB zones,
> then the max nr_zones * 256 MB is much smaller than a uint64_t capacity
> value. It seems safer to make this 64-bit, but maybe Dmitry, Hannes, or
> Damien can tell us what to do here.
>
> > +    struct BlockZoneDescriptor *zones; /* array of zones */
>
> When is this used?
Re: [RFC v2] Adding block layer APIs resembling Linux ZoneBlockDevice ioctls.
Posted by Stefan Hajnoczi 1 year, 10 months ago
On Fri, Jun 24, 2022 at 11:14:32AM +0800, Sam Li wrote:
> Hi Stefan,
> 
> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月20日周一 15:55写道:
> >
> > On Mon, Jun 20, 2022 at 11:36:11AM +0800, Sam Li wrote:
> >
> > Hi Sam,
> > Is this version 2 of "[RFC v1] Add support for zoned device"? Please
> > keep the email subject line the same (except for "v2", "v3", etc) so
> > that it's clear which patch series this new version replaces.
> >
> > > Fix some mistakes before. It can report a range of zones now.
> >
> > This looks like the description of what changed compared to v1. Please
> > put the changelog below "---" in the future. When patch emails are
> > merged by git-am(1) it keeps the text above "---" and discards the text
> > below "---". The changelog is usually no longer useful once the patches
> > are merged, so it should be located below the "---" line.
> >
> > The text above the "---" is the commit description (an explanation of
> > why this commit is necessary). In this case the commit description
> > should explain that this patch adds .bdrv_co_zone_report() and
> > .bdrv_co_zone_mgmt() to BlockDriver so that zoned block devices can be
> > supported.
> >
> > >
> > > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > > ---
> > >  block/block-backend.c             |  22 ++++
> > >  block/coroutines.h                |   5 +
> > >  block/file-posix.c                | 182 ++++++++++++++++++++++++++++++
> > >  block/io.c                        |  23 ++++
> > >  include/block/block-common.h      |  43 ++++++-
> > >  include/block/block-io.h          |  13 +++
> > >  include/block/block_int-common.h  |  20 ++++
> > >  qemu-io-cmds.c                    | 118 +++++++++++++++++++
> > >  tests/qemu-iotests/tests/zoned.sh |  52 +++++++++
> > >  9 files changed, 477 insertions(+), 1 deletion(-)
> > >  create mode 100644 tests/qemu-iotests/tests/zoned.sh
> > >
> > > diff --git a/block/block-backend.c b/block/block-backend.c
> > > index e0e1aff4b1..20248e4a35 100644
> > > --- a/block/block-backend.c
> > > +++ b/block/block-backend.c
> > > @@ -104,6 +104,8 @@ typedef struct BlockBackendAIOCB {
> > >      int ret;
> > >  } BlockBackendAIOCB;
> > >
> > > +
> > > +
> >
> > Please avoid whitespace changes in code that is otherwise untouched by
> > your patch. Code changes can cause merge conflicts and they make it
> > harder to use git-annotate(1), so only changes that are necessary should
> > be included in a patch.
> >
> > >  static const AIOCBInfo block_backend_aiocb_info = {
> > >      .get_aio_context = blk_aiocb_get_aio_context,
> > >      .aiocb_size = sizeof(BlockBackendAIOCB),
> > > @@ -1810,6 +1812,25 @@ int blk_flush(BlockBackend *blk)
> > >      return ret;
> > >  }
> > >
> >
> > Please add a documentation comment for blk_co_zone_report() that
> > explains how to use the functions and the purpose of the arguments. For
> > example, does offset have to be the first byte in a zone or can it be
> > any byte offset? What are the alignment requirements of offset and len?
> > Why is nr_zones a pointer?
> >
> > > +int blk_co_zone_report(BlockBackend *blk, int64_t offset, int64_t len,
> >
> > Functions that run in coroutine context must be labeled with
> > coroutine_fn:
> >
> >     int coroutine_fn blk_co_zone_report(...)
> >
> > This tells humans and tools that the function can only be called from a
> > coroutine. There is a blog post about coroutines in QEMU here:
> > https://blog.vmsplice.net/2014/01/coroutines-in-qemu-basics.html
> >
> > > +                       int64_t *nr_zones,
> > > +                       struct BlockZoneDescriptor *zones)
> >
> > QEMU coding style uses typedefs when defining structs, so "struct
> > BlockZoneDescriptor *zones" should be written as "BlockZoneDescriptor
> > *zones".
> >
> > > +{
> > > +    int ret;
> >
> > This function is called from the I/O code path, please mark it with:
> >
> >   IO_CODE();
> >
> > From include/block/block-io.h:
> >
> >   * I/O API functions. These functions are thread-safe, and therefore
> >   * can run in any thread as long as the thread has called
> >   * aio_context_acquire/release().
> >   *
> >   * These functions can only call functions from I/O and Common categories,
> >   * but can be invoked by GS, "I/O or GS" and I/O APIs.
> >   *
> >   * All functions in this category must use the macro
> >   * IO_CODE();
> >   * to catch when they are accidentally called by the wrong API.
> >
> > > +    ret = bdrv_co_zone_report(blk->root->bs, offset, len, nr_zones, zones);
> >
> > Please add blk_inc_in_flight(blk) and blk_dec_in_flight(blk) around this
> > function call to ensure that zone report requests finish before I/O is
> > drained (see bdrv_drained_begin()). This is necessary so that it's
> > possible to wait for I/O requests, including zone report, to complete.
> >
> > Similar to blk_co_do_preadv() we need blk_wait_while_drained(blk),
> > blk_check_byte_request(), and bdrv_inc_in_flight(bs) before calling
> > bdrv_co_zone_report(). bdrv_dec_in_flight(bs) needs to be called after
> > bdrv_co_zone_report() returns.
> >
> After adding similar structure to blk_co_do_preadv(), zone operation
> command will always fail at blk_wait_while_drained(blk) because
> blk->inflight <= 0. Would it be ok to just remove
> blk_wait_while_drained?

Are you hitting the assertion in
block/block-backend.c:blk_wait_while_drained()?

  assert(blk->in_flight > 0);

If yes, then there is a bug in the code. You need to make sure that
blk_inc_in_flight() is called before blk_wait_while_drained().

> > > +    BLK_ZT_CONV = BLK_ZONE_TYPE_CONVENTIONAL,
> > > +    BLK_ZT_SWR = BLK_ZONE_TYPE_SEQWRITE_REQ,
> > > +    BLK_ZT_SWP = BLK_ZONE_TYPE_SEQWRITE_PREF,
> > > +};
> > > +
> > > +enum zone_cond {
> > > +    BLK_ZS_NOT_WP = BLK_ZONE_COND_NOT_WP,
> > > +    BLK_ZS_EMPTY = BLK_ZONE_COND_EMPTY,
> > > +    BLK_ZS_IOPEN = BLK_ZONE_COND_IMP_OPEN,
> > > +    BLK_ZS_EOPEN = BLK_ZONE_COND_EXP_OPEN,
> > > +    BLK_ZS_CLOSED = BLK_ZONE_COND_CLOSED,
> > > +    BLK_ZS_RDONLY = BLK_ZONE_COND_READONLY,
> > > +    BLK_ZS_FULL = BLK_ZONE_COND_FULL,
> > > +    BLK_ZS_OFFLINE = BLK_ZONE_COND_OFFLINE,
> > > +};
> >
> > This 1:1 correspondence with Linux constants could make the code a
> > little harder to port.
> >
> > Maybe QEMU's block layer should define its own numeric constants so the
> > code doesn't rely on operating system-specific headers.
> > block/file-posix.c #ifdef __linux__ code can be responsible for
> > converting Linux-specific constants to QEMU constants (and the 1:1
> > mapping can be used there).
> >
> Can we define those constants in block-common.h? Because
> BlockZoneDescriptor requires zone_condition, zone_type defined and
> BlockZoneDesicriptor are used in header files and qemu-io
> sub-commands. If we use #ifdef __linux__ in block-common.h, it can be
> responsible for converting Linux constants instead.
> 
> Thanks for reviewing! If there is any problem, please let me know.

I suggest defining the constants in block-common.h. #ifdef __linux__ is
not necessary in block-common.h because the constants should just be an
enum with BLK_ZS_NOT_WP = 0 and so on (no need for Linux headers).

In block/file-posix.c you can define a helper function inside #ifdef
__linux__ that does something like:

  BlkZoneCond zone_cond_from_linux(enum blk_zone_cond val)
  {
      switch (val) {
      case BLK_ZONE_COND_NOT_WP:
          return BLK_ZS_NOT_WP;
      ...
  }

The code in block/file-posix.c should call this helper to convert from
Linux values to QEMU values.

This way the QEMU block layer does not use Linux constants and compiles
on non-Linux machines.

Stefan
Re: [RFC v2] Adding block layer APIs resembling Linux ZoneBlockDevice ioctls.
Posted by Damien Le Moal 1 year, 10 months ago
On 6/25/22 00:49, Stefan Hajnoczi wrote:
> On Fri, Jun 24, 2022 at 11:14:32AM +0800, Sam Li wrote:
>> Hi Stefan,
>>
>> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月20日周一 15:55写道:
>>>
>>> On Mon, Jun 20, 2022 at 11:36:11AM +0800, Sam Li wrote:
>>>
>>> Hi Sam,
>>> Is this version 2 of "[RFC v1] Add support for zoned device"? Please
>>> keep the email subject line the same (except for "v2", "v3", etc) so
>>> that it's clear which patch series this new version replaces.
>>>
>>>> Fix some mistakes before. It can report a range of zones now.
>>>
>>> This looks like the description of what changed compared to v1. Please
>>> put the changelog below "---" in the future. When patch emails are
>>> merged by git-am(1) it keeps the text above "---" and discards the text
>>> below "---". The changelog is usually no longer useful once the patches
>>> are merged, so it should be located below the "---" line.
>>>
>>> The text above the "---" is the commit description (an explanation of
>>> why this commit is necessary). In this case the commit description
>>> should explain that this patch adds .bdrv_co_zone_report() and
>>> .bdrv_co_zone_mgmt() to BlockDriver so that zoned block devices can be
>>> supported.
>>>
>>>>
>>>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>>> ---
>>>>  block/block-backend.c             |  22 ++++
>>>>  block/coroutines.h                |   5 +
>>>>  block/file-posix.c                | 182 ++++++++++++++++++++++++++++++
>>>>  block/io.c                        |  23 ++++
>>>>  include/block/block-common.h      |  43 ++++++-
>>>>  include/block/block-io.h          |  13 +++
>>>>  include/block/block_int-common.h  |  20 ++++
>>>>  qemu-io-cmds.c                    | 118 +++++++++++++++++++
>>>>  tests/qemu-iotests/tests/zoned.sh |  52 +++++++++
>>>>  9 files changed, 477 insertions(+), 1 deletion(-)
>>>>  create mode 100644 tests/qemu-iotests/tests/zoned.sh
>>>>
>>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>>> index e0e1aff4b1..20248e4a35 100644
>>>> --- a/block/block-backend.c
>>>> +++ b/block/block-backend.c
>>>> @@ -104,6 +104,8 @@ typedef struct BlockBackendAIOCB {
>>>>      int ret;
>>>>  } BlockBackendAIOCB;
>>>>
>>>> +
>>>> +
>>>
>>> Please avoid whitespace changes in code that is otherwise untouched by
>>> your patch. Code changes can cause merge conflicts and they make it
>>> harder to use git-annotate(1), so only changes that are necessary should
>>> be included in a patch.
>>>
>>>>  static const AIOCBInfo block_backend_aiocb_info = {
>>>>      .get_aio_context = blk_aiocb_get_aio_context,
>>>>      .aiocb_size = sizeof(BlockBackendAIOCB),
>>>> @@ -1810,6 +1812,25 @@ int blk_flush(BlockBackend *blk)
>>>>      return ret;
>>>>  }
>>>>
>>>
>>> Please add a documentation comment for blk_co_zone_report() that
>>> explains how to use the functions and the purpose of the arguments. For
>>> example, does offset have to be the first byte in a zone or can it be
>>> any byte offset? What are the alignment requirements of offset and len?
>>> Why is nr_zones a pointer?
>>>
>>>> +int blk_co_zone_report(BlockBackend *blk, int64_t offset, int64_t len,
>>>
>>> Functions that run in coroutine context must be labeled with
>>> coroutine_fn:
>>>
>>>     int coroutine_fn blk_co_zone_report(...)
>>>
>>> This tells humans and tools that the function can only be called from a
>>> coroutine. There is a blog post about coroutines in QEMU here:
>>> https://blog.vmsplice.net/2014/01/coroutines-in-qemu-basics.html
>>>
>>>> +                       int64_t *nr_zones,
>>>> +                       struct BlockZoneDescriptor *zones)
>>>
>>> QEMU coding style uses typedefs when defining structs, so "struct
>>> BlockZoneDescriptor *zones" should be written as "BlockZoneDescriptor
>>> *zones".
>>>
>>>> +{
>>>> +    int ret;
>>>
>>> This function is called from the I/O code path, please mark it with:
>>>
>>>   IO_CODE();
>>>
>>> From include/block/block-io.h:
>>>
>>>   * I/O API functions. These functions are thread-safe, and therefore
>>>   * can run in any thread as long as the thread has called
>>>   * aio_context_acquire/release().
>>>   *
>>>   * These functions can only call functions from I/O and Common categories,
>>>   * but can be invoked by GS, "I/O or GS" and I/O APIs.
>>>   *
>>>   * All functions in this category must use the macro
>>>   * IO_CODE();
>>>   * to catch when they are accidentally called by the wrong API.
>>>
>>>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len, nr_zones, zones);
>>>
>>> Please add blk_inc_in_flight(blk) and blk_dec_in_flight(blk) around this
>>> function call to ensure that zone report requests finish before I/O is
>>> drained (see bdrv_drained_begin()). This is necessary so that it's
>>> possible to wait for I/O requests, including zone report, to complete.
>>>
>>> Similar to blk_co_do_preadv() we need blk_wait_while_drained(blk),
>>> blk_check_byte_request(), and bdrv_inc_in_flight(bs) before calling
>>> bdrv_co_zone_report(). bdrv_dec_in_flight(bs) needs to be called after
>>> bdrv_co_zone_report() returns.
>>>
>> After adding similar structure to blk_co_do_preadv(), zone operation
>> command will always fail at blk_wait_while_drained(blk) because
>> blk->inflight <= 0. Would it be ok to just remove
>> blk_wait_while_drained?
> 
> Are you hitting the assertion in
> block/block-backend.c:blk_wait_while_drained()?
> 
>   assert(blk->in_flight > 0);
> 
> If yes, then there is a bug in the code. You need to make sure that
> blk_inc_in_flight() is called before blk_wait_while_drained().
> 
>>>> +    BLK_ZT_CONV = BLK_ZONE_TYPE_CONVENTIONAL,
>>>> +    BLK_ZT_SWR = BLK_ZONE_TYPE_SEQWRITE_REQ,
>>>> +    BLK_ZT_SWP = BLK_ZONE_TYPE_SEQWRITE_PREF,
>>>> +};
>>>> +
>>>> +enum zone_cond {
>>>> +    BLK_ZS_NOT_WP = BLK_ZONE_COND_NOT_WP,
>>>> +    BLK_ZS_EMPTY = BLK_ZONE_COND_EMPTY,
>>>> +    BLK_ZS_IOPEN = BLK_ZONE_COND_IMP_OPEN,
>>>> +    BLK_ZS_EOPEN = BLK_ZONE_COND_EXP_OPEN,
>>>> +    BLK_ZS_CLOSED = BLK_ZONE_COND_CLOSED,
>>>> +    BLK_ZS_RDONLY = BLK_ZONE_COND_READONLY,
>>>> +    BLK_ZS_FULL = BLK_ZONE_COND_FULL,
>>>> +    BLK_ZS_OFFLINE = BLK_ZONE_COND_OFFLINE,
>>>> +};
>>>
>>> This 1:1 correspondence with Linux constants could make the code a
>>> little harder to port.
>>>
>>> Maybe QEMU's block layer should define its own numeric constants so the
>>> code doesn't rely on operating system-specific headers.
>>> block/file-posix.c #ifdef __linux__ code can be responsible for
>>> converting Linux-specific constants to QEMU constants (and the 1:1
>>> mapping can be used there).
>>>
>> Can we define those constants in block-common.h? Because
>> BlockZoneDescriptor requires zone_condition, zone_type defined and
>> BlockZoneDesicriptor are used in header files and qemu-io
>> sub-commands. If we use #ifdef __linux__ in block-common.h, it can be
>> responsible for converting Linux constants instead.
>>
>> Thanks for reviewing! If there is any problem, please let me know.
> 
> I suggest defining the constants in block-common.h. #ifdef __linux__ is
> not necessary in block-common.h because the constants should just be an
> enum with BLK_ZS_NOT_WP = 0 and so on (no need for Linux headers).
> 
> In block/file-posix.c you can define a helper function inside #ifdef
> __linux__ that does something like:
> 
>   BlkZoneCond zone_cond_from_linux(enum blk_zone_cond val)
>   {
>       switch (val) {
>       case BLK_ZONE_COND_NOT_WP:
>           return BLK_ZS_NOT_WP;
>       ...
>   }
> 
> The code in block/file-posix.c should call this helper to convert from
> Linux values to QEMU values.

Given that the entire zone API is Linux specific (as far as I know), we do
not need to have these helpers: the entire code for zones needs to be
under a #ifdef __linux__.

But the conversion from Linux struct blk_zone to qemu zone descriptor
still needs to be done. And the perfect place to do this is the
parse_zone() function. There, we can add:

	switch (blkz->cond) {
	case BLK_ZONE_COND_NOT_WP:
		zone->cond = BLK_ZS_NOT_WP;
		break;
	...
	}

And same for zone type. That will also allow checking the values returned
by Linux. ZBC-2 defines more zone types and zone conditions than currently
defined in /usr/include/linux/blkzoned.h. If these new zone
types/conditions ever get supported by Linux, qemu can catch the values it
does not support and reject the drive.

> 
> This way the QEMU block layer does not use Linux constants and compiles
> on non-Linux machines.
> 
> Stefan


-- 
Damien Le Moal
Western Digital Research
Re: [RFC v2] Adding block layer APIs resembling Linux ZoneBlockDevice ioctls.
Posted by Sam Li 1 year, 10 months ago
Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月24日周五 23:50写道:
>
> On Fri, Jun 24, 2022 at 11:14:32AM +0800, Sam Li wrote:
> > Hi Stefan,
> >
> > Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月20日周一 15:55写道:
> > >
> > > On Mon, Jun 20, 2022 at 11:36:11AM +0800, Sam Li wrote:
> > >
> > > Hi Sam,
> > > Is this version 2 of "[RFC v1] Add support for zoned device"? Please
> > > keep the email subject line the same (except for "v2", "v3", etc) so
> > > that it's clear which patch series this new version replaces.
> > >
> > > > Fix some mistakes before. It can report a range of zones now.
> > >
> > > This looks like the description of what changed compared to v1. Please
> > > put the changelog below "---" in the future. When patch emails are
> > > merged by git-am(1) it keeps the text above "---" and discards the text
> > > below "---". The changelog is usually no longer useful once the patches
> > > are merged, so it should be located below the "---" line.
> > >
> > > The text above the "---" is the commit description (an explanation of
> > > why this commit is necessary). In this case the commit description
> > > should explain that this patch adds .bdrv_co_zone_report() and
> > > .bdrv_co_zone_mgmt() to BlockDriver so that zoned block devices can be
> > > supported.
> > >
> > > >
> > > > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > > > ---
> > > >  block/block-backend.c             |  22 ++++
> > > >  block/coroutines.h                |   5 +
> > > >  block/file-posix.c                | 182 ++++++++++++++++++++++++++++++
> > > >  block/io.c                        |  23 ++++
> > > >  include/block/block-common.h      |  43 ++++++-
> > > >  include/block/block-io.h          |  13 +++
> > > >  include/block/block_int-common.h  |  20 ++++
> > > >  qemu-io-cmds.c                    | 118 +++++++++++++++++++
> > > >  tests/qemu-iotests/tests/zoned.sh |  52 +++++++++
> > > >  9 files changed, 477 insertions(+), 1 deletion(-)
> > > >  create mode 100644 tests/qemu-iotests/tests/zoned.sh
> > > >
> > > > diff --git a/block/block-backend.c b/block/block-backend.c
> > > > index e0e1aff4b1..20248e4a35 100644
> > > > --- a/block/block-backend.c
> > > > +++ b/block/block-backend.c
> > > > @@ -104,6 +104,8 @@ typedef struct BlockBackendAIOCB {
> > > >      int ret;
> > > >  } BlockBackendAIOCB;
> > > >
> > > > +
> > > > +
> > >
> > > Please avoid whitespace changes in code that is otherwise untouched by
> > > your patch. Code changes can cause merge conflicts and they make it
> > > harder to use git-annotate(1), so only changes that are necessary should
> > > be included in a patch.
> > >
> > > >  static const AIOCBInfo block_backend_aiocb_info = {
> > > >      .get_aio_context = blk_aiocb_get_aio_context,
> > > >      .aiocb_size = sizeof(BlockBackendAIOCB),
> > > > @@ -1810,6 +1812,25 @@ int blk_flush(BlockBackend *blk)
> > > >      return ret;
> > > >  }
> > > >
> > >
> > > Please add a documentation comment for blk_co_zone_report() that
> > > explains how to use the functions and the purpose of the arguments. For
> > > example, does offset have to be the first byte in a zone or can it be
> > > any byte offset? What are the alignment requirements of offset and len?
> > > Why is nr_zones a pointer?
> > >
> > > > +int blk_co_zone_report(BlockBackend *blk, int64_t offset, int64_t len,
> > >
> > > Functions that run in coroutine context must be labeled with
> > > coroutine_fn:
> > >
> > >     int coroutine_fn blk_co_zone_report(...)
> > >
> > > This tells humans and tools that the function can only be called from a
> > > coroutine. There is a blog post about coroutines in QEMU here:
> > > https://blog.vmsplice.net/2014/01/coroutines-in-qemu-basics.html
> > >
> > > > +                       int64_t *nr_zones,
> > > > +                       struct BlockZoneDescriptor *zones)
> > >
> > > QEMU coding style uses typedefs when defining structs, so "struct
> > > BlockZoneDescriptor *zones" should be written as "BlockZoneDescriptor
> > > *zones".
> > >
> > > > +{
> > > > +    int ret;
> > >
> > > This function is called from the I/O code path, please mark it with:
> > >
> > >   IO_CODE();
> > >
> > > From include/block/block-io.h:
> > >
> > >   * I/O API functions. These functions are thread-safe, and therefore
> > >   * can run in any thread as long as the thread has called
> > >   * aio_context_acquire/release().
> > >   *
> > >   * These functions can only call functions from I/O and Common categories,
> > >   * but can be invoked by GS, "I/O or GS" and I/O APIs.
> > >   *
> > >   * All functions in this category must use the macro
> > >   * IO_CODE();
> > >   * to catch when they are accidentally called by the wrong API.
> > >
> > > > +    ret = bdrv_co_zone_report(blk->root->bs, offset, len, nr_zones, zones);
> > >
> > > Please add blk_inc_in_flight(blk) and blk_dec_in_flight(blk) around this
> > > function call to ensure that zone report requests finish before I/O is
> > > drained (see bdrv_drained_begin()). This is necessary so that it's
> > > possible to wait for I/O requests, including zone report, to complete.
> > >
> > > Similar to blk_co_do_preadv() we need blk_wait_while_drained(blk),
> > > blk_check_byte_request(), and bdrv_inc_in_flight(bs) before calling
> > > bdrv_co_zone_report(). bdrv_dec_in_flight(bs) needs to be called after
> > > bdrv_co_zone_report() returns.
> > >
> > After adding similar structure to blk_co_do_preadv(), zone operation
> > command will always fail at blk_wait_while_drained(blk) because
> > blk->inflight <= 0. Would it be ok to just remove
> > blk_wait_while_drained?
>
> Are you hitting the assertion in
> block/block-backend.c:blk_wait_while_drained()?
>
>   assert(blk->in_flight > 0);
>
> If yes, then there is a bug in the code. You need to make sure that
> blk_inc_in_flight() is called before blk_wait_while_drained().
>

Right! I didn't add blk_inc_in/dec_flight() because similar
blockdriver functions in file-posix.c don't use blk_inc_in_flight much
and the location would be wrong.

> > > > +    BLK_ZT_CONV = BLK_ZONE_TYPE_CONVENTIONAL,
> > > > +    BLK_ZT_SWR = BLK_ZONE_TYPE_SEQWRITE_REQ,
> > > > +    BLK_ZT_SWP = BLK_ZONE_TYPE_SEQWRITE_PREF,
> > > > +};
> > > > +
> > > > +enum zone_cond {
> > > > +    BLK_ZS_NOT_WP = BLK_ZONE_COND_NOT_WP,
> > > > +    BLK_ZS_EMPTY = BLK_ZONE_COND_EMPTY,
> > > > +    BLK_ZS_IOPEN = BLK_ZONE_COND_IMP_OPEN,
> > > > +    BLK_ZS_EOPEN = BLK_ZONE_COND_EXP_OPEN,
> > > > +    BLK_ZS_CLOSED = BLK_ZONE_COND_CLOSED,
> > > > +    BLK_ZS_RDONLY = BLK_ZONE_COND_READONLY,
> > > > +    BLK_ZS_FULL = BLK_ZONE_COND_FULL,
> > > > +    BLK_ZS_OFFLINE = BLK_ZONE_COND_OFFLINE,
> > > > +};
> > >
> > > This 1:1 correspondence with Linux constants could make the code a
> > > little harder to port.
> > >
> > > Maybe QEMU's block layer should define its own numeric constants so the
> > > code doesn't rely on operating system-specific headers.
> > > block/file-posix.c #ifdef __linux__ code can be responsible for
> > > converting Linux-specific constants to QEMU constants (and the 1:1
> > > mapping can be used there).
> > >
> > Can we define those constants in block-common.h? Because
> > BlockZoneDescriptor requires zone_condition, zone_type defined and
> > BlockZoneDesicriptor are used in header files and qemu-io
> > sub-commands. If we use #ifdef __linux__ in block-common.h, it can be
> > responsible for converting Linux constants instead.
> >
> > Thanks for reviewing! If there is any problem, please let me know.
>
> I suggest defining the constants in block-common.h. #ifdef __linux__ is
> not necessary in block-common.h because the constants should just be an
> enum with BLK_ZS_NOT_WP = 0 and so on (no need for Linux headers).
>
> In block/file-posix.c you can define a helper function inside #ifdef
> __linux__ that does something like:
>
>   BlkZoneCond zone_cond_from_linux(enum blk_zone_cond val)
>   {
>       switch (val) {
>       case BLK_ZONE_COND_NOT_WP:
>           return BLK_ZS_NOT_WP;
>       ...
>   }
>
> The code in block/file-posix.c should call this helper to convert from
> Linux values to QEMU values.
>
> This way the QEMU block layer does not use Linux constants and compiles
> on non-Linux machines.
>

Thanks!

> Stefan
Re: [RFC v2] Adding block layer APIs resembling Linux ZoneBlockDevice ioctls.
Posted by Damien Le Moal 1 year, 11 months ago
On 6/20/22 16:55, Stefan Hajnoczi wrote:
> On Mon, Jun 20, 2022 at 11:36:11AM +0800, Sam Li wrote:
> 
> Hi Sam,
> Is this version 2 of "[RFC v1] Add support for zoned device"? Please
> keep the email subject line the same (except for "v2", "v3", etc) so
> that it's clear which patch series this new version replaces.
> 
>> Fix some mistakes before. It can report a range of zones now.
> 
> This looks like the description of what changed compared to v1. Please
> put the changelog below "---" in the future. When patch emails are
> merged by git-am(1) it keeps the text above "---" and discards the text
> below "---". The changelog is usually no longer useful once the patches
> are merged, so it should be located below the "---" line.
> 
> The text above the "---" is the commit description (an explanation of
> why this commit is necessary). In this case the commit description
> should explain that this patch adds .bdrv_co_zone_report() and
> .bdrv_co_zone_mgmt() to BlockDriver so that zoned block devices can be
> supported.
> 
>>
>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>> ---
>>  block/block-backend.c             |  22 ++++
>>  block/coroutines.h                |   5 +
>>  block/file-posix.c                | 182 ++++++++++++++++++++++++++++++
>>  block/io.c                        |  23 ++++
>>  include/block/block-common.h      |  43 ++++++-
>>  include/block/block-io.h          |  13 +++
>>  include/block/block_int-common.h  |  20 ++++
>>  qemu-io-cmds.c                    | 118 +++++++++++++++++++
>>  tests/qemu-iotests/tests/zoned.sh |  52 +++++++++
>>  9 files changed, 477 insertions(+), 1 deletion(-)
>>  create mode 100644 tests/qemu-iotests/tests/zoned.sh
>>
>> diff --git a/block/block-backend.c b/block/block-backend.c
>> index e0e1aff4b1..20248e4a35 100644
>> --- a/block/block-backend.c
>> +++ b/block/block-backend.c
>> @@ -104,6 +104,8 @@ typedef struct BlockBackendAIOCB {
>>      int ret;
>>  } BlockBackendAIOCB;
>>  
>> +
>> +
> 
> Please avoid whitespace changes in code that is otherwise untouched by
> your patch. Code changes can cause merge conflicts and they make it
> harder to use git-annotate(1), so only changes that are necessary should
> be included in a patch.
> 
>>  static const AIOCBInfo block_backend_aiocb_info = {
>>      .get_aio_context = blk_aiocb_get_aio_context,
>>      .aiocb_size = sizeof(BlockBackendAIOCB),
>> @@ -1810,6 +1812,25 @@ int blk_flush(BlockBackend *blk)
>>      return ret;
>>  }
>>  
> 
> Please add a documentation comment for blk_co_zone_report() that
> explains how to use the functions and the purpose of the arguments. For
> example, does offset have to be the first byte in a zone or can it be
> any byte offset? What are the alignment requirements of offset and len?
> Why is nr_zones a pointer?
> 
>> +int blk_co_zone_report(BlockBackend *blk, int64_t offset, int64_t len,
> 
> Functions that run in coroutine context must be labeled with
> coroutine_fn:
> 
>     int coroutine_fn blk_co_zone_report(...)
> 
> This tells humans and tools that the function can only be called from a
> coroutine. There is a blog post about coroutines in QEMU here:
> https://blog.vmsplice.net/2014/01/coroutines-in-qemu-basics.html
> 
>> +                       int64_t *nr_zones,
>> +                       struct BlockZoneDescriptor *zones)
> 
> QEMU coding style uses typedefs when defining structs, so "struct
> BlockZoneDescriptor *zones" should be written as "BlockZoneDescriptor
> *zones".
> 
>> +{
>> +    int ret;
> 
> This function is called from the I/O code path, please mark it with:
> 
>   IO_CODE();
> 
> From include/block/block-io.h:
> 
>   * I/O API functions. These functions are thread-safe, and therefore
>   * can run in any thread as long as the thread has called
>   * aio_context_acquire/release().
>   *
>   * These functions can only call functions from I/O and Common categories,
>   * but can be invoked by GS, "I/O or GS" and I/O APIs.
>   *
>   * All functions in this category must use the macro
>   * IO_CODE();
>   * to catch when they are accidentally called by the wrong API.
> 
>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len, nr_zones, zones);
> 
> Please add blk_inc_in_flight(blk) and blk_dec_in_flight(blk) around this
> function call to ensure that zone report requests finish before I/O is
> drained (see bdrv_drained_begin()). This is necessary so that it's
> possible to wait for I/O requests, including zone report, to complete.
> 
> Similar to blk_co_do_preadv() we need blk_wait_while_drained(blk),
> blk_check_byte_request(), and bdrv_inc_in_flight(bs) before calling
> bdrv_co_zone_report(). bdrv_dec_in_flight(bs) needs to be called after
> bdrv_co_zone_report() returns.
> 
>> +    return ret;
>> +}
>> +
>> +int blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
>> +        int64_t offset, int64_t len)
>> +{
>> +    int ret;
>> +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
>> +
>> +    return ret;
>> +}
> 
> The same applies to this function.
> 
>> +
>> +
>>  void blk_drain(BlockBackend *blk)
>>  {
>>      BlockDriverState *bs = blk_bs(blk);
>> @@ -2634,3 +2655,4 @@ int blk_make_empty(BlockBackend *blk, Error **errp)
>>  
>>      return bdrv_make_empty(blk->root, errp);
>>  }
>> +
> 
> Unrelated whitespace change.
> 
>> diff --git a/block/coroutines.h b/block/coroutines.h
>> index 830ecaa733..247326255f 100644
>> --- a/block/coroutines.h
>> +++ b/block/coroutines.h
>> @@ -80,6 +80,11 @@ int coroutine_fn
>>  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
>>  
>>  int coroutine_fn blk_co_do_flush(BlockBackend *blk);
>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>> +                                    int64_t len, int64_t *nr_zones,
>> +                                    struct BlockZoneDescriptor *zones);
>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
>> +        int64_t offset, int64_t len);
>>  
>>  
>>  /*
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index 48cd096624..71fd21f8ba 100644
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -178,6 +178,137 @@ typedef struct BDRVRawReopenState {
>>      bool check_cache_dropped;
>>  } BDRVRawReopenState;
>>  
>> +/*
>> + * parse_zone - Fill a zone descriptor
>> + */
>> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
>> +        struct blk_zone *blkz) {
>> +    zone->start = blkz->start << BDRV_SECTOR_BITS;
>> +    zone->length = blkz->len << BDRV_SECTOR_BITS;
>> +    zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
>> +    zone->wp = blkz->wp << BDRV_SECTOR_BITS;
>> +    zone->type = blkz->type;
>> +    zone->cond = blkz->type;
> 
> Should this be "zone->cond = blkz->cond"?
> 
>> +}
>> +
>> +/**
>> + * zone report - Get a zone block device's information in the
>> + * form of an array of zone descriptors.
>> + *
>> + * @param bs: passing zone block device file descriptor
>> + * @param zones: Space to hold zone information on reply
>> + * @param offset: the location in the zone block device
>> + * @return 0 on success, -1 on failure
>> + */
>> +static int raw_co_zone_report(BlockDriverState *bs, int64_t offset, int64_t len,
> 
> coroutine_fn
> 
>> +                              int64_t *nr_zones,
>> +                              struct BlockZoneDescriptor *zones) {
>> +    BDRVRawState *s = bs->opaque;
>> +    struct blk_zone_report *rep;
>> +    struct BlockZoneDescriptor bzd;
>> +    struct blk_zone *blkz;
>> +    int64_t zone_size_mask, end, rep_size, nrz;
>> +    int ret, n = 0, i = 0;
>> +
>> +    printf("%s\n", "start to report zone");
> 
> This looks like debug output. QEMU has a tracing system that you can
> use. See docs/devel/tracing.rst.
> 
>> +    nrz = *nr_zones;
>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>> +    rep = (struct blk_zone_report *)malloc(rep_size);
> 
> Please use g_autofree and g_new(). QEMU avoids direct use of malloc(3)/free(3).
> 
>> +    if (!rep) {
>> +        return -1;
>> +    }
>> +
>> +    zone_size_mask = zone_start_sector - 1;
>> +    /* align up */
>> +    end = ((offset + len + zone_size_mask) & (~zone_size_mask))
>> +            >> BDRV_SECTOR_BITS;
> 
> More readable:
> 
>   end = DIV_ROUND_UP(offset + len, BDRV_SECTOR_SIZE);
> 
>> +
>> +    blkz = (struct blk_zone *)(rep + 1);
>> +    while (offset < end) {
>> +        memset(rep, 0, rep_size);
>> +        rep->sector = offset;
>> +        rep->nr_zones = nrz;
>> +
>> +        ret = ioctl(s->fd, BLKREPORTZONE, rep);
> 
> Can this ioctl() block? It seems likely. If yes, then the call needs to
> be made from the thread pool to avoid blocking the current thread. See
> raw_thread_pool_submit().
> 
>> +        if (ret != 0) {
>> +            ret = -errno;
>> +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
>> +                         s->fd, offset, errno);
>> +            free(rep);
>> +            return ret;
>> +        }
>> +
>> +        if (!rep->nr_zones) {
>> +            break;
>> +        }
>> +
>> +        for (i = 0; i < rep->nr_zones; i++) {
>> +            parse_zone(&bzd, &blkz[i]);
>> +            if (zones) {
>> +                memcpy(&zones[n], &bzd, sizeof(bzd));
> 
> n is never incremented so this always overwrites the first element.
> 
>> +            }
>> +        }
>> +
>> +        offset = blkz[i].start + blkz[i].len;
>> +    }
>> +
> 
> Does this function need to update *nr_zones = n before returning? How does
> the caller know how many zones were returned?
> 
>> +    return ret;
>> +}
>> +
>> +/**
>> + * zone management operations - Execute an operation on a zone
>> + */
>> +static int raw_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
>> +        int64_t offset, int64_t len) {
>> +    BDRVRawState *s = bs->opaque;
>> +    struct blk_zone_range range;
>> +    const char *ioctl_name;
>> +    uint64_t ioctl_op;
> 
> ioctl()'s second argument is unsigned long request. Please use unsigned
> long to keep the types consistent.
> 
>> +    int64_t zone_size_mask, end;
>> +    int ret;
>> +
>> +    zone_size_mask = zone_start_sector - 1;
>> +    /* align up */
>> +    end = ((offset + len + zone_size_mask) & (~zone_size_mask))
>> +            >> BDRV_SECTOR_BITS;
>> +    offset = (offset & (~zone_size_mask)) >> BDRV_SECTOR_BITS; /* align down */
>> +
>> +    switch (op) {
>> +    case zone_open:
>> +        ioctl_name = "BLKOPENZONE";
>> +        ioctl_op = BLKOPENZONE;
>> +        break;
>> +    case zone_close:
>> +        ioctl_name = "BLKCLOSEZONE";
>> +        ioctl_op = BLKCLOSEZONE;
>> +        break;
>> +    case zone_finish:
>> +        ioctl_name = "BLKFINISHZONE";
>> +        ioctl_op = BLKFINISHZONE;
>> +        break;
>> +    case zone_reset:
>> +        ioctl_name = "BLKRESETZONE";
>> +        ioctl_op = BLKRESETZONE;
>> +        break;
>> +    default:
>> +        error_report("Invalid zone operation 0x%x", op);
>> +        errno = -EINVAL;
>> +        return -1;
>> +    }
>> +
>> +    /* Execute the operation */
>> +    range.sector = offset;
>> +    range.nr_sectors = end - offset;
>> +    ret = ioctl(s->fd, ioctl_op, &range);
>> +    if (ret != 0) {
>> +        error_report("ioctl %s failed %d",
>> +                         ioctl_name, errno);
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>  static int fd_open(BlockDriverState *bs)
>>  {
>>      BDRVRawState *s = bs->opaque;
>> @@ -3324,6 +3455,9 @@ BlockDriver bdrv_file = {
>>      .bdrv_abort_perm_update = raw_abort_perm_update,
>>      .create_opts = &raw_create_opts,
>>      .mutable_opts = mutable_opts,
>> +
>> +    .bdrv_co_zone_report = raw_co_zone_report,
>> +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>>  };
>>  
>>  /***********************************************/
>> @@ -3703,6 +3837,53 @@ static BlockDriver bdrv_host_device = {
>>  #endif
>>  };
>>  
>> +static BlockDriver bdrv_zoned_host_device = {
>> +        .format_name = "zoned_host_device",
>> +        .protocol_name = "zoned_host_device",
>> +        .instance_size = sizeof(BDRVRawState),
>> +        .bdrv_needs_filename = true,
>> +        .bdrv_probe_device  = hdev_probe_device,
>> +        .bdrv_parse_filename = hdev_parse_filename,
>> +        .bdrv_file_open     = hdev_open,
>> +        .bdrv_close         = raw_close,
>> +        .bdrv_reopen_prepare = raw_reopen_prepare,
>> +        .bdrv_reopen_commit  = raw_reopen_commit,
>> +        .bdrv_reopen_abort   = raw_reopen_abort,
>> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
>> +        .create_opts         = &bdrv_create_opts_simple,
>> +        .mutable_opts        = mutable_opts,
>> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>> +
>> +        .bdrv_co_preadv         = raw_co_preadv,
>> +        .bdrv_co_pwritev        = raw_co_pwritev,
>> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
>> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
>> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>> +        .bdrv_refresh_limits = raw_refresh_limits,
>> +        .bdrv_io_plug = raw_aio_plug,
>> +        .bdrv_io_unplug = raw_aio_unplug,
>> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>> +
>> +        .bdrv_co_truncate       = raw_co_truncate,
>> +        .bdrv_getlength = raw_getlength,
>> +        .bdrv_get_info = raw_get_info,
>> +        .bdrv_get_allocated_file_size
>> +                            = raw_get_allocated_file_size,
>> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
>> +        .bdrv_check_perm = raw_check_perm,
>> +        .bdrv_set_perm   = raw_set_perm,
>> +        .bdrv_abort_perm_update = raw_abort_perm_update,
>> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
>> +        .bdrv_probe_geometry = hdev_probe_geometry,
>> +        .bdrv_co_ioctl = hdev_co_ioctl,
>> +
>> +        /* zone management operations */
>> +        .bdrv_co_zone_report = raw_co_zone_report,
>> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>> +};
>> +
>>  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>>  static void cdrom_parse_filename(const char *filename, QDict *options,
>>                                   Error **errp)
>> @@ -3964,6 +4145,7 @@ static void bdrv_file_init(void)
>>  #if defined(HAVE_HOST_BLOCK_DEVICE)
>>      bdrv_register(&bdrv_host_device);
>>  #ifdef __linux__
>> +    bdrv_register(&bdrv_zoned_host_device);
>>      bdrv_register(&bdrv_host_cdrom);
>>  #endif
>>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>> diff --git a/block/io.c b/block/io.c
>> index 789e6373d5..3e8bb83cc3 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -3258,6 +3258,29 @@ out:
>>      return co.ret;
>>  }
>>  
>> +int bdrv_co_zone_report(BlockDriverState *bs,
>> +                        int64_t offset, int64_t len, int64_t *nr_zones,
>> +                        struct BlockZoneDescriptor *zones)
>> +{
>> +    int ret = 0;
>> +    if (!bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones)) {
>> +        return -ENOTSUP;
>> +    }
> 
> This returns -ENOTSUP any time ->bdrv_co_zone_report() returns 0. Should
> this be:
> 
>   if (!bs->drv->bdrv_co_zone_report) {
>       return -ENOTSUP;
>   }
> 
>   return bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones);
> 
> ?
> 
>> +
>> +    return ret;
>> +}
>> +
>> +int bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
>> +        int64_t offset, int64_t len)
>> +{
>> +    int ret = 0;
>> +    if (!bs->drv->bdrv_co_zone_mgmt(bs, op, offset, len)) {
>> +        return -ENOTSUP;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>>  {
>>      IO_CODE();
>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>> index fdb7306e78..24c468d8ad 100644
>> --- a/include/block/block-common.h
>> +++ b/include/block/block-common.h
>> @@ -23,7 +23,7 @@
>>   */
>>  #ifndef BLOCK_COMMON_H
>>  #define BLOCK_COMMON_H
>> -
>> +#include <linux/blkzoned.h>
> 
> Linux-specific code must be #ifdef __linux__ to avoid compilation errors
> on non-Linux hosts.

This include should not be here anyway. It should be in block/file-posix.c
I think, and...

> 
>>  #include "block/aio.h"
>>  #include "block/aio-wait.h"
>>  #include "qemu/iov.h"
>> @@ -48,6 +48,47 @@
>>  typedef struct BlockDriver BlockDriver;
>>  typedef struct BdrvChild BdrvChild;
>>  typedef struct BdrvChildClass BdrvChildClass;
>> +enum zone_type {
> 
> Please use "typedef enum { ... } BdrvZoneType" so that enums follow
> QEMU's coding style.
> 
>> +    BLK_ZT_CONV = BLK_ZONE_TYPE_CONVENTIONAL,
>> +    BLK_ZT_SWR = BLK_ZONE_TYPE_SEQWRITE_REQ,
>> +    BLK_ZT_SWP = BLK_ZONE_TYPE_SEQWRITE_PREF,

These should use hard values instead of the macro defined in
linux/blkzoned.h so that the interface is truly independent of linux
kernel API.

>> +};
>> +
>> +enum zone_cond {
>> +    BLK_ZS_NOT_WP = BLK_ZONE_COND_NOT_WP,
>> +    BLK_ZS_EMPTY = BLK_ZONE_COND_EMPTY,
>> +    BLK_ZS_IOPEN = BLK_ZONE_COND_IMP_OPEN,
>> +    BLK_ZS_EOPEN = BLK_ZONE_COND_EXP_OPEN,
>> +    BLK_ZS_CLOSED = BLK_ZONE_COND_CLOSED,
>> +    BLK_ZS_RDONLY = BLK_ZONE_COND_READONLY,
>> +    BLK_ZS_FULL = BLK_ZONE_COND_FULL,
>> +    BLK_ZS_OFFLINE = BLK_ZONE_COND_OFFLINE,
>> +};
> 
> This 1:1 correspondence with Linux constants could make the code a
> little harder to port.
> 
> Maybe QEMU's block layer should define its own numeric constants so the
> code doesn't rely on operating system-specific headers.
> block/file-posix.c #ifdef __linux__ code can be responsible for
> converting Linux-specific constants to QEMU constants (and the 1:1
> mapping can be used there).

Yes !

> 
>> +
>> +enum zone_op {
>> +    zone_open,
>> +    zone_close,
>> +    zone_finish,
>> +    zone_reset,
>> +};
>> +
>> +/*
>> + * Zone descriptor data structure.
>> + * Provide information on a zone with all position and size values in bytes.
>> + */
>> +typedef struct BlockZoneDescriptor {
>> +    uint64_t start;
>> +    uint64_t length;
>> +    uint64_t cap;
>> +    uint64_t wp;
>> +    enum zone_type type;
>> +    enum zone_cond cond;
>> +} BlockZoneDescriptor;
>> +
>> +enum zone_model {
>> +    BLK_Z_HM,
>> +    BLK_Z_HA,
>> +};
>>  
>>  typedef struct BlockDriverInfo {
>>      /* in bytes, 0 if irrelevant */
>> diff --git a/include/block/block-io.h b/include/block/block-io.h
>> index 62c84f0519..deb8902684 100644
>> --- a/include/block/block-io.h
>> +++ b/include/block/block-io.h
>> @@ -80,6 +80,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
>>  /* Ensure contents are flushed to disk.  */
>>  int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
>>  
>> +/* Report zone information of zone block device. */
>> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>> +                                     int64_t len, int64_t *nr_zones,
>> +                                     struct BlockZoneDescriptor *zones);
>> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
>> +        int64_t offset, int64_t len);
>> +
>>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
>>  int bdrv_block_status(BlockDriverState *bs, int64_t offset,
>> @@ -290,6 +297,12 @@ bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
>>  int generated_co_wrapper
>>  bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
>>  
>> +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
>> +                                         int64_t len, int64_t *nr_zones,
>> +                                         struct BlockZoneDescriptor *zones);
>> +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, enum zone_op op,
>> +        int64_t offset, int64_t len);
>> +
>>  /**
>>   * bdrv_parent_drained_begin_single:
>>   *
>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
>> index 8947abab76..4d0180a7da 100644
>> --- a/include/block/block_int-common.h
>> +++ b/include/block/block_int-common.h
>> @@ -63,6 +63,7 @@
>>  #define BLOCK_OPT_EXTL2             "extended_l2"
>>  
>>  #define BLOCK_PROBE_BUF_SIZE        512
>> +#define zone_start_sector           512
>>  
>>  enum BdrvTrackedRequestType {
>>      BDRV_TRACKED_READ,
>> @@ -94,6 +95,19 @@ typedef struct BdrvTrackedRequest {
>>      struct BdrvTrackedRequest *waiting_for;
>>  } BdrvTrackedRequest;
>>  
>> +/**
>> + * Zone device information data structure.
>> + * Provide information on a device.
>> + */
>> +typedef struct zbd_dev {
>> +    enum zone_model model;
>> +    uint32_t block_size;
> 
> How does this relate to QEMU's BlockLimits?
> 
>> +    uint32_t write_granularity;
> 
> Same. Maybe this belongs in BlockLimits?
> 
>> +    uint32_t nr_zones;
> 
> Should this really be limited to 32-bit? For example, take 256 MB zones,
> then the max nr_zones * 256 MB is much smaller than a uint64_t capacity
> value. It seems safer to make this 64-bit, but maybe Dmitry, Hannes, or
> Damien can tell us what to do here.

u32 is fine. We are nowhere near 4G zones :)
The max out there today is 20TB SMR drive with 128MB zones. About 150,000
zones. Nowhere near 4G limit. Linux kernel also uses unsigned int for
number of zones everywhere.

> 
>> +    struct BlockZoneDescriptor *zones; /* array of zones */
> 
> When is this used?

This will be needed when zone append implementation comes. But not there
yet :) A simpler form of the zone descriptor will likely be better anyway
to save memory.


-- 
Damien Le Moal
Western Digital Research
Re: [RFC v2] Adding block layer APIs resembling Linux ZoneBlockDevice ioctls.
Posted by Stefan Hajnoczi 1 year, 11 months ago
On Mon, Jun 20, 2022 at 07:11:40PM +0900, Damien Le Moal wrote:
> On 6/20/22 16:55, Stefan Hajnoczi wrote:
> > On Mon, Jun 20, 2022 at 11:36:11AM +0800, Sam Li wrote:
> >> +    uint32_t nr_zones;
> > 
> > Should this really be limited to 32-bit? For example, take 256 MB zones,
> > then the max nr_zones * 256 MB is much smaller than a uint64_t capacity
> > value. It seems safer to make this 64-bit, but maybe Dmitry, Hannes, or
> > Damien can tell us what to do here.
> 
> u32 is fine. We are nowhere near 4G zones :)
> The max out there today is 20TB SMR drive with 128MB zones. About 150,000
> zones. Nowhere near 4G limit. Linux kernel also uses unsigned int for
> number of zones everywhere.

Thanks!

Stefan