Series comparison

-[Qemu-devel] [PULL 0/5] Block patches
+[PULL 00/33] Block patches
-The following changes since commit a0def594286d9110a6035e02eef558cf3cf5d847:
+The following changes since commit 8507c9d5c9a62de2a0e281b640f995e26eac46af:
-  Merge remote-tracking branch 'remotes/bonzini/tags/for-upstream' into staging (2017-01-30 10:23:20 +0000)
+  Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-11-03 15:59:44 +0000)
-are available in the git repository at:
+are available in the Git repository at:
-  https://github.com/codyprime/qemu-kvm-jtc.git tags/block-pull-request
+  https://gitlab.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to acf6e5f0962c4be670d4a93ede77423512521876:
+for you to fetch changes up to fc107d86840b3364e922c26cf7631b7fd38ce523:
-  sheepdog: reorganize check for overlapping requests (2017-02-01 00:17:20 -0500)
+  util/vfio-helpers: Assert offset is aligned to page size (2020-11-03 19:06:23 +0000)
 ----------------------------------------------------------------
-Block patches
+Pull request for 5.2
 NVMe fixes to solve IOMMU issues on non-x86 and error message/tracing
 improvements. Elena Afanasova's ioeventfd fixes are also included.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ----------------------------------------------------------------
-Paolo Bonzini (5):
+Elena Afanasova (2):
-  sheepdog: remove unused cancellation support
+  accel/kvm: add PIO ioeventfds only in case kvm_eventfds_allowed is
-  sheepdog: reorganize coroutine flow
+    true
-  sheepdog: do not use BlockAIOCB
+  softmmu/memory: fix memory_region_ioeventfd_equal()
   sheepdog: simplify inflight_aio_head management
   sheepdog: reorganize check for overlapping requests
- block/sheepdog.c | 289 ++++++++++++++++---------------------------------------
+Eric Auger (4):
-file changed, 84 insertions(+), 205 deletions(-)
+  block/nvme: Change size and alignment of IDENTIFY response buffer
   block/nvme: Change size and alignment of queue
   block/nvme: Change size and alignment of prp_list_pages
   block/nvme: Align iov's va and size on host page size
 Philippe Mathieu-Daudé (27):
   MAINTAINERS: Cover "block/nvme.h" file
   block/nvme: Use hex format to display offset in trace events
   block/nvme: Report warning with warn_report()
   block/nvme: Trace controller capabilities
   block/nvme: Trace nvme_poll_queue() per queue
   block/nvme: Improve nvme_free_req_queue_wait() trace information
   block/nvme: Trace queue pair creation/deletion
   block/nvme: Move definitions before structure declarations
   block/nvme: Use unsigned integer for queue counter/size
   block/nvme: Make nvme_identify() return boolean indicating error
   block/nvme: Make nvme_init_queue() return boolean indicating error
   block/nvme: Introduce Completion Queue definitions
   block/nvme: Use definitions instead of magic values in add_io_queue()
   block/nvme: Correctly initialize Admin Queue Attributes
   block/nvme: Simplify ADMIN queue access
   block/nvme: Simplify nvme_cmd_sync()
   block/nvme: Set request_alignment at initialization
   block/nvme: Correct minimum device page size
   block/nvme: Fix use of write-only doorbells page on Aarch64 arch
   block/nvme: Fix nvme_submit_command() on big-endian host
   util/vfio-helpers: Improve reporting unsupported IOMMU type
   util/vfio-helpers: Trace PCI I/O config accesses
   util/vfio-helpers: Trace PCI BAR region info
   util/vfio-helpers: Trace where BARs are mapped
   util/vfio-helpers: Improve DMA trace events
   util/vfio-helpers: Convert vfio_dump_mapping to trace events
   util/vfio-helpers: Assert offset is aligned to page size
  MAINTAINERS          |   2 +
  include/block/nvme.h |  18 ++--
  accel/kvm/kvm-all.c  |   6 +-
  block/nvme.c         | 209 ++++++++++++++++++++++++-------------------
  softmmu/memory.c     |  11 ++-
  util/vfio-helpers.c  |  43 +++++----
  block/trace-events   |  30 ++++---
  util/trace-events    |  10 ++-
 files changed, 195 insertions(+), 134 deletions(-)
 --
-.9.3
+.28.0

-New patch
+[PULL 01/33] accel/kvm: add PIO ioeventfds only in case kvm_eventfds_allowed is true
+From: Elena Afanasova <eafanasova@gmail.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
+Message-Id: <20201017210102.26036-1-eafanasova@gmail.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ accel/kvm/kvm-all.c | 6 ++++--
+file changed, 4 insertions(+), 2 deletions(-)
+diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/kvm/kvm-all.c
++++ b/accel/kvm/kvm-all.c
+@@ -XXX,XX +XXX,XX @@ static int kvm_init(MachineState *ms)
+     kvm_memory_listener_register(s, &s->memory_listener,
+                                  &address_space_memory, 0);
+-    memory_listener_register(&kvm_io_listener,
+-                             &address_space_io);
++    if (kvm_eventfds_allowed) {
++        memory_listener_register(&kvm_io_listener,
++                                 &address_space_io);
++    }
+     memory_listener_register(&kvm_coalesced_pio_listener,
+                              &address_space_io);
+--
+.28.0

-New patch
+[PULL 02/33] softmmu/memory: fix memory_region_ioeventfd_equal()
+From: Elena Afanasova <eafanasova@gmail.com>
+Eventfd can be registered with a zero length when fast_mmio is true.
+Handle this case properly when dispatching through QEMU.
+Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
+Message-id: cf71a62eb04e61932ff8ffdd02e0b2aab4f495a0.camel@gmail.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ softmmu/memory.c | 11 +++++++++--
+file changed, 9 insertions(+), 2 deletions(-)
+diff --git a/softmmu/memory.c b/softmmu/memory.c
+index XXXXXXX..XXXXXXX 100644
+--- a/softmmu/memory.c
++++ b/softmmu/memory.c
+@@ -XXX,XX +XXX,XX @@ static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a,
+ static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a,
+                                           MemoryRegionIoeventfd *b)
+ {
+-    return !memory_region_ioeventfd_before(a, b)
+-        && !memory_region_ioeventfd_before(b, a);
++    if (int128_eq(a->addr.start, b->addr.start) &&
++        (!int128_nz(a->addr.size) || !int128_nz(b->addr.size) ||
++         (int128_eq(a->addr.size, b->addr.size) &&
++          (a->match_data == b->match_data) &&
++          ((a->match_data && (a->data == b->data)) || !a->match_data) &&
++          (a->e == b->e))))
++        return true;
++
++    return false;
+ }
+ /* Range of memory in the global map.  Addresses are absolute. */
+--
+.28.0

-New patch
+[PULL 03/33] MAINTAINERS: Cover "block/nvme.h" file
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+The "block/nvme.h" header is shared by both the NVMe block
+driver and the NVMe emulated device. Add the 'F:' entry on
+both sections, so all maintainers/reviewers are notified
+when it is changed.
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
+Message-Id: <20200701140634.25994-1-philmd@redhat.com>
+---
+ MAINTAINERS | 2 ++
+file changed, 2 insertions(+)
+diff --git a/MAINTAINERS b/MAINTAINERS
+index XXXXXXX..XXXXXXX 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -XXX,XX +XXX,XX @@ M: Klaus Jensen <its@irrelevant.dk>
+ L: qemu-block@nongnu.org
+ S: Supported
+ F: hw/block/nvme*
++F: include/block/nvme.h
+ F: tests/qtest/nvme-test.c
+ F: docs/specs/nvme.txt
+ T: git git://git.infradead.org/qemu-nvme.git nvme-next
+@@ -XXX,XX +XXX,XX @@ R: Fam Zheng <fam@euphon.net>
+ L: qemu-block@nongnu.org
+ S: Supported
+ F: block/nvme*
++F: include/block/nvme.h
+ T: git https://github.com/stefanha/qemu.git block
+ Bootdevice
+--
+.28.0

-New patch
+[PULL 04/33] block/nvme: Use hex format to display offset in trace events
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Use the same format used for the hw/vfio/ trace events.
+Suggested-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-3-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/trace-events | 12 ++++++------
+file changed, 6 insertions(+), 6 deletions(-)
+diff --git a/block/trace-events b/block/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/block/trace-events
++++ b/block/trace-events
+@@ -XXX,XX +XXX,XX @@ nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
+ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
+ nvme_handle_event(void *s) "s %p"
+ nvme_poll_cb(void *s) "s %p"
+-nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
+-nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
++nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
++nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
+ nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
+-nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
+-nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
+-nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
+-nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
++nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset 0x%"PRIx64" bytes %"PRId64" niov %d is_write %d"
++nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" ret %d"
++nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
++nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
+ nvme_dma_map_flush(void *s) "s %p"
+ nvme_free_req_queue_wait(void *q) "q %p"
+ nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
+--
+.28.0

-New patch
+[PULL 05/33] block/nvme: Report warning with warn_report()
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Instead of displaying warning on stderr, use warn_report()
+which also displays it on the monitor.
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-4-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 4 ++--
+file changed, 2 insertions(+), 2 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
+         }
+         cid = le16_to_cpu(c->cid);
+         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
+-            fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
+-                    cid);
++            warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
++                        "queue size: %u", cid, NVME_QUEUE_SIZE);
+             continue;
+         }
+         trace_nvme_complete_command(s, q->index, cid);
+--
+.28.0

-New patch
+[PULL 06/33] block/nvme: Trace controller capabilities
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Controllers have different capabilities and report them in the
+CAP register. We are particularly interested by the page size
+limits.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-5-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c       | 13 +++++++++++++
+ block/trace-events |  2 ++
+files changed, 15 insertions(+)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
+      * Initialization". */
+     cap = le64_to_cpu(regs->cap);
++    trace_nvme_controller_capability_raw(cap);
++    trace_nvme_controller_capability("Maximum Queue Entries Supported",
++                                     1 + NVME_CAP_MQES(cap));
++    trace_nvme_controller_capability("Contiguous Queues Required",
++                                     NVME_CAP_CQR(cap));
++    trace_nvme_controller_capability("Doorbell Stride",
++                                     2 << (2 + NVME_CAP_DSTRD(cap)));
++    trace_nvme_controller_capability("Subsystem Reset Supported",
++                                     NVME_CAP_NSSRS(cap));
++    trace_nvme_controller_capability("Memory Page Size Minimum",
++                                     1 << (12 + NVME_CAP_MPSMIN(cap)));
++    trace_nvme_controller_capability("Memory Page Size Maximum",
++                                     1 << (12 + NVME_CAP_MPSMAX(cap)));
+     if (!NVME_CAP_CSS(cap)) {
+         error_setg(errp, "Device doesn't support NVMe command set");
+         ret = -EINVAL;
+diff --git a/block/trace-events b/block/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/block/trace-events
++++ b/block/trace-events
+@@ -XXX,XX +XXX,XX @@ qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t
+ qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
+ # nvme.c
++nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
++nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
+ nvme_kick(void *s, int queue) "s %p queue %d"
+ nvme_dma_flush_queue_wait(void *s) "s %p"
+ nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
+--
+.28.0

-New patch
+[PULL 07/33] block/nvme: Trace nvme_poll_queue() per queue
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+As we want to enable multiple queues, report the event
+in each nvme_poll_queue() call, rather than once in
+the callback calling nvme_poll_queues().
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-6-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c       | 2 +-
+ block/trace-events | 2 +-
+files changed, 2 insertions(+), 2 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queue(NVMeQueuePair *q)
+     const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
+     NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
++    trace_nvme_poll_queue(q->s, q->index);
+     /*
+      * Do an early check for completions. q->lock isn't needed because
+      * nvme_process_completion() only runs in the event loop thread and
+@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_cb(void *opaque)
+     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
+                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
+-    trace_nvme_poll_cb(s);
+     return nvme_poll_queues(s);
+ }
+diff --git a/block/trace-events b/block/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/block/trace-events
++++ b/block/trace-events
+@@ -XXX,XX +XXX,XX @@ nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
+ nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
+ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
+ nvme_handle_event(void *s) "s %p"
+-nvme_poll_cb(void *s) "s %p"
++nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
+ nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
+ nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
+ nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
+--
+.28.0

-New patch
+[PULL 08/33] block/nvme: Improve nvme_free_req_queue_wait() trace information
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+What we want to trace is the block driver state and the queue index.
+Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-7-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c       | 2 +-
+ block/trace-events | 2 +-
+files changed, 2 insertions(+), 2 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
+     while (q->free_req_head == -1) {
+         if (qemu_in_coroutine()) {
+-            trace_nvme_free_req_queue_wait(q);
++            trace_nvme_free_req_queue_wait(q->s, q->index);
+             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
+         } else {
+             qemu_mutex_unlock(&q->lock);
+diff --git a/block/trace-events b/block/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/block/trace-events
++++ b/block/trace-events
+@@ -XXX,XX +XXX,XX @@ nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s
+ nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
+ nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
+ nvme_dma_map_flush(void *s) "s %p"
+-nvme_free_req_queue_wait(void *q) "q %p"
++nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
+ nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
+ nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
+ nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"
+--
+.28.0

-New patch
+[PULL 09/33] block/nvme: Trace queue pair creation/deletion
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-8-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c       | 3 +++
+ block/trace-events | 2 ++
+files changed, 5 insertions(+)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
+ static void nvme_free_queue_pair(NVMeQueuePair *q)
+ {
++    trace_nvme_free_queue_pair(q->index, q);
+     if (q->completion_bh) {
+         qemu_bh_delete(q->completion_bh);
+     }
+@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
+     if (!q) {
+         return NULL;
+     }
++    trace_nvme_create_queue_pair(idx, q, size, aio_context,
++                                 event_notifier_get_fd(s->irq_notifier));
+     q->prp_list_pages = qemu_try_memalign(s->page_size,
+                                           s->page_size * NVME_NUM_REQS);
+     if (!q->prp_list_pages) {
+diff --git a/block/trace-events b/block/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/block/trace-events
++++ b/block/trace-events
+@@ -XXX,XX +XXX,XX @@ nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" byte
+ nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
+ nvme_dma_map_flush(void *s) "s %p"
+ nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
++nvme_create_queue_pair(unsigned q_index, void *q, unsigned size, void *aio_context, int fd) "index %u q %p size %u aioctx %p fd %d"
++nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p"
+ nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
+ nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
+ nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"
+--
+.28.0

-New patch
+[PULL 10/33] block/nvme: Move definitions before structure declarations
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+To be able to use some definitions in structure declarations,
+move them earlier. No logical change.
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-9-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 19 ++++++++++---------
+file changed, 10 insertions(+), 9 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@
+ typedef struct BDRVNVMeState BDRVNVMeState;
++/* Same index is used for queues and IRQs */
++#define INDEX_ADMIN     0
++#define INDEX_IO(n)     (1 + n)
++
++/* This driver shares a single MSIX IRQ for the admin and I/O queues */
++enum {
++    MSIX_SHARED_IRQ_IDX = 0,
++    MSIX_IRQ_COUNT = 1
++};
++
+ typedef struct {
+     int32_t  head, tail;
+     uint8_t  *queue;
+@@ -XXX,XX +XXX,XX @@ typedef struct {
+     QEMUBH      *completion_bh;
+ } NVMeQueuePair;
+-#define INDEX_ADMIN     0
+-#define INDEX_IO(n)     (1 + n)
+-
+-/* This driver shares a single MSIX IRQ for the admin and I/O queues */
+-enum {
+-    MSIX_SHARED_IRQ_IDX = 0,
+-    MSIX_IRQ_COUNT = 1
+-};
+-
+ struct BDRVNVMeState {
+     AioContext *aio_context;
+     QEMUVFIOState *vfio;
+--
+.28.0

-New patch
+[PULL 11/33] block/nvme: Use unsigned integer for queue counter/size
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
 We can not have negative queue count/size/index, use unsigned type.
 Rename 'nr_queues' as 'queue_count' to match the spec naming.
 Reviewed-by: Eric Auger <eric.auger@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Tested-by: Eric Auger <eric.auger@redhat.com>
 Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-id: 20201029093306.1063879-10-philmd@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Tested-by: Eric Auger <eric.auger@redhat.com>
 ---
  block/nvme.c       | 38 ++++++++++++++++++--------------------
  block/trace-events | 10 +++++-----
 files changed, 23 insertions(+), 25 deletions(-)
 diff --git a/block/nvme.c b/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nvme.c
 +++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
       * [1..]: io queues.
       */
      NVMeQueuePair **queues;
 -    int nr_queues;
 +    unsigned queue_count;
      size_t page_size;
      /* How many uint32_t elements does each doorbell entry take. */
      size_t doorbell_scale;
@@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = {
  };
  static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
 -                            int nentries, int entry_bytes, Error **errp)
 +                            unsigned nentries, size_t entry_bytes, Error **errp)
  {
      size_t bytes;
      int r;
@@ -XXX,XX +XXX,XX @@ static void nvme_free_req_queue_cb(void *opaque)
  static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
                                               AioContext *aio_context,
 -                                             int idx, int size,
 +                                             unsigned idx, size_t size,
                                               Error **errp)
  {
      int i, r;
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
      bool progress = false;
      int i;
 -    for (i = 0; i < s->nr_queues; i++) {
 +    for (i = 0; i < s->queue_count; i++) {
          if (nvme_poll_queue(s->queues[i])) {
              progress = true;
          }
@@ -XXX,XX +XXX,XX @@ static void nvme_handle_event(EventNotifier *n)
  static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
  {
      BDRVNVMeState *s = bs->opaque;
 -    int n = s->nr_queues;
 +    unsigned n = s->queue_count;
      NVMeQueuePair *q;
      NvmeCmd cmd;
 -    int queue_size = NVME_QUEUE_SIZE;
 +    unsigned queue_size = NVME_QUEUE_SIZE;
      q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
                                 n, queue_size, errp);
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
          .cdw11 = cpu_to_le32(0x3),
      };
      if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
 -        error_setg(errp, "Failed to create CQ io queue [%d]", n);
 +        error_setg(errp, "Failed to create CQ io queue [%u]", n);
          goto out_error;
      }
      cmd = (NvmeCmd) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
          .cdw11 = cpu_to_le32(0x1 | (n << 16)),
      };
      if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
 -        error_setg(errp, "Failed to create SQ io queue [%d]", n);
 +        error_setg(errp, "Failed to create SQ io queue [%u]", n);
          goto out_error;
      }
      s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
      s->queues[n] = q;
 -    s->nr_queues++;
 +    s->queue_count++;
      return true;
  out_error:
      nvme_free_queue_pair(q);
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
          ret = -EINVAL;
          goto out;
      }
 -    s->nr_queues = 1;
 +    s->queue_count = 1;
      QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
      regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
                              (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
@@ -XXX,XX +XXX,XX @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
  static void nvme_close(BlockDriverState *bs)
  {
 -    int i;
      BDRVNVMeState *s = bs->opaque;
 -    for (i = 0; i < s->nr_queues; ++i) {
 +    for (unsigned i = 0; i < s->queue_count; ++i) {
          nvme_free_queue_pair(s->queues[i]);
      }
      g_free(s->queues);
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
      };
      trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
 -    assert(s->nr_queues > 1);
 +    assert(s->queue_count > 1);
      req = nvme_get_free_req(ioq);
      assert(req);
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
          .ret = -EINPROGRESS,
      };
 -    assert(s->nr_queues > 1);
 +    assert(s->queue_count > 1);
      req = nvme_get_free_req(ioq);
      assert(req);
      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
      cmd.cdw12 = cpu_to_le32(cdw12);
      trace_nvme_write_zeroes(s, offset, bytes, flags);
 -    assert(s->nr_queues > 1);
 +    assert(s->queue_count > 1);
      req = nvme_get_free_req(ioq);
      assert(req);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
          return -ENOTSUP;
      }
 -    assert(s->nr_queues > 1);
 +    assert(s->queue_count > 1);
      buf = qemu_try_memalign(s->page_size, s->page_size);
      if (!buf) {
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
  {
      BDRVNVMeState *s = bs->opaque;
 -    for (int i = 0; i < s->nr_queues; i++) {
 +    for (unsigned i = 0; i < s->queue_count; i++) {
          NVMeQueuePair *q = s->queues[i];
          qemu_bh_delete(q->completion_bh);
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
      aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                             false, nvme_handle_event, nvme_poll_cb);
 -    for (int i = 0; i < s->nr_queues; i++) {
 +    for (unsigned i = 0; i < s->queue_count; i++) {
          NVMeQueuePair *q = s->queues[i];
          q->completion_bh =
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_plug(BlockDriverState *bs)
  static void nvme_aio_unplug(BlockDriverState *bs)
  {
 -    int i;
      BDRVNVMeState *s = bs->opaque;
      assert(s->plugged);
      s->plugged = false;
 -    for (i = INDEX_IO(0); i < s->nr_queues; i++) {
 +    for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
          NVMeQueuePair *q = s->queues[i];
          qemu_mutex_lock(&q->lock);
          nvme_kick(q);
 diff --git a/block/trace-events b/block/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/block/trace-events
 +++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
  # nvme.c
  nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
  nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
 -nvme_kick(void *s, int queue) "s %p queue %d"
 +nvme_kick(void *s, unsigned q_index) "s %p q #%u"
  nvme_dma_flush_queue_wait(void *s) "s %p"
  nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
 -nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
 -nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
 -nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
 -nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
 +nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
 +nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
 +nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
 +nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
  nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
  nvme_handle_event(void *s) "s %p"
  nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
 --
 .28.0

-[Qemu-devel] [PULL 2/5] sheepdog: reorganize coroutine flow
+[PULL 12/33] block/nvme: Make nvme_identify() return boolean indicating error
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Delimit co_recv's lifetime clearly in aio_read_response.
+Just for consistency, following the example documented since
 commit e3fe3988d7 ("error: Document Error API usage rules"),
 return a boolean value indicating an error is set or not.
 Directly pass errp as the local_err is not requested in our
 case.
-Do a simple qemu_coroutine_enter in aio_read_response, letting
+Tested-by: Eric Auger <eric.auger@redhat.com>
-sd_co_writev call sd_write_done.
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-id: 20201029093306.1063879-11-philmd@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Tested-by: Eric Auger <eric.auger@redhat.com>
 ---
  block/nvme.c | 12 +++++++-----
 file changed, 7 insertions(+), 5 deletions(-)
-Handle nr_pending in the same way in sd_co_rw_vector,
+diff --git a/block/nvme.c b/block/nvme.c
 sd_write_done and sd_co_flush_to_disk.
 Remove sd_co_rw_vector's return value; just leave with no
 pending requests.
 [Jeff: added missing 'return' back, spotted by Paolo after
        series was applied.]
 Signed-off-by: Jeff Cody <jcody@redhat.com>
 ---
  block/sheepdog.c | 115 ++++++++++++++++++++-----------------------------------
 file changed, 42 insertions(+), 73 deletions(-)
 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/sheepdog.c
+--- a/block/nvme.c
-+++ b/block/sheepdog.c
++++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ struct SheepdogAIOCB {
+@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
-     enum AIOCBState aiocb_type;
+     return ret;
      Coroutine *coroutine;
 -    void (*aio_done_func)(SheepdogAIOCB *);
 -
      int nr_pending;
      uint32_t min_affect_data_idx;
@@ -XXX,XX +XXX,XX @@ static const char * sd_strerror(int err)
   *
   * 1. In sd_co_rw_vector, we send the I/O requests to the server and
   *    link the requests to the inflight_list in the
 - *    BDRVSheepdogState.  The function exits without waiting for
 + *    BDRVSheepdogState.  The function yields while waiting for
   *    receiving the response.
   *
   * 2. We receive the response in aio_read_response, the fd handler to
 - *    the sheepdog connection.  If metadata update is needed, we send
 - *    the write request to the vdi object in sd_write_done, the write
 - *    completion function.  We switch back to sd_co_readv/writev after
 - *    all the requests belonging to the AIOCB are finished.
 + *    the sheepdog connection.  We switch back to sd_co_readv/sd_writev
 + *    after all the requests belonging to the AIOCB are finished.  If
 + *    needed, sd_co_writev will send another requests for the vdi object.
   */
  static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
@@ -XXX,XX +XXX,XX @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
      acb->nr_pending--;
  }
--static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
+-static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
--{
++/* Returns true on success, false on failure. */
--    qemu_coroutine_enter(acb->coroutine);
++static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
--    qemu_aio_unref(acb);
+ {
--}
+     BDRVNVMeState *s = bs->opaque;
--
++    bool ret = false;
- static const AIOCBInfo sd_aiocb_info = {
+     union {
-     .aiocb_size     = sizeof(SheepdogAIOCB),
+         NvmeIdCtrl ctrl;
- };
+         NvmeIdNs ns;
-@@ -XXX,XX +XXX,XX @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
+@@ -XXX,XX +XXX,XX @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
-     acb->sector_num = sector_num;
+         goto out;
      acb->nb_sectors = nb_sectors;
 -    acb->aio_done_func = NULL;
      acb->coroutine = qemu_coroutine_self();
      acb->ret = 0;
      acb->nr_pending = 0;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
      switch (acb->aiocb_type) {
      case AIOCB_WRITE_UDATA:
 -        /* this coroutine context is no longer suitable for co_recv
 -         * because we may send data to update vdi objects */
 -        s->co_recv = NULL;
          if (!is_data_obj(aio_req->oid)) {
              break;
          }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
          }
      }
-+    /* No more data for this aio_req (reload_inode below uses its own file
++    ret = true;
-+     * descriptor handler which doesn't use co_recv).
+     s->blkshift = lbaf->ds;
-+    */
+ out:
-+    s->co_recv = NULL;
+     qemu_vfio_dma_unmap(s->vfio, id);
      qemu_vfree(id);
 +
-     switch (rsp.result) {
-     case SD_RES_SUCCESS:
-         break;
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
-             aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
-         }
-         resend_aioreq(s, aio_req);
--        goto out;
-+        return;
-     default:
-         acb->ret = -EIO;
-         error_report("%s", sd_strerror(rsp.result));
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
-          * We've finished all requests which belong to the AIOCB, so
-          * we can switch back to sd_co_readv/writev now.
-          */
--        acb->aio_done_func(acb);
-+        qemu_coroutine_enter(acb->coroutine);
-     }
--out:
--    s->co_recv = NULL;
-+
-     return;
-+
- err:
--    s->co_recv = NULL;
-     reconnect_to_sdog(opaque);
- }
-@@ -XXX,XX +XXX,XX @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
- /*
-  * This function is called after writing data objects.  If we need to
-  * update metadata, this sends a write request to the vdi object.
-- * Otherwise, this switches back to sd_co_readv/writev.
-  */
- static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
- {
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
-     mx = acb->max_dirty_data_idx;
-     if (mn <= mx) {
-         /* we need to update the vdi object. */
-+        ++acb->nr_pending;
-         offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
-             mn * sizeof(s->inode.data_vdi_id[0]);
-         data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
-                                 data_len, offset, 0, false, 0, offset);
-         QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-         add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
--
--        acb->aio_done_func = sd_finish_aiocb;
--        acb->aiocb_type = AIOCB_WRITE_UDATA;
--        return;
-+        if (--acb->nr_pending) {
-+            qemu_coroutine_yield();
-+        }
-     }
--
--    sd_finish_aiocb(acb);
- }
- /* Delete current working VDI on the snapshot chain */
-@@ -XXX,XX +XXX,XX @@ out:
-  * Returns 1 when we need to wait a response, 0 when there is no sent
-  * request and -errno in error cases.
-  */
--static int coroutine_fn sd_co_rw_vector(void *p)
-+static void coroutine_fn sd_co_rw_vector(void *p)
- {
-     SheepdogAIOCB *acb = p;
-     int ret = 0;
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_rw_vector(void *p)
-         ret = sd_create_branch(s);
-         if (ret) {
-             acb->ret = -EIO;
--            goto out;
-+            return;
-         }
-     }
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_rw_vector(void *p)
-         idx++;
-         done += len;
-     }
--out:
--    if (!--acb->nr_pending) {
--        return acb->ret;
-+    if (--acb->nr_pending) {
-+        qemu_coroutine_yield();
-     }
--    return 1;
- }
- static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
-     }
-     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
--    acb->aio_done_func = sd_write_done;
-     acb->aiocb_type = AIOCB_WRITE_UDATA;
- retry:
-@@ -XXX,XX +XXX,XX @@ retry:
-         goto retry;
-     }
--    ret = sd_co_rw_vector(acb);
--    if (ret <= 0) {
--        QLIST_REMOVE(acb, aiocb_siblings);
--        qemu_co_queue_restart_all(&s->overlapping_queue);
--        qemu_aio_unref(acb);
--        return ret;
--    }
--
--    qemu_coroutine_yield();
-+    sd_co_rw_vector(acb);
-+    sd_write_done(acb);
-     QLIST_REMOVE(acb, aiocb_siblings);
-     qemu_co_queue_restart_all(&s->overlapping_queue);
--
--    return acb->ret;
-+    ret = acb->ret;
-+    qemu_aio_unref(acb);
 +    return ret;
  }
- static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
+ static bool nvme_poll_queue(NVMeQueuePair *q)
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
+@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
+     uint64_t cap;
-     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
+     uint64_t timeout_ms;
-     acb->aiocb_type = AIOCB_READ_UDATA;
+     uint64_t deadline, now;
--    acb->aio_done_func = sd_finish_aiocb;
+-    Error *local_err = NULL;
+     volatile NvmeBar *regs = NULL;
- retry:
-     if (check_overlapping_aiocb(s, acb)) {
+     qemu_co_mutex_init(&s->dma_map_lock);
-@@ -XXX,XX +XXX,XX @@ retry:
+@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
-         goto retry;
+                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                             false, nvme_handle_event, nvme_poll_cb);
 -    nvme_identify(bs, namespace, &local_err);
 -    if (local_err) {
 -        error_propagate(errp, local_err);
 +    if (!nvme_identify(bs, namespace, errp)) {
          ret = -EIO;
          goto out;
      }
--    ret = sd_co_rw_vector(acb);
--    if (ret <= 0) {
--        QLIST_REMOVE(acb, aiocb_siblings);
--        qemu_co_queue_restart_all(&s->overlapping_queue);
--        qemu_aio_unref(acb);
--        return ret;
--    }
--
--    qemu_coroutine_yield();
-+    sd_co_rw_vector(acb);
-     QLIST_REMOVE(acb, aiocb_siblings);
-     qemu_co_queue_restart_all(&s->overlapping_queue);
--    return acb->ret;
-+    ret = acb->ret;
-+    qemu_aio_unref(acb);
-+    return ret;
- }
- static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
- {
-     BDRVSheepdogState *s = bs->opaque;
-     SheepdogAIOCB *acb;
-+    int ret;
-     AIOReq *aio_req;
-     if (s->cache_flags != SD_FLAG_CMD_CACHE) {
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
-     acb = sd_aio_setup(bs, NULL, 0, 0);
-     acb->aiocb_type = AIOCB_FLUSH_CACHE;
--    acb->aio_done_func = sd_finish_aiocb;
-+    acb->nr_pending++;
-     aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
-, 0, 0, false, 0, 0);
-     QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-     add_aio_request(s, aio_req, NULL, 0, acb->aiocb_type);
--    qemu_coroutine_yield();
--    return acb->ret;
-+    if (--acb->nr_pending) {
-+        qemu_coroutine_yield();
-+    }
-+    ret = acb->ret;
-+    qemu_aio_unref(acb);
-+    return ret;
- }
- static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
-     acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
-                        count >> BDRV_SECTOR_BITS);
-     acb->aiocb_type = AIOCB_DISCARD_OBJ;
--    acb->aio_done_func = sd_finish_aiocb;
- retry:
-     if (check_overlapping_aiocb(s, acb)) {
-@@ -XXX,XX +XXX,XX @@ retry:
-         goto retry;
-     }
--    ret = sd_co_rw_vector(acb);
--    if (ret <= 0) {
--        QLIST_REMOVE(acb, aiocb_siblings);
--        qemu_co_queue_restart_all(&s->overlapping_queue);
--        qemu_aio_unref(acb);
--        return ret;
--    }
--
--    qemu_coroutine_yield();
-+    sd_co_rw_vector(acb);
-     QLIST_REMOVE(acb, aiocb_siblings);
-     qemu_co_queue_restart_all(&s->overlapping_queue);
--
--    return acb->ret;
-+    ret = acb->ret;
-+    qemu_aio_unref(acb);
-+    return ret;
- }
- static coroutine_fn int64_t
 --
-.9.3
+.28.0

-[Qemu-devel] [PULL 3/5] sheepdog: do not use BlockAIOCB
+[PULL 13/33] block/nvme: Make nvme_init_queue() return boolean indicating error
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Sheepdog's AIOCB are completely internal entities for a group of
+Just for consistency, following the example documented since
-requests and do not need dynamic allocation.
+commit e3fe3988d7 ("error: Document Error API usage rules"),
 return a boolean value indicating an error is set or not.
 Directly pass errp as the local_err is not requested in our
 case. This simplifies a bit nvme_create_queue_pair().
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 20161129113245.32724-4-pbonzini@redhat.com
+Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Jeff Cody <jcody@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-id: 20201029093306.1063879-12-philmd@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Tested-by: Eric Auger <eric.auger@redhat.com>
 ---
- block/sheepdog.c | 99 ++++++++++++++++++++++----------------------------------
+ block/nvme.c | 16 +++++++---------
-file changed, 39 insertions(+), 60 deletions(-)
+file changed, 7 insertions(+), 9 deletions(-)
-diff --git a/block/sheepdog.c b/block/sheepdog.c
+diff --git a/block/nvme.c b/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/sheepdog.c
+--- a/block/nvme.c
-+++ b/block/sheepdog.c
++++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static inline size_t count_data_objs(const struct SheepdogInode *inode)
+@@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = {
-     } while (0)
+     },
  typedef struct SheepdogAIOCB SheepdogAIOCB;
 +typedef struct BDRVSheepdogState BDRVSheepdogState;
  typedef struct AIOReq {
      SheepdogAIOCB *aiocb;
@@ -XXX,XX +XXX,XX @@ enum AIOCBState {
         || y->max_affect_data_idx < x->min_affect_data_idx))
  struct SheepdogAIOCB {
 -    BlockAIOCB common;
 +    BDRVSheepdogState *s;
      QEMUIOVector *qiov;
@@ -XXX,XX +XXX,XX @@ struct SheepdogAIOCB {
      QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
  };
--typedef struct BDRVSheepdogState {
+-static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
-+struct BDRVSheepdogState {
++/* Returns true on success, false on failure. */
-     BlockDriverState *bs;
++static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
-     AioContext *aio_context;
+                             unsigned nentries, size_t entry_bytes, Error **errp)
+ {
-@@ -XXX,XX +XXX,XX @@ typedef struct BDRVSheepdogState {
+     size_t bytes;
+@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
-     CoQueue overlapping_queue;
+     q->queue = qemu_try_memalign(s->page_size, bytes);
-     QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
+     if (!q->queue) {
--} BDRVSheepdogState;
+         error_setg(errp, "Cannot allocate queue");
-+};
+-        return;
++        return false;
- typedef struct BDRVSheepdogReopenState {
+     }
-     int fd;
+     memset(q->queue, 0, bytes);
-@@ -XXX,XX +XXX,XX @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
+     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
-     acb->nr_pending--;
+     if (r) {
          error_setg(errp, "Cannot map queue");
 +        return false;
      }
 +    return true;
  }
--static const AIOCBInfo sd_aiocb_info = {
+ static void nvme_free_queue_pair(NVMeQueuePair *q)
--    .aiocb_size     = sizeof(SheepdogAIOCB),
+@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
--};
+                                              Error **errp)
 -
 -static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
 -                                   int64_t sector_num, int nb_sectors)
 +static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
 +                         QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
 +                         int type)
  {
--    SheepdogAIOCB *acb;
+     int i, r;
-     uint32_t object_size;
+-    Error *local_err = NULL;
--    BDRVSheepdogState *s = bs->opaque;
+     NVMeQueuePair *q;
+     uint64_t prp_list_iova;
-     object_size = (UINT32_C(1) << s->inode.block_size_shift);
+@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
--    acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
+         req->prp_list_iova = prp_list_iova + i * s->page_size;
 +    acb->s = s;
      acb->qiov = qiov;
@@ -XXX,XX +XXX,XX @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
      acb->min_dirty_data_idx = UINT32_MAX;
      acb->max_dirty_data_idx = 0;
 -
 -    return acb;
 +    acb->aiocb_type = type;
  }
  /* Return -EIO in case of error, file descriptor on success */
@@ -XXX,XX +XXX,XX @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
   */
  static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
  {
 -    BDRVSheepdogState *s = acb->common.bs->opaque;
 +    BDRVSheepdogState *s = acb->s;
      struct iovec iov;
      AIOReq *aio_req;
      uint32_t offset, data_len, mn, mx;
@@ -XXX,XX +XXX,XX @@ out:
   * Returns 1 when we need to wait a response, 0 when there is no sent
   * request and -errno in error cases.
   */
 -static void coroutine_fn sd_co_rw_vector(void *p)
 +static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
  {
 -    SheepdogAIOCB *acb = p;
      int ret = 0;
      unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
      unsigned long idx;
      uint32_t object_size;
      uint64_t oid;
      uint64_t offset;
 -    BDRVSheepdogState *s = acb->common.bs->opaque;
 +    BDRVSheepdogState *s = acb->s;
      SheepdogInode *inode = &s->inode;
      AIOReq *aio_req;
@@ -XXX,XX +XXX,XX @@ static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
  static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors, QEMUIOVector *qiov)
  {
 -    SheepdogAIOCB *acb;
 +    SheepdogAIOCB acb;
      int ret;
      int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
      BDRVSheepdogState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
          }
      }
--    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
+-    nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
--    acb->aiocb_type = AIOCB_WRITE_UDATA;
+-    if (local_err) {
-+    sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
+-        error_propagate(errp, local_err);
++    if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
- retry:
+         goto fail;
 -    if (check_overlapping_aiocb(s, acb)) {
 +    if (check_overlapping_aiocb(s, &acb)) {
          qemu_co_queue_wait(&s->overlapping_queue);
          goto retry;
      }
+     q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
--    sd_co_rw_vector(acb);
--    sd_write_done(acb);
+-    nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
-+    sd_co_rw_vector(&acb);
+-    if (local_err) {
-+    sd_write_done(&acb);
+-        error_propagate(errp, local_err);
++    if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
--    QLIST_REMOVE(acb, aiocb_siblings);
+         goto fail;
 +    QLIST_REMOVE(&acb, aiocb_siblings);
      qemu_co_queue_restart_all(&s->overlapping_queue);
 -    ret = acb->ret;
 -    qemu_aio_unref(acb);
 -    return ret;
 +    return acb.ret;
  }
  static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
  {
 -    SheepdogAIOCB *acb;
 -    int ret;
 +    SheepdogAIOCB acb;
      BDRVSheepdogState *s = bs->opaque;
 -    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
 -    acb->aiocb_type = AIOCB_READ_UDATA;
 +    sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
  retry:
 -    if (check_overlapping_aiocb(s, acb)) {
 +    if (check_overlapping_aiocb(s, &acb)) {
          qemu_co_queue_wait(&s->overlapping_queue);
          goto retry;
      }
+     q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
 -    sd_co_rw_vector(acb);
 +    sd_co_rw_vector(&acb);
 -    QLIST_REMOVE(acb, aiocb_siblings);
 +    QLIST_REMOVE(&acb, aiocb_siblings);
      qemu_co_queue_restart_all(&s->overlapping_queue);
 -    ret = acb->ret;
 -    qemu_aio_unref(acb);
 -    return ret;
 +    return acb.ret;
  }
  static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
  {
      BDRVSheepdogState *s = bs->opaque;
 -    SheepdogAIOCB *acb;
 -    int ret;
 +    SheepdogAIOCB acb;
      AIOReq *aio_req;
      if (s->cache_flags != SD_FLAG_CMD_CACHE) {
          return 0;
      }
 -    acb = sd_aio_setup(bs, NULL, 0, 0);
 -    acb->aiocb_type = AIOCB_FLUSH_CACHE;
 +    sd_aio_setup(&acb, s, NULL, 0, 0, AIOCB_FLUSH_CACHE);
 -    acb->nr_pending++;
 -    aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
 +    acb.nr_pending++;
 +    aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
 , 0, 0, false, 0, 0);
      QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 -    add_aio_request(s, aio_req, NULL, 0, acb->aiocb_type);
 +    add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
 -    if (--acb->nr_pending) {
 +    if (--acb.nr_pending) {
          qemu_coroutine_yield();
      }
 -    ret = acb->ret;
 -    qemu_aio_unref(acb);
 -    return ret;
 +    return acb.ret;
  }
  static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
@@ -XXX,XX +XXX,XX @@ static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
  static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
                                        int count)
  {
 -    SheepdogAIOCB *acb;
 +    SheepdogAIOCB acb;
      BDRVSheepdogState *s = bs->opaque;
 -    int ret;
      QEMUIOVector discard_iov;
      struct iovec iov;
      uint32_t zero = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
      if (!QEMU_IS_ALIGNED(offset | count, BDRV_SECTOR_SIZE)) {
          return -ENOTSUP;
      }
 -    acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
 -                       count >> BDRV_SECTOR_BITS);
 -    acb->aiocb_type = AIOCB_DISCARD_OBJ;
 +    sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
 +                 count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
  retry:
 -    if (check_overlapping_aiocb(s, acb)) {
 +    if (check_overlapping_aiocb(s, &acb)) {
          qemu_co_queue_wait(&s->overlapping_queue);
          goto retry;
      }
 -    sd_co_rw_vector(acb);
 +    sd_co_rw_vector(&acb);
 -    QLIST_REMOVE(acb, aiocb_siblings);
 +    QLIST_REMOVE(&acb, aiocb_siblings);
      qemu_co_queue_restart_all(&s->overlapping_queue);
 -    ret = acb->ret;
 -    qemu_aio_unref(acb);
 -    return ret;
 +    return acb.ret;
  }
  static coroutine_fn int64_t
 --
-.9.3
+.28.0

-New patch
+[PULL 14/33] block/nvme: Introduce Completion Queue definitions
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Rename Submission Queue flags with 'Sq' to differentiate
+submission queue flags from command queue flags, and introduce
+Completion Queue flag definitions.
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20201029093306.1063879-13-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ include/block/nvme.h | 18 ++++++++++++------
+file changed, 12 insertions(+), 6 deletions(-)
+diff --git a/include/block/nvme.h b/include/block/nvme.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/block/nvme.h
++++ b/include/block/nvme.h
+@@ -XXX,XX +XXX,XX @@ typedef struct QEMU_PACKED NvmeCreateCq {
+ #define NVME_CQ_FLAGS_PC(cq_flags)  (cq_flags & 0x1)
+ #define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
++enum NvmeFlagsCq {
++    NVME_CQ_PC          = 1,
++    NVME_CQ_IEN         = 2,
++};
++
+ typedef struct QEMU_PACKED NvmeCreateSq {
+     uint8_t     opcode;
+     uint8_t     flags;
+@@ -XXX,XX +XXX,XX @@ typedef struct QEMU_PACKED NvmeCreateSq {
+ #define NVME_SQ_FLAGS_PC(sq_flags)      (sq_flags & 0x1)
+ #define NVME_SQ_FLAGS_QPRIO(sq_flags)   ((sq_flags >> 1) & 0x3)
+-enum NvmeQueueFlags {
+-    NVME_Q_PC           = 1,
+-    NVME_Q_PRIO_URGENT  = 0,
+-    NVME_Q_PRIO_HIGH    = 1,
+-    NVME_Q_PRIO_NORMAL  = 2,
+-    NVME_Q_PRIO_LOW     = 3,
++enum NvmeFlagsSq {
++    NVME_SQ_PC          = 1,
++
++    NVME_SQ_PRIO_URGENT = 0,
++    NVME_SQ_PRIO_HIGH   = 1,
++    NVME_SQ_PRIO_NORMAL = 2,
++    NVME_SQ_PRIO_LOW    = 3,
+ };
+ typedef struct QEMU_PACKED NvmeIdentify {
+--
+.28.0

-New patch
+[PULL 15/33] block/nvme: Use definitions instead of magic values in add_io_queue()
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Replace magic values by definitions, and simplifiy since the
+number of queues will never reach 64K.
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-14-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 9 +++++----
+file changed, 5 insertions(+), 4 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
+     NvmeCmd cmd;
+     unsigned queue_size = NVME_QUEUE_SIZE;
++    assert(n <= UINT16_MAX);
+     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
+                                n, queue_size, errp);
+     if (!q) {
+@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
+     cmd = (NvmeCmd) {
+         .opcode = NVME_ADM_CMD_CREATE_CQ,
+         .dptr.prp1 = cpu_to_le64(q->cq.iova),
+-        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
+-        .cdw11 = cpu_to_le32(0x3),
++        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
++        .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
+     };
+     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+         error_setg(errp, "Failed to create CQ io queue [%u]", n);
+@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
+     cmd = (NvmeCmd) {
+         .opcode = NVME_ADM_CMD_CREATE_SQ,
+         .dptr.prp1 = cpu_to_le64(q->sq.iova),
+-        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
+-        .cdw11 = cpu_to_le32(0x1 | (n << 16)),
++        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
++        .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
+     };
+     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+         error_setg(errp, "Failed to create SQ io queue [%u]", n);
+--
+.28.0

-New patch
+[PULL 16/33] block/nvme: Correctly initialize Admin Queue Attributes
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+From the specification chapter 3.1.8 "AQA - Admin Queue Attributes"
+the Admin Submission Queue Size field is a 0’s based value:
+  Admin Submission Queue Size (ASQS):
+    Defines the size of the Admin Submission Queue in entries.
+    Enabling a controller while this field is cleared to 00h
+    produces undefined results. The minimum size of the Admin
+    Submission Queue is two entries. The maximum size of the
+    Admin Submission Queue is 4096 entries.
+    This is a 0’s based value.
+This bug has never been hit because the device initialization
+uses a single command synchronously :)
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-15-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 6 +++---
+file changed, 3 insertions(+), 3 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
+         goto out;
+     }
+     s->queue_count = 1;
+-    QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
+-    regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
+-                            (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
++    QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
++    regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
++                            ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
+     regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
+     regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
+--
+.28.0

-New patch
+[PULL 17/33] block/nvme: Simplify ADMIN queue access
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+We don't need to dereference from BDRVNVMeState each time.
+Use a NVMeQueuePair pointer on the admin queue.
+The nvme_init() becomes easier to review, matching the style
+of nvme_add_io_queue().
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-16-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 12 ++++++------
+file changed, 6 insertions(+), 6 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
+                      Error **errp)
+ {
+     BDRVNVMeState *s = bs->opaque;
++    NVMeQueuePair *q;
+     AioContext *aio_context = bdrv_get_aio_context(bs);
+     int ret;
+     uint64_t cap;
+@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
+     /* Set up admin queue. */
+     s->queues = g_new(NVMeQueuePair *, 1);
+-    s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
+-                                                          NVME_QUEUE_SIZE,
+-                                                          errp);
+-    if (!s->queues[INDEX_ADMIN]) {
++    q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
++    if (!q) {
+         ret = -EINVAL;
+         goto out;
+     }
++    s->queues[INDEX_ADMIN] = q;
+     s->queue_count = 1;
+     QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
+     regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
+                             ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
+-    regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
+-    regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
++    regs->asq = cpu_to_le64(q->sq.iova);
++    regs->acq = cpu_to_le64(q->cq.iova);
+     /* After setting up all control registers we can enable device now. */
+     regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
+--
+.28.0

-[Qemu-devel] [PULL 5/5] sheepdog: reorganize check for overlapping requests
+[PULL 18/33] block/nvme: Simplify nvme_cmd_sync()
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Wrap the code that was copied repeatedly in the two functions,
+As all commands use the ADMIN queue, it is pointless to pass
-sd_aio_setup and sd_aio_complete.
+it as argument each time. Remove the argument, and rename the
 function as nvme_admin_cmd_sync() to make this new behavior
 clearer.
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Message-id: 20161129113245.32724-6-pbonzini@redhat.com
+Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Jeff Cody <jcody@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-id: 20201029093306.1063879-17-philmd@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Tested-by: Eric Auger <eric.auger@redhat.com>
 ---
- block/sheepdog.c | 66 ++++++++++++++++++++++++++------------------------------
+ block/nvme.c | 19 ++++++++++---------
-file changed, 30 insertions(+), 36 deletions(-)
+file changed, 10 insertions(+), 9 deletions(-)
-diff --git a/block/sheepdog.c b/block/sheepdog.c
+diff --git a/block/nvme.c b/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/sheepdog.c
+--- a/block/nvme.c
-+++ b/block/sheepdog.c
++++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
+@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
-     return aio_req;
+     qemu_mutex_unlock(&q->lock);
  }
-+static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
+-static void nvme_cmd_sync_cb(void *opaque, int ret)
-+{
++static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
-+    SheepdogAIOCB *cb;
+ {
-+
+     int *pret = opaque;
-+retry:
+     *pret = ret;
-+    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
+     aio_wait_kick();
 +        if (AIOCBOverlapping(acb, cb)) {
 +            qemu_co_queue_wait(&s->overlapping_queue);
 +            goto retry;
 +        }
 +    }
 +}
 +
  static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
                           QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
                           int type)
@@ -XXX,XX +XXX,XX @@ static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
      acb->min_dirty_data_idx = UINT32_MAX;
      acb->max_dirty_data_idx = 0;
      acb->aiocb_type = type;
 +
 +    if (type == AIOCB_FLUSH_CACHE) {
 +        return;
 +    }
 +
 +    wait_for_overlapping_aiocb(s, acb);
 +    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
  }
- /* Return -EIO in case of error, file descriptor on success */
+-static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
+-                         NvmeCmd *cmd)
 +static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
  {
 +    BDRVNVMeState *s = bs->opaque;
 +    NVMeQueuePair *q = s->queues[INDEX_ADMIN];
      AioContext *aio_context = bdrv_get_aio_context(bs);
      NVMeRequest *req;
      int ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
      if (!req) {
          return -EBUSY;
      }
- }
+-    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
++    nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
--static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
-+static void sd_aio_complete(SheepdogAIOCB *acb)
+     AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
- {
+     return ret;
--    SheepdogAIOCB *cb;
+@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
--
--    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
+     memset(id, 0, sizeof(*id));
--        if (AIOCBOverlapping(aiocb, cb)) {
+     cmd.dptr.prp1 = cpu_to_le64(iova);
--            return true;
+-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
--        }
++    if (nvme_admin_cmd_sync(bs, &cmd)) {
-+    if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
+         error_setg(errp, "Failed to identify controller");
-+        return;
+         goto out;
      }
+@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
--    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
+     memset(id, 0, sizeof(*id));
--    return false;
+     cmd.cdw10 = 0;
-+    QLIST_REMOVE(acb, aiocb_siblings);
+     cmd.nsid = cpu_to_le32(namespace);
-+    qemu_co_queue_restart_all(&acb->s->overlapping_queue);
+-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
- }
++    if (nvme_admin_cmd_sync(bs, &cmd)) {
+         error_setg(errp, "Failed to identify namespace");
- static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
+         goto out;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
      }
+@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
-     sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
+         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
--
+         .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
--retry:
+     };
--    if (check_overlapping_aiocb(s, &acb)) {
+-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
--        qemu_co_queue_wait(&s->overlapping_queue);
++    if (nvme_admin_cmd_sync(bs, &cmd)) {
--        goto retry;
+         error_setg(errp, "Failed to create CQ io queue [%u]", n);
--    }
+         goto out_error;
 -
      sd_co_rw_vector(&acb);
      sd_write_done(&acb);
 +    sd_aio_complete(&acb);
 -    QLIST_REMOVE(&acb, aiocb_siblings);
 -    qemu_co_queue_restart_all(&s->overlapping_queue);
      return acb.ret;
  }
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
      BDRVSheepdogState *s = bs->opaque;
      sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
 -
 -retry:
 -    if (check_overlapping_aiocb(s, &acb)) {
 -        qemu_co_queue_wait(&s->overlapping_queue);
 -        goto retry;
 -    }
 -
      sd_co_rw_vector(&acb);
 +    sd_aio_complete(&acb);
 -    QLIST_REMOVE(&acb, aiocb_siblings);
 -    qemu_co_queue_restart_all(&s->overlapping_queue);
      return acb.ret;
  }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
      if (--acb.nr_pending) {
          qemu_coroutine_yield();
      }
-+
+@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
-+    sd_aio_complete(&acb);
+         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
-     return acb.ret;
+         .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
- }
+     };
+-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
++    if (nvme_admin_cmd_sync(bs, &cmd)) {
          error_setg(errp, "Failed to create SQ io queue [%u]", n);
          goto out_error;
      }
-     sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
+@@ -XXX,XX +XXX,XX @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
-                  count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
+         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
--
+     };
--retry:
--    if (check_overlapping_aiocb(s, &acb)) {
+-    ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd);
--        qemu_co_queue_wait(&s->overlapping_queue);
++    ret = nvme_admin_cmd_sync(bs, &cmd);
--        goto retry;
+     if (ret) {
--    }
+         error_setg(errp, "Failed to configure NVMe write cache");
--
+     }
      sd_co_rw_vector(&acb);
 +    sd_aio_complete(&acb);
 -    QLIST_REMOVE(&acb, aiocb_siblings);
 -    qemu_co_queue_restart_all(&s->overlapping_queue);
      return acb.ret;
  }
 --
-.9.3
+.28.0

-New patch
+[PULL 19/33] block/nvme: Set request_alignment at initialization
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Commit bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
+sets the request_alignment in nvme_refresh_limits().
+For consistency, also set it during initialization.
+Reported-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-18-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 1 +
+file changed, 1 insertion(+)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
+     s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
+     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
+     bs->bl.opt_mem_alignment = s->page_size;
++    bs->bl.request_alignment = s->page_size;
+     timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
+     /* Reset device to get a clean state. */
+--
+.28.0

-New patch
+[PULL 20/33] block/nvme: Correct minimum device page size
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+While trying to simplify the code using a macro, we forgot
+the 12-bit shift... Correct that.
+Fixes: fad1eb68862 ("block/nvme: Use register definitions from 'block/nvme.h'")
+Reported-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-19-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
+         goto out;
+     }
+-    s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
++    s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
+     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
+     bs->bl.opt_mem_alignment = s->page_size;
+     bs->bl.request_alignment = s->page_size;
+--
+.28.0

-[Qemu-devel] [PULL 4/5] sheepdog: simplify inflight_aio_head management
+[PULL 21/33] block/nvme: Change size and alignment of IDENTIFY response buffer
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Eric Auger <eric.auger@redhat.com>
-Add to the list in add_aio_request and, indirectly, resend_aioreq.  Inline
+In preparation of 64kB host page support, let's change the size
-free_aio_req in the caller, it does not simply undo alloc_aio_req's job.
+and alignment of the IDENTIFY command response buffer so that
 the VFIO DMA MAP succeeds. We align on the host page size.
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
-Message-id: 20161129113245.32724-5-pbonzini@redhat.com
+Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Jeff Cody <jcody@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Tested-by: Eric Auger <eric.auger@redhat.com>
 Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-id: 20201029093306.1063879-20-philmd@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Tested-by: Eric Auger <eric.auger@redhat.com>
 ---
- block/sheepdog.c | 23 ++++++-----------------
+ block/nvme.c | 9 +++++----
-file changed, 6 insertions(+), 17 deletions(-)
+file changed, 5 insertions(+), 4 deletions(-)
-diff --git a/block/sheepdog.c b/block/sheepdog.c
+diff --git a/block/nvme.c b/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/sheepdog.c
+--- a/block/nvme.c
-+++ b/block/sheepdog.c
++++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
+@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
-     return aio_req;
+         .opcode = NVME_ADM_CMD_IDENTIFY,
- }
+         .cdw10 = cpu_to_le32(0x1),
+     };
--static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
++    size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size);
--{
--    SheepdogAIOCB *acb = aio_req->aiocb;
+-    id = qemu_try_memalign(s->page_size, sizeof(*id));
--
++    id = qemu_try_memalign(qemu_real_host_page_size, id_size);
--    QLIST_REMOVE(aio_req, aio_siblings);
+     if (!id) {
--    g_free(aio_req);
+         error_setg(errp, "Cannot allocate buffer for identify response");
--
+         goto out;
 -    acb->nr_pending--;
 -}
 -
  static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
                           QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
                           int type)
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
      while (!QLIST_EMPTY(&s->failed_aio_head)) {
          aio_req = QLIST_FIRST(&s->failed_aio_head);
          QLIST_REMOVE(aio_req, aio_siblings);
 -        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
          resend_aioreq(s, aio_req);
      }
- }
+-    r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova);
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
++    r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova);
-     */
+     if (r) {
-     s->co_recv = NULL;
+         error_setg(errp, "Cannot map buffer for DMA");
+         goto out;
 +    QLIST_REMOVE(aio_req, aio_siblings);
      switch (rsp.result) {
      case SD_RES_SUCCESS:
          break;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
          break;
      }
--    free_aio_req(s, aio_req);
+-    memset(id, 0, sizeof(*id));
--    if (!acb->nr_pending) {
++    memset(id, 0, id_size);
-+    g_free(aio_req);
+     cmd.dptr.prp1 = cpu_to_le64(iova);
-+
+     if (nvme_admin_cmd_sync(bs, &cmd)) {
-+    if (!--acb->nr_pending) {
+         error_setg(errp, "Failed to identify controller");
-         /*
+@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
-          * We've finished all requests which belong to the AIOCB, so
+     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
-          * we can switch back to sd_co_readv/writev now.
+     s->supports_discard = !!(oncs & NVME_ONCS_DSM);
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
-     uint64_t old_oid = aio_req->base_oid;
+-    memset(id, 0, sizeof(*id));
-     bool create = aio_req->create;
++    memset(id, 0, id_size);
+     cmd.cdw10 = 0;
-+    QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+     cmd.nsid = cpu_to_le32(namespace);
-+
+     if (nvme_admin_cmd_sync(bs, &cmd)) {
      if (!nr_copies) {
          error_report("bug");
      }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
          iov.iov_len = sizeof(s->inode);
          aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
                                  data_len, offset, 0, false, 0, offset);
 -        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
          add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
          if (--acb->nr_pending) {
              qemu_coroutine_yield();
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
                                  old_oid,
                                  acb->aiocb_type == AIOCB_DISCARD_OBJ ?
 : done);
 -        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 -
          add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
                          acb->aiocb_type);
      done:
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
      acb.nr_pending++;
      aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
 , 0, 0, false, 0, 0);
 -    QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
      add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
      if (--acb.nr_pending) {
 --
-.9.3
+.28.0

-New patch
+[PULL 22/33] block/nvme: Change size and alignment of queue
+From: Eric Auger <eric.auger@redhat.com>
+In preparation of 64kB host page support, let's change the size
+and alignment of the queue so that the VFIO DMA MAP succeeds.
+We align on the host page size.
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-21-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 4 ++--
+file changed, 2 insertions(+), 2 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
+     size_t bytes;
+     int r;
+-    bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
++    bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size);
+     q->head = q->tail = 0;
+-    q->queue = qemu_try_memalign(s->page_size, bytes);
++    q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes);
+     if (!q->queue) {
+         error_setg(errp, "Cannot allocate queue");
+         return false;
+--
+.28.0

-New patch
+[PULL 23/33] block/nvme: Change size and alignment of prp_list_pages
+From: Eric Auger <eric.auger@redhat.com>
+In preparation of 64kB host page support, let's change the size
+and alignment of the prp_list_pages so that the VFIO DMA MAP succeeds
+with 64kB host page size. We align on the host page size.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-22-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 11 ++++++-----
+file changed, 6 insertions(+), 5 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
+     int i, r;
+     NVMeQueuePair *q;
+     uint64_t prp_list_iova;
++    size_t bytes;
+     q = g_try_new0(NVMeQueuePair, 1);
+     if (!q) {
+@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
+     }
+     trace_nvme_create_queue_pair(idx, q, size, aio_context,
+                                  event_notifier_get_fd(s->irq_notifier));
+-    q->prp_list_pages = qemu_try_memalign(s->page_size,
+-                                          s->page_size * NVME_NUM_REQS);
++    bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
++                          qemu_real_host_page_size);
++    q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes);
+     if (!q->prp_list_pages) {
+         goto fail;
+     }
+-    memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
++    memset(q->prp_list_pages, 0, bytes);
+     qemu_mutex_init(&q->lock);
+     q->s = s;
+     q->index = idx;
+     qemu_co_queue_init(&q->free_req_queue);
+     q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
+-    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
+-                          s->page_size * NVME_NUM_REQS,
++    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
+                           false, &prp_list_iova);
+     if (r) {
+         goto fail;
+--
+.28.0

-New patch
+[PULL 24/33] block/nvme: Align iov's va and size on host page size
+From: Eric Auger <eric.auger@redhat.com>
+Make sure iov's va and size are properly aligned on the
+host page size.
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-23-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 14 ++++++++------
+file changed, 8 insertions(+), 6 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
+     for (i = 0; i < qiov->niov; ++i) {
+         bool retry = true;
+         uint64_t iova;
++        size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
++                                   qemu_real_host_page_size);
+ try_map:
+         r = qemu_vfio_dma_map(s->vfio,
+                               qiov->iov[i].iov_base,
+-                              qiov->iov[i].iov_len,
+-                              true, &iova);
++                              len, true, &iova);
+         if (r == -ENOMEM && retry) {
+             retry = false;
+             trace_nvme_dma_flush_queue_wait(s);
+@@ -XXX,XX +XXX,XX @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs,
+     BDRVNVMeState *s = bs->opaque;
+     for (i = 0; i < qiov->niov; ++i) {
+-        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
+-            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
++        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
++                                 qemu_real_host_page_size) ||
++            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) {
+             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
+                                       qiov->iov[i].iov_len, s->page_size);
+             return false;
+@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+     int r;
+     uint8_t *buf = NULL;
+     QEMUIOVector local_qiov;
+-
++    size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
+     assert(QEMU_IS_ALIGNED(offset, s->page_size));
+     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
+     assert(bytes <= s->max_transfer);
+@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+     }
+     s->stats.unaligned_accesses++;
+     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
+-    buf = qemu_try_memalign(s->page_size, bytes);
++    buf = qemu_try_memalign(qemu_real_host_page_size, len);
+     if (!buf) {
+         return -ENOMEM;
+--
+.28.0

-New patch
+[PULL 25/33] block/nvme: Fix use of write-only doorbells page on Aarch64 arch
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+qemu_vfio_pci_map_bar() calls mmap(), and mmap(2) states:
+  'offset' must be a multiple of the page size as returned
+   by sysconf(_SC_PAGE_SIZE).
+In commit f68453237b9 we started to use an offset of 4K which
+broke this contract on Aarch64 arch.
+Fix by mapping at offset 0, and and accessing doorbells at offset=4K.
+Fixes: f68453237b9 ("block/nvme: Map doorbells pages write-only")
+Reported-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201029093306.1063879-24-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 11 +++++++----
+file changed, 7 insertions(+), 4 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ typedef struct {
+ struct BDRVNVMeState {
+     AioContext *aio_context;
+     QEMUVFIOState *vfio;
++    void *bar0_wo_map;
+     /* Memory mapped registers */
+     volatile struct {
+         uint32_t sq_tail;
+@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
+         }
+     }
+-    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
+-                                         NVME_DOORBELL_SIZE, PROT_WRITE, errp);
++    s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
++                                           sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
++                                           PROT_WRITE, errp);
++    s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
+     if (!s->doorbells) {
+         ret = -EINVAL;
+         goto out;
+@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
+                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
+                            false, NULL, NULL);
+     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
+-    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
+-                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);
++    qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
++                            0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
+     qemu_vfio_close(s->vfio);
+     g_free(s->device);
+--
+.28.0

-New patch
+[PULL 26/33] block/nvme: Fix nvme_submit_command() on big-endian host
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+The Completion Queue Command Identifier is a 16-bit value,
+so nvme_submit_command() is unlikely to work on big-endian
+hosts, as the relevant bits are truncated.
+Fix by using the correct byte-swap function.
+Fixes: bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
+Reported-by: Keith Busch <kbusch@kernel.org>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20201029093306.1063879-25-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ block/nvme.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
+     assert(!req->cb);
+     req->cb = cb;
+     req->opaque = opaque;
+-    cmd->cid = cpu_to_le32(req->cid);
++    cmd->cid = cpu_to_le16(req->cid);
+     trace_nvme_submit_command(q->s, q->index, req->cid);
+     nvme_trace_command(cmd);
+--
+.28.0

-New patch
+[PULL 27/33] util/vfio-helpers: Improve reporting unsupported IOMMU type
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Change the confuse "VFIO IOMMU check failed" error message by
+the explicit "VFIO IOMMU Type1 is not supported" once.
+Example on POWER:
+ $ qemu-system-ppc64 -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw
+ qemu-system-ppc64: -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw: VFIO IOMMU Type1 is not supported
+Suggested-by: Alex Williamson <alex.williamson@redhat.com>
+Reviewed-by: Fam Zheng <fam@euphon.net>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201103020733.2303148-2-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ util/vfio-helpers.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vfio-helpers.c
++++ b/util/vfio-helpers.c
+@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
+     }
+     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+-        error_setg_errno(errp, errno, "VFIO IOMMU check failed");
++        error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
+         ret = -EINVAL;
+         goto fail_container;
+     }
+--
+.28.0

-New patch
+[PULL 28/33] util/vfio-helpers: Trace PCI I/O config accesses
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+We sometime get kernel panic with some devices on Aarch64
+hosts. Alex Williamson suggests it might be broken PCIe
+root complex. Add trace event to record the latest I/O
+access before crashing. In case, assert our accesses are
+aligned.
+Reviewed-by: Fam Zheng <fam@euphon.net>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201103020733.2303148-3-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ util/vfio-helpers.c | 8 ++++++++
+ util/trace-events   | 2 ++
+files changed, 10 insertions(+)
+diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vfio-helpers.c
++++ b/util/vfio-helpers.c
+@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
+ {
+     int ret;
++    trace_qemu_vfio_pci_read_config(buf, ofs, size,
++                                    s->config_region_info.offset,
++                                    s->config_region_info.size);
++    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
+     do {
+         ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
+     } while (ret == -1 && errno == EINTR);
+@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
+ {
+     int ret;
++    trace_qemu_vfio_pci_write_config(buf, ofs, size,
++                                     s->config_region_info.offset,
++                                     s->config_region_info.size);
++    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
+     do {
+         ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
+     } while (ret == -1 && errno == EINTR);
+diff --git a/util/trace-events b/util/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/util/trace-events
++++ b/util/trace-events
+@@ -XXX,XX +XXX,XX @@ qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova
+ qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
+ qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
+ qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
++qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
++qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+--
+.28.0

-New patch
+[PULL 29/33] util/vfio-helpers: Trace PCI BAR region info
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+For debug purpose, trace BAR regions info.
+Reviewed-by: Fam Zheng <fam@euphon.net>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201103020733.2303148-4-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ util/vfio-helpers.c | 8 ++++++++
+ util/trace-events   | 1 +
+files changed, 9 insertions(+)
+diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vfio-helpers.c
++++ b/util/vfio-helpers.c
+@@ -XXX,XX +XXX,XX @@ static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
+ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
+ {
++    g_autofree char *barname = NULL;
+     assert_bar_index_valid(s, index);
+     s->bar_region_info[index] = (struct vfio_region_info) {
+         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
+@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
+         error_setg_errno(errp, errno, "Failed to get BAR region info");
+         return -errno;
+     }
++    barname = g_strdup_printf("bar[%d]", index);
++    trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
++                                s->bar_region_info[index].size,
++                                s->bar_region_info[index].cap_offset);
+     return 0;
+ }
+@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
+         ret = -errno;
+         goto fail;
+     }
++    trace_qemu_vfio_region_info("config", s->config_region_info.offset,
++                                s->config_region_info.size,
++                                s->config_region_info.cap_offset);
+     for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
+         ret = qemu_vfio_pci_init_bar(s, i, errp);
+diff --git a/util/trace-events b/util/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/util/trace-events
++++ b/util/trace-events
+@@ -XXX,XX +XXX,XX @@ qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *io
+ qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
+ qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+ qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
++qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
+--
+.28.0

-New patch
+[PULL 30/33] util/vfio-helpers: Trace where BARs are mapped
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+For debugging purpose, trace where a BAR is mapped.
+Reviewed-by: Fam Zheng <fam@euphon.net>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201103020733.2303148-5-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ util/vfio-helpers.c | 2 ++
+ util/trace-events   | 1 +
+files changed, 3 insertions(+)
+diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vfio-helpers.c
++++ b/util/vfio-helpers.c
+@@ -XXX,XX +XXX,XX @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
+     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
+              prot, MAP_SHARED,
+              s->device, s->bar_region_info[index].offset + offset);
++    trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
++                                size, offset, p);
+     if (p == MAP_FAILED) {
+         error_setg_errno(errp, errno, "Failed to map BAR region");
+         p = NULL;
+diff --git a/util/trace-events b/util/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/util/trace-events
++++ b/util/trace-events
+@@ -XXX,XX +XXX,XX @@ qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
+ qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+ qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
++qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
+--
+.28.0

-New patch
+[PULL 31/33] util/vfio-helpers: Improve DMA trace events
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+For debugging purpose, trace where DMA regions are mapped.
+Reviewed-by: Fam Zheng <fam@euphon.net>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201103020733.2303148-6-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ util/vfio-helpers.c | 3 ++-
+ util/trace-events   | 5 +++--
+files changed, 5 insertions(+), 3 deletions(-)
+diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vfio-helpers.c
++++ b/util/vfio-helpers.c
+@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
+         .vaddr = (uintptr_t)host,
+         .size = size,
+     };
+-    trace_qemu_vfio_do_mapping(s, host, size, iova);
++    trace_qemu_vfio_do_mapping(s, host, iova, size);
+     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+         error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
+@@ -XXX,XX +XXX,XX @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
+             }
+         }
+     }
++    trace_qemu_vfio_dma_mapped(s, host, iova0, size);
+     if (iova) {
+         *iova = iova0;
+     }
+diff --git a/util/trace-events b/util/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/util/trace-events
++++ b/util/trace-events
+@@ -XXX,XX +XXX,XX @@ qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%z
+ qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+ qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
+ qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
+-qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
+-qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
++qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
++qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p"
++qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx"
+ qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
+ qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+ qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+--
+.28.0

-[Qemu-devel] [PULL 1/5] sheepdog: remove unused cancellation support
+[PULL 32/33] util/vfio-helpers: Convert vfio_dump_mapping to trace events
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
-SheepdogAIOCB is internal to sheepdog.c, hence it is never canceled.
+The QEMU_VFIO_DEBUG definition is only modifiable at build-time.
 Trace events can be enabled at run-time. As we prefer the latter,
 convert qemu_vfio_dump_mappings() to use trace events instead
 of fprintf().
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <fam@euphon.net>
-Message-id: 20161129113245.32724-2-pbonzini@redhat.com
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Jeff Cody <jcody@redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-id: 20201103020733.2303148-7-philmd@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Tested-by: Eric Auger <eric.auger@redhat.com>
 ---
- block/sheepdog.c | 52 ----------------------------------------------------
+ util/vfio-helpers.c | 19 ++++---------------
-file changed, 52 deletions(-)
+ util/trace-events   |  1 +
 files changed, 5 insertions(+), 15 deletions(-)
-diff --git a/block/sheepdog.c b/block/sheepdog.c
+diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/sheepdog.c
+--- a/util/vfio-helpers.c
-+++ b/block/sheepdog.c
++++ b/util/vfio-helpers.c
-@@ -XXX,XX +XXX,XX @@ struct SheepdogAIOCB {
+@@ -XXX,XX +XXX,XX @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
-     Coroutine *coroutine;
+     return s;
      void (*aio_done_func)(SheepdogAIOCB *);
 -    bool cancelable;
      int nr_pending;
      uint32_t min_affect_data_idx;
@@ -XXX,XX +XXX,XX @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
  {
      SheepdogAIOCB *acb = aio_req->aiocb;
 -    acb->cancelable = false;
      QLIST_REMOVE(aio_req, aio_siblings);
      g_free(aio_req);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
      qemu_aio_unref(acb);
  }
--/*
+-static void qemu_vfio_dump_mapping(IOVAMapping *m)
 - * Check whether the specified acb can be canceled
 - *
 - * We can cancel aio when any request belonging to the acb is:
 - *  - Not processed by the sheepdog server.
 - *  - Not linked to the inflight queue.
 - */
 -static bool sd_acb_cancelable(const SheepdogAIOCB *acb)
 -{
--    BDRVSheepdogState *s = acb->common.bs->opaque;
+-    if (QEMU_VFIO_DEBUG) {
--    AIOReq *aioreq;
+-        printf("  vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
--
+-               (uint64_t)m->size, (uint64_t)m->iova);
 -    if (!acb->cancelable) {
 -        return false;
 -    }
 -
 -    QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) {
 -        if (aioreq->aiocb == acb) {
 -            return false;
 -        }
 -    }
 -
 -    return true;
 -}
 -
 -static void sd_aio_cancel(BlockAIOCB *blockacb)
 -{
 -    SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
 -    BDRVSheepdogState *s = acb->common.bs->opaque;
 -    AIOReq *aioreq, *next;
 -
 -    if (sd_acb_cancelable(acb)) {
 -        /* Remove outstanding requests from failed queue.  */
 -        QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
 -                           next) {
 -            if (aioreq->aiocb == acb) {
 -                free_aio_req(s, aioreq);
 -            }
 -        }
 -
 -        assert(acb->nr_pending == 0);
 -        if (acb->common.cb) {
 -            acb->common.cb(acb->common.opaque, -ECANCELED);
 -        }
 -        sd_finish_aiocb(acb);
 -    }
 -}
 -
- static const AIOCBInfo sd_aiocb_info = {
+ static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
-     .aiocb_size     = sizeof(SheepdogAIOCB),
+ {
--    .cancel_async   = sd_aio_cancel,
+-    int i;
- };
+-
+-    if (QEMU_VFIO_DEBUG) {
- static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
+-        printf("vfio mappings\n");
-@@ -XXX,XX +XXX,XX @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
+-        for (i = 0; i < s->nr_mappings; ++i) {
-     acb->nb_sectors = nb_sectors;
+-            qemu_vfio_dump_mapping(&s->mappings[i]);
+-        }
-     acb->aio_done_func = NULL;
++    for (int i = 0; i < s->nr_mappings; ++i) {
--    acb->cancelable = true;
++        trace_qemu_vfio_dump_mapping(s->mappings[i].host,
-     acb->coroutine = qemu_coroutine_self();
++                                     s->mappings[i].iova,
-     acb->ret = 0;
++                                     s->mappings[i].size);
-     acb->nr_pending = 0;
+     }
  }
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex
  qemu_vfio_dma_reset_temporary(void *s) "s %p"
  qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
  qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
 +qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx"
  qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
  qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
  qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
 --
-.9.3
+.28.0

-New patch
+[PULL 33/33] util/vfio-helpers: Assert offset is aligned to page size
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
+mmap(2) states:
+  'offset' must be a multiple of the page size as returned
+   by sysconf(_SC_PAGE_SIZE).
+Add an assertion to be sure we don't break this contract.
+Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201103020733.2303148-8-philmd@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+---
+ util/vfio-helpers.c | 1 +
+file changed, 1 insertion(+)
+diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vfio-helpers.c
++++ b/util/vfio-helpers.c
+@@ -XXX,XX +XXX,XX @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
+                             Error **errp)
+ {
+     void *p;
++    assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size));
+     assert_bar_index_valid(s, index);
+     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
+              prot, MAP_SHARED,
+--
+.28.0

From: Paolo Bonzini <pbonzini@redhat.com>

SheepdogAIOCB is internal to sheepdog.c, hence it is never canceled.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20161129113245.32724-2-pbonzini@redhat.com
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/sheepdog.c | 52 ----------------------------------------------------
 1 file changed, 52 deletions(-)

From: Paolo Bonzini <pbonzini@redhat.com>

Delimit co_recv's lifetime clearly in aio_read_response.

Do a simple qemu_coroutine_enter in aio_read_response, letting
sd_co_writev call sd_write_done.

Handle nr_pending in the same way in sd_co_rw_vector,
sd_write_done and sd_co_flush_to_disk.

Remove sd_co_rw_vector's return value; just leave with no
pending requests.

[Jeff: added missing 'return' back, spotted by Paolo after
       series was applied.]

Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/sheepdog.c | 115 ++++++++++++++++++++-----------------------------------
 1 file changed, 42 insertions(+), 73 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ struct SheepdogAIOCB {
     enum AIOCBState aiocb_type;
 
     Coroutine *coroutine;
-    void (*aio_done_func)(SheepdogAIOCB *);
-
     int nr_pending;
 
     uint32_t min_affect_data_idx;
@@ -XXX,XX +XXX,XX @@ static const char * sd_strerror(int err)
  *
  * 1. In sd_co_rw_vector, we send the I/O requests to the server and
  *    link the requests to the inflight_list in the
- *    BDRVSheepdogState.  The function exits without waiting for
+ *    BDRVSheepdogState.  The function yields while waiting for
  *    receiving the response.
  *
  * 2. We receive the response in aio_read_response, the fd handler to
- *    the sheepdog connection.  If metadata update is needed, we send
- *    the write request to the vdi object in sd_write_done, the write
- *    completion function.  We switch back to sd_co_readv/writev after
- *    all the requests belonging to the AIOCB are finished.
+ *    the sheepdog connection.  We switch back to sd_co_readv/sd_writev
+ *    after all the requests belonging to the AIOCB are finished.  If
+ *    needed, sd_co_writev will send another requests for the vdi object.
  */
 
 static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
@@ -XXX,XX +XXX,XX @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
     acb->nr_pending--;
 }
 
-static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
-{
-    qemu_coroutine_enter(acb->coroutine);
-    qemu_aio_unref(acb);
-}
-
 static const AIOCBInfo sd_aiocb_info = {
     .aiocb_size     = sizeof(SheepdogAIOCB),
 };
@@ -XXX,XX +XXX,XX @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
     acb->sector_num = sector_num;
     acb->nb_sectors = nb_sectors;
 
-    acb->aio_done_func = NULL;
     acb->coroutine = qemu_coroutine_self();
     acb->ret = 0;
     acb->nr_pending = 0;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
 
     switch (acb->aiocb_type) {
     case AIOCB_WRITE_UDATA:
-        /* this coroutine context is no longer suitable for co_recv
-         * because we may send data to update vdi objects */
-        s->co_recv = NULL;
         if (!is_data_obj(aio_req->oid)) {
             break;
         }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
         }
     }
 
+    /* No more data for this aio_req (reload_inode below uses its own file
+     * descriptor handler which doesn't use co_recv).
+    */
+    s->co_recv = NULL;
+
     switch (rsp.result) {
     case SD_RES_SUCCESS:
         break;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
             aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
         }
         resend_aioreq(s, aio_req);
-        goto out;
+        return;
     default:
         acb->ret = -EIO;
         error_report("%s", sd_strerror(rsp.result));
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
          * We've finished all requests which belong to the AIOCB, so
          * we can switch back to sd_co_readv/writev now.
          */
-        acb->aio_done_func(acb);
+        qemu_coroutine_enter(acb->coroutine);
     }
-out:
-    s->co_recv = NULL;
+
     return;
+
 err:
-    s->co_recv = NULL;
     reconnect_to_sdog(opaque);
 }
 
@@ -XXX,XX +XXX,XX @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
 /*
  * This function is called after writing data objects.  If we need to
  * update metadata, this sends a write request to the vdi object.
- * Otherwise, this switches back to sd_co_readv/writev.
  */
 static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
     mx = acb->max_dirty_data_idx;
     if (mn <= mx) {
         /* we need to update the vdi object. */
+        ++acb->nr_pending;
         offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
             mn * sizeof(s->inode.data_vdi_id[0]);
         data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
                                 data_len, offset, 0, false, 0, offset);
         QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
         add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
-
-        acb->aio_done_func = sd_finish_aiocb;
-        acb->aiocb_type = AIOCB_WRITE_UDATA;
-        return;
+        if (--acb->nr_pending) {
+            qemu_coroutine_yield();
+        }
     }
-
-    sd_finish_aiocb(acb);
 }
 
 /* Delete current working VDI on the snapshot chain */
@@ -XXX,XX +XXX,XX @@ out:
  * Returns 1 when we need to wait a response, 0 when there is no sent
  * request and -errno in error cases.
  */
-static int coroutine_fn sd_co_rw_vector(void *p)
+static void coroutine_fn sd_co_rw_vector(void *p)
 {
     SheepdogAIOCB *acb = p;
     int ret = 0;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_rw_vector(void *p)
         ret = sd_create_branch(s);
         if (ret) {
             acb->ret = -EIO;
-            goto out;
+            return;
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_rw_vector(void *p)
         idx++;
         done += len;
     }
-out:
-    if (!--acb->nr_pending) {
-        return acb->ret;
+    if (--acb->nr_pending) {
+        qemu_coroutine_yield();
     }
-    return 1;
 }
 
 static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
     }
 
     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
-    acb->aio_done_func = sd_write_done;
     acb->aiocb_type = AIOCB_WRITE_UDATA;
 
 retry:
@@ -XXX,XX +XXX,XX @@ retry:
         goto retry;
     }
 
-    ret = sd_co_rw_vector(acb);
-    if (ret <= 0) {
-        QLIST_REMOVE(acb, aiocb_siblings);
-        qemu_co_queue_restart_all(&s->overlapping_queue);
-        qemu_aio_unref(acb);
-        return ret;
-    }
-
-    qemu_coroutine_yield();
+    sd_co_rw_vector(acb);
+    sd_write_done(acb);
 
     QLIST_REMOVE(acb, aiocb_siblings);
     qemu_co_queue_restart_all(&s->overlapping_queue);
-
-    return acb->ret;
+    ret = acb->ret;
+    qemu_aio_unref(acb);
+    return ret;
 }
 
 static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
 
     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
     acb->aiocb_type = AIOCB_READ_UDATA;
-    acb->aio_done_func = sd_finish_aiocb;
 
 retry:
     if (check_overlapping_aiocb(s, acb)) {
@@ -XXX,XX +XXX,XX @@ retry:
         goto retry;
     }
 
-    ret = sd_co_rw_vector(acb);
-    if (ret <= 0) {
-        QLIST_REMOVE(acb, aiocb_siblings);
-        qemu_co_queue_restart_all(&s->overlapping_queue);
-        qemu_aio_unref(acb);
-        return ret;
-    }
-
-    qemu_coroutine_yield();
+    sd_co_rw_vector(acb);
 
     QLIST_REMOVE(acb, aiocb_siblings);
     qemu_co_queue_restart_all(&s->overlapping_queue);
-    return acb->ret;
+    ret = acb->ret;
+    qemu_aio_unref(acb);
+    return ret;
 }
 
 static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogAIOCB *acb;
+    int ret;
     AIOReq *aio_req;
 
     if (s->cache_flags != SD_FLAG_CMD_CACHE) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
 
     acb = sd_aio_setup(bs, NULL, 0, 0);
     acb->aiocb_type = AIOCB_FLUSH_CACHE;
-    acb->aio_done_func = sd_finish_aiocb;
 
+    acb->nr_pending++;
     aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
                             0, 0, 0, false, 0, 0);
     QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
     add_aio_request(s, aio_req, NULL, 0, acb->aiocb_type);
 
-    qemu_coroutine_yield();
-    return acb->ret;
+    if (--acb->nr_pending) {
+        qemu_coroutine_yield();
+    }
+    ret = acb->ret;
+    qemu_aio_unref(acb);
+    return ret;
 }
 
 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
     acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
                        count >> BDRV_SECTOR_BITS);
     acb->aiocb_type = AIOCB_DISCARD_OBJ;
-    acb->aio_done_func = sd_finish_aiocb;
 
 retry:
     if (check_overlapping_aiocb(s, acb)) {
@@ -XXX,XX +XXX,XX @@ retry:
         goto retry;
     }
 
-    ret = sd_co_rw_vector(acb);
-    if (ret <= 0) {
-        QLIST_REMOVE(acb, aiocb_siblings);
-        qemu_co_queue_restart_all(&s->overlapping_queue);
-        qemu_aio_unref(acb);
-        return ret;
-    }
-
-    qemu_coroutine_yield();
+    sd_co_rw_vector(acb);
 
     QLIST_REMOVE(acb, aiocb_siblings);
     qemu_co_queue_restart_all(&s->overlapping_queue);
-
-    return acb->ret;
+    ret = acb->ret;
+    qemu_aio_unref(acb);
+    return ret;
 }
 
 static coroutine_fn int64_t
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Sheepdog's AIOCB are completely internal entities for a group of
requests and do not need dynamic allocation.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20161129113245.32724-4-pbonzini@redhat.com
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/sheepdog.c | 99 ++++++++++++++++++++++----------------------------------
 1 file changed, 39 insertions(+), 60 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static inline size_t count_data_objs(const struct SheepdogInode *inode)
     } while (0)
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
+typedef struct BDRVSheepdogState BDRVSheepdogState;
 
 typedef struct AIOReq {
     SheepdogAIOCB *aiocb;
@@ -XXX,XX +XXX,XX @@ enum AIOCBState {
        || y->max_affect_data_idx < x->min_affect_data_idx))
 
 struct SheepdogAIOCB {
-    BlockAIOCB common;
+    BDRVSheepdogState *s;
 
     QEMUIOVector *qiov;
 
@@ -XXX,XX +XXX,XX @@ struct SheepdogAIOCB {
     QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
 };
 
-typedef struct BDRVSheepdogState {
+struct BDRVSheepdogState {
     BlockDriverState *bs;
     AioContext *aio_context;
 
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVSheepdogState {
 
     CoQueue overlapping_queue;
     QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
-} BDRVSheepdogState;
+};
 
 typedef struct BDRVSheepdogReopenState {
     int fd;
@@ -XXX,XX +XXX,XX @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
     acb->nr_pending--;
 }
 
-static const AIOCBInfo sd_aiocb_info = {
-    .aiocb_size     = sizeof(SheepdogAIOCB),
-};
-
-static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
-                                   int64_t sector_num, int nb_sectors)
+static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
+                         QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
+                         int type)
 {
-    SheepdogAIOCB *acb;
     uint32_t object_size;
-    BDRVSheepdogState *s = bs->opaque;
 
     object_size = (UINT32_C(1) << s->inode.block_size_shift);
 
-    acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
+    acb->s = s;
 
     acb->qiov = qiov;
 
@@ -XXX,XX +XXX,XX @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
 
     acb->min_dirty_data_idx = UINT32_MAX;
     acb->max_dirty_data_idx = 0;
-
-    return acb;
+    acb->aiocb_type = type;
 }
 
 /* Return -EIO in case of error, file descriptor on success */
@@ -XXX,XX +XXX,XX @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
  */
 static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 {
-    BDRVSheepdogState *s = acb->common.bs->opaque;
+    BDRVSheepdogState *s = acb->s;
     struct iovec iov;
     AIOReq *aio_req;
     uint32_t offset, data_len, mn, mx;
@@ -XXX,XX +XXX,XX @@ out:
  * Returns 1 when we need to wait a response, 0 when there is no sent
  * request and -errno in error cases.
  */
-static void coroutine_fn sd_co_rw_vector(void *p)
+static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
 {
-    SheepdogAIOCB *acb = p;
     int ret = 0;
     unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
     unsigned long idx;
     uint32_t object_size;
     uint64_t oid;
     uint64_t offset;
-    BDRVSheepdogState *s = acb->common.bs->opaque;
+    BDRVSheepdogState *s = acb->s;
     SheepdogInode *inode = &s->inode;
     AIOReq *aio_req;
 
@@ -XXX,XX +XXX,XX @@ static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
 {
-    SheepdogAIOCB *acb;
+    SheepdogAIOCB acb;
     int ret;
     int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
     BDRVSheepdogState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
         }
     }
 
-    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
-    acb->aiocb_type = AIOCB_WRITE_UDATA;
+    sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
 
 retry:
-    if (check_overlapping_aiocb(s, acb)) {
+    if (check_overlapping_aiocb(s, &acb)) {
         qemu_co_queue_wait(&s->overlapping_queue);
         goto retry;
     }
 
-    sd_co_rw_vector(acb);
-    sd_write_done(acb);
+    sd_co_rw_vector(&acb);
+    sd_write_done(&acb);
 
-    QLIST_REMOVE(acb, aiocb_siblings);
+    QLIST_REMOVE(&acb, aiocb_siblings);
     qemu_co_queue_restart_all(&s->overlapping_queue);
-    ret = acb->ret;
-    qemu_aio_unref(acb);
-    return ret;
+    return acb.ret;
 }
 
 static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov)
 {
-    SheepdogAIOCB *acb;
-    int ret;
+    SheepdogAIOCB acb;
     BDRVSheepdogState *s = bs->opaque;
 
-    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
-    acb->aiocb_type = AIOCB_READ_UDATA;
+    sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
 
 retry:
-    if (check_overlapping_aiocb(s, acb)) {
+    if (check_overlapping_aiocb(s, &acb)) {
         qemu_co_queue_wait(&s->overlapping_queue);
         goto retry;
     }
 
-    sd_co_rw_vector(acb);
+    sd_co_rw_vector(&acb);
 
-    QLIST_REMOVE(acb, aiocb_siblings);
+    QLIST_REMOVE(&acb, aiocb_siblings);
     qemu_co_queue_restart_all(&s->overlapping_queue);
-    ret = acb->ret;
-    qemu_aio_unref(acb);
-    return ret;
+    return acb.ret;
 }
 
 static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
 {
     BDRVSheepdogState *s = bs->opaque;
-    SheepdogAIOCB *acb;
-    int ret;
+    SheepdogAIOCB acb;
     AIOReq *aio_req;
 
     if (s->cache_flags != SD_FLAG_CMD_CACHE) {
         return 0;
     }
 
-    acb = sd_aio_setup(bs, NULL, 0, 0);
-    acb->aiocb_type = AIOCB_FLUSH_CACHE;
+    sd_aio_setup(&acb, s, NULL, 0, 0, AIOCB_FLUSH_CACHE);
 
-    acb->nr_pending++;
-    aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
+    acb.nr_pending++;
+    aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
                             0, 0, 0, false, 0, 0);
     QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-    add_aio_request(s, aio_req, NULL, 0, acb->aiocb_type);
+    add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
 
-    if (--acb->nr_pending) {
+    if (--acb.nr_pending) {
         qemu_coroutine_yield();
     }
-    ret = acb->ret;
-    qemu_aio_unref(acb);
-    return ret;
+    return acb.ret;
 }
 
 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
@@ -XXX,XX +XXX,XX @@ static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
 static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
                                       int count)
 {
-    SheepdogAIOCB *acb;
+    SheepdogAIOCB acb;
     BDRVSheepdogState *s = bs->opaque;
-    int ret;
     QEMUIOVector discard_iov;
     struct iovec iov;
     uint32_t zero = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
     if (!QEMU_IS_ALIGNED(offset | count, BDRV_SECTOR_SIZE)) {
         return -ENOTSUP;
     }
-    acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
-                       count >> BDRV_SECTOR_BITS);
-    acb->aiocb_type = AIOCB_DISCARD_OBJ;
+    sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
+                 count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
 
 retry:
-    if (check_overlapping_aiocb(s, acb)) {
+    if (check_overlapping_aiocb(s, &acb)) {
         qemu_co_queue_wait(&s->overlapping_queue);
         goto retry;
     }
 
-    sd_co_rw_vector(acb);
+    sd_co_rw_vector(&acb);
 
-    QLIST_REMOVE(acb, aiocb_siblings);
+    QLIST_REMOVE(&acb, aiocb_siblings);
     qemu_co_queue_restart_all(&s->overlapping_queue);
-    ret = acb->ret;
-    qemu_aio_unref(acb);
-    return ret;
+    return acb.ret;
 }
 
 static coroutine_fn int64_t
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Add to the list in add_aio_request and, indirectly, resend_aioreq.  Inline
free_aio_req in the caller, it does not simply undo alloc_aio_req's job.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20161129113245.32724-5-pbonzini@redhat.com
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/sheepdog.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
     return aio_req;
 }
 
-static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
-{
-    SheepdogAIOCB *acb = aio_req->aiocb;
-
-    QLIST_REMOVE(aio_req, aio_siblings);
-    g_free(aio_req);
-
-    acb->nr_pending--;
-}
-
 static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
                          QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
                          int type)
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
     while (!QLIST_EMPTY(&s->failed_aio_head)) {
         aio_req = QLIST_FIRST(&s->failed_aio_head);
         QLIST_REMOVE(aio_req, aio_siblings);
-        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
         resend_aioreq(s, aio_req);
     }
 }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
     */
     s->co_recv = NULL;
 
+    QLIST_REMOVE(aio_req, aio_siblings);
     switch (rsp.result) {
     case SD_RES_SUCCESS:
         break;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
         break;
     }
 
-    free_aio_req(s, aio_req);
-    if (!acb->nr_pending) {
+    g_free(aio_req);
+
+    if (!--acb->nr_pending) {
         /*
          * We've finished all requests which belong to the AIOCB, so
          * we can switch back to sd_co_readv/writev now.
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
     uint64_t old_oid = aio_req->base_oid;
     bool create = aio_req->create;
 
+    QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+
     if (!nr_copies) {
         error_report("bug");
     }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
         iov.iov_len = sizeof(s->inode);
         aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
                                 data_len, offset, 0, false, 0, offset);
-        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
         add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
         if (--acb->nr_pending) {
             qemu_coroutine_yield();
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
                                 old_oid,
                                 acb->aiocb_type == AIOCB_DISCARD_OBJ ?
                                 0 : done);
-        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-
         add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
                         acb->aiocb_type);
     done:
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
     acb.nr_pending++;
     aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
                             0, 0, 0, false, 0, 0);
-    QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
     add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
 
     if (--acb.nr_pending) {
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Wrap the code that was copied repeatedly in the two functions,
sd_aio_setup and sd_aio_complete.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20161129113245.32724-6-pbonzini@redhat.com
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/sheepdog.c | 66 ++++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 36 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
     return aio_req;
 }
 
+static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
+{
+    SheepdogAIOCB *cb;
+
+retry:
+    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
+        if (AIOCBOverlapping(acb, cb)) {
+            qemu_co_queue_wait(&s->overlapping_queue);
+            goto retry;
+        }
+    }
+}
+
 static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
                          QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
                          int type)
@@ -XXX,XX +XXX,XX @@ static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
     acb->min_dirty_data_idx = UINT32_MAX;
     acb->max_dirty_data_idx = 0;
     acb->aiocb_type = type;
+
+    if (type == AIOCB_FLUSH_CACHE) {
+        return;
+    }
+
+    wait_for_overlapping_aiocb(s, acb);
+    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
 }
 
 /* Return -EIO in case of error, file descriptor on success */
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
     }
 }
 
-static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
+static void sd_aio_complete(SheepdogAIOCB *acb)
 {
-    SheepdogAIOCB *cb;
-
-    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
-        if (AIOCBOverlapping(aiocb, cb)) {
-            return true;
-        }
+    if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
+        return;
     }
 
-    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
-    return false;
+    QLIST_REMOVE(acb, aiocb_siblings);
+    qemu_co_queue_restart_all(&acb->s->overlapping_queue);
 }
 
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
     }
 
     sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
-
-retry:
-    if (check_overlapping_aiocb(s, &acb)) {
-        qemu_co_queue_wait(&s->overlapping_queue);
-        goto retry;
-    }
-
     sd_co_rw_vector(&acb);
     sd_write_done(&acb);
+    sd_aio_complete(&acb);
 
-    QLIST_REMOVE(&acb, aiocb_siblings);
-    qemu_co_queue_restart_all(&s->overlapping_queue);
     return acb.ret;
 }
 
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
     BDRVSheepdogState *s = bs->opaque;
 
     sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
-
-retry:
-    if (check_overlapping_aiocb(s, &acb)) {
-        qemu_co_queue_wait(&s->overlapping_queue);
-        goto retry;
-    }
-
     sd_co_rw_vector(&acb);
+    sd_aio_complete(&acb);
 
-    QLIST_REMOVE(&acb, aiocb_siblings);
-    qemu_co_queue_restart_all(&s->overlapping_queue);
     return acb.ret;
 }
 
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
     if (--acb.nr_pending) {
         qemu_coroutine_yield();
     }
+
+    sd_aio_complete(&acb);
     return acb.ret;
 }
 
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
     }
     sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
                  count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
-
-retry:
-    if (check_overlapping_aiocb(s, &acb)) {
-        qemu_co_queue_wait(&s->overlapping_queue);
-        goto retry;
-    }
-
     sd_co_rw_vector(&acb);
+    sd_aio_complete(&acb);
 
-    QLIST_REMOVE(&acb, aiocb_siblings);
-    qemu_co_queue_restart_all(&s->overlapping_queue);
     return acb.ret;
 }
 
-- 
2.9.3

The following changes since commit 8507c9d5c9a62de2a0e281b640f995e26eac46af:

Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-11-03 15:59:44 +0000)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to fc107d86840b3364e922c26cf7631b7fd38ce523:

util/vfio-helpers: Assert offset is aligned to page size (2020-11-03 19:06:23 +0000)

----------------------------------------------------------------
Pull request for 5.2

NVMe fixes to solve IOMMU issues on non-x86 and error message/tracing
improvements. Elena Afanasova's ioeventfd fixes are also included.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>

----------------------------------------------------------------

Elena Afanasova (2):
  accel/kvm: add PIO ioeventfds only in case kvm_eventfds_allowed is
    true
  softmmu/memory: fix memory_region_ioeventfd_equal()

Eric Auger (4):
  block/nvme: Change size and alignment of IDENTIFY response buffer
  block/nvme: Change size and alignment of queue
  block/nvme: Change size and alignment of prp_list_pages
  block/nvme: Align iov's va and size on host page size

Philippe Mathieu-Daudé (27):
  MAINTAINERS: Cover "block/nvme.h" file
  block/nvme: Use hex format to display offset in trace events
  block/nvme: Report warning with warn_report()
  block/nvme: Trace controller capabilities
  block/nvme: Trace nvme_poll_queue() per queue
  block/nvme: Improve nvme_free_req_queue_wait() trace information
  block/nvme: Trace queue pair creation/deletion
  block/nvme: Move definitions before structure declarations
  block/nvme: Use unsigned integer for queue counter/size
  block/nvme: Make nvme_identify() return boolean indicating error
  block/nvme: Make nvme_init_queue() return boolean indicating error
  block/nvme: Introduce Completion Queue definitions
  block/nvme: Use definitions instead of magic values in add_io_queue()
  block/nvme: Correctly initialize Admin Queue Attributes
  block/nvme: Simplify ADMIN queue access
  block/nvme: Simplify nvme_cmd_sync()
  block/nvme: Set request_alignment at initialization
  block/nvme: Correct minimum device page size
  block/nvme: Fix use of write-only doorbells page on Aarch64 arch
  block/nvme: Fix nvme_submit_command() on big-endian host
  util/vfio-helpers: Improve reporting unsupported IOMMU type
  util/vfio-helpers: Trace PCI I/O config accesses
  util/vfio-helpers: Trace PCI BAR region info
  util/vfio-helpers: Trace where BARs are mapped
  util/vfio-helpers: Improve DMA trace events
  util/vfio-helpers: Convert vfio_dump_mapping to trace events
  util/vfio-helpers: Assert offset is aligned to page size

-- 
2.28.0

From: Elena Afanasova <eafanasova@gmail.com>

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
Message-Id: <20201017210102.26036-1-eafanasova@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 accel/kvm/kvm-all.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -XXX,XX +XXX,XX @@ static int kvm_init(MachineState *ms)
 
     kvm_memory_listener_register(s, &s->memory_listener,
                                  &address_space_memory, 0);
-    memory_listener_register(&kvm_io_listener,
-                             &address_space_io);
+    if (kvm_eventfds_allowed) {
+        memory_listener_register(&kvm_io_listener,
+                                 &address_space_io);
+    }
     memory_listener_register(&kvm_coalesced_pio_listener,
                              &address_space_io);
 
-- 
2.28.0

From: Elena Afanasova <eafanasova@gmail.com>

Eventfd can be registered with a zero length when fast_mmio is true.
Handle this case properly when dispatching through QEMU.

Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
Message-id: cf71a62eb04e61932ff8ffdd02e0b2aab4f495a0.camel@gmail.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 softmmu/memory.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/softmmu/memory.c b/softmmu/memory.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -XXX,XX +XXX,XX @@ static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a,
 static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a,
                                           MemoryRegionIoeventfd *b)
 {
-    return !memory_region_ioeventfd_before(a, b)
-        && !memory_region_ioeventfd_before(b, a);
+    if (int128_eq(a->addr.start, b->addr.start) &&
+        (!int128_nz(a->addr.size) || !int128_nz(b->addr.size) ||
+         (int128_eq(a->addr.size, b->addr.size) &&
+          (a->match_data == b->match_data) &&
+          ((a->match_data && (a->data == b->data)) || !a->match_data) &&
+          (a->e == b->e))))
+        return true;
+
+    return false;
 }
 
 /* Range of memory in the global map.  Addresses are absolute. */
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The "block/nvme.h" header is shared by both the NVMe block
driver and the NVMe emulated device. Add the 'F:' entry on
both sections, so all maintainers/reviewers are notified
when it is changed.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Message-Id: <20200701140634.25994-1-philmd@redhat.com>
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ M: Klaus Jensen <its@irrelevant.dk>
 L: qemu-block@nongnu.org
 S: Supported
 F: hw/block/nvme*
+F: include/block/nvme.h
 F: tests/qtest/nvme-test.c
 F: docs/specs/nvme.txt
 T: git git://git.infradead.org/qemu-nvme.git nvme-next
@@ -XXX,XX +XXX,XX @@ R: Fam Zheng <fam@euphon.net>
 L: qemu-block@nongnu.org
 S: Supported
 F: block/nvme*
+F: include/block/nvme.h
 T: git https://github.com/stefanha/qemu.git block
 
 Bootdevice
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Use the same format used for the hw/vfio/ trace events.

Suggested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-3-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/trace-events | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
 nvme_handle_event(void *s) "s %p"
 nvme_poll_cb(void *s) "s %p"
-nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
-nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
+nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
+nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
-nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
-nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
-nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
-nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
+nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset 0x%"PRIx64" bytes %"PRId64" niov %d is_write %d"
+nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" ret %d"
+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *q) "q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Instead of displaying warning on stderr, use warn_report()
which also displays it on the monitor.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-4-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
         }
         cid = le16_to_cpu(c->cid);
         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
-            fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
-                    cid);
+            warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
+                        "queue size: %u", cid, NVME_QUEUE_SIZE);
             continue;
         }
         trace_nvme_complete_command(s, q->index, cid);
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Controllers have different capabilities and report them in the
CAP register. We are particularly interested by the page size
limits.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-5-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 13 +++++++++++++
 block/trace-events |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
      * Initialization". */
 
     cap = le64_to_cpu(regs->cap);
+    trace_nvme_controller_capability_raw(cap);
+    trace_nvme_controller_capability("Maximum Queue Entries Supported",
+                                     1 + NVME_CAP_MQES(cap));
+    trace_nvme_controller_capability("Contiguous Queues Required",
+                                     NVME_CAP_CQR(cap));
+    trace_nvme_controller_capability("Doorbell Stride",
+                                     2 << (2 + NVME_CAP_DSTRD(cap)));
+    trace_nvme_controller_capability("Subsystem Reset Supported",
+                                     NVME_CAP_NSSRS(cap));
+    trace_nvme_controller_capability("Memory Page Size Minimum",
+                                     1 << (12 + NVME_CAP_MPSMIN(cap)));
+    trace_nvme_controller_capability("Memory Page Size Maximum",
+                                     1 << (12 + NVME_CAP_MPSMAX(cap)));
     if (!NVME_CAP_CSS(cap)) {
         error_setg(errp, "Device doesn't support NVMe command set");
         ret = -EINVAL;
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t
 qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
 
 # nvme.c
+nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
+nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
 nvme_kick(void *s, int queue) "s %p queue %d"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

As we want to enable multiple queues, report the event
in each nvme_poll_queue() call, rather than once in
the callback calling nvme_poll_queues().

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-6-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 2 +-
 block/trace-events | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queue(NVMeQueuePair *q)
     const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
     NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
 
+    trace_nvme_poll_queue(q->s, q->index);
     /*
      * Do an early check for completions. q->lock isn't needed because
      * nvme_process_completion() only runs in the event loop thread and
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_cb(void *opaque)
     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
 
-    trace_nvme_poll_cb(s);
     return nvme_poll_queues(s);
 }
 
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
 nvme_handle_event(void *s) "s %p"
-nvme_poll_cb(void *s) "s %p"
+nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
 nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
 nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

What we want to trace is the block driver state and the queue index.

Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-7-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 2 +-
 block/trace-events | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 
     while (q->free_req_head == -1) {
         if (qemu_in_coroutine()) {
-            trace_nvme_free_req_queue_wait(q);
+            trace_nvme_free_req_queue_wait(q->s, q->index);
             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
         } else {
             qemu_mutex_unlock(&q->lock);
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s
 nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
 nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
-nvme_free_req_queue_wait(void *q) "q %p"
+nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
 nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-8-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 3 +++
 block/trace-events | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
 
 static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
+    trace_nvme_free_queue_pair(q->index, q);
     if (q->completion_bh) {
         qemu_bh_delete(q->completion_bh);
     }
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
     if (!q) {
         return NULL;
     }
+    trace_nvme_create_queue_pair(idx, q, size, aio_context,
+                                 event_notifier_get_fd(s->irq_notifier));
     q->prp_list_pages = qemu_try_memalign(s->page_size,
                                           s->page_size * NVME_NUM_REQS);
     if (!q->prp_list_pages) {
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" byte
 nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
+nvme_create_queue_pair(unsigned q_index, void *q, unsigned size, void *aio_context, int fd) "index %u q %p size %u aioctx %p fd %d"
+nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
 nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

To be able to use some definitions in structure declarations,
move them earlier. No logical change.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-9-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 
 typedef struct BDRVNVMeState BDRVNVMeState;
 
+/* Same index is used for queues and IRQs */
+#define INDEX_ADMIN     0
+#define INDEX_IO(n)     (1 + n)
+
+/* This driver shares a single MSIX IRQ for the admin and I/O queues */
+enum {
+    MSIX_SHARED_IRQ_IDX = 0,
+    MSIX_IRQ_COUNT = 1
+};
+
 typedef struct {
     int32_t  head, tail;
     uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
     QEMUBH      *completion_bh;
 } NVMeQueuePair;
 
-#define INDEX_ADMIN     0
-#define INDEX_IO(n)     (1 + n)
-
-/* This driver shares a single MSIX IRQ for the admin and I/O queues */
-enum {
-    MSIX_SHARED_IRQ_IDX = 0,
-    MSIX_IRQ_COUNT = 1
-};
-
 struct BDRVNVMeState {
     AioContext *aio_context;
     QEMUVFIOState *vfio;
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

We can not have negative queue count/size/index, use unsigned type.
Rename 'nr_queues' as 'queue_count' to match the spec naming.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-10-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 38 ++++++++++++++++++--------------------
 block/trace-events | 10 +++++-----
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
      * [1..]: io queues.
      */
     NVMeQueuePair **queues;
-    int nr_queues;
+    unsigned queue_count;
     size_t page_size;
     /* How many uint32_t elements does each doorbell entry take. */
     size_t doorbell_scale;
@@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = {
 };
 
 static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
-                            int nentries, int entry_bytes, Error **errp)
+                            unsigned nentries, size_t entry_bytes, Error **errp)
 {
     size_t bytes;
     int r;
@@ -XXX,XX +XXX,XX @@ static void nvme_free_req_queue_cb(void *opaque)
 
 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
                                              AioContext *aio_context,
-                                             int idx, int size,
+                                             unsigned idx, size_t size,
                                              Error **errp)
 {
     int i, r;
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
     bool progress = false;
     int i;
 
-    for (i = 0; i < s->nr_queues; i++) {
+    for (i = 0; i < s->queue_count; i++) {
         if (nvme_poll_queue(s->queues[i])) {
             progress = true;
         }
@@ -XXX,XX +XXX,XX @@ static void nvme_handle_event(EventNotifier *n)
 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
 {
     BDRVNVMeState *s = bs->opaque;
-    int n = s->nr_queues;
+    unsigned n = s->queue_count;
     NVMeQueuePair *q;
     NvmeCmd cmd;
-    int queue_size = NVME_QUEUE_SIZE;
+    unsigned queue_size = NVME_QUEUE_SIZE;
 
     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
                                n, queue_size, errp);
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
         .cdw11 = cpu_to_le32(0x3),
     };
     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-        error_setg(errp, "Failed to create CQ io queue [%d]", n);
+        error_setg(errp, "Failed to create CQ io queue [%u]", n);
         goto out_error;
     }
     cmd = (NvmeCmd) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
         .cdw11 = cpu_to_le32(0x1 | (n << 16)),
     };
     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-        error_setg(errp, "Failed to create SQ io queue [%d]", n);
+        error_setg(errp, "Failed to create SQ io queue [%u]", n);
         goto out_error;
     }
     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
     s->queues[n] = q;
-    s->nr_queues++;
+    s->queue_count++;
     return true;
 out_error:
     nvme_free_queue_pair(q);
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         ret = -EINVAL;
         goto out;
     }
-    s->nr_queues = 1;
+    s->queue_count = 1;
     QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
     regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
                             (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
@@ -XXX,XX +XXX,XX @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
 
 static void nvme_close(BlockDriverState *bs)
 {
-    int i;
     BDRVNVMeState *s = bs->opaque;
 
-    for (i = 0; i < s->nr_queues; ++i) {
+    for (unsigned i = 0; i < s->queue_count; ++i) {
         nvme_free_queue_pair(s->queues[i]);
     }
     g_free(s->queues);
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
     };
 
     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
-    assert(s->nr_queues > 1);
+    assert(s->queue_count > 1);
     req = nvme_get_free_req(ioq);
     assert(req);
 
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
         .ret = -EINPROGRESS,
     };
 
-    assert(s->nr_queues > 1);
+    assert(s->queue_count > 1);
     req = nvme_get_free_req(ioq);
     assert(req);
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     cmd.cdw12 = cpu_to_le32(cdw12);
 
     trace_nvme_write_zeroes(s, offset, bytes, flags);
-    assert(s->nr_queues > 1);
+    assert(s->queue_count > 1);
     req = nvme_get_free_req(ioq);
     assert(req);
 
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
         return -ENOTSUP;
     }
 
-    assert(s->nr_queues > 1);
+    assert(s->queue_count > 1);
 
     buf = qemu_try_memalign(s->page_size, s->page_size);
     if (!buf) {
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
 {
     BDRVNVMeState *s = bs->opaque;
 
-    for (int i = 0; i < s->nr_queues; i++) {
+    for (unsigned i = 0; i < s->queue_count; i++) {
         NVMeQueuePair *q = s->queues[i];
 
         qemu_bh_delete(q->completion_bh);
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
     aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                            false, nvme_handle_event, nvme_poll_cb);
 
-    for (int i = 0; i < s->nr_queues; i++) {
+    for (unsigned i = 0; i < s->queue_count; i++) {
         NVMeQueuePair *q = s->queues[i];
 
         q->completion_bh =
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_plug(BlockDriverState *bs)
 
 static void nvme_aio_unplug(BlockDriverState *bs)
 {
-    int i;
     BDRVNVMeState *s = bs->opaque;
     assert(s->plugged);
     s->plugged = false;
-    for (i = INDEX_IO(0); i < s->nr_queues; i++) {
+    for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
         NVMeQueuePair *q = s->queues[i];
         qemu_mutex_lock(&q->lock);
         nvme_kick(q);
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
 # nvme.c
 nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
 nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
-nvme_kick(void *s, int queue) "s %p queue %d"
+nvme_kick(void *s, unsigned q_index) "s %p q #%u"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
-nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
-nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
-nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
-nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
+nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
+nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
+nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
+nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
 nvme_handle_event(void *s) "s %p"
 nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Just for consistency, following the example documented since
commit e3fe3988d7 ("error: Document Error API usage rules"),
return a boolean value indicating an error is set or not.
Directly pass errp as the local_err is not requested in our
case.

Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20201029093306.1063879-11-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
     return ret;
 }
 
-static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
+/* Returns true on success, false on failure. */
+static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 {
     BDRVNVMeState *s = bs->opaque;
+    bool ret = false;
     union {
         NvmeIdCtrl ctrl;
         NvmeIdNs ns;
@@ -XXX,XX +XXX,XX @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
         goto out;
     }
 
+    ret = true;
     s->blkshift = lbaf->ds;
 out:
     qemu_vfio_dma_unmap(s->vfio, id);
     qemu_vfree(id);
+
+    return ret;
 }
 
 static bool nvme_poll_queue(NVMeQueuePair *q)
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
     uint64_t cap;
     uint64_t timeout_ms;
     uint64_t deadline, now;
-    Error *local_err = NULL;
     volatile NvmeBar *regs = NULL;
 
     qemu_co_mutex_init(&s->dma_map_lock);
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                            false, nvme_handle_event, nvme_poll_cb);
 
-    nvme_identify(bs, namespace, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (!nvme_identify(bs, namespace, errp)) {
         ret = -EIO;
         goto out;
     }
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-12-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = {
     },
 };
 
-static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
+/* Returns true on success, false on failure. */
+static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
                             unsigned nentries, size_t entry_bytes, Error **errp)
 {
     size_t bytes;
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
     q->queue = qemu_try_memalign(s->page_size, bytes);
     if (!q->queue) {
         error_setg(errp, "Cannot allocate queue");
-        return;
+        return false;
     }
     memset(q->queue, 0, bytes);
     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
     if (r) {
         error_setg(errp, "Cannot map queue");
+        return false;
     }
+    return true;
 }
 
 static void nvme_free_queue_pair(NVMeQueuePair *q)
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
                                              Error **errp)
 {
     int i, r;
-    Error *local_err = NULL;
     NVMeQueuePair *q;
     uint64_t prp_list_iova;
 
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
         req->prp_list_iova = prp_list_iova + i * s->page_size;
     }
 
-    nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
         goto fail;
     }
     q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
 
-    nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
         goto fail;
     }
     q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Rename Submission Queue flags with 'Sq' to differentiate
submission queue flags from command queue flags, and introduce
Completion Queue flag definitions.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20201029093306.1063879-13-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 include/block/nvme.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -XXX,XX +XXX,XX @@ typedef struct QEMU_PACKED NvmeCreateCq {
 #define NVME_CQ_FLAGS_PC(cq_flags)  (cq_flags & 0x1)
 #define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
 
+enum NvmeFlagsCq {
+    NVME_CQ_PC          = 1,
+    NVME_CQ_IEN         = 2,
+};
+
 typedef struct QEMU_PACKED NvmeCreateSq {
     uint8_t     opcode;
     uint8_t     flags;
@@ -XXX,XX +XXX,XX @@ typedef struct QEMU_PACKED NvmeCreateSq {
 #define NVME_SQ_FLAGS_PC(sq_flags)      (sq_flags & 0x1)
 #define NVME_SQ_FLAGS_QPRIO(sq_flags)   ((sq_flags >> 1) & 0x3)
 
-enum NvmeQueueFlags {
-    NVME_Q_PC           = 1,
-    NVME_Q_PRIO_URGENT  = 0,
-    NVME_Q_PRIO_HIGH    = 1,
-    NVME_Q_PRIO_NORMAL  = 2,
-    NVME_Q_PRIO_LOW     = 3,
+enum NvmeFlagsSq {
+    NVME_SQ_PC          = 1,
+
+    NVME_SQ_PRIO_URGENT = 0,
+    NVME_SQ_PRIO_HIGH   = 1,
+    NVME_SQ_PRIO_NORMAL = 2,
+    NVME_SQ_PRIO_LOW    = 3,
 };
 
 typedef struct QEMU_PACKED NvmeIdentify {
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Replace magic values by definitions, and simplifiy since the
number of queues will never reach 64K.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-14-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     NvmeCmd cmd;
     unsigned queue_size = NVME_QUEUE_SIZE;
 
+    assert(n <= UINT16_MAX);
     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
                                n, queue_size, errp);
     if (!q) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     cmd = (NvmeCmd) {
         .opcode = NVME_ADM_CMD_CREATE_CQ,
         .dptr.prp1 = cpu_to_le64(q->cq.iova),
-        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
-        .cdw11 = cpu_to_le32(0x3),
+        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
+        .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
     };
     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
         error_setg(errp, "Failed to create CQ io queue [%u]", n);
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     cmd = (NvmeCmd) {
         .opcode = NVME_ADM_CMD_CREATE_SQ,
         .dptr.prp1 = cpu_to_le64(q->sq.iova),
-        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
-        .cdw11 = cpu_to_le32(0x1 | (n << 16)),
+        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
+        .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
     };
     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
         error_setg(errp, "Failed to create SQ io queue [%u]", n);
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

From the specification chapter 3.1.8 "AQA - Admin Queue Attributes"
the Admin Submission Queue Size field is a 0’s based value:

Admin Submission Queue Size (ASQS):

Defines the size of the Admin Submission Queue in entries.
    Enabling a controller while this field is cleared to 00h
    produces undefined results. The minimum size of the Admin
    Submission Queue is two entries. The maximum size of the
    Admin Submission Queue is 4096 entries.
    This is a 0’s based value.

This bug has never been hit because the device initialization
uses a single command synchronously :)

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-15-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         goto out;
     }
     s->queue_count = 1;
-    QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
-    regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
-                            (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
+    QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
+    regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
+                            ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
     regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
     regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
 
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

We don't need to dereference from BDRVNVMeState each time.
Use a NVMeQueuePair pointer on the admin queue.
The nvme_init() becomes easier to review, matching the style
of nvme_add_io_queue().

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-16-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
                      Error **errp)
 {
     BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *q;
     AioContext *aio_context = bdrv_get_aio_context(bs);
     int ret;
     uint64_t cap;
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
 
     /* Set up admin queue. */
     s->queues = g_new(NVMeQueuePair *, 1);
-    s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
-                                                          NVME_QUEUE_SIZE,
-                                                          errp);
-    if (!s->queues[INDEX_ADMIN]) {
+    q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
+    if (!q) {
         ret = -EINVAL;
         goto out;
     }
+    s->queues[INDEX_ADMIN] = q;
     s->queue_count = 1;
     QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
     regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
                             ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
-    regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
-    regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
+    regs->asq = cpu_to_le64(q->sq.iova);
+    regs->acq = cpu_to_le64(q->cq.iova);
 
     /* After setting up all control registers we can enable device now. */
     regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

As all commands use the ADMIN queue, it is pointless to pass
it as argument each time. Remove the argument, and rename the
function as nvme_admin_cmd_sync() to make this new behavior
clearer.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20201029093306.1063879-17-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
     qemu_mutex_unlock(&q->lock);
 }
 
-static void nvme_cmd_sync_cb(void *opaque, int ret)
+static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
 {
     int *pret = opaque;
     *pret = ret;
     aio_wait_kick();
 }
 
-static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
-                         NvmeCmd *cmd)
+static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
 {
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *q = s->queues[INDEX_ADMIN];
     AioContext *aio_context = bdrv_get_aio_context(bs);
     NVMeRequest *req;
     int ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
     if (!req) {
         return -EBUSY;
     }
-    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
+    nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
 
     AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
     return ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 
     memset(id, 0, sizeof(*id));
     cmd.dptr.prp1 = cpu_to_le64(iova);
-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+    if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to identify controller");
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     memset(id, 0, sizeof(*id));
     cmd.cdw10 = 0;
     cmd.nsid = cpu_to_le32(namespace);
-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+    if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to identify namespace");
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
         .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
     };
-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+    if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to create CQ io queue [%u]", n);
         goto out_error;
     }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
         .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
     };
-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+    if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to create SQ io queue [%u]", n);
         goto out_error;
     }
@@ -XXX,XX +XXX,XX @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
     };
 
-    ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd);
+    ret = nvme_admin_cmd_sync(bs, &cmd);
     if (ret) {
         error_setg(errp, "Failed to configure NVMe write cache");
     }
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Commit bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
sets the request_alignment in nvme_refresh_limits().
For consistency, also set it during initialization.

Reported-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-18-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
     s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
     bs->bl.opt_mem_alignment = s->page_size;
+    bs->bl.request_alignment = s->page_size;
     timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
 
     /* Reset device to get a clean state. */
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

While trying to simplify the code using a macro, we forgot
the 12-bit shift... Correct that.

Fixes: fad1eb68862 ("block/nvme: Use register definitions from 'block/nvme.h'")
Reported-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-19-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         goto out;
     }
 
-    s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
+    s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
     bs->bl.opt_mem_alignment = s->page_size;
     bs->bl.request_alignment = s->page_size;
-- 
2.28.0

From: Eric Auger <eric.auger@redhat.com>

In preparation of 64kB host page support, let's change the size
and alignment of the IDENTIFY command response buffer so that
the VFIO DMA MAP succeeds. We align on the host page size.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-20-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
         .opcode = NVME_ADM_CMD_IDENTIFY,
         .cdw10 = cpu_to_le32(0x1),
     };
+    size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size);
 
-    id = qemu_try_memalign(s->page_size, sizeof(*id));
+    id = qemu_try_memalign(qemu_real_host_page_size, id_size);
     if (!id) {
         error_setg(errp, "Cannot allocate buffer for identify response");
         goto out;
     }
-    r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova);
+    r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova);
     if (r) {
         error_setg(errp, "Cannot map buffer for DMA");
         goto out;
     }
 
-    memset(id, 0, sizeof(*id));
+    memset(id, 0, id_size);
     cmd.dptr.prp1 = cpu_to_le64(iova);
     if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to identify controller");
@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
     s->supports_discard = !!(oncs & NVME_ONCS_DSM);
 
-    memset(id, 0, sizeof(*id));
+    memset(id, 0, id_size);
     cmd.cdw10 = 0;
     cmd.nsid = cpu_to_le32(namespace);
     if (nvme_admin_cmd_sync(bs, &cmd)) {
-- 
2.28.0

From: Eric Auger <eric.auger@redhat.com>

In preparation of 64kB host page support, let's change the size
and alignment of the queue so that the VFIO DMA MAP succeeds.
We align on the host page size.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-21-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
     size_t bytes;
     int r;
 
-    bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
+    bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size);
     q->head = q->tail = 0;
-    q->queue = qemu_try_memalign(s->page_size, bytes);
+    q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes);
     if (!q->queue) {
         error_setg(errp, "Cannot allocate queue");
         return false;
-- 
2.28.0

From: Eric Auger <eric.auger@redhat.com>

In preparation of 64kB host page support, let's change the size
and alignment of the prp_list_pages so that the VFIO DMA MAP succeeds
with 64kB host page size. We align on the host page size.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-22-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
     int i, r;
     NVMeQueuePair *q;
     uint64_t prp_list_iova;
+    size_t bytes;
 
     q = g_try_new0(NVMeQueuePair, 1);
     if (!q) {
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
     }
     trace_nvme_create_queue_pair(idx, q, size, aio_context,
                                  event_notifier_get_fd(s->irq_notifier));
-    q->prp_list_pages = qemu_try_memalign(s->page_size,
-                                          s->page_size * NVME_NUM_REQS);
+    bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
+                          qemu_real_host_page_size);
+    q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes);
     if (!q->prp_list_pages) {
         goto fail;
     }
-    memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
+    memset(q->prp_list_pages, 0, bytes);
     qemu_mutex_init(&q->lock);
     q->s = s;
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
     q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
-    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
-                          s->page_size * NVME_NUM_REQS,
+    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
                           false, &prp_list_iova);
     if (r) {
         goto fail;
-- 
2.28.0

From: Eric Auger <eric.auger@redhat.com>

Make sure iov's va and size are properly aligned on the
host page size.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-23-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
     for (i = 0; i < qiov->niov; ++i) {
         bool retry = true;
         uint64_t iova;
+        size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
+                                   qemu_real_host_page_size);
 try_map:
         r = qemu_vfio_dma_map(s->vfio,
                               qiov->iov[i].iov_base,
-                              qiov->iov[i].iov_len,
-                              true, &iova);
+                              len, true, &iova);
         if (r == -ENOMEM && retry) {
             retry = false;
             trace_nvme_dma_flush_queue_wait(s);
@@ -XXX,XX +XXX,XX @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs,
     BDRVNVMeState *s = bs->opaque;
 
     for (i = 0; i < qiov->niov; ++i) {
-        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
-            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
+        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
+                                 qemu_real_host_page_size) ||
+            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) {
             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
                                       qiov->iov[i].iov_len, s->page_size);
             return false;
@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     int r;
     uint8_t *buf = NULL;
     QEMUIOVector local_qiov;
-
+    size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
     assert(QEMU_IS_ALIGNED(offset, s->page_size));
     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
     assert(bytes <= s->max_transfer);
@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     }
     s->stats.unaligned_accesses++;
     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
-    buf = qemu_try_memalign(s->page_size, bytes);
+    buf = qemu_try_memalign(qemu_real_host_page_size, len);
 
     if (!buf) {
         return -ENOMEM;
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

qemu_vfio_pci_map_bar() calls mmap(), and mmap(2) states:

'offset' must be a multiple of the page size as returned
   by sysconf(_SC_PAGE_SIZE).

In commit f68453237b9 we started to use an offset of 4K which
broke this contract on Aarch64 arch.

Fix by mapping at offset 0, and and accessing doorbells at offset=4K.

Fixes: f68453237b9 ("block/nvme: Map doorbells pages write-only")
Reported-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-24-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
 struct BDRVNVMeState {
     AioContext *aio_context;
     QEMUVFIOState *vfio;
+    void *bar0_wo_map;
     /* Memory mapped registers */
     volatile struct {
         uint32_t sq_tail;
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         }
     }
 
-    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
-                                         NVME_DOORBELL_SIZE, PROT_WRITE, errp);
+    s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
+                                           sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
+                                           PROT_WRITE, errp);
+    s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
     if (!s->doorbells) {
         ret = -EINVAL;
         goto out;
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                            false, NULL, NULL);
     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
-    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
-                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);
+    qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
+                            0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
     qemu_vfio_close(s->vfio);
 
     g_free(s->device);
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The Completion Queue Command Identifier is a 16-bit value,
so nvme_submit_command() is unlikely to work on big-endian
hosts, as the relevant bits are truncated.
Fix by using the correct byte-swap function.

Fixes: bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
Reported-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20201029093306.1063879-25-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
     assert(!req->cb);
     req->cb = cb;
     req->opaque = opaque;
-    cmd->cid = cpu_to_le32(req->cid);
+    cmd->cid = cpu_to_le16(req->cid);
 
     trace_nvme_submit_command(q->s, q->index, req->cid);
     nvme_trace_command(cmd);
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Change the confuse "VFIO IOMMU check failed" error message by
the explicit "VFIO IOMMU Type1 is not supported" once.

Example on POWER:

$ qemu-system-ppc64 -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw
 qemu-system-ppc64: -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw: VFIO IOMMU Type1 is not supported

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-2-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
     }
 
     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
-        error_setg_errno(errp, errno, "VFIO IOMMU check failed");
+        error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
         ret = -EINVAL;
         goto fail_container;
     }
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

We sometime get kernel panic with some devices on Aarch64
hosts. Alex Williamson suggests it might be broken PCIe
root complex. Add trace event to record the latest I/O
access before crashing. In case, assert our accesses are
aligned.

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-3-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 8 ++++++++
 util/trace-events   | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
 {
     int ret;
 
+    trace_qemu_vfio_pci_read_config(buf, ofs, size,
+                                    s->config_region_info.offset,
+                                    s->config_region_info.size);
+    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
     do {
         ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
     } while (ret == -1 && errno == EINTR);
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
 {
     int ret;
 
+    trace_qemu_vfio_pci_write_config(buf, ofs, size,
+                                     s->config_region_info.offset,
+                                     s->config_region_info.size);
+    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
     do {
         ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
     } while (ret == -1 && errno == EINTR);
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova
 qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
 qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
 qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
+qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

For debug purpose, trace BAR regions info.

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-4-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 8 ++++++++
 util/trace-events   | 1 +
 2 files changed, 9 insertions(+)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
 
 static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
 {
+    g_autofree char *barname = NULL;
     assert_bar_index_valid(s, index);
     s->bar_region_info[index] = (struct vfio_region_info) {
         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
         error_setg_errno(errp, errno, "Failed to get BAR region info");
         return -errno;
     }
+    barname = g_strdup_printf("bar[%d]", index);
+    trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
+                                s->bar_region_info[index].size,
+                                s->bar_region_info[index].cap_offset);
 
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
         ret = -errno;
         goto fail;
     }
+    trace_qemu_vfio_region_info("config", s->config_region_info.offset,
+                                s->config_region_info.size,
+                                s->config_region_info.cap_offset);
 
     for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
         ret = qemu_vfio_pci_init_bar(s, i, errp);
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *io
 qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
 qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
 qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

For debugging purpose, trace where a BAR is mapped.

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-5-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 2 ++
 util/trace-events   | 1 +
 2 files changed, 3 insertions(+)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
              prot, MAP_SHARED,
              s->device, s->bar_region_info[index].offset + offset);
+    trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
+                                size, offset, p);
     if (p == MAP_FAILED) {
         error_setg_errno(errp, errno, "Failed to map BAR region");
         p = NULL;
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
 qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
 qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
 qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
+qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

For debugging purpose, trace where DMA regions are mapped.

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-6-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 3 ++-
 util/trace-events   | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
         .vaddr = (uintptr_t)host,
         .size = size,
     };
-    trace_qemu_vfio_do_mapping(s, host, size, iova);
+    trace_qemu_vfio_do_mapping(s, host, iova, size);
 
     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
         error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
@@ -XXX,XX +XXX,XX @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
             }
         }
     }
+    trace_qemu_vfio_dma_mapped(s, host, iova0, size);
     if (iova) {
         *iova = iova0;
     }
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%z
 qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
 qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
 qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
-qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
-qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
+qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
+qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p"
+qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx"
 qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
 qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
 qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The QEMU_VFIO_DEBUG definition is only modifiable at build-time.
Trace events can be enabled at run-time. As we prefer the latter,
convert qemu_vfio_dump_mappings() to use trace events instead
of fprintf().

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-7-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 19 ++++---------------
 util/trace-events   |  1 +
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
     return s;
 }
 
-static void qemu_vfio_dump_mapping(IOVAMapping *m)
-{
-    if (QEMU_VFIO_DEBUG) {
-        printf("  vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
-               (uint64_t)m->size, (uint64_t)m->iova);
-    }
-}
-
 static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
 {
-    int i;
-
-    if (QEMU_VFIO_DEBUG) {
-        printf("vfio mappings\n");
-        for (i = 0; i < s->nr_mappings; ++i) {
-            qemu_vfio_dump_mapping(&s->mappings[i]);
-        }
+    for (int i = 0; i < s->nr_mappings; ++i) {
+        trace_qemu_vfio_dump_mapping(s->mappings[i].host,
+                                     s->mappings[i].iova,
+                                     s->mappings[i].size);
     }
 }
 
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex
 qemu_vfio_dma_reset_temporary(void *s) "s %p"
 qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
 qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx"
 qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
 qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
 qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
-- 
2.28.0