[v2] xen-disk: performance improvements

[Qemu-devel] [PATCH v2 3/3] xen-disk: use an IOThread per instance

Posted by Paul Durrant 8 years, 7 months ago

This patch allocates an IOThread object for each xen_disk instance and
sets the AIO context appropriately on connect. This allows processing
of I/O to proceed in parallel.

The patch also adds tracepoints into xen_disk to make it possible to
follow the state transtions of an instance in the log.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
---
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Anthony Perard <anthony.perard@citrix.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Max Reitz <mreitz@redhat.com>

v2:
 - explicitly acquire and release AIO context in qemu_aio_complete() and
   blk_bh()
---
 hw/block/trace-events |  7 ++++++
 hw/block/xen_disk.c   | 69 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/hw/block/trace-events b/hw/block/trace-events
index 65e83dc258..608b24ba66 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -10,3 +10,10 @@ virtio_blk_submit_multireq(void *mrb, int start, int num_reqs, uint64_t offset,
 # hw/block/hd-geometry.c
 hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
 hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
+
+# hw/block/xen_disk.c
+xen_disk_alloc(char *name) "%s"
+xen_disk_init(char *name) "%s"
+xen_disk_connect(char *name) "%s"
+xen_disk_disconnect(char *name) "%s"
+xen_disk_free(char *name) "%s"
diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 0e6513708e..8548195195 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -27,10 +27,13 @@
 #include "hw/xen/xen_backend.h"
 #include "xen_blkif.h"
 #include "sysemu/blockdev.h"
+#include "sysemu/iothread.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
+#include "qom/object_interfaces.h"
+#include "trace.h"
 
 /* ------------------------------------------------------------- */
 
@@ -128,6 +131,9 @@ struct XenBlkDev {
     DriveInfo           *dinfo;
     BlockBackend        *blk;
     QEMUBH              *bh;
+
+    IOThread            *iothread;
+    AioContext          *ctx;
 };
 
 /* ------------------------------------------------------------- */
@@ -599,9 +605,12 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq);
 static void qemu_aio_complete(void *opaque, int ret)
 {
     struct ioreq *ioreq = opaque;
+    struct XenBlkDev *blkdev = ioreq->blkdev;
+
+    aio_context_acquire(blkdev->ctx);
 
     if (ret != 0) {
-        xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
+        xen_pv_printf(&blkdev->xendev, 0, "%s I/O error\n",
                       ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
         ioreq->aio_errors++;
     }
@@ -610,13 +619,13 @@ static void qemu_aio_complete(void *opaque, int ret)
     if (ioreq->presync) {
         ioreq->presync = 0;
         ioreq_runio_qemu_aio(ioreq);
-        return;
+        goto done;
     }
     if (ioreq->aio_inflight > 0) {
-        return;
+        goto done;
     }
 
-    if (ioreq->blkdev->feature_grant_copy) {
+    if (blkdev->feature_grant_copy) {
         switch (ioreq->req.operation) {
         case BLKIF_OP_READ:
             /* in case of failure ioreq->aio_errors is increased */
@@ -638,7 +647,7 @@ static void qemu_aio_complete(void *opaque, int ret)
     }
 
     ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
-    if (!ioreq->blkdev->feature_grant_copy) {
+    if (!blkdev->feature_grant_copy) {
         ioreq_unmap(ioreq);
     }
     ioreq_finish(ioreq);
@@ -650,16 +659,19 @@ static void qemu_aio_complete(void *opaque, int ret)
         }
     case BLKIF_OP_READ:
         if (ioreq->status == BLKIF_RSP_OKAY) {
-            block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
+            block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct);
         } else {
-            block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
+            block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct);
         }
         break;
     case BLKIF_OP_DISCARD:
     default:
         break;
     }
-    qemu_bh_schedule(ioreq->blkdev->bh);
+    qemu_bh_schedule(blkdev->bh);
+
+done:
+    aio_context_release(blkdev->ctx);
 }
 
 static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t sector_number,
@@ -917,17 +929,40 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
 static void blk_bh(void *opaque)
 {
     struct XenBlkDev *blkdev = opaque;
+
+    aio_context_acquire(blkdev->ctx);
     blk_handle_requests(blkdev);
+    aio_context_release(blkdev->ctx);
 }
 
 static void blk_alloc(struct XenDevice *xendev)
 {
     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+    Object *obj;
+    char *name;
+    Error *err = NULL;
+
+    trace_xen_disk_alloc(xendev->name);
 
     QLIST_INIT(&blkdev->inflight);
     QLIST_INIT(&blkdev->finished);
     QLIST_INIT(&blkdev->freelist);
-    blkdev->bh = qemu_bh_new(blk_bh, blkdev);
+
+    obj = object_new(TYPE_IOTHREAD);
+    name = g_strdup_printf("iothread-%s", xendev->name);
+
+    object_property_add_child(object_get_objects_root(), name, obj, &err);
+    assert(!err);
+
+    g_free(name);
+
+    user_creatable_complete(obj, &err);
+    assert(!err);
+
+    blkdev->iothread = (IOThread *)object_dynamic_cast(obj, TYPE_IOTHREAD);
+    blkdev->ctx = iothread_get_aio_context(blkdev->iothread);
+    blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev);
+
     if (xen_mode != XEN_EMULATE) {
         batch_maps = 1;
     }
@@ -954,6 +989,8 @@ static int blk_init(struct XenDevice *xendev)
     int info = 0;
     char *directiosafe = NULL;
 
+    trace_xen_disk_init(xendev->name);
+
     /* read xenstore entries */
     if (blkdev->params == NULL) {
         char *h = NULL;
@@ -1069,6 +1106,8 @@ static int blk_connect(struct XenDevice *xendev)
     unsigned int i;
     uint32_t *domids;
 
+    trace_xen_disk_connect(xendev->name);
+
     /* read-only ? */
     if (blkdev->directiosafe) {
         qflags = BDRV_O_NOCACHE | BDRV_O_NATIVE_AIO;
@@ -1288,6 +1327,8 @@ static int blk_connect(struct XenDevice *xendev)
         blkdev->persistent_gnt_count = 0;
     }
 
+    blk_set_aio_context(blkdev->blk, blkdev->ctx);
+
     xen_be_bind_evtchn(&blkdev->xendev);
 
     xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, nr-ring-ref %u, "
@@ -1301,13 +1342,20 @@ static void blk_disconnect(struct XenDevice *xendev)
 {
     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 
+    trace_xen_disk_disconnect(xendev->name);
+
+    aio_context_acquire(blkdev->ctx);
+
     if (blkdev->blk) {
+        blk_set_aio_context(blkdev->blk, qemu_get_aio_context());
         blk_detach_dev(blkdev->blk, blkdev);
         blk_unref(blkdev->blk);
         blkdev->blk = NULL;
     }
     xen_pv_unbind_evtchn(&blkdev->xendev);
 
+    aio_context_release(blkdev->ctx);
+
     if (blkdev->sring) {
         xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring,
                         blkdev->nr_ring_ref);
@@ -1341,6 +1389,8 @@ static int blk_free(struct XenDevice *xendev)
     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
     struct ioreq *ioreq;
 
+    trace_xen_disk_free(xendev->name);
+
     if (blkdev->blk || blkdev->sring) {
         blk_disconnect(xendev);
     }
@@ -1358,6 +1408,7 @@ static int blk_free(struct XenDevice *xendev)
     g_free(blkdev->dev);
     g_free(blkdev->devtype);
     qemu_bh_delete(blkdev->bh);
+    object_unparent(OBJECT(blkdev->iothread));
     return 0;
 }
 
-- 
2.11.0

Re: [Qemu-devel] [PATCH v2 3/3] xen-disk: use an IOThread per instance

Posted by Stefano Stabellini 8 years, 7 months ago

CC'ing Andreas Färber. Could you please give a quick look below at the
way the iothread object is instantiate and destroyed? I am no object
model expert and would appreaciate a second opinion.


On Wed, 21 Jun 2017, Paul Durrant wrote:
> This patch allocates an IOThread object for each xen_disk instance and
> sets the AIO context appropriately on connect. This allows processing
> of I/O to proceed in parallel.
> 
> The patch also adds tracepoints into xen_disk to make it possible to
> follow the state transtions of an instance in the log.
> 
> Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
> ---
> Cc: Stefano Stabellini <sstabellini@kernel.org>
> Cc: Anthony Perard <anthony.perard@citrix.com>
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: Max Reitz <mreitz@redhat.com>
> 
> v2:
>  - explicitly acquire and release AIO context in qemu_aio_complete() and
>    blk_bh()
> ---
>  hw/block/trace-events |  7 ++++++
>  hw/block/xen_disk.c   | 69 ++++++++++++++++++++++++++++++++++++++++++++-------
>  2 files changed, 67 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 65e83dc258..608b24ba66 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -10,3 +10,10 @@ virtio_blk_submit_multireq(void *mrb, int start, int num_reqs, uint64_t offset,
>  # hw/block/hd-geometry.c
>  hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
>  hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
> +
> +# hw/block/xen_disk.c
> +xen_disk_alloc(char *name) "%s"
> +xen_disk_init(char *name) "%s"
> +xen_disk_connect(char *name) "%s"
> +xen_disk_disconnect(char *name) "%s"
> +xen_disk_free(char *name) "%s"
> diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
> index 0e6513708e..8548195195 100644
> --- a/hw/block/xen_disk.c
> +++ b/hw/block/xen_disk.c
> @@ -27,10 +27,13 @@
>  #include "hw/xen/xen_backend.h"
>  #include "xen_blkif.h"
>  #include "sysemu/blockdev.h"
> +#include "sysemu/iothread.h"
>  #include "sysemu/block-backend.h"
>  #include "qapi/error.h"
>  #include "qapi/qmp/qdict.h"
>  #include "qapi/qmp/qstring.h"
> +#include "qom/object_interfaces.h"
> +#include "trace.h"
>  
>  /* ------------------------------------------------------------- */
>  
> @@ -128,6 +131,9 @@ struct XenBlkDev {
>      DriveInfo           *dinfo;
>      BlockBackend        *blk;
>      QEMUBH              *bh;
> +
> +    IOThread            *iothread;
> +    AioContext          *ctx;
>  };
>  
>  /* ------------------------------------------------------------- */
> @@ -599,9 +605,12 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq);
>  static void qemu_aio_complete(void *opaque, int ret)
>  {
>      struct ioreq *ioreq = opaque;
> +    struct XenBlkDev *blkdev = ioreq->blkdev;
> +
> +    aio_context_acquire(blkdev->ctx);

I think that Paolo was right that we need a aio_context_acquire here,
however the issue is that with the current code:

  blk_handle_requests -> ioreq_runio_qemu_aio -> qemu_aio_complete

leading to aio_context_acquire being called twice on the same lock,
which I don't think is allowed?

I think we need to get rid of the qemu_aio_complete call from
ioreq_runio_qemu_aio, but to do that we need to be careful with the
accounting of aio_inflight (today it's incremented unconditionally at
the beginning of ioreq_runio_qemu_aio, I think we would have to change
that to increment it only if presync).


>      if (ret != 0) {
> -        xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
> +        xen_pv_printf(&blkdev->xendev, 0, "%s I/O error\n",
>                        ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
>          ioreq->aio_errors++;
>      }
> @@ -610,13 +619,13 @@ static void qemu_aio_complete(void *opaque, int ret)
>      if (ioreq->presync) {
>          ioreq->presync = 0;
>          ioreq_runio_qemu_aio(ioreq);
> -        return;
> +        goto done;
>      }
>      if (ioreq->aio_inflight > 0) {
> -        return;
> +        goto done;
>      }
>  
> -    if (ioreq->blkdev->feature_grant_copy) {
> +    if (blkdev->feature_grant_copy) {
>          switch (ioreq->req.operation) {
>          case BLKIF_OP_READ:
>              /* in case of failure ioreq->aio_errors is increased */
> @@ -638,7 +647,7 @@ static void qemu_aio_complete(void *opaque, int ret)
>      }
>  
>      ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
> -    if (!ioreq->blkdev->feature_grant_copy) {
> +    if (!blkdev->feature_grant_copy) {
>          ioreq_unmap(ioreq);
>      }
>      ioreq_finish(ioreq);
> @@ -650,16 +659,19 @@ static void qemu_aio_complete(void *opaque, int ret)
>          }
>      case BLKIF_OP_READ:
>          if (ioreq->status == BLKIF_RSP_OKAY) {
> -            block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
> +            block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct);
>          } else {
> -            block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
> +            block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct);
>          }
>          break;
>      case BLKIF_OP_DISCARD:
>      default:
>          break;
>      }
> -    qemu_bh_schedule(ioreq->blkdev->bh);
> +    qemu_bh_schedule(blkdev->bh);
> +
> +done:
> +    aio_context_release(blkdev->ctx);
>  }
>  
>  static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t sector_number,
> @@ -917,17 +929,40 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
>  static void blk_bh(void *opaque)
>  {
>      struct XenBlkDev *blkdev = opaque;
> +
> +    aio_context_acquire(blkdev->ctx);
>      blk_handle_requests(blkdev);
> +    aio_context_release(blkdev->ctx);
>  }
>  
>  static void blk_alloc(struct XenDevice *xendev)
>  {
>      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
> +    Object *obj;
> +    char *name;
> +    Error *err = NULL;
> +
> +    trace_xen_disk_alloc(xendev->name);
>  
>      QLIST_INIT(&blkdev->inflight);
>      QLIST_INIT(&blkdev->finished);
>      QLIST_INIT(&blkdev->freelist);
> -    blkdev->bh = qemu_bh_new(blk_bh, blkdev);
> +
> +    obj = object_new(TYPE_IOTHREAD);
> +    name = g_strdup_printf("iothread-%s", xendev->name);
> +
> +    object_property_add_child(object_get_objects_root(), name, obj, &err);
> +    assert(!err);

Would it be enough to call object_ref?


> +    g_free(name);
> +
> +    user_creatable_complete(obj, &err);

Why do we need to call this?


> +    assert(!err);
> +
> +    blkdev->iothread = (IOThread *)object_dynamic_cast(obj, TYPE_IOTHREAD);
> +    blkdev->ctx = iothread_get_aio_context(blkdev->iothread);
> +    blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev);
> +
>      if (xen_mode != XEN_EMULATE) {
>          batch_maps = 1;
>      }
> @@ -1288,6 +1327,8 @@ static int blk_connect(struct XenDevice *xendev)
>          blkdev->persistent_gnt_count = 0;
>      }
>  
> +    blk_set_aio_context(blkdev->blk, blkdev->ctx);
> +
>      xen_be_bind_evtchn(&blkdev->xendev);
>  
>      xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, nr-ring-ref %u, "
> @@ -1301,13 +1342,20 @@ static void blk_disconnect(struct XenDevice *xendev)
>  {
>      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
>  
> +    trace_xen_disk_disconnect(xendev->name);
> +
> +    aio_context_acquire(blkdev->ctx);
> +
>      if (blkdev->blk) {
> +        blk_set_aio_context(blkdev->blk, qemu_get_aio_context());
>          blk_detach_dev(blkdev->blk, blkdev);
>          blk_unref(blkdev->blk);
>          blkdev->blk = NULL;
>      }
>      xen_pv_unbind_evtchn(&blkdev->xendev);
>  
> +    aio_context_release(blkdev->ctx);
> +
>      if (blkdev->sring) {
>          xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring,
>                          blkdev->nr_ring_ref);
> @@ -1358,6 +1408,7 @@ static int blk_free(struct XenDevice *xendev)
>      g_free(blkdev->dev);
>      g_free(blkdev->devtype);
>      qemu_bh_delete(blkdev->bh);
> +    object_unparent(OBJECT(blkdev->iothread));

Shouldn't this be object_unref?


>      return 0;
>  }

Re: [Qemu-devel] [PATCH v2 3/3] xen-disk: use an IOThread per instance

Posted by Paul Durrant 8 years, 7 months ago

> -----Original Message-----
> From: Stefano Stabellini [mailto:sstabellini@kernel.org]
> Sent: 22 June 2017 23:15
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: xen-devel@lists.xenproject.org; qemu-devel@nongnu.org; qemu-
> block@nongnu.org; Stefano Stabellini <sstabellini@kernel.org>; Anthony
> Perard <anthony.perard@citrix.com>; Kevin Wolf <kwolf@redhat.com>;
> Max Reitz <mreitz@redhat.com>; afaerber@suse.de
> Subject: Re: [PATCH v2 3/3] xen-disk: use an IOThread per instance
> 
> CC'ing Andreas Färber. Could you please give a quick look below at the
> way the iothread object is instantiate and destroyed? I am no object
> model expert and would appreaciate a second opinion.
> 

I have not seen any response so far.

> 
> On Wed, 21 Jun 2017, Paul Durrant wrote:
> > This patch allocates an IOThread object for each xen_disk instance and
> > sets the AIO context appropriately on connect. This allows processing
> > of I/O to proceed in parallel.
> >
> > The patch also adds tracepoints into xen_disk to make it possible to
> > follow the state transtions of an instance in the log.
> >
> > Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
> > ---
> > Cc: Stefano Stabellini <sstabellini@kernel.org>
> > Cc: Anthony Perard <anthony.perard@citrix.com>
> > Cc: Kevin Wolf <kwolf@redhat.com>
> > Cc: Max Reitz <mreitz@redhat.com>
> >
> > v2:
> >  - explicitly acquire and release AIO context in qemu_aio_complete() and
> >    blk_bh()
> > ---
> >  hw/block/trace-events |  7 ++++++
> >  hw/block/xen_disk.c   | 69
> ++++++++++++++++++++++++++++++++++++++++++++-------
> >  2 files changed, 67 insertions(+), 9 deletions(-)
> >
> > diff --git a/hw/block/trace-events b/hw/block/trace-events
> > index 65e83dc258..608b24ba66 100644
> > --- a/hw/block/trace-events
> > +++ b/hw/block/trace-events
> > @@ -10,3 +10,10 @@ virtio_blk_submit_multireq(void *mrb, int start, int
> num_reqs, uint64_t offset,
> >  # hw/block/hd-geometry.c
> >  hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p
> LCHS %d %d %d"
> >  hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t
> secs, int trans) "blk %p CHS %u %u %u trans %d"
> > +
> > +# hw/block/xen_disk.c
> > +xen_disk_alloc(char *name) "%s"
> > +xen_disk_init(char *name) "%s"
> > +xen_disk_connect(char *name) "%s"
> > +xen_disk_disconnect(char *name) "%s"
> > +xen_disk_free(char *name) "%s"
> > diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
> > index 0e6513708e..8548195195 100644
> > --- a/hw/block/xen_disk.c
> > +++ b/hw/block/xen_disk.c
> > @@ -27,10 +27,13 @@
> >  #include "hw/xen/xen_backend.h"
> >  #include "xen_blkif.h"
> >  #include "sysemu/blockdev.h"
> > +#include "sysemu/iothread.h"
> >  #include "sysemu/block-backend.h"
> >  #include "qapi/error.h"
> >  #include "qapi/qmp/qdict.h"
> >  #include "qapi/qmp/qstring.h"
> > +#include "qom/object_interfaces.h"
> > +#include "trace.h"
> >
> >  /* ------------------------------------------------------------- */
> >
> > @@ -128,6 +131,9 @@ struct XenBlkDev {
> >      DriveInfo           *dinfo;
> >      BlockBackend        *blk;
> >      QEMUBH              *bh;
> > +
> > +    IOThread            *iothread;
> > +    AioContext          *ctx;
> >  };
> >
> >  /* ------------------------------------------------------------- */
> > @@ -599,9 +605,12 @@ static int ioreq_runio_qemu_aio(struct ioreq
> *ioreq);
> >  static void qemu_aio_complete(void *opaque, int ret)
> >  {
> >      struct ioreq *ioreq = opaque;
> > +    struct XenBlkDev *blkdev = ioreq->blkdev;
> > +
> > +    aio_context_acquire(blkdev->ctx);
> 
> I think that Paolo was right that we need a aio_context_acquire here,
> however the issue is that with the current code:
> 
>   blk_handle_requests -> ioreq_runio_qemu_aio -> qemu_aio_complete
> 
> leading to aio_context_acquire being called twice on the same lock,
> which I don't think is allowed?

It resolves to a qemu_rec_mutex_lock() which I believed is a recursive lock, so I think that's ok.

> 
> I think we need to get rid of the qemu_aio_complete call from
> ioreq_runio_qemu_aio, but to do that we need to be careful with the
> accounting of aio_inflight (today it's incremented unconditionally at
> the beginning of ioreq_runio_qemu_aio, I think we would have to change
> that to increment it only if presync).
> 

If the lock is indeed recursive then I think we can avoid this complication.

> 
> >      if (ret != 0) {
> > -        xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
> > +        xen_pv_printf(&blkdev->xendev, 0, "%s I/O error\n",
> >                        ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
> >          ioreq->aio_errors++;
> >      }
> > @@ -610,13 +619,13 @@ static void qemu_aio_complete(void *opaque, int
> ret)
> >      if (ioreq->presync) {
> >          ioreq->presync = 0;
> >          ioreq_runio_qemu_aio(ioreq);
> > -        return;
> > +        goto done;
> >      }
> >      if (ioreq->aio_inflight > 0) {
> > -        return;
> > +        goto done;
> >      }
> >
> > -    if (ioreq->blkdev->feature_grant_copy) {
> > +    if (blkdev->feature_grant_copy) {
> >          switch (ioreq->req.operation) {
> >          case BLKIF_OP_READ:
> >              /* in case of failure ioreq->aio_errors is increased */
> > @@ -638,7 +647,7 @@ static void qemu_aio_complete(void *opaque, int
> ret)
> >      }
> >
> >      ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR :
> BLKIF_RSP_OKAY;
> > -    if (!ioreq->blkdev->feature_grant_copy) {
> > +    if (!blkdev->feature_grant_copy) {
> >          ioreq_unmap(ioreq);
> >      }
> >      ioreq_finish(ioreq);
> > @@ -650,16 +659,19 @@ static void qemu_aio_complete(void *opaque, int
> ret)
> >          }
> >      case BLKIF_OP_READ:
> >          if (ioreq->status == BLKIF_RSP_OKAY) {
> > -            block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
> > +            block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct);
> >          } else {
> > -            block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
> > +            block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct);
> >          }
> >          break;
> >      case BLKIF_OP_DISCARD:
> >      default:
> >          break;
> >      }
> > -    qemu_bh_schedule(ioreq->blkdev->bh);
> > +    qemu_bh_schedule(blkdev->bh);
> > +
> > +done:
> > +    aio_context_release(blkdev->ctx);
> >  }
> >
> >  static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t
> sector_number,
> > @@ -917,17 +929,40 @@ static void blk_handle_requests(struct XenBlkDev
> *blkdev)
> >  static void blk_bh(void *opaque)
> >  {
> >      struct XenBlkDev *blkdev = opaque;
> > +
> > +    aio_context_acquire(blkdev->ctx);
> >      blk_handle_requests(blkdev);
> > +    aio_context_release(blkdev->ctx);
> >  }
> >
> >  static void blk_alloc(struct XenDevice *xendev)
> >  {
> >      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev,
> xendev);
> > +    Object *obj;
> > +    char *name;
> > +    Error *err = NULL;
> > +
> > +    trace_xen_disk_alloc(xendev->name);
> >
> >      QLIST_INIT(&blkdev->inflight);
> >      QLIST_INIT(&blkdev->finished);
> >      QLIST_INIT(&blkdev->freelist);
> > -    blkdev->bh = qemu_bh_new(blk_bh, blkdev);
> > +
> > +    obj = object_new(TYPE_IOTHREAD);
> > +    name = g_strdup_printf("iothread-%s", xendev->name);
> > +
> > +    object_property_add_child(object_get_objects_root(), name, obj,
> &err);
> > +    assert(!err);
> 
> Would it be enough to call object_ref?
> 

You mean to avoid the assert? I guess so but I think any failure here would be indicative of a larger problem.

> 
> > +    g_free(name);
> > +
> > +    user_creatable_complete(obj, &err);
> 
> Why do we need to call this?
> 

I'm not entirely sure but looking around the object code it seemed to be a necessary part of instantiation. Maybe it is not required for iothread objects, but I could not figure that out from looking at the code and comments in the header suggest it is harmless if it is not required.

> 
> > +    assert(!err);
> > +
> > +    blkdev->iothread = (IOThread *)object_dynamic_cast(obj,
> TYPE_IOTHREAD);
> > +    blkdev->ctx = iothread_get_aio_context(blkdev->iothread);
> > +    blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev);
> > +
> >      if (xen_mode != XEN_EMULATE) {
> >          batch_maps = 1;
> >      }
> > @@ -1288,6 +1327,8 @@ static int blk_connect(struct XenDevice *xendev)
> >          blkdev->persistent_gnt_count = 0;
> >      }
> >
> > +    blk_set_aio_context(blkdev->blk, blkdev->ctx);
> > +
> >      xen_be_bind_evtchn(&blkdev->xendev);
> >
> >      xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, nr-ring-ref %u, "
> > @@ -1301,13 +1342,20 @@ static void blk_disconnect(struct XenDevice
> *xendev)
> >  {
> >      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev,
> xendev);
> >
> > +    trace_xen_disk_disconnect(xendev->name);
> > +
> > +    aio_context_acquire(blkdev->ctx);
> > +
> >      if (blkdev->blk) {
> > +        blk_set_aio_context(blkdev->blk, qemu_get_aio_context());
> >          blk_detach_dev(blkdev->blk, blkdev);
> >          blk_unref(blkdev->blk);
> >          blkdev->blk = NULL;
> >      }
> >      xen_pv_unbind_evtchn(&blkdev->xendev);
> >
> > +    aio_context_release(blkdev->ctx);
> > +
> >      if (blkdev->sring) {
> >          xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring,
> >                          blkdev->nr_ring_ref);
> > @@ -1358,6 +1408,7 @@ static int blk_free(struct XenDevice *xendev)
> >      g_free(blkdev->dev);
> >      g_free(blkdev->devtype);
> >      qemu_bh_delete(blkdev->bh);
> > +    object_unparent(OBJECT(blkdev->iothread));
> 
> Shouldn't this be object_unref?
> 

I don't think so. I think this is required to undo what was done by calling object_property_add_child() on the root object. Looking at other code such as object_new_with_propv() it looks like the right thing to do is to call object_unref() after calling object_property_add_child() to drop the implicit ref taken by object_new() so I'd need to add the call in blk_alloc().

  Paul

> 
> >      return 0;
> >  }

Re: [Qemu-devel] [PATCH v2 3/3] xen-disk: use an IOThread per instance

Posted by Stefano Stabellini 8 years, 7 months ago

On Fri, 7 Jul 2017, Paul Durrant wrote:
> > -----Original Message-----
> > From: Stefano Stabellini [mailto:sstabellini@kernel.org]
> > Sent: 22 June 2017 23:15
> > To: Paul Durrant <Paul.Durrant@citrix.com>
> > Cc: xen-devel@lists.xenproject.org; qemu-devel@nongnu.org; qemu-
> > block@nongnu.org; Stefano Stabellini <sstabellini@kernel.org>; Anthony
> > Perard <anthony.perard@citrix.com>; Kevin Wolf <kwolf@redhat.com>;
> > Max Reitz <mreitz@redhat.com>; afaerber@suse.de
> > Subject: Re: [PATCH v2 3/3] xen-disk: use an IOThread per instance
> > 
> > CC'ing Andreas Färber. Could you please give a quick look below at the
> > way the iothread object is instantiate and destroyed? I am no object
> > model expert and would appreaciate a second opinion.
> > 
> 
> I have not seen any response so far.
> 
> > 
> > On Wed, 21 Jun 2017, Paul Durrant wrote:
> > > This patch allocates an IOThread object for each xen_disk instance and
> > > sets the AIO context appropriately on connect. This allows processing
> > > of I/O to proceed in parallel.
> > >
> > > The patch also adds tracepoints into xen_disk to make it possible to
> > > follow the state transtions of an instance in the log.
> > >
> > > Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
> > > ---
> > > Cc: Stefano Stabellini <sstabellini@kernel.org>
> > > Cc: Anthony Perard <anthony.perard@citrix.com>
> > > Cc: Kevin Wolf <kwolf@redhat.com>
> > > Cc: Max Reitz <mreitz@redhat.com>
> > >
> > > v2:
> > >  - explicitly acquire and release AIO context in qemu_aio_complete() and
> > >    blk_bh()
> > > ---
> > >  hw/block/trace-events |  7 ++++++
> > >  hw/block/xen_disk.c   | 69
> > ++++++++++++++++++++++++++++++++++++++++++++-------
> > >  2 files changed, 67 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/hw/block/trace-events b/hw/block/trace-events
> > > index 65e83dc258..608b24ba66 100644
> > > --- a/hw/block/trace-events
> > > +++ b/hw/block/trace-events
> > > @@ -10,3 +10,10 @@ virtio_blk_submit_multireq(void *mrb, int start, int
> > num_reqs, uint64_t offset,
> > >  # hw/block/hd-geometry.c
> > >  hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p
> > LCHS %d %d %d"
> > >  hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t
> > secs, int trans) "blk %p CHS %u %u %u trans %d"
> > > +
> > > +# hw/block/xen_disk.c
> > > +xen_disk_alloc(char *name) "%s"
> > > +xen_disk_init(char *name) "%s"
> > > +xen_disk_connect(char *name) "%s"
> > > +xen_disk_disconnect(char *name) "%s"
> > > +xen_disk_free(char *name) "%s"
> > > diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
> > > index 0e6513708e..8548195195 100644
> > > --- a/hw/block/xen_disk.c
> > > +++ b/hw/block/xen_disk.c
> > > @@ -27,10 +27,13 @@
> > >  #include "hw/xen/xen_backend.h"
> > >  #include "xen_blkif.h"
> > >  #include "sysemu/blockdev.h"
> > > +#include "sysemu/iothread.h"
> > >  #include "sysemu/block-backend.h"
> > >  #include "qapi/error.h"
> > >  #include "qapi/qmp/qdict.h"
> > >  #include "qapi/qmp/qstring.h"
> > > +#include "qom/object_interfaces.h"
> > > +#include "trace.h"
> > >
> > >  /* ------------------------------------------------------------- */
> > >
> > > @@ -128,6 +131,9 @@ struct XenBlkDev {
> > >      DriveInfo           *dinfo;
> > >      BlockBackend        *blk;
> > >      QEMUBH              *bh;
> > > +
> > > +    IOThread            *iothread;
> > > +    AioContext          *ctx;
> > >  };
> > >
> > >  /* ------------------------------------------------------------- */
> > > @@ -599,9 +605,12 @@ static int ioreq_runio_qemu_aio(struct ioreq
> > *ioreq);
> > >  static void qemu_aio_complete(void *opaque, int ret)
> > >  {
> > >      struct ioreq *ioreq = opaque;
> > > +    struct XenBlkDev *blkdev = ioreq->blkdev;
> > > +
> > > +    aio_context_acquire(blkdev->ctx);
> > 
> > I think that Paolo was right that we need a aio_context_acquire here,
> > however the issue is that with the current code:
> > 
> >   blk_handle_requests -> ioreq_runio_qemu_aio -> qemu_aio_complete
> > 
> > leading to aio_context_acquire being called twice on the same lock,
> > which I don't think is allowed?
> 
> It resolves to a qemu_rec_mutex_lock() which I believed is a recursive lock, so I think that's ok.

On Linux it becomes pthread_mutex_lock. The lock is created by
qemu_rec_mutex_init which specifies PTHREAD_MUTEX_RECURSIVE, so yes, it
should be recursive. Good.


> > 
> > I think we need to get rid of the qemu_aio_complete call from
> > ioreq_runio_qemu_aio, but to do that we need to be careful with the
> > accounting of aio_inflight (today it's incremented unconditionally at
> > the beginning of ioreq_runio_qemu_aio, I think we would have to change
> > that to increment it only if presync).
> > 
> 
> If the lock is indeed recursive then I think we can avoid this complication.

OK


> > 
> > >      if (ret != 0) {
> > > -        xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
> > > +        xen_pv_printf(&blkdev->xendev, 0, "%s I/O error\n",
> > >                        ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
> > >          ioreq->aio_errors++;
> > >      }
> > > @@ -610,13 +619,13 @@ static void qemu_aio_complete(void *opaque, int
> > ret)
> > >      if (ioreq->presync) {
> > >          ioreq->presync = 0;
> > >          ioreq_runio_qemu_aio(ioreq);
> > > -        return;
> > > +        goto done;
> > >      }
> > >      if (ioreq->aio_inflight > 0) {
> > > -        return;
> > > +        goto done;
> > >      }
> > >
> > > -    if (ioreq->blkdev->feature_grant_copy) {
> > > +    if (blkdev->feature_grant_copy) {
> > >          switch (ioreq->req.operation) {
> > >          case BLKIF_OP_READ:
> > >              /* in case of failure ioreq->aio_errors is increased */
> > > @@ -638,7 +647,7 @@ static void qemu_aio_complete(void *opaque, int
> > ret)
> > >      }
> > >
> > >      ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR :
> > BLKIF_RSP_OKAY;
> > > -    if (!ioreq->blkdev->feature_grant_copy) {
> > > +    if (!blkdev->feature_grant_copy) {
> > >          ioreq_unmap(ioreq);
> > >      }
> > >      ioreq_finish(ioreq);
> > > @@ -650,16 +659,19 @@ static void qemu_aio_complete(void *opaque, int
> > ret)
> > >          }
> > >      case BLKIF_OP_READ:
> > >          if (ioreq->status == BLKIF_RSP_OKAY) {
> > > -            block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
> > > +            block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct);
> > >          } else {
> > > -            block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
> > > +            block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct);
> > >          }
> > >          break;
> > >      case BLKIF_OP_DISCARD:
> > >      default:
> > >          break;
> > >      }
> > > -    qemu_bh_schedule(ioreq->blkdev->bh);
> > > +    qemu_bh_schedule(blkdev->bh);
> > > +
> > > +done:
> > > +    aio_context_release(blkdev->ctx);
> > >  }
> > >
> > >  static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t
> > sector_number,
> > > @@ -917,17 +929,40 @@ static void blk_handle_requests(struct XenBlkDev
> > *blkdev)
> > >  static void blk_bh(void *opaque)
> > >  {
> > >      struct XenBlkDev *blkdev = opaque;
> > > +
> > > +    aio_context_acquire(blkdev->ctx);
> > >      blk_handle_requests(blkdev);
> > > +    aio_context_release(blkdev->ctx);
> > >  }
> > >
> > >  static void blk_alloc(struct XenDevice *xendev)
> > >  {
> > >      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev,
> > xendev);
> > > +    Object *obj;
> > > +    char *name;
> > > +    Error *err = NULL;
> > > +
> > > +    trace_xen_disk_alloc(xendev->name);
> > >
> > >      QLIST_INIT(&blkdev->inflight);
> > >      QLIST_INIT(&blkdev->finished);
> > >      QLIST_INIT(&blkdev->freelist);
> > > -    blkdev->bh = qemu_bh_new(blk_bh, blkdev);
> > > +
> > > +    obj = object_new(TYPE_IOTHREAD);
> > > +    name = g_strdup_printf("iothread-%s", xendev->name);
> > > +
> > > +    object_property_add_child(object_get_objects_root(), name, obj,
> > &err);
> > > +    assert(!err);
> > 
> > Would it be enough to call object_ref?
> > 
> 
> You mean to avoid the assert? I guess so but I think any failure here would be indicative of a larger problem.

No, I meant calling object_ref instead of object_property_add_child.


> > 
> > > +    g_free(name);
> > > +
> > > +    user_creatable_complete(obj, &err);
> > 
> > Why do we need to call this?
> > 
> 
> I'm not entirely sure but looking around the object code it seemed to be a necessary part of instantiation. Maybe it is not required for iothread objects, but I could not figure that out from looking at the code and comments in the header suggest it is harmless if it is not required.
>
> > > +    assert(!err);
> > > +
> > > +    blkdev->iothread = (IOThread *)object_dynamic_cast(obj,
> > TYPE_IOTHREAD);
> > > +    blkdev->ctx = iothread_get_aio_context(blkdev->iothread);
> > > +    blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev);
> > > +
> > >      if (xen_mode != XEN_EMULATE) {
> > >          batch_maps = 1;
> > >      }
> > > @@ -1288,6 +1327,8 @@ static int blk_connect(struct XenDevice *xendev)
> > >          blkdev->persistent_gnt_count = 0;
> > >      }
> > >
> > > +    blk_set_aio_context(blkdev->blk, blkdev->ctx);
> > > +
> > >      xen_be_bind_evtchn(&blkdev->xendev);
> > >
> > >      xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, nr-ring-ref %u, "
> > > @@ -1301,13 +1342,20 @@ static void blk_disconnect(struct XenDevice
> > *xendev)
> > >  {
> > >      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev,
> > xendev);
> > >
> > > +    trace_xen_disk_disconnect(xendev->name);
> > > +
> > > +    aio_context_acquire(blkdev->ctx);
> > > +
> > >      if (blkdev->blk) {
> > > +        blk_set_aio_context(blkdev->blk, qemu_get_aio_context());
> > >          blk_detach_dev(blkdev->blk, blkdev);
> > >          blk_unref(blkdev->blk);
> > >          blkdev->blk = NULL;
> > >      }
> > >      xen_pv_unbind_evtchn(&blkdev->xendev);
> > >
> > > +    aio_context_release(blkdev->ctx);
> > > +
> > >      if (blkdev->sring) {
> > >          xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring,
> > >                          blkdev->nr_ring_ref);
> > > @@ -1358,6 +1408,7 @@ static int blk_free(struct XenDevice *xendev)
> > >      g_free(blkdev->dev);
> > >      g_free(blkdev->devtype);
> > >      qemu_bh_delete(blkdev->bh);
> > > +    object_unparent(OBJECT(blkdev->iothread));
> > 
> > Shouldn't this be object_unref?
> > 
> 
> I don't think so. I think this is required to undo what was done by calling object_property_add_child() on the root object.

Right, so if object_property_add_child is not actually required, then
you might be able to turn object_unparent into object_unref.

Unfortunately I don't know enough about QOM to be able to tell which is
the right way of doing things, but looking at
hw/block/dataplane/virtio-blk.c, it would seem that only object_ref and
object_unref are required?


> Looking at other code such as object_new_with_propv() it looks like the right thing to do is to call object_unref() after calling object_property_add_child() to drop the implicit ref taken by object_new() so I'd need to add the call in blk_alloc().

Re: [Qemu-devel] [PATCH v2 3/3] xen-disk: use an IOThread per instance

Posted by Paul Durrant 8 years, 7 months ago

> -----Original Message-----
[snip]
> > > > +    object_unparent(OBJECT(blkdev->iothread));
> > >
> > > Shouldn't this be object_unref?
> > >
> >
> > I don't think so. I think this is required to undo what was done by calling
> object_property_add_child() on the root object.
> 
> Right, so if object_property_add_child is not actually required, then
> you might be able to turn object_unparent into object_unref.
> 
> Unfortunately I don't know enough about QOM to be able to tell which is
> the right way of doing things, but looking at
> hw/block/dataplane/virtio-blk.c, it would seem that only object_ref and
> object_unref are required?
> 

I guess I can give it a try. I was working on the assumption that all objects were required to have a parent, but maybe that's not true. Can someone more familiar with QOM comment?

Cheers,

  Paul

> 
> > Looking at other code such as object_new_with_propv() it looks like the
> right thing to do is to call object_unref() after calling
> object_property_add_child() to drop the implicit ref taken by object_new()
> so I'd need to add the call in blk_alloc().

[Qemu-devel] [PATCH v2 1/3] xen-disk: only advertize feature-persistent if grant copy is not available
[Qemu-devel] [PATCH v2 2/3] xen-disk: add support for multi-page shared rings
[Qemu-devel] [PATCH v2 3/3] xen-disk: use an IOThread per instance