Revival of patches to implement NBD client multi-conn

[PATCH v2 4/4] nbd: Enable multi-conn using round-robin

Posted by Eric Blake 5 months, 1 week ago

From: "Richard W.M. Jones" <rjones@redhat.com>

Enable NBD multi-conn by spreading operations across multiple
connections.

(XXX) This uses a naive round-robin approach which could be improved.
For example we could look at how many requests are in flight and
assign operations to the connections with fewest.  Or we could try to
estimate (based on size of requests outstanding) the load on each
connection.  But this implementation doesn't do any of that.

Signed-off-by: Richard W.M. Jones <rjones@redhat.com>
Message-ID: <20230309113946.1528247-5-rjones@redhat.com>
---
 block/nbd.c | 67 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 19da1a7a1fe..bf5bc57569c 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -1264,6 +1264,26 @@ nbd_co_request(NBDConnState *cs, NBDRequest *request,
     return ret ? ret : request_ret;
 }

+/*
+ * If multi-conn, choose a connection for this operation.
+ */
+static NBDConnState *choose_connection(BDRVNBDState *s)
+{
+    static size_t next;
+    size_t i;
+
+    if (s->multi_conn <= 1) {
+        return s->conns[0];
+    }
+
+    /* XXX Stupid simple round robin. */
+    i = qatomic_fetch_inc(&next);
+    i %= s->multi_conn;
+
+    assert(s->conns[i] != NULL);
+    return s->conns[i];
+}
+
 static int coroutine_fn GRAPH_RDLOCK
 nbd_client_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
                      QEMUIOVector *qiov, BdrvRequestFlags flags)
@@ -1276,7 +1296,7 @@ nbd_client_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
         .from = offset,
         .len = bytes,
     };
-    NBDConnState * const cs = s->conns[0];
+    NBDConnState * const cs = choose_connection(s);

     assert(bytes <= NBD_MAX_BUFFER_SIZE);

@@ -1333,7 +1353,7 @@ nbd_client_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
         .from = offset,
         .len = bytes,
     };
-    NBDConnState * const cs = s->conns[0];
+    NBDConnState * const cs = choose_connection(s);

     assert(!(cs->info.flags & NBD_FLAG_READ_ONLY));
     if (flags & BDRV_REQ_FUA) {
@@ -1359,7 +1379,7 @@ nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
         .from = offset,
         .len = bytes,
     };
-    NBDConnState * const cs = s->conns[0];
+    NBDConnState * const cs = choose_connection(s);

     /* rely on max_pwrite_zeroes */
     assert(bytes <= UINT32_MAX || cs->info.mode >= NBD_MODE_EXTENDED);
@@ -1391,7 +1411,13 @@ static int coroutine_fn GRAPH_RDLOCK nbd_client_co_flush(BlockDriverState *bs)
 {
     BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
     NBDRequest request = { .type = NBD_CMD_FLUSH };
-    NBDConnState * const cs = s->conns[0];
+
+    /*
+     * Multi-conn (if used) guarantees that flushing on any connection
+     * flushes caches on all connections, so we can perform this
+     * operation on any.
+     */
+    NBDConnState * const cs = choose_connection(s);

     if (!(cs->info.flags & NBD_FLAG_SEND_FLUSH)) {
         return 0;
@@ -1412,7 +1438,7 @@ nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
         .from = offset,
         .len = bytes,
     };
-    NBDConnState * const cs = s->conns[0];
+    NBDConnState * const cs = choose_connection(s);

     /* rely on max_pdiscard */
     assert(bytes <= UINT32_MAX || cs->info.mode >= NBD_MODE_EXTENDED);
@@ -1433,7 +1459,7 @@ static int coroutine_fn GRAPH_RDLOCK nbd_client_co_block_status(
     NBDExtent64 extent = { 0 };
     BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
     Error *local_err = NULL;
-    NBDConnState * const cs = s->conns[0];
+    NBDConnState * const cs = choose_connection(s);

     NBDRequest request = {
         .type = NBD_CMD_BLOCK_STATUS,
@@ -2058,7 +2084,7 @@ fail:
 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
-    NBDConnState * const cs = s->conns[0];
+    NBDConnState * const cs = choose_connection(s);
     uint32_t min = cs->info.min_block;
     uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, cs->info.max_block);

@@ -2124,7 +2150,7 @@ static int coroutine_fn nbd_co_truncate(BlockDriverState *bs, int64_t offset,
                                         BdrvRequestFlags flags, Error **errp)
 {
     BDRVNBDState *s = bs->opaque;
-    NBDConnState * const cs = s->conns[0];
+    NBDConnState * const cs = choose_connection(s);

     if (offset != cs->info.size && exact) {
         error_setg(errp, "Cannot resize NBD nodes");
@@ -2207,24 +2233,29 @@ static const char *const nbd_strong_runtime_opts[] = {
 static void nbd_cancel_in_flight(BlockDriverState *bs)
 {
     BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
-    NBDConnState * const cs = s->conns[0];
+    size_t i;
+    NBDConnState *cs;

-    reconnect_delay_timer_del(cs);
+    for (i = 0; i < MAX_MULTI_CONN; ++i) {
+        cs = s->conns[i];

-    qemu_mutex_lock(&cs->requests_lock);
-    if (cs->state == NBD_CLIENT_CONNECTING_WAIT) {
-        cs->state = NBD_CLIENT_CONNECTING_NOWAIT;
+        reconnect_delay_timer_del(cs);
+
+        qemu_mutex_lock(&cs->requests_lock);
+        if (cs->state == NBD_CLIENT_CONNECTING_WAIT) {
+            cs->state = NBD_CLIENT_CONNECTING_NOWAIT;
+        }
+        qemu_mutex_unlock(&cs->requests_lock);
+
+        nbd_co_establish_connection_cancel(cs->conn);
     }
-    qemu_mutex_unlock(&cs->requests_lock);
-
-    nbd_co_establish_connection_cancel(cs->conn);
 }

 static void nbd_attach_aio_context(BlockDriverState *bs,
                                    AioContext *new_context)
 {
     BDRVNBDState *s = bs->opaque;
-    NBDConnState * const cs = s->conns[0];
+    NBDConnState * const cs = choose_connection(s);

     /* The open_timer is used only during nbd_open() */
     assert(!cs->open_timer);
@@ -2244,7 +2275,7 @@ static void nbd_attach_aio_context(BlockDriverState *bs,
 static void nbd_detach_aio_context(BlockDriverState *bs)
 {
     BDRVNBDState *s = bs->opaque;
-    NBDConnState * const cs = s->conns[0];
+    NBDConnState * const cs = choose_connection(s);

     assert(!cs->open_timer);
     assert(!cs->reconnect_delay_timer);
-- 
2.49.0

Re: [PATCH v2 4/4] nbd: Enable multi-conn using round-robin

Posted by Andrey Drobyshev 4 months, 2 weeks ago

On 4/28/25 9:46 PM, Eric Blake wrote:
> From: "Richard W.M. Jones" <rjones@redhat.com>
> 
> Enable NBD multi-conn by spreading operations across multiple
> connections.
> 
> (XXX) This uses a naive round-robin approach which could be improved.
> For example we could look at how many requests are in flight and
> assign operations to the connections with fewest.  Or we could try to
> estimate (based on size of requests outstanding) the load on each
> connection.  But this implementation doesn't do any of that.
> 
> Signed-off-by: Richard W.M. Jones <rjones@redhat.com>
> Message-ID: <20230309113946.1528247-5-rjones@redhat.com>
> ---
>  block/nbd.c | 67 +++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 49 insertions(+), 18 deletions(-)
> 
> diff --git a/block/nbd.c b/block/nbd.c
> index 19da1a7a1fe..bf5bc57569c 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> 
> [...]


> @@ -2207,24 +2233,29 @@ static const char *const nbd_strong_runtime_opts[] = {
>  static void nbd_cancel_in_flight(BlockDriverState *bs)
>  {
>      BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
> -    NBDConnState * const cs = s->conns[0];
> +    size_t i;
> +    NBDConnState *cs;
> 
> -    reconnect_delay_timer_del(cs);
> +    for (i = 0; i < MAX_MULTI_CONN; ++i) {
> +        cs = s->conns[i];
> 
> -    qemu_mutex_lock(&cs->requests_lock);
> -    if (cs->state == NBD_CLIENT_CONNECTING_WAIT) {
> -        cs->state = NBD_CLIENT_CONNECTING_NOWAIT;
> +        reconnect_delay_timer_del(cs);
> +

This code is causing iotests/{185,264,281} to segfault.  E.g.:

> (gdb) bt
> #0  0x000055bbaec58119 in reconnect_delay_timer_del (cs=0x0) at ../block/nbd.c:205
> #1  0x000055bbaec5d8e4 in nbd_cancel_in_flight (bs=0x55bbb1458020) at ../block/nbd.c:2242
> #2  0x000055bbaec4ff16 in bdrv_cancel_in_flight (bs=0x55bbb1458020) at ../block/io.c:3737
> #3  0x000055bbaec54ec1 in mirror_cancel (job=0x55bbb21ce800, force=true) at ../block/mirror.c:1335
> #4  0x000055bbaec18278 in job_cancel_async_locked (job=0x55bbb21ce800, force=true) at ../job.c:893
> #5  0x000055bbaec18df2 in job_cancel_locked (job=0x55bbb21ce800, force=true) at ../job.c:1143
> #6  0x000055bbaec18ef3 in job_force_cancel_err_locked (job=0x55bbb21ce800, errp=0x7fff44f247a0) at ../job.c:1190
> #7  0x000055bbaec192a4 in job_finish_sync_locked (job=0x55bbb21ce800, finish=0x55bbaec18ed2 <job_force_cancel_err_locked>, errp=0x0) at ../job.c:1253
> #8  0x000055bbaec18f2e in job_cancel_sync_locked (job=0x55bbb21ce800, force=true) at ../job.c:1196
> #9  0x000055bbaec19086 in job_cancel_sync_all () at ../job.c:1214
> #10 0x000055bbaed55177 in qemu_cleanup (status=0) at ../system/runstate.c:949
> #11 0x000055bbaedd0aad in qemu_default_main (opaque=0x0) at ../system/main.c:51
> #12 0x000055bbaedd0b4f in main (argc=21, argv=0x7fff44f249d8) at ../system/main.c:80

We're dereferencing a pointer to NBDConnState that was never
initialized.  So it should either be

> -    for (i = 0; i < MAX_MULTI_CONN; ++i) {
> +    for (i = 0; i < s->multi_conn; ++i) {

or

> -        cs = s->conns[i];
> +        if (s->conns[i]) {
> +            cs = s->conns[i];
> +        ...

[...]

Andrey

Re: [PATCH v2 4/4] nbd: Enable multi-conn using round-robin

Posted by Eric Blake 4 months, 2 weeks ago

On Thu, May 22, 2025 at 08:37:58PM +0300, Andrey Drobyshev wrote:
> On 4/28/25 9:46 PM, Eric Blake wrote:
> > From: "Richard W.M. Jones" <rjones@redhat.com>
> > 
> > Enable NBD multi-conn by spreading operations across multiple
> > connections.
> > 
> > (XXX) This uses a naive round-robin approach which could be improved.
> > For example we could look at how many requests are in flight and
> > assign operations to the connections with fewest.  Or we could try to
> > estimate (based on size of requests outstanding) the load on each
> > connection.  But this implementation doesn't do any of that.
> > 
> > Signed-off-by: Richard W.M. Jones <rjones@redhat.com>
> > Message-ID: <20230309113946.1528247-5-rjones@redhat.com>
> > ---
> >  block/nbd.c | 67 +++++++++++++++++++++++++++++++++++++++--------------
> >  1 file changed, 49 insertions(+), 18 deletions(-)
> > 
> > diff --git a/block/nbd.c b/block/nbd.c
> > index 19da1a7a1fe..bf5bc57569c 100644
> > --- a/block/nbd.c
> > +++ b/block/nbd.c
> > 
> > [...]
> 
> 
> > @@ -2207,24 +2233,29 @@ static const char *const nbd_strong_runtime_opts[] = {
> >  static void nbd_cancel_in_flight(BlockDriverState *bs)
> >  {
> >      BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
> > -    NBDConnState * const cs = s->conns[0];
> > +    size_t i;
> > +    NBDConnState *cs;
> > 
> > -    reconnect_delay_timer_del(cs);
> > +    for (i = 0; i < MAX_MULTI_CONN; ++i) {
> > +        cs = s->conns[i];
> > 
> > -    qemu_mutex_lock(&cs->requests_lock);
> > -    if (cs->state == NBD_CLIENT_CONNECTING_WAIT) {
> > -        cs->state = NBD_CLIENT_CONNECTING_NOWAIT;
> > +        reconnect_delay_timer_del(cs);
> > +
> 
> This code is causing iotests/{185,264,281} to segfault.  E.g.:
> 
> > (gdb) bt
> > #0  0x000055bbaec58119 in reconnect_delay_timer_del (cs=0x0) at ../block/nbd.c:205
> > #1  0x000055bbaec5d8e4 in nbd_cancel_in_flight (bs=0x55bbb1458020) at ../block/nbd.c:2242
> > #2  0x000055bbaec4ff16 in bdrv_cancel_in_flight (bs=0x55bbb1458020) at ../block/io.c:3737
> > #3  0x000055bbaec54ec1 in mirror_cancel (job=0x55bbb21ce800, force=true) at ../block/mirror.c:1335
> > #4  0x000055bbaec18278 in job_cancel_async_locked (job=0x55bbb21ce800, force=true) at ../job.c:893
> > #5  0x000055bbaec18df2 in job_cancel_locked (job=0x55bbb21ce800, force=true) at ../job.c:1143
> > #6  0x000055bbaec18ef3 in job_force_cancel_err_locked (job=0x55bbb21ce800, errp=0x7fff44f247a0) at ../job.c:1190
> > #7  0x000055bbaec192a4 in job_finish_sync_locked (job=0x55bbb21ce800, finish=0x55bbaec18ed2 <job_force_cancel_err_locked>, errp=0x0) at ../job.c:1253
> > #8  0x000055bbaec18f2e in job_cancel_sync_locked (job=0x55bbb21ce800, force=true) at ../job.c:1196
> > #9  0x000055bbaec19086 in job_cancel_sync_all () at ../job.c:1214
> > #10 0x000055bbaed55177 in qemu_cleanup (status=0) at ../system/runstate.c:949
> > #11 0x000055bbaedd0aad in qemu_default_main (opaque=0x0) at ../system/main.c:51
> > #12 0x000055bbaedd0b4f in main (argc=21, argv=0x7fff44f249d8) at ../system/main.c:80
> 
> We're dereferencing a pointer to NBDConnState that was never
> initialized.  So it should either be
> 
> > -    for (i = 0; i < MAX_MULTI_CONN; ++i) {
> > +    for (i = 0; i < s->multi_conn; ++i) {

Thanks for pointing it out; I'll fix that.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

Re: [PATCH v2 4/4] nbd: Enable multi-conn using round-robin

Posted by Richard W.M. Jones 5 months, 1 week ago

On Mon, Apr 28, 2025 at 01:46:47PM -0500, Eric Blake wrote:
[...]

This all looks similar to when I posted it before.  However I noted a
couple of problems ...

> (XXX) The strategy here is very naive.  Firstly if you were going to 
> open them, you'd probably want to open them in parallel.  Secondly it
> would make sense to delay opening until multiple parallel requests are
> seen (perhaps above some threshold), so that simple or shortlived NBD
> operations do not require multiple connections to be made.

> (XXX) This uses a naive round-robin approach which could be improved.
> For example we could look at how many requests are in flight and
> assign operations to the connections with fewest.  Or we could try to
> estimate (based on size of requests outstanding) the load on each
> connection.  But this implementation doesn't do any of that.

Plus there was a third rather more fundamental problem that apparently
I didn't write about.  That is that connections were serialised on a
single thread (called from many coroutines).  This bottleneck meant
that there wasn't very much advantage to multi-conn, compared to what
we get in libnbd / nbdcopy.

Are these fixed / planned to be fixed, especially the third?

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-top is 'top' for virtual machines.  Tiny program with many
powerful monitoring features, net stats, disk stats, logging, etc.
http://people.redhat.com/~rjones/virt-top

Re: [PATCH v2 4/4] nbd: Enable multi-conn using round-robin

Posted by Eric Blake 5 months, 1 week ago

On Mon, Apr 28, 2025 at 08:27:54PM +0100, Richard W.M. Jones wrote:
> On Mon, Apr 28, 2025 at 01:46:47PM -0500, Eric Blake wrote:
> [...]
> 
> This all looks similar to when I posted it before.  However I noted a
> couple of problems ...
> 
> > (XXX) The strategy here is very naive.  Firstly if you were going to 
> > open them, you'd probably want to open them in parallel.  Secondly it
> > would make sense to delay opening until multiple parallel requests are
> > seen (perhaps above some threshold), so that simple or shortlived NBD
> > operations do not require multiple connections to be made.
> 
> > (XXX) This uses a naive round-robin approach which could be improved.
> > For example we could look at how many requests are in flight and
> > assign operations to the connections with fewest.  Or we could try to
> > estimate (based on size of requests outstanding) the load on each
> > connection.  But this implementation doesn't do any of that.
> 
> Plus there was a third rather more fundamental problem that apparently
> I didn't write about.  That is that connections were serialised on a
> single thread (called from many coroutines).  This bottleneck meant
> that there wasn't very much advantage to multi-conn, compared to what
> we get in libnbd / nbdcopy.
> 
> Are these fixed / planned to be fixed, especially the third?

That indeed is what I hope to address - to provide additional patches
on top of this rebase to make it possible to specify a pool of more
than one thread using the recent multiqueue work in the block layer.
It's taking me longer than I want to get something that I'm happy with
posting.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

[PATCH v2 1/4] nbd: Add multi-conn option
[PATCH v2 2/4] nbd: Split out block device state from underlying NBD connections
[PATCH v2 3/4] nbd: Open multiple NBD connections if multi-conn is set
[PATCH v2 4/4] nbd: Enable multi-conn using round-robin