[PATCH] block/rbd: support driver-specific reopen

Raphael Pour posted 1 patch 2 years ago
Test checkpatch passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20220413122656.3070251-1-raphael.pour@hetzner.com
Maintainers: Ilya Dryomov <idryomov@gmail.com>, Peter Lieven <pl@kamp.de>, Kevin Wolf <kwolf@redhat.com>, Hanna Reitz <hreitz@redhat.com>
block/rbd.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 201 insertions(+), 5 deletions(-)
[PATCH] block/rbd: support driver-specific reopen
Posted by Raphael Pour 2 years ago
This patch completes the reopen functionality for an attached RBD where altered
driver options can be passed to. This is necessary to move RBDs between ceph
clusters without interrupting QEMU, where some ceph settings need to be adjusted.

The reopen_prepare method early returns if no rbd-specific driver options are
given to maintain compatible with the previous behavior by dropping all
generic block layer options. Otherwise the reopen acts similar to qemu_rbd_open.

The reopen_commit tears down the old state and replaces it with the new
one.

The reopen_abort drops an ongoing reopen.

Signed-off-by: Raphael Pour <raphael.pour@hetzner.com>
---
 block/rbd.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 201 insertions(+), 5 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 6caf35cbba..e7b45d1c50 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -1029,19 +1029,213 @@ out:
 static int qemu_rbd_reopen_prepare(BDRVReopenState *state,
                                    BlockReopenQueue *queue, Error **errp)
 {
-    BDRVRBDState *s = state->bs->opaque;
-    int ret = 0;
+    BDRVRBDState *new_s = state->bs->opaque;
+    BlockdevOptionsRbd *opts = NULL;
+    const QDictEntry *e;
+    Error *local_err = NULL;
+    char *keypairs, *secretid;
+    rbd_image_info_t info;
+    int r = 0;
 
-    if (s->snap && state->flags & BDRV_O_RDWR) {
+    if (new_s->snap && state->flags & BDRV_O_RDWR) {
         error_setg(errp,
                    "Cannot change node '%s' to r/w when using RBD snapshot",
                    bdrv_get_device_or_node_name(state->bs));
-        ret = -EINVAL;
+        r = -EINVAL;
     }
 
-    return ret;
+    /*
+     * Remove all keys from the generic layer which
+     * can't be converted by rbd
+     */
+    qdict_del(state->options, "driver");
+    qdict_del(state->options, "node-name");
+    qdict_del(state->options, "auto-read-only");
+    qdict_del(state->options, "discard");
+    qdict_del(state->options, "cache");
+
+    /*
+     * To maintain the compatibility prior the rbd-reopen,
+     * where the generic layer can be altered without any
+     * rbd argument given, we must early return if there
+     * aren't any rbd-specific options left.
+     */
+    if (qdict_size(state->options) == 0) {
+        return r;
+    }
+
+    new_s = state->opaque = g_new0(BDRVReopenState, 1);
+
+    keypairs = g_strdup(qdict_get_try_str(state->options, "=keyvalue-pairs"));
+    if (keypairs) {
+        qdict_del(state->options, "=keyvalue-pairs");
+    }
+
+    secretid = g_strdup(qdict_get_try_str(state->options, "password-secret"));
+    if (secretid) {
+        qdict_del(state->options, "password-secret");
+    }
+
+    r = qemu_rbd_convert_options(state->options, &opts, &local_err);
+    if (local_err) {
+        /*
+         * If keypairs are present, that means some options are present in
+         * the modern option format.  Don't attempt to parse legacy option
+         * formats, as we won't support mixed usage.
+         */
+        if (keypairs) {
+            error_propagate(errp, local_err);
+            goto out;
+        }
+
+        /*
+         * If the initial attempt to convert and process the options failed,
+         * we may be attempting to open an image file that has the rbd options
+         * specified in the older format consisting of all key/value pairs
+         * encoded in the filename.  Go ahead and attempt to parse the
+         * filename, and see if we can pull out the required options.
+         */
+        r = qemu_rbd_attempt_legacy_options(state->options, &opts, &keypairs);
+        if (r < 0) {
+            /*
+             * Propagate the original error, not the legacy parsing fallback
+             * error, as the latter was just a best-effort attempt.
+             */
+            error_propagate(errp, local_err);
+            goto out;
+        }
+        /*
+         * Take care whenever deciding to actually deprecate; once this ability
+         * is removed, we will not be able to open any images with legacy-styled
+         * backing image strings.
+         */
+        warn_report("RBD options encoded in the filename as keyvalue pairs "
+                    "is deprecated");
+    }
+
+    /*
+     * Remove the processed options from the QDict (the visitor processes
+     * _all_ options in the QDict)
+     */
+    while ((e = qdict_first(state->options))) {
+        qdict_del(state->options, e->key);
+    }
+
+    r = qemu_rbd_connect(&new_s->cluster, &new_s->io_ctx, opts,
+                         !(state->flags & BDRV_O_NOCACHE), keypairs,
+                         secretid, errp);
+    if (r < 0) {
+        goto out;
+    }
+
+    new_s->snap = g_strdup(opts->snapshot);
+    new_s->image_name = g_strdup(opts->image);
+
+    /* rbd_open is always r/w */
+    r = rbd_open(new_s->io_ctx, new_s->image_name, &new_s->image, new_s->snap);
+    if (r < 0) {
+        error_setg_errno(errp, -r, "error reading header from %s",
+                         new_s->image_name);
+        goto failed_open;
+    }
+
+    if (opts->has_encrypt) {
+#ifdef LIBRBD_SUPPORTS_ENCRYPTION
+        r = qemu_rbd_encryption_load(new_s->image, opts->encrypt, errp);
+        if (r < 0) {
+            goto failed_post_open;
+        }
+#else
+        r = -ENOTSUP;
+        error_setg(errp, "RBD library does not support image encryption");
+        goto failed_post_open;
+#endif
+    }
+
+    r = rbd_stat(new_s->image, &info, sizeof(info));
+    if (r < 0) {
+        error_setg_errno(errp, -r, "error getting image info from %s",
+                         new_s->image_name);
+        goto failed_post_open;
+    }
+    new_s->image_size = info.size;
+    new_s->object_size = info.obj_size;
+
+    /*
+     * If we are using an rbd snapshot, we must be r/o, otherwise
+     * leave as-is
+     */
+    if (new_s->snap != NULL) {
+        r = bdrv_apply_auto_read_only(state->bs, "rbd snapshots are read-only",
+                                      errp);
+        if (r < 0) {
+            goto failed_post_open;
+        }
+    }
+
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+    state->bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
+#endif
+
+    /* When extending regular files, we get zeros from the OS */
+    state->bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
+
+    r = 0;
+    goto out;
+
+failed_post_open:
+    rbd_close(new_s->image);
+failed_open:
+    rados_ioctx_destroy(new_s->io_ctx);
+    g_free(new_s->snap);
+    g_free(new_s->image_name);
+    rados_shutdown(new_s->cluster);
+out:
+    qapi_free_BlockdevOptionsRbd(opts);
+    g_free(keypairs);
+    g_free(secretid);
+    return r;
 }
 
+static void qemu_rbd_reopen_abort(BDRVReopenState *reopen_state)
+{
+    BDRVRBDState *s = reopen_state->bs->opaque;
+
+    if (s->io_ctx) {
+        rados_ioctx_destroy(s->io_ctx);
+    }
+
+   if (s->cluster) {
+        rados_shutdown(s->cluster);
+    }
+
+    g_free(s->snap);
+    g_free(reopen_state->opaque);
+    reopen_state->opaque = NULL;
+}
+
+static void qemu_rbd_reopen_commit(BDRVReopenState *reopen_state)
+{
+    BDRVRBDState *s = reopen_state->bs->opaque;
+    BDRVRBDState *new_s = reopen_state->opaque;
+
+    rados_aio_flush(s->io_ctx);
+
+    rbd_close(s->image);
+    rados_ioctx_destroy(s->io_ctx);
+    g_free(s->snap);
+    rados_shutdown(s->cluster);
+
+    s->io_ctx = new_s->io_ctx;
+    s->cluster = new_s->cluster;
+    s->image = new_s->image;
+    s->snap = new_s->snap;
+
+    g_free(reopen_state->opaque);
+    reopen_state->opaque = NULL;
+}
+
+
 static void qemu_rbd_close(BlockDriverState *bs)
 {
     BDRVRBDState *s = bs->opaque;
@@ -1628,6 +1822,8 @@ static BlockDriver bdrv_rbd = {
     .bdrv_file_open         = qemu_rbd_open,
     .bdrv_close             = qemu_rbd_close,
     .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
+    .bdrv_reopen_commit     = qemu_rbd_reopen_commit,
+    .bdrv_reopen_abort     = qemu_rbd_reopen_abort,
     .bdrv_co_create         = qemu_rbd_co_create,
     .bdrv_co_create_opts    = qemu_rbd_co_create_opts,
     .bdrv_has_zero_init     = bdrv_has_zero_init_1,
-- 
2.35.1
Re: [PATCH] block/rbd: support driver-specific reopen
Posted by Hanna Reitz 1 year, 10 months ago
On 13.04.22 14:26, Raphael Pour wrote:
> This patch completes the reopen functionality for an attached RBD where altered
> driver options can be passed to. This is necessary to move RBDs between ceph
> clusters without interrupting QEMU, where some ceph settings need to be adjusted.
>
> The reopen_prepare method early returns if no rbd-specific driver options are
> given to maintain compatible with the previous behavior by dropping all
> generic block layer options. Otherwise the reopen acts similar to qemu_rbd_open.
>
> The reopen_commit tears down the old state and replaces it with the new
> one.
>
> The reopen_abort drops an ongoing reopen.
>
> Signed-off-by: Raphael Pour <raphael.pour@hetzner.com>
> ---
>   block/rbd.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 201 insertions(+), 5 deletions(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index 6caf35cbba..e7b45d1c50 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -1029,19 +1029,213 @@ out:

I think the comment above this point (“Since RBD is currently...”) 
should either be dropped now or moved above the `if (new_s->snap && 
...)` condition.

>   static int qemu_rbd_reopen_prepare(BDRVReopenState *state,
>                                      BlockReopenQueue *queue, Error **errp)
>   {
> -    BDRVRBDState *s = state->bs->opaque;
> -    int ret = 0;
> +    BDRVRBDState *new_s = state->bs->opaque;
> +    BlockdevOptionsRbd *opts = NULL;
> +    const QDictEntry *e;
> +    Error *local_err = NULL;
> +    char *keypairs, *secretid;
> +    rbd_image_info_t info;
> +    int r = 0;
>   
> -    if (s->snap && state->flags & BDRV_O_RDWR) {
> +    if (new_s->snap && state->flags & BDRV_O_RDWR) {
>           error_setg(errp,
>                      "Cannot change node '%s' to r/w when using RBD snapshot",
>                      bdrv_get_device_or_node_name(state->bs));

Is this still the case?  I understand next to nothing about RBD, but can 
the user not make it R/W if they simultaneously decide to switch from 
snapshot to not-snapshot?

(I.e. shouldn’t we just let the generic code below figure out whether 
we’ll get an error with the whole new configuration?)

> -        ret = -EINVAL;
> +        r = -EINVAL;

If it is still relevant: Why not return the error immediately here?

If we don’t, it looks like a couple of bad things might happen below; 
like `r` getting overwritten, or `errp` getting set twice (which would 
cause an assertion failure).

>       }
>   
> -    return ret;
> +    /*
> +     * Remove all keys from the generic layer which
> +     * can't be converted by rbd
> +     */

Does any other driver do this?  Shouldn’t we leave them there so that 
the generic layer can verify that they aren’t changed?

> +    qdict_del(state->options, "driver");
> +    qdict_del(state->options, "node-name");
> +    qdict_del(state->options, "auto-read-only");
> +    qdict_del(state->options, "discard");
> +    qdict_del(state->options, "cache");

Because AFAIU this would mean that users could attempt to change e.g. 
the @cache option, and wouldn’t receive an error back, even though there 
is no support for changing it.

> +
> +    /*
> +     * To maintain the compatibility prior the rbd-reopen,
> +     * where the generic layer can be altered without any
> +     * rbd argument given,

What does “the generic layer can be altered” mean?  As far as I 
understand, it was only possible to change between read/write and 
read-only access.

>                              we must early return if there
> +     * aren't any rbd-specific options left.
> +     */
> +    if (qdict_size(state->options) == 0) {
> +        return r;
> +    }
> +
> +    new_s = state->opaque = g_new0(BDRVReopenState, 1);

This seems like it’s only “new” from this point on, but before that, it 
was the old state.  I find it confusing that a variable named “new_s” 
apparently stored the old state before this point, so if that were the 
case, I’d use a different variable (e.g. the previously existing `s`) 
for `state->bs->opaque`.

> +
> +    keypairs = g_strdup(qdict_get_try_str(state->options, "=keyvalue-pairs"));
> +    if (keypairs) {
> +        qdict_del(state->options, "=keyvalue-pairs");
> +    }
> +
> +    secretid = g_strdup(qdict_get_try_str(state->options, "password-secret"));
> +    if (secretid) {
> +        qdict_del(state->options, "password-secret");
> +    }
> +
> +    r = qemu_rbd_convert_options(state->options, &opts, &local_err);
> +    if (local_err) {
> +        /*
> +         * If keypairs are present, that means some options are present in
> +         * the modern option format.  Don't attempt to parse legacy option
> +         * formats, as we won't support mixed usage.
> +         */
> +        if (keypairs) {
> +            error_propagate(errp, local_err);
> +            goto out;
> +        }
> +
> +        /*
> +         * If the initial attempt to convert and process the options failed,
> +         * we may be attempting to open an image file that has the rbd options
> +         * specified in the older format consisting of all key/value pairs
> +         * encoded in the filename.  Go ahead and attempt to parse the
> +         * filename, and see if we can pull out the required options.
> +         */
> +        r = qemu_rbd_attempt_legacy_options(state->options, &opts, &keypairs);
> +        if (r < 0) {
> +            /*
> +             * Propagate the original error, not the legacy parsing fallback
> +             * error, as the latter was just a best-effort attempt.
> +             */
> +            error_propagate(errp, local_err);
> +            goto out;
> +        }
> +        /*
> +         * Take care whenever deciding to actually deprecate; once this ability
> +         * is removed, we will not be able to open any images with legacy-styled
> +         * backing image strings.
> +         */
> +        warn_report("RBD options encoded in the filename as keyvalue pairs "
> +                    "is deprecated");
> +    }
> +
> +    /*
> +     * Remove the processed options from the QDict (the visitor processes
> +     * _all_ options in the QDict)
> +     */
> +    while ((e = qdict_first(state->options))) {
> +        qdict_del(state->options, e->key);
> +    }

OK, I see why you’d then want to remove all non-RBD options before this 
point.  Other block drivers seem to solve this by having an 
X_runtime_opts QemuOptsList, which contains all driver-handled options, 
so they can then use qemu_opts_absorb_qdict() to extract the options 
they can handle.  (Compare e.g. qcow2_update_options_prepare().)

> +
> +    r = qemu_rbd_connect(&new_s->cluster, &new_s->io_ctx, opts,
> +                         !(state->flags & BDRV_O_NOCACHE), keypairs,
> +                         secretid, errp);

I assume that’s possible without causing issues while we still have the 
old connection open?  (I can’t test this, but I assume you did, so I’m 
just asking back for confirmation :))

> +    if (r < 0) {
> +        goto out;
> +    }
> +
> +    new_s->snap = g_strdup(opts->snapshot);
> +    new_s->image_name = g_strdup(opts->image);
> +
> +    /* rbd_open is always r/w */
> +    r = rbd_open(new_s->io_ctx, new_s->image_name, &new_s->image, new_s->snap);
> +    if (r < 0) {
> +        error_setg_errno(errp, -r, "error reading header from %s",
> +                         new_s->image_name);
> +        goto failed_open;
> +    }
> +
> +    if (opts->has_encrypt) {
> +#ifdef LIBRBD_SUPPORTS_ENCRYPTION
> +        r = qemu_rbd_encryption_load(new_s->image, opts->encrypt, errp);
> +        if (r < 0) {
> +            goto failed_post_open;
> +        }
> +#else
> +        r = -ENOTSUP;
> +        error_setg(errp, "RBD library does not support image encryption");
> +        goto failed_post_open;
> +#endif
> +    }
> +
> +    r = rbd_stat(new_s->image, &info, sizeof(info));
> +    if (r < 0) {
> +        error_setg_errno(errp, -r, "error getting image info from %s",
> +                         new_s->image_name);
> +        goto failed_post_open;
> +    }
> +    new_s->image_size = info.size;
> +    new_s->object_size = info.obj_size;
> +
> +    /*
> +     * If we are using an rbd snapshot, we must be r/o, otherwise
> +     * leave as-is
> +     */
> +    if (new_s->snap != NULL) {
> +        r = bdrv_apply_auto_read_only(state->bs, "rbd snapshots are read-only",
> +                                      errp);
> +        if (r < 0) {
> +            goto failed_post_open;
> +        }
> +    }
> +
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +    state->bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
> +#endif
> +
> +    /* When extending regular files, we get zeros from the OS */
> +    state->bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
> +
> +    r = 0;
> +    goto out;

It seems to me like all of this code comes directly from 
qemu_rbd_open().  I think it should therefore be put into a new function 
that’s used by both qemu_rbd_open() and qemu_rbd_reopen_prepare().

> +
> +failed_post_open:
> +    rbd_close(new_s->image);
> +failed_open:
> +    rados_ioctx_destroy(new_s->io_ctx);
> +    g_free(new_s->snap);
> +    g_free(new_s->image_name);
> +    rados_shutdown(new_s->cluster);
> +out:
> +    qapi_free_BlockdevOptionsRbd(opts);
> +    g_free(keypairs);
> +    g_free(secretid);
> +    return r;
>   }
>   
> +static void qemu_rbd_reopen_abort(BDRVReopenState *reopen_state)
> +{
> +    BDRVRBDState *s = reopen_state->bs->opaque;

Should this not be `reopen_state->opaque`, i.e. the new state?  I 
would’ve thought in case of abort we need to leave the old state intact.

> +
> +    if (s->io_ctx) {
> +        rados_ioctx_destroy(s->io_ctx);
> +    }
> +
> +   if (s->cluster) {
> +        rados_shutdown(s->cluster);
> +    }
> +
> +    g_free(s->snap);
> +    g_free(reopen_state->opaque);
> +    reopen_state->opaque = NULL;

These two lines look as I’d’ve expected them, but that makes the 
preceding code more suspicious (i.e. we close the old state, then free 
the new one).

> +}
> +
> +static void qemu_rbd_reopen_commit(BDRVReopenState *reopen_state)
> +{
> +    BDRVRBDState *s = reopen_state->bs->opaque;
> +    BDRVRBDState *new_s = reopen_state->opaque;
> +
> +    rados_aio_flush(s->io_ctx);
> +
> +    rbd_close(s->image);
> +    rados_ioctx_destroy(s->io_ctx);
> +    g_free(s->snap);
> +    rados_shutdown(s->cluster);
> +
> +    s->io_ctx = new_s->io_ctx;
> +    s->cluster = new_s->cluster;
> +    s->image = new_s->image;
> +    s->snap = new_s->snap;
> +
> +    g_free(reopen_state->opaque);
> +    reopen_state->opaque = NULL;
> +}

This looks OK.

> +
> +
>   static void qemu_rbd_close(BlockDriverState *bs)
>   {
>       BDRVRBDState *s = bs->opaque;
> @@ -1628,6 +1822,8 @@ static BlockDriver bdrv_rbd = {
>       .bdrv_file_open         = qemu_rbd_open,
>       .bdrv_close             = qemu_rbd_close,
>       .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
> +    .bdrv_reopen_commit     = qemu_rbd_reopen_commit,
> +    .bdrv_reopen_abort     = qemu_rbd_reopen_abort,
>       .bdrv_co_create         = qemu_rbd_co_create,
>       .bdrv_co_create_opts    = qemu_rbd_co_create_opts,
>       .bdrv_has_zero_init     = bdrv_has_zero_init_1,


Re: [PATCH] block/rbd: support driver-specific reopen
Posted by Raphael Pour 1 year, 9 months ago
On 7/1/22 11:41, Hanna Reitz wrote:
> On 13.04.22 14:26, Raphael Pour wrote:
>  >>       }
>> -    return ret;
>> +    /*
>> +     * Remove all keys from the generic layer which
>> +     * can't be converted by rbd
>> +     * >
> Does any other driver do this?  Shouldn’t we leave them there so that 
> the generic layer can verify that they aren’t changed?
> 
>> +    qdict_del(state->options, "driver");
>> +    qdict_del(state->options, "node-name");
>> +    qdict_del(state->options, "auto-read-only");
>> +    qdict_del(state->options, "discard");
>> +    qdict_del(state->options, "cache");
> 
> Because AFAIU this would mean that users could attempt to change e.g. 
> the @cache option, and wouldn’t receive an error back, even though there 
> is no support for changing it.
> 
>> +
>> +    /*
>> +     * To maintain the compatibility prior the rbd-reopen,
>> +     * where the generic layer can be altered without any
>> +     * rbd argument given,
> 
> What does “the generic layer can be altered” mean?  As far as I 
> understand, it was only possible to change between read/write and 
> read-only access.
> 
>> +
>> +    keypairs = g_strdup(qdict_get_try_str(state->options, 
>> "=keyvalue-pairs"));
>> +    if (keypairs) {
>> +        qdict_del(state->options, "=keyvalue-pairs");
>> +    }
>> +
>> +    secretid = g_strdup(qdict_get_try_str(state->options, 
>> "password-secret"));
>> +    if (secretid) {
>> +        qdict_del(state->options, "password-secret");
>> +    }
>> +
>> +    r = qemu_rbd_convert_options(state->options, &opts, &local_err);
>> +    if (local_err) {
>> +        /*
>> +         * If keypairs are present, that means some options are 
>> present in
>> +         * the modern option format.  Don't attempt to parse legacy 
>> option
>> +         * formats, as we won't support mixed usage.
>> +         */
>> +        if (keypairs) {
>> +            error_propagate(errp, local_err);
>> +            goto out;
>> +        }
>> +
>> +        /*
>> +         * If the initial attempt to convert and process the options 
>> failed,
>> +         * we may be attempting to open an image file that has the 
>> rbd options
>> +         * specified in the older format consisting of all key/value 
>> pairs
>> +         * encoded in the filename.  Go ahead and attempt to parse the
>> +         * filename, and see if we can pull out the required options.
>> +         */
>> +        r = qemu_rbd_attempt_legacy_options(state->options, &opts, 
>> &keypairs);
>> +        if (r < 0) {
>> +            /*
>> +             * Propagate the original error, not the legacy parsing 
>> fallback
>> +             * error, as the latter was just a best-effort attempt.
>> +             */
>> +            error_propagate(errp, local_err);
>> +            goto out;
>> +        }
>> +        /*
>> +         * Take care whenever deciding to actually deprecate; once 
>> this ability
>> +         * is removed, we will not be able to open any images with 
>> legacy-styled
>> +         * backing image strings.
>> +         */
>> +        warn_report("RBD options encoded in the filename as keyvalue 
>> pairs "
>> +                    "is deprecated");
>> +    }
>> +
>> +    /*
>> +     * Remove the processed options from the QDict (the visitor 
>> processes
>> +     * _all_ options in the QDict)
>> +     */
>> +    while ((e = qdict_first(state->options))) {
>> +        qdict_del(state->options, e->key);
>> +    }
 >
 > OK, I see why you’d then want to remove all non-RBD options before this
 > point.  Other block drivers seem to solve this by having an
 > X_runtime_opts QemuOptsList, which contains all driver-handled options,
 > so they can then use qemu_opts_absorb_qdict() to extract the options
 > they can handle.  (Compare e.g. qcow2_update_options_prepare().)
 >

Looking through all the drivers, rbd seems to be the only one not 
absorbing its options via runtime_opts but instead using 
qemu_rbd_convert_options. The converter visits all the options from 
BlockdevOptionsRbd defined in block-core.json. If there is an unknown 
option the conversion fails with EINVAL.

The runtime_opts in contrast to the already defined json schema with the 
visitor-code-generation seem to be redundant. Do you have some insights 
why and when this redundancy is reasonable?

I came up with several alternatives:

1. add own runtime_opts:
   - pro: "the qemu way", it behaves like most of the drivers
   - con: redundancy to qemu_rbd_convert_options which does the same 
except skipping the generic block layer options and adds +1k lines
   - con: I couldn't figure out how to add non-primitive options to the 
runtime_opts like encrypt or server
2. tell visitor to ignore unknown keys (?)
3. parse rbd options manually (opposite of deleting the generic block keys)

I agree deleting the generic block opts isn't optimal.

What do you think?

Your remaining points are also reasonable and I'll submit their fix 
along with the solution to the issue above.
Re: [PATCH] block/rbd: support driver-specific reopen
Posted by Raphael Pour 1 year, 10 months ago
Hello everyone,

what do you think? Please tell me if something needs to be clarified or 
improved.

Raphael

PS: Hopefully this second reply attempt isn't messed up (first: 
https://lists.nongnu.org/archive/html/qemu-block/2022-06/msg00344.html)

On 4/13/22 14:26, Raphael Pour wrote:
> This patch completes the reopen functionality for an attached RBD where altered
> driver options can be passed to. This is necessary to move RBDs between ceph
> clusters without interrupting QEMU, where some ceph settings need to be adjusted.
> 
> The reopen_prepare method early returns if no rbd-specific driver options are
> given to maintain compatible with the previous behavior by dropping all
> generic block layer options. Otherwise the reopen acts similar to qemu_rbd_open.
> 
> The reopen_commit tears down the old state and replaces it with the new
> one.
> 
> The reopen_abort drops an ongoing reopen.
> 
> Signed-off-by: Raphael Pour <raphael.pour@hetzner.com>
> ---
>   block/rbd.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 201 insertions(+), 5 deletions(-)
> 
> diff --git a/block/rbd.c b/block/rbd.c
> index 6caf35cbba..e7b45d1c50 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -1029,19 +1029,213 @@ out:
>   static int qemu_rbd_reopen_prepare(BDRVReopenState *state,
>                                      BlockReopenQueue *queue, Error **errp)
>   {
> -    BDRVRBDState *s = state->bs->opaque;
> -    int ret = 0;
> +    BDRVRBDState *new_s = state->bs->opaque;
> +    BlockdevOptionsRbd *opts = NULL;
> +    const QDictEntry *e;
> +    Error *local_err = NULL;
> +    char *keypairs, *secretid;
> +    rbd_image_info_t info;
> +    int r = 0;
>   
> -    if (s->snap && state->flags & BDRV_O_RDWR) {
> +    if (new_s->snap && state->flags & BDRV_O_RDWR) {
>           error_setg(errp,
>                      "Cannot change node '%s' to r/w when using RBD snapshot",
>                      bdrv_get_device_or_node_name(state->bs));
> -        ret = -EINVAL;
> +        r = -EINVAL;
>       }
>   
> -    return ret;
> +    /*
> +     * Remove all keys from the generic layer which
> +     * can't be converted by rbd
> +     */
> +    qdict_del(state->options, "driver");
> +    qdict_del(state->options, "node-name");
> +    qdict_del(state->options, "auto-read-only");
> +    qdict_del(state->options, "discard");
> +    qdict_del(state->options, "cache");
> +
> +    /*
> +     * To maintain the compatibility prior the rbd-reopen,
> +     * where the generic layer can be altered without any
> +     * rbd argument given, we must early return if there
> +     * aren't any rbd-specific options left.
> +     */
> +    if (qdict_size(state->options) == 0) {
> +        return r;
> +    }
> +
> +    new_s = state->opaque = g_new0(BDRVReopenState, 1);
> +
> +    keypairs = g_strdup(qdict_get_try_str(state->options, "=keyvalue-pairs"));
> +    if (keypairs) {
> +        qdict_del(state->options, "=keyvalue-pairs");
> +    }
> +
> +    secretid = g_strdup(qdict_get_try_str(state->options, "password-secret"));
> +    if (secretid) {
> +        qdict_del(state->options, "password-secret");
> +    }
> +
> +    r = qemu_rbd_convert_options(state->options, &opts, &local_err);
> +    if (local_err) {
> +        /*
> +         * If keypairs are present, that means some options are present in
> +         * the modern option format.  Don't attempt to parse legacy option
> +         * formats, as we won't support mixed usage.
> +         */
> +        if (keypairs) {
> +            error_propagate(errp, local_err);
> +            goto out;
> +        }
> +
> +        /*
> +         * If the initial attempt to convert and process the options failed,
> +         * we may be attempting to open an image file that has the rbd options
> +         * specified in the older format consisting of all key/value pairs
> +         * encoded in the filename.  Go ahead and attempt to parse the
> +         * filename, and see if we can pull out the required options.
> +         */
> +        r = qemu_rbd_attempt_legacy_options(state->options, &opts, &keypairs);
> +        if (r < 0) {
> +            /*
> +             * Propagate the original error, not the legacy parsing fallback
> +             * error, as the latter was just a best-effort attempt.
> +             */
> +            error_propagate(errp, local_err);
> +            goto out;
> +        }
> +        /*
> +         * Take care whenever deciding to actually deprecate; once this ability
> +         * is removed, we will not be able to open any images with legacy-styled
> +         * backing image strings.
> +         */
> +        warn_report("RBD options encoded in the filename as keyvalue pairs "
> +                    "is deprecated");
> +    }
> +
> +    /*
> +     * Remove the processed options from the QDict (the visitor processes
> +     * _all_ options in the QDict)
> +     */
> +    while ((e = qdict_first(state->options))) {
> +        qdict_del(state->options, e->key);
> +    }
> +
> +    r = qemu_rbd_connect(&new_s->cluster, &new_s->io_ctx, opts,
> +                         !(state->flags & BDRV_O_NOCACHE), keypairs,
> +                         secretid, errp);
> +    if (r < 0) {
> +        goto out;
> +    }
> +
> +    new_s->snap = g_strdup(opts->snapshot);
> +    new_s->image_name = g_strdup(opts->image);
> +
> +    /* rbd_open is always r/w */
> +    r = rbd_open(new_s->io_ctx, new_s->image_name, &new_s->image, new_s->snap);
> +    if (r < 0) {
> +        error_setg_errno(errp, -r, "error reading header from %s",
> +                         new_s->image_name);
> +        goto failed_open;
> +    }
> +
> +    if (opts->has_encrypt) {
> +#ifdef LIBRBD_SUPPORTS_ENCRYPTION
> +        r = qemu_rbd_encryption_load(new_s->image, opts->encrypt, errp);
> +        if (r < 0) {
> +            goto failed_post_open;
> +        }
> +#else
> +        r = -ENOTSUP;
> +        error_setg(errp, "RBD library does not support image encryption");
> +        goto failed_post_open;
> +#endif
> +    }
> +
> +    r = rbd_stat(new_s->image, &info, sizeof(info));
> +    if (r < 0) {
> +        error_setg_errno(errp, -r, "error getting image info from %s",
> +                         new_s->image_name);
> +        goto failed_post_open;
> +    }
> +    new_s->image_size = info.size;
> +    new_s->object_size = info.obj_size;
> +
> +    /*
> +     * If we are using an rbd snapshot, we must be r/o, otherwise
> +     * leave as-is
> +     */
> +    if (new_s->snap != NULL) {
> +        r = bdrv_apply_auto_read_only(state->bs, "rbd snapshots are read-only",
> +                                      errp);
> +        if (r < 0) {
> +            goto failed_post_open;
> +        }
> +    }
> +
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +    state->bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
> +#endif
> +
> +    /* When extending regular files, we get zeros from the OS */
> +    state->bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
> +
> +    r = 0;
> +    goto out;
> +
> +failed_post_open:
> +    rbd_close(new_s->image);
> +failed_open:
> +    rados_ioctx_destroy(new_s->io_ctx);
> +    g_free(new_s->snap);
> +    g_free(new_s->image_name);
> +    rados_shutdown(new_s->cluster);
> +out:
> +    qapi_free_BlockdevOptionsRbd(opts);
> +    g_free(keypairs);
> +    g_free(secretid);
> +    return r;
>   }
>   
> +static void qemu_rbd_reopen_abort(BDRVReopenState *reopen_state)
> +{
> +    BDRVRBDState *s = reopen_state->bs->opaque;
> +
> +    if (s->io_ctx) {
> +        rados_ioctx_destroy(s->io_ctx);
> +    }
> +
> +   if (s->cluster) {
> +        rados_shutdown(s->cluster);
> +    }
> +
> +    g_free(s->snap);
> +    g_free(reopen_state->opaque);
> +    reopen_state->opaque = NULL;
> +}
> +
> +static void qemu_rbd_reopen_commit(BDRVReopenState *reopen_state)
> +{
> +    BDRVRBDState *s = reopen_state->bs->opaque;
> +    BDRVRBDState *new_s = reopen_state->opaque;
> +
> +    rados_aio_flush(s->io_ctx);
> +
> +    rbd_close(s->image);
> +    rados_ioctx_destroy(s->io_ctx);
> +    g_free(s->snap);
> +    rados_shutdown(s->cluster);
> +
> +    s->io_ctx = new_s->io_ctx;
> +    s->cluster = new_s->cluster;
> +    s->image = new_s->image;
> +    s->snap = new_s->snap;
> +
> +    g_free(reopen_state->opaque);
> +    reopen_state->opaque = NULL;
> +}
> +
> +
>   static void qemu_rbd_close(BlockDriverState *bs)
>   {
>       BDRVRBDState *s = bs->opaque;
> @@ -1628,6 +1822,8 @@ static BlockDriver bdrv_rbd = {
>       .bdrv_file_open         = qemu_rbd_open,
>       .bdrv_close             = qemu_rbd_close,
>       .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
> +    .bdrv_reopen_commit     = qemu_rbd_reopen_commit,
> +    .bdrv_reopen_abort     = qemu_rbd_reopen_abort,
>       .bdrv_co_create         = qemu_rbd_co_create,
>       .bdrv_co_create_opts    = qemu_rbd_co_create_opts,
>       .bdrv_has_zero_init     = bdrv_has_zero_init_1,
[PATCH] block/rbd: support driver-specific reopen
Posted by Raphael Pour 1 year, 10 months ago
Hello everyone,

what do you think? Tell me if something needs to be clarified or improved.

Raphael