Minimal realization: only one extent in server answer is supported.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
include/block/nbd.h | 33 ++++++
nbd/common.c | 10 ++
nbd/server.c | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 352 insertions(+), 1 deletion(-)
diff --git a/include/block/nbd.h b/include/block/nbd.h
index ef1698914b..b16215d17a 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -41,6 +41,12 @@ struct NBDOptionReply {
} QEMU_PACKED;
typedef struct NBDOptionReply NBDOptionReply;
+typedef struct NBDOptionReplyMetaContext {
+ NBDOptionReply h; /* h.type = NBD_REP_META_CONTEXT, h.length > 4 */
+ uint32_t context_id;
+ /* meta context name follows */
+} QEMU_PACKED NBDOptionReplyMetaContext;
+
/* Transmission phase structs
*
* Note: these are _NOT_ the same as the network representation of an NBD
@@ -105,6 +111,19 @@ typedef struct NBDStructuredError {
uint16_t message_length;
} QEMU_PACKED NBDStructuredError;
+/* Header of NBD_REPLY_TYPE_BLOCK_STATUS */
+typedef struct NBDStructuredMeta {
+ NBDStructuredReplyChunk h; /* h.length >= 12 (at least one extent) */
+ uint32_t context_id;
+ /* extents follows */
+} QEMU_PACKED NBDStructuredMeta;
+
+/* Extent chunk for NBD_REPLY_TYPE_BLOCK_STATUS */
+typedef struct NBDExtent {
+ uint32_t length;
+ uint32_t flags; /* NBD_STATE_* */
+} QEMU_PACKED NBDExtent;
+
/* Transmission (export) flags: sent from server to client during handshake,
but describe what will happen during transmission */
#define NBD_FLAG_HAS_FLAGS (1 << 0) /* Flags are there */
@@ -136,6 +155,8 @@ typedef struct NBDStructuredError {
#define NBD_OPT_INFO (6)
#define NBD_OPT_GO (7)
#define NBD_OPT_STRUCTURED_REPLY (8)
+#define NBD_OPT_LIST_META_CONTEXT (9)
+#define NBD_OPT_SET_META_CONTEXT (10)
/* Option reply types. */
#define NBD_REP_ERR(value) ((UINT32_C(1) << 31) | (value))
@@ -143,6 +164,7 @@ typedef struct NBDStructuredError {
#define NBD_REP_ACK (1) /* Data sending finished. */
#define NBD_REP_SERVER (2) /* Export description. */
#define NBD_REP_INFO (3) /* NBD_OPT_INFO/GO. */
+#define NBD_REP_META_CONTEXT (4) /* NBD_OPT_{LIST,SET}_META_CONTEXT */
#define NBD_REP_ERR_UNSUP NBD_REP_ERR(1) /* Unknown option */
#define NBD_REP_ERR_POLICY NBD_REP_ERR(2) /* Server denied */
@@ -163,6 +185,10 @@ typedef struct NBDStructuredError {
#define NBD_CMD_FLAG_FUA (1 << 0) /* 'force unit access' during write */
#define NBD_CMD_FLAG_NO_HOLE (1 << 1) /* don't punch hole on zero run */
#define NBD_CMD_FLAG_DF (1 << 2) /* don't fragment structured read */
+#define NBD_CMD_FLAG_REQ_ONE (1 << 3) /* only one extent in BLOCK_STATUS
+ * reply chunk */
+
+#define NBD_META_ID_BASE_ALLOCATION 0
/* Supported request types */
enum {
@@ -173,6 +199,7 @@ enum {
NBD_CMD_TRIM = 4,
/* 5 reserved for failed experiment NBD_CMD_CACHE */
NBD_CMD_WRITE_ZEROES = 6,
+ NBD_CMD_BLOCK_STATUS = 7,
};
#define NBD_DEFAULT_PORT 10809
@@ -200,9 +227,15 @@ enum {
#define NBD_REPLY_TYPE_NONE 0
#define NBD_REPLY_TYPE_OFFSET_DATA 1
#define NBD_REPLY_TYPE_OFFSET_HOLE 2
+#define NBD_REPLY_TYPE_BLOCK_STATUS 5
#define NBD_REPLY_TYPE_ERROR NBD_REPLY_ERR(1)
#define NBD_REPLY_TYPE_ERROR_OFFSET NBD_REPLY_ERR(2)
+/* Flags for extents (NBDExtent.flags) of NBD_REPLY_TYPE_BLOCK_STATUS,
+ * for base:allocation meta context */
+#define NBD_STATE_HOLE (1 << 0)
+#define NBD_STATE_ZERO (1 << 1)
+
static inline bool nbd_reply_type_is_error(int type)
{
return type & (1 << 15);
diff --git a/nbd/common.c b/nbd/common.c
index 6295526dd1..8c95c1d606 100644
--- a/nbd/common.c
+++ b/nbd/common.c
@@ -75,6 +75,10 @@ const char *nbd_opt_lookup(uint32_t opt)
return "go";
case NBD_OPT_STRUCTURED_REPLY:
return "structured reply";
+ case NBD_OPT_LIST_META_CONTEXT:
+ return "list meta context";
+ case NBD_OPT_SET_META_CONTEXT:
+ return "set meta context";
default:
return "<unknown>";
}
@@ -90,6 +94,8 @@ const char *nbd_rep_lookup(uint32_t rep)
return "server";
case NBD_REP_INFO:
return "info";
+ case NBD_REP_META_CONTEXT:
+ return "meta context";
case NBD_REP_ERR_UNSUP:
return "unsupported";
case NBD_REP_ERR_POLICY:
@@ -144,6 +150,8 @@ const char *nbd_cmd_lookup(uint16_t cmd)
return "trim";
case NBD_CMD_WRITE_ZEROES:
return "write zeroes";
+ case NBD_CMD_BLOCK_STATUS:
+ return "block status";
default:
return "<unknown>";
}
@@ -159,6 +167,8 @@ const char *nbd_reply_type_lookup(uint16_t type)
return "data";
case NBD_REPLY_TYPE_OFFSET_HOLE:
return "hole";
+ case NBD_REPLY_TYPE_BLOCK_STATUS:
+ return "block status";
case NBD_REPLY_TYPE_ERROR:
return "generic error";
case NBD_REPLY_TYPE_ERROR_OFFSET:
diff --git a/nbd/server.c b/nbd/server.c
index b9860a6dcf..f2a750eb97 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -82,6 +82,15 @@ struct NBDExport {
static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
+/* NBDExportMetaContexts represents list of selected by
+ * NBD_OPT_SET_META_CONTEXT contexts to be exported. */
+typedef struct NBDExportMetaContexts {
+ char export_name[NBD_MAX_NAME_SIZE + 1];
+ bool valid; /* means that negotiation of the option finished without
+ errors */
+ bool base_allocation; /* export base:allocation context (block status) */
+} NBDExportMetaContexts;
+
struct NBDClient {
int refcount;
void (*close_fn)(NBDClient *client, bool negotiated);
@@ -102,6 +111,7 @@ struct NBDClient {
bool closing;
bool structured_reply;
+ NBDExportMetaContexts export_meta;
uint32_t opt; /* Current option being negotiated */
uint32_t optlen; /* remaining length of data in ioc for the option being
@@ -636,6 +646,201 @@ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
return QIO_CHANNEL(tioc);
}
+/* nbd_alloc_read_size_string
+ *
+ * Read string in format
+ * uint32_t len
+ * len bytes string (not 0-terminated)
+ * String is allocated and pointer returned as @buf
+ *
+ * Return -errno on I/O error, 0 if option was completely handled by
+ * sending a reply about inconsistent lengths, or 1 on success. */
+static int nbd_alloc_read_size_string(NBDClient *client, char **buf,
+ Error **errp)
+{
+ int ret;
+ uint32_t len;
+
+ ret = nbd_opt_read(client, &len, sizeof(len), errp);
+ if (ret <= 0) {
+ return ret;
+ }
+ cpu_to_be32s(&len);
+
+ *buf = g_try_malloc(len + 1);
+ if (*buf == NULL) {
+ error_setg(errp, "No memory");
+ return -ENOMEM;
+ }
+ (*buf)[len] = '\0';
+
+ ret = nbd_opt_read(client, *buf, len, errp);
+ if (ret <= 0) {
+ g_free(*buf);
+ *buf = NULL;
+ }
+
+ return ret;
+}
+
+/* nbd_read_size_string
+ *
+ * Read string in format
+ * uint32_t len
+ * len bytes string (not 0-terminated)
+ *
+ * @buf should be enough to store @max_len+1
+ *
+ * Return -errno on I/O error, 0 if option was completely handled by
+ * sending a reply about inconsistent lengths, or 1 on success. */
+static int nbd_read_size_string(NBDClient *client, char *buf,
+ uint32_t max_len, Error **errp)
+{
+ int ret;
+ uint32_t len;
+
+ ret = nbd_opt_read(client, &len, sizeof(len), errp);
+ if (ret <= 0) {
+ return ret;
+ }
+ cpu_to_be32s(&len);
+
+ if (len > max_len) {
+ return nbd_opt_invalid(client, errp, "Invalid string length: %u", len);
+ }
+
+ ret = nbd_opt_read(client, buf, len, errp);
+ if (ret <= 0) {
+ return ret;
+ }
+ buf[len] = '\0';
+
+ return 1;
+}
+
+static int nbd_negotiate_send_meta_context(NBDClient *client,
+ const char *context,
+ uint32_t context_id,
+ Error **errp)
+{
+ NBDOptionReplyMetaContext opt;
+ struct iovec iov[] = {
+ {.iov_base = &opt, .iov_len = sizeof(opt)},
+ {.iov_base = (void *)context, .iov_len = strlen(context)}
+ };
+
+ set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
+ sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
+ stl_be_p(&opt.context_id, context_id);
+
+ return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
+}
+
+static void nbd_meta_base_query(NBDExportMetaContexts *meta, const char *query)
+{
+ if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
+ /* Note: empty query should select all contexts within base
+ * namespace. */
+ meta->base_allocation = true;
+ }
+}
+
+/* nbd_negotiate_meta_query
+ * Return -errno on I/O error, 0 if option was completely handled by
+ * sending a reply about inconsistent lengths, or 1 on success. */
+static int nbd_negotiate_meta_query(NBDClient *client,
+ NBDExportMetaContexts *meta, Error **errp)
+{
+ int ret;
+ char *query, *colon, *namespace, *subquery;
+
+ ret = nbd_alloc_read_size_string(client, &query, errp);
+ if (ret <= 0) {
+ return ret;
+ }
+
+ colon = strchr(query, ':');
+ if (colon == NULL) {
+ ret = nbd_opt_invalid(client, errp, "no colon in query");
+ goto out;
+ }
+ *colon = '\0';
+ namespace = query;
+ subquery = colon + 1;
+
+ if (strcmp(namespace, "base") == 0) {
+ nbd_meta_base_query(meta, subquery);
+ }
+
+out:
+ g_free(query);
+ return ret;
+}
+
+/* nbd_negotiate_meta_queries
+ * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
+ *
+ * Return -errno on I/O error, 0 if option was completely handled by
+ * sending a reply about inconsistent lengths, or 1 on success. */
+static int nbd_negotiate_meta_queries(NBDClient *client,
+ NBDExportMetaContexts *meta, Error **errp)
+{
+ int ret;
+ NBDExport *exp;
+ NBDExportMetaContexts local_meta;
+ uint32_t nb_queries;
+ int i;
+
+ assert(client->structured_reply);
+
+ if (meta == NULL) {
+ meta = &local_meta;
+ }
+
+ memset(meta, 0, sizeof(*meta));
+
+ ret = nbd_read_size_string(client, meta->export_name,
+ NBD_MAX_NAME_SIZE, errp);
+ if (ret <= 0) {
+ return ret;
+ }
+
+ exp = nbd_export_find(meta->export_name);
+ if (exp == NULL) {
+ return nbd_opt_invalid(client, errp,
+ "export '%s' not present", meta->export_name);
+ }
+
+ ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp);
+ if (ret <= 0) {
+ return ret;
+ }
+ cpu_to_be32s(&nb_queries);
+
+ for (i = 0; i < nb_queries; ++i) {
+ ret = nbd_negotiate_meta_query(client, meta, errp);
+ if (ret <= 0) {
+ return ret;
+ }
+ }
+
+ if (meta->base_allocation) {
+ ret = nbd_negotiate_send_meta_context(client, "base:allocation",
+ NBD_META_ID_BASE_ALLOCATION,
+ errp);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
+ if (ret == 0) {
+ meta->valid = true;
+ }
+
+ return ret;
+}
+
/* nbd_negotiate_options
* Process all NBD_OPT_* client option commands, during fixed newstyle
* negotiation.
@@ -826,6 +1031,22 @@ static int nbd_negotiate_options(NBDClient *client, uint16_t myflags,
}
break;
+ case NBD_OPT_LIST_META_CONTEXT:
+ case NBD_OPT_SET_META_CONTEXT:
+ if (!client->structured_reply) {
+ ret = nbd_opt_invalid(
+ client, errp,
+ "request option '%s' when structured reply "
+ "is not negotiated", nbd_opt_lookup(option));
+ } else if (option == NBD_OPT_LIST_META_CONTEXT) {
+ ret = nbd_negotiate_meta_queries(client, NULL, errp);
+ } else {
+ ret = nbd_negotiate_meta_queries(client,
+ &client->export_meta,
+ errp);
+ }
+ break;
+
default:
ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
"Unsupported option 0x%" PRIx32 " (%s)",
@@ -1446,6 +1667,78 @@ static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
}
+static int blockstatus_to_extent_be(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, NBDExtent *extent)
+{
+ uint64_t tail_bytes = bytes;
+
+ while (tail_bytes) {
+ uint32_t flags;
+ int64_t num;
+ int ret = bdrv_block_status_above(bs, NULL, offset, tail_bytes, &num,
+ NULL, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+ flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) |
+ (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
+
+ if (tail_bytes == bytes) {
+ extent->flags = flags;
+ }
+
+ if (flags != extent->flags) {
+ break;
+ }
+
+ offset += num;
+ tail_bytes -= num;
+ }
+
+ cpu_to_be32s(&extent->flags);
+ extent->length = cpu_to_be32(bytes - tail_bytes);
+
+ return 0;
+}
+
+/* nbd_co_send_extents
+ * @extents should be in big-endian */
+static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
+ NBDExtent *extents, unsigned nb_extents,
+ uint32_t context_id, Error **errp)
+{
+ NBDStructuredMeta chunk;
+
+ struct iovec iov[] = {
+ {.iov_base = &chunk, .iov_len = sizeof(chunk)},
+ {.iov_base = extents, .iov_len = nb_extents * sizeof(extents[0])}
+ };
+
+ set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_BLOCK_STATUS,
+ handle, sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
+ stl_be_p(&chunk.context_id, context_id);
+
+ return nbd_co_send_iov(client, iov, 2, errp);
+}
+
+static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
+ BlockDriverState *bs, uint64_t offset,
+ uint64_t length, uint32_t context_id,
+ Error **errp)
+{
+ int ret;
+ NBDExtent extent;
+
+ ret = blockstatus_to_extent_be(bs, offset, length, &extent);
+ if (ret < 0) {
+ return nbd_co_send_structured_error(
+ client, handle, -ret, "can't get block status", errp);
+ }
+
+ return nbd_co_send_extents(client, handle, &extent, 1, context_id, errp);
+}
+
/* nbd_co_receive_request
* Collect a client request. Return 0 if request looks valid, -EIO to drop
* connection right away, and any other negative value to report an error to
@@ -1523,6 +1816,8 @@ static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
valid_flags |= NBD_CMD_FLAG_DF;
} else if (request->type == NBD_CMD_WRITE_ZEROES) {
valid_flags |= NBD_CMD_FLAG_NO_HOLE;
+ } else if (request->type == NBD_CMD_BLOCK_STATUS) {
+ valid_flags |= NBD_CMD_FLAG_REQ_ONE;
}
if (request->flags & ~valid_flags) {
error_setg(errp, "unsupported flags for command %s (got 0x%x)",
@@ -1650,6 +1945,19 @@ static coroutine_fn void nbd_trip(void *opaque)
}
break;
+ case NBD_CMD_BLOCK_STATUS:
+ if (client->export_meta.base_allocation) {
+ ret = nbd_co_send_block_status(req->client, request.handle,
+ blk_bs(exp->blk), request.from,
+ request.len,
+ NBD_META_ID_BASE_ALLOCATION,
+ &local_err);
+ } else {
+ ret = -EINVAL;
+ error_setg(&local_err, "CMD_BLOCK_STATUS not negotiated");
+ }
+
+ break;
default:
error_setg(&local_err, "invalid request type (%" PRIu32 ") received",
request.type);
@@ -1680,7 +1988,7 @@ reply:
ret = nbd_co_send_structured_done(req->client, request.handle,
&local_err);
}
- } else {
+ } else if (request.type != NBD_CMD_BLOCK_STATUS) {
ret = nbd_co_send_simple_reply(req->client, request.handle,
ret < 0 ? -ret : 0,
req->data, reply_data_len, &local_err);
--
2.11.1
On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
> Minimal realization: only one extent in server answer is supported.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
> include/block/nbd.h | 33 ++++++
> nbd/common.c | 10 ++
> nbd/server.c | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 352 insertions(+), 1 deletion(-)
>
> @@ -200,9 +227,15 @@ enum {
> #define NBD_REPLY_TYPE_NONE 0
> #define NBD_REPLY_TYPE_OFFSET_DATA 1
> #define NBD_REPLY_TYPE_OFFSET_HOLE 2
> +#define NBD_REPLY_TYPE_BLOCK_STATUS 5
Stale; see nbd.git commit 56c77720 which changed this to 3.
> +++ b/nbd/server.c
> @@ -82,6 +82,15 @@ struct NBDExport {
>
> static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
>
> +/* NBDExportMetaContexts represents list of selected by
> + * NBD_OPT_SET_META_CONTEXT contexts to be exported. */
represents a list of contexts to be exported, as selected by
NBD_OPT_SET_META_CONTEXT.
> +typedef struct NBDExportMetaContexts {
> + char export_name[NBD_MAX_NAME_SIZE + 1];
Would this work as const char * pointing at some other storage, instead
of having to copy into this storage?
> + bool valid; /* means that negotiation of the option finished without
> + errors */
> + bool base_allocation; /* export base:allocation context (block status) */
> +} NBDExportMetaContexts;
> +
> struct NBDClient {
> @@ -636,6 +646,201 @@ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
> return QIO_CHANNEL(tioc);
> }
>
> +/* nbd_alloc_read_size_string
> + *
> + * Read string in format
> + * uint32_t len
> + * len bytes string (not 0-terminated)
> + * String is allocated and pointer returned as @buf
> + *
> + * Return -errno on I/O error, 0 if option was completely handled by
> + * sending a reply about inconsistent lengths, or 1 on success. */
> +static int nbd_alloc_read_size_string(NBDClient *client, char **buf,
> + Error **errp)
> +{
> + int ret;
> + uint32_t len;
> +
> + ret = nbd_opt_read(client, &len, sizeof(len), errp);
> + if (ret <= 0) {
> + return ret;
> + }
> + cpu_to_be32s(&len);
> +
> + *buf = g_try_malloc(len + 1);
I'd rather check that len is sane prior to trying to malloc. Otherwise,
a malicious client can convince us to waste time/space doing a large
malloc before we finally realize that we can't read that many bytes
after all. And in fact, if you check len in advance, you can probably
just use g_malloc() instead of g_try_malloc() (g_try_malloc() makes
sense on a 1M allocation, where we can still allocate smaller stuff in
reporting the error; but since NBD limits strings to 4k, if we fail at
allocating 4k, we are probably already so hosed that our attempts to
report the failure will also run out of memory and abort, so why not
just abort now).
> + if (*buf == NULL) {
> + error_setg(errp, "No memory");
> + return -ENOMEM;
> + }
> + (*buf)[len] = '\0';
> +
> + ret = nbd_opt_read(client, *buf, len, errp);
> + if (ret <= 0) {
> + g_free(*buf);
> + *buf = NULL;
> + }
> +
> + return ret;
> +}
> +
> +/* nbd_read_size_string
Will resume review here...
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3266
Virtualization: qemu.org | libvirt.org
16.02.2018 02:02, Eric Blake wrote:
> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>> Minimal realization: only one extent in server answer is supported.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>> include/block/nbd.h | 33 ++++++
>> nbd/common.c | 10 ++
>> nbd/server.c | 310
>> +++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 3 files changed, 352 insertions(+), 1 deletion(-)
>>
>
>> @@ -200,9 +227,15 @@ enum {
>> #define NBD_REPLY_TYPE_NONE 0
>> #define NBD_REPLY_TYPE_OFFSET_DATA 1
>> #define NBD_REPLY_TYPE_OFFSET_HOLE 2
>> +#define NBD_REPLY_TYPE_BLOCK_STATUS 5
>
> Stale; see nbd.git commit 56c77720 which changed this to 3.
Very unpleasant surprise. I understand that this is still experimental
extension, but actually we use it =5 in production about one year. Can
we revert it to 5?
>
>> +++ b/nbd/server.c
>> @@ -82,6 +82,15 @@ struct NBDExport {
>> static QTAILQ_HEAD(, NBDExport) exports =
>> QTAILQ_HEAD_INITIALIZER(exports);
>> +/* NBDExportMetaContexts represents list of selected by
>> + * NBD_OPT_SET_META_CONTEXT contexts to be exported. */
>
> represents a list of contexts to be exported, as selected by
> NBD_OPT_SET_META_CONTEXT.
>
>> +typedef struct NBDExportMetaContexts {
>> + char export_name[NBD_MAX_NAME_SIZE + 1];
>
> Would this work as const char * pointing at some other storage,
> instead of having to copy into this storage?
I'll think about
>
>> + bool valid; /* means that negotiation of the option finished
>> without
>> + errors */
>> + bool base_allocation; /* export base:allocation context (block
>> status) */
>> +} NBDExportMetaContexts;
>> +
>> struct NBDClient {
>
>> @@ -636,6 +646,201 @@ static QIOChannel
>> *nbd_negotiate_handle_starttls(NBDClient *client,
>> return QIO_CHANNEL(tioc);
>> }
>> +/* nbd_alloc_read_size_string
>> + *
>> + * Read string in format
>> + * uint32_t len
>> + * len bytes string (not 0-terminated)
>> + * String is allocated and pointer returned as @buf
>> + *
>> + * Return -errno on I/O error, 0 if option was completely handled by
>> + * sending a reply about inconsistent lengths, or 1 on success. */
>> +static int nbd_alloc_read_size_string(NBDClient *client, char **buf,
>> + Error **errp)
>> +{
>> + int ret;
>> + uint32_t len;
>> +
>> + ret = nbd_opt_read(client, &len, sizeof(len), errp);
>> + if (ret <= 0) {
>> + return ret;
>> + }
>> + cpu_to_be32s(&len);
>> +
>> + *buf = g_try_malloc(len + 1);
>
> I'd rather check that len is sane prior to trying to malloc.
> Otherwise, a malicious client can convince us to waste time/space
> doing a large malloc before we finally realize that we can't read that
> many bytes after all. And in fact, if you check len in advance, you
> can probably just use g_malloc() instead of g_try_malloc()
> (g_try_malloc() makes sense on a 1M allocation, where we can still
> allocate smaller stuff in reporting the error; but since NBD limits
> strings to 4k, if we fail at allocating 4k, we are probably already so
> hosed that our attempts to report the failure will also run out of
> memory and abort, so why not just abort now).
reasonable, will do
>
>> + if (*buf == NULL) {
>> + error_setg(errp, "No memory");
>> + return -ENOMEM;
>> + }
>> + (*buf)[len] = '\0';
>> +
>> + ret = nbd_opt_read(client, *buf, len, errp);
>> + if (ret <= 0) {
>> + g_free(*buf);
>> + *buf = NULL;
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +/* nbd_read_size_string
>
> Will resume review here...
>
--
Best regards,
Vladimir
On 02/16/2018 05:09 AM, Vladimir Sementsov-Ogievskiy wrote:
> 16.02.2018 02:02, Eric Blake wrote:
>> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> Minimal realization: only one extent in server answer is supported.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>> ---
>>> include/block/nbd.h | 33 ++++++
>>> nbd/common.c | 10 ++
>>> nbd/server.c | 310
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>> 3 files changed, 352 insertions(+), 1 deletion(-)
>>>
>>
>>> @@ -200,9 +227,15 @@ enum {
>>> #define NBD_REPLY_TYPE_NONE 0
>>> #define NBD_REPLY_TYPE_OFFSET_DATA 1
>>> #define NBD_REPLY_TYPE_OFFSET_HOLE 2
>>> +#define NBD_REPLY_TYPE_BLOCK_STATUS 5
>>
>> Stale; see nbd.git commit 56c77720 which changed this to 3.
>
> Very unpleasant surprise. I understand that this is still experimental
> extension, but actually we use it =5 in production about one year. Can
> we revert it to 5?
For that, you'll have to ask on the upstream NBD list. There may be
other things that turn out to need tweaking as I get through the rest of
your initial implementation, vs. what works well for interoperability,
so having a distinct number for experimental vs. actual may be
worthwhile for other reasons as well, even if it requires some glue code
on your side to handle an older server sending 5 instead of 3 to a newer
client.
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3266
Virtualization: qemu.org | libvirt.org
On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
> Minimal realization: only one extent in server answer is supported.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
continuing where I left off,
> +++ b/nbd/common.c
> @@ -75,6 +75,10 @@ const char *nbd_opt_lookup(uint32_t opt)
> return "go";
> case NBD_OPT_STRUCTURED_REPLY:
> return "structured reply";
> + case NBD_OPT_LIST_META_CONTEXT:
> + return "list meta context";
> + case NBD_OPT_SET_META_CONTEXT:
> + return "set meta context";
> default:
Should the changes to the header for new macros and to common.c for
mapping bits to names be split into a separate patch, so that someone
could backport just the new constants and then the client-side
implementation, rather than being forced to backport the rest of the
server implementation at the same time?
> +
> +/* nbd_read_size_string
> + *
> + * Read string in format
> + * uint32_t len
> + * len bytes string (not 0-terminated)
> + *
> + * @buf should be enough to store @max_len+1
> + *
> + * Return -errno on I/O error, 0 if option was completely handled by
> + * sending a reply about inconsistent lengths, or 1 on success. */
> +static int nbd_read_size_string(NBDClient *client, char *buf,
> + uint32_t max_len, Error **errp)
Would existing code benefit from using this helper? If so, splitting it
into a separate patch, plus converting initial clients to use it, would
be worthwhile.
> +
> +static int nbd_negotiate_send_meta_context(NBDClient *client,
> + const char *context,
> + uint32_t context_id,
> + Error **errp)
No comment documenting this function?
> +{
> + NBDOptionReplyMetaContext opt;
> + struct iovec iov[] = {
> + {.iov_base = &opt, .iov_len = sizeof(opt)},
> + {.iov_base = (void *)context, .iov_len = strlen(context)}
Casting to void* looks suspicious, but I see that it is casting away
const. Okay.
> + };
> +
> + set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
> + sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
> + stl_be_p(&opt.context_id, context_id);
> +
> + return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
> +}
> +
> +static void nbd_meta_base_query(NBDExportMetaContexts *meta, const char *query)
> +{
Again, function comments are useful.
> + if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
> + /* Note: empty query should select all contexts within base
> + * namespace. */
> + meta->base_allocation = true;
From the client perspective, this handling of the empty leaf-name works
well for NBD_OPT_LIST_META_CONTEXT (I want to see what leaf nodes the
server supports), but not so well for NBD_OPT_SET_META_CONTEXT (asking
the server to send ALL base allocations, even when I don't necessarily
know how to interpret anything other than base:allocation, is a waste).
So this function needs a bool parameter that says whether it is being
invoked from _LIST (empty string okay, to advertise ALL base leaf names
back to client, which for now is just base:allocation) or from _SET
(empty string is ignored as invalid; client has to specifically ask for
base:allocation by name).
> + }
> +}
> +
> +/* nbd_negotiate_meta_query
> + * Return -errno on I/O error, 0 if option was completely handled by
> + * sending a reply about inconsistent lengths, or 1 on success. */
> +static int nbd_negotiate_meta_query(NBDClient *client,
> + NBDExportMetaContexts *meta, Error **errp)
> +{
> + int ret;
> + char *query, *colon, *namespace, *subquery;
Is it worth stack-allocating query here, so you don't have to g_free()
it later? NBD limits the maximum string to 4k, which is a little bit
big for stack allocation (on an operating system with 4k pages,
allocating more than 4k on the stack in one function risks missing the
guard page on stack overflow), but we also have the benefit that we KNOW
that the set of meta-context namespaces that we support have a much
smaller maximum limit of what we even care about.
> +
> + ret = nbd_alloc_read_size_string(client, &query, errp);
> + if (ret <= 0) {
> + return ret;
> + }
> +
> + colon = strchr(query, ':');
> + if (colon == NULL) {
> + ret = nbd_opt_invalid(client, errp, "no colon in query");
> + goto out;
Hmm, that puts a slight wrinkle into my proposal, or else maybe it is
something I should bring up on the NBD list. If we only read 5
characters (because the max namespace WE support is "base:"), but a
client asks for namespace "X-longname:", we should gracefully ignore the
client's request; while we still want to reply with an error to a client
that asks for "garbage" with no colon at all. The question for the NBD
spec, then, is whether detecting bad client requests that didn't use
colon is mandatory for the server (meaning we MUST read the entire
namespace request, and search for the colon) or merely best effort (we
only have to read 5 characters, and if we silently ignore instead of
diagnose a bad namespace request that was longer than that, oh well).
Worded from the client, it switches to a question of whether the client
should expect the server to diagnose all requests, or must be prepared
for the server to ignore requests even where those requests are bogus.
Or, the NBD spec may change slightly to pass namespace and leafname as
separate fields, both with lengths, rather than a colon, to make it
easier for the server to skip over an unknown namespace/leaf pair
without having to parse whether a colon was present. I'll send that in
a separate email (the upstream NBD list doesn't need to see all my
review comments on this thread).
> + }
> + *colon = '\0';
> + namespace = query;
> + subquery = colon + 1;
> +
> + if (strcmp(namespace, "base") == 0) {
> + nbd_meta_base_query(meta, subquery);
> + }
> +
> +out:
> + g_free(query);
> + return ret;
> +}
> +
> +/* nbd_negotiate_meta_queries
> + * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
> + *
> + * Return -errno on I/O error, 0 if option was completely handled by
> + * sending a reply about inconsistent lengths, or 1 on success. */
> +static int nbd_negotiate_meta_queries(NBDClient *client,
> + NBDExportMetaContexts *meta, Error **errp)
> +{
The two options can mostly share code, but see my earlier comments about
how I think you need to distinguish between list ('base:' is a valid
query) and set ('base:' is an invalid request).
> + int ret;
> + NBDExport *exp;
> + NBDExportMetaContexts local_meta;
> + uint32_t nb_queries;
> + int i;
> +
> + assert(client->structured_reply);
Hmm. Asserting here is bad if the client can get here without
negotiating structured reply (I'll have to see if you checked that in
the caller... [1])
> +
> + if (meta == NULL) {
Bikeshedding - I like writing 'if (!meta) {' as it is fewer characters.
The function documentation should mention that 'meta' may be NULL from
the caller. But (without seeing the callers yet), why? Are you arguing
that _LIST uses local_meta (because it only needs something for the
duration of the reply) while _SET supplies a pre-existing meta (that is
associated with the server state, and used when the client finally calls
NBD_OPT_GO)?
> + meta = &local_meta;
> + }
> +
> + memset(meta, 0, sizeof(*meta));
> +
> + ret = nbd_read_size_string(client, meta->export_name,
> + NBD_MAX_NAME_SIZE, errp);
Revisiting a question I raised in my first half review - you saved the
name as part of the struct because we have to later compare that the
final OPT_SET export name matches the request during OPT_GO (if they
don't match, then we have no contexts to serve after all). So a 'const
char *' won't work, but maybe the struct could use a 'char *' pointing
to malloc'd storage rather than char[MAX_NAME] that reserves array space
that is mostly unused for the typical name that is much shorter than the
maximum name length.
> + if (ret <= 0) {
> + return ret;
> + }
> +
> + exp = nbd_export_find(meta->export_name);
> + if (exp == NULL) {
> + return nbd_opt_invalid(client, errp,
> + "export '%s' not present", meta->export_name);
Wrong error; I think NBD_REP_ERR_UNKNOWN is better here.
> + }
> +
> + ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp);
> + if (ret <= 0) {
> + return ret;
> + }
> + cpu_to_be32s(&nb_queries);
> +
> + for (i = 0; i < nb_queries; ++i) {
> + ret = nbd_negotiate_meta_query(client, meta, errp);
> + if (ret <= 0) {
> + return ret;
> + }
> + }
Interesting choice - you parse all queries prior to preparing any
response. I guess you could also reply as you read, but it doesn't
change the fact that you have to remember what the final _SET selected
for use later on. So this works.
> +
> + if (meta->base_allocation) {
> + ret = nbd_negotiate_send_meta_context(client, "base:allocation",
> + NBD_META_ID_BASE_ALLOCATION,
> + errp);
Since earlier you have #define NBD_META_ID_BASE_ALLOCATION 0, that means
you use the context ID of 0 in response to both _LIST (where the spec
says the server SHOULD send 0, but the client SHOULD ignore the id
field) and for _SET (where the client needs a unique ID for every
separate context that actually got selected - but since we only select a
single context, we get away with it). I suspect later patches will add
additional contexts where you'll have to actually worry about context
ids (and always sending 0 in response to _LIST), but for now, this works :)
> + if (ret < 0) {
> + return ret;
> + }
> + }
> +
> + ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
> + if (ret == 0) {
> + meta->valid = true;
> + }
Interesting - meta->valid can be true even if 0 contexts were selected;
I presume we use this later on to decide whether to reply with EINVAL
vs. an empty list if a client uses NBD_CMD_BLOCK_STATUS without a valid
_SET call. Then again, the NBD spec currently states that
NBD_CMD_BLOCK_STATUS should not be used even if _SET succeeded, if the
_SET didn't return at least one context.
> +
> + return ret;
> +}
> +
> /* nbd_negotiate_options
> * Process all NBD_OPT_* client option commands, during fixed newstyle
> * negotiation.
> @@ -826,6 +1031,22 @@ static int nbd_negotiate_options(NBDClient *client, uint16_t myflags,
> }
> break;
>
> + case NBD_OPT_LIST_META_CONTEXT:
> + case NBD_OPT_SET_META_CONTEXT:
> + if (!client->structured_reply) {
> + ret = nbd_opt_invalid(
> + client, errp,
> + "request option '%s' when structured reply "
> + "is not negotiated", nbd_opt_lookup(option));
[1] okay, you did filter out clients that request out of order.
> + } else if (option == NBD_OPT_LIST_META_CONTEXT) {
> + ret = nbd_negotiate_meta_queries(client, NULL, errp);
> + } else {
> + ret = nbd_negotiate_meta_queries(client,
> + &client->export_meta,
> + errp);
Okay, so you DO have a difference on whether you pass in 'meta' based on
whether it is a local response vs. setting things for later.
> + }
> + break;
> +
> default:
> ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
> "Unsupported option 0x%" PRIx32 " (%s)",
> @@ -1446,6 +1667,78 @@ static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
> return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
> }
>
> +static int blockstatus_to_extent_be(BlockDriverState *bs, uint64_t offset,
> + uint64_t bytes, NBDExtent *extent)
> +{
> + uint64_t tail_bytes = bytes;
> +
> + while (tail_bytes) {
I might have named this 'remaining'
> + uint32_t flags;
> + int64_t num;
> + int ret = bdrv_block_status_above(bs, NULL, offset, tail_bytes, &num,
> + NULL, NULL);
> + if (ret < 0) {
> + return ret;
> + }
> +
> + flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) |
> + (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
Looks like the correct bit mapping to me.
> +
> + if (tail_bytes == bytes) {
> + extent->flags = flags;
> + }
> +
> + if (flags != extent->flags) {
> + break;
> + }
> +
> + offset += num;
> + tail_bytes -= num;
> + }
> +
> + cpu_to_be32s(&extent->flags);
> + extent->length = cpu_to_be32(bytes - tail_bytes);
This only computes one extent, but tries to find the longest extent
possible (if two consecutive bdrv_block_status calls return the same
status, perhaps because of fragmentation). Presumably this is called in
a loop to find a full list of extents covering the client's entire
request? [2]
> +
> + return 0;
> +}
> +
> +/* nbd_co_send_extents
> + * @extents should be in big-endian */
> +static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
> + NBDExtent *extents, unsigned nb_extents,
> + uint32_t context_id, Error **errp)
> +{
> + NBDStructuredMeta chunk;
> +
> + struct iovec iov[] = {
> + {.iov_base = &chunk, .iov_len = sizeof(chunk)},
> + {.iov_base = extents, .iov_len = nb_extents * sizeof(extents[0])}
> + };
> +
> + set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_BLOCK_STATUS,
> + handle, sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
This says you can only send a single chunk, which matches the fact that
right now you support exactly one context type. Presumably later
patches tweak this.
> + stl_be_p(&chunk.context_id, context_id);
> +
> + return nbd_co_send_iov(client, iov, 2, errp);
> +}
> +
> +static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
> + BlockDriverState *bs, uint64_t offset,
> + uint64_t length, uint32_t context_id,
> + Error **errp)
No function comment?
> +{
> + int ret;
> + NBDExtent extent;
> +
> + ret = blockstatus_to_extent_be(bs, offset, length, &extent);
> + if (ret < 0) {
> + return nbd_co_send_structured_error(
> + client, handle, -ret, "can't get block status", errp);
> + }
> +
> + return nbd_co_send_extents(client, handle, &extent, 1, context_id, errp);
[2] Okay, no looping to answer the client's entire request, so that may
be something you add in a later patch, to keep this initial patch
minimal. But at least it matches your commit message.
> +}
> +
> /* nbd_co_receive_request
> * Collect a client request. Return 0 if request looks valid, -EIO to drop
> * connection right away, and any other negative value to report an error to
> @@ -1523,6 +1816,8 @@ static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
> valid_flags |= NBD_CMD_FLAG_DF;
> } else if (request->type == NBD_CMD_WRITE_ZEROES) {
> valid_flags |= NBD_CMD_FLAG_NO_HOLE;
> + } else if (request->type == NBD_CMD_BLOCK_STATUS) {
> + valid_flags |= NBD_CMD_FLAG_REQ_ONE;
> }
> if (request->flags & ~valid_flags) {
> error_setg(errp, "unsupported flags for command %s (got 0x%x)",
> @@ -1650,6 +1945,19 @@ static coroutine_fn void nbd_trip(void *opaque)
> }
>
> break;
> + case NBD_CMD_BLOCK_STATUS:
> + if (client->export_meta.base_allocation) {
> + ret = nbd_co_send_block_status(req->client, request.handle,
> + blk_bs(exp->blk), request.from,
> + request.len,
> + NBD_META_ID_BASE_ALLOCATION,
> + &local_err);
No check of client->export_meta.valid?
> + } else {
> + ret = -EINVAL;
> + error_setg(&local_err, "CMD_BLOCK_STATUS not negotiated");
> + }
At this point, we've either sent the final structured chunk, or we've
sent nothing and have ret < 0;...
> +
> + break;
> default:
> error_setg(&local_err, "invalid request type (%" PRIu32 ") received",
> request.type);
> @@ -1680,7 +1988,7 @@ reply:
if (client->structured_reply &&
(ret < 0 || request.type == NBD_CMD_READ)) {
> ret = nbd_co_send_structured_done(req->client, request.handle,
> &local_err);
> }
...iIf the client negotiated structured reply, then we've sent the error
message. But if the client did NOT negotiate structured reply, then...
> - } else {
> + } else if (request.type != NBD_CMD_BLOCK_STATUS) {
...we fail to reply at all. Oops. That's not good for
interoperability. You'll need to make sure that we reply with EINVAL
even if the client (mistakenly) calls NBD_CMD_BLOCK_STATUS without
negotiating structured replies.
> ret = nbd_co_send_simple_reply(req->client, request.handle,
> ret < 0 ? -ret : 0,
> req->data, reply_data_len, &local_err);
>
Also missing from the patch - where do you validate that the export name
in client->export_meta matches the export name in NBD_OPT_GO? If they
don't match, then export_meta should be discarded.
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3266
Virtualization: qemu.org | libvirt.org
16.02.2018 16:21, Eric Blake wrote:
> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>> Minimal realization: only one extent in server answer is supported.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>
> continuing where I left off,
>
>
>> +++ b/nbd/common.c
>> @@ -75,6 +75,10 @@ const char *nbd_opt_lookup(uint32_t opt)
>> return "go";
>> case NBD_OPT_STRUCTURED_REPLY:
>> return "structured reply";
>> + case NBD_OPT_LIST_META_CONTEXT:
>> + return "list meta context";
>> + case NBD_OPT_SET_META_CONTEXT:
>> + return "set meta context";
>> default:
>
> Should the changes to the header for new macros and to common.c for
> mapping bits to names be split into a separate patch, so that someone
> could backport just the new constants and then the client-side
> implementation, rather than being forced to backport the rest of the
> server implementation at the same time?
Not a problem, I can split. I've thought about it, but decided for first
roll send it as one patch.
>
>> +
>> +/* nbd_read_size_string
>> + *
>> + * Read string in format
>> + * uint32_t len
>> + * len bytes string (not 0-terminated)
>> + *
>> + * @buf should be enough to store @max_len+1
>> + *
>> + * Return -errno on I/O error, 0 if option was completely handled by
>> + * sending a reply about inconsistent lengths, or 1 on success. */
>> +static int nbd_read_size_string(NBDClient *client, char *buf,
>> + uint32_t max_len, Error **errp)
>
> Would existing code benefit from using this helper? If so, splitting
> it into a separate patch, plus converting initial clients to use it,
> would be worthwhile.
>
>
>> +
>> +static int nbd_negotiate_send_meta_context(NBDClient *client,
>> + const char *context,
>> + uint32_t context_id,
>> + Error **errp)
>
> No comment documenting this function?
>
>> +{
>> + NBDOptionReplyMetaContext opt;
>> + struct iovec iov[] = {
>> + {.iov_base = &opt, .iov_len = sizeof(opt)},
>> + {.iov_base = (void *)context, .iov_len = strlen(context)}
>
> Casting to void* looks suspicious, but I see that it is casting away
> const. Okay.
>
>> + };
>> +
>> + set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
>> + sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
>> + stl_be_p(&opt.context_id, context_id);
>> +
>> + return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ?
>> -EIO : 0;
>> +}
>> +
>> +static void nbd_meta_base_query(NBDExportMetaContexts *meta, const
>> char *query)
>> +{
>
> Again, function comments are useful.
>
>> + if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
>> + /* Note: empty query should select all contexts within base
>> + * namespace. */
>> + meta->base_allocation = true;
>
> From the client perspective, this handling of the empty leaf-name
> works well for NBD_OPT_LIST_META_CONTEXT (I want to see what leaf
> nodes the server supports), but not so well for
> NBD_OPT_SET_META_CONTEXT (asking the server to send ALL base
> allocations, even when I don't necessarily know how to interpret
> anything other than base:allocation, is a waste). So this function
> needs a bool parameter that says whether it is being invoked from
> _LIST (empty string okay, to advertise ALL base leaf names back to
> client, which for now is just base:allocation) or from _SET (empty
> string is ignored as invalid; client has to specifically ask for
> base:allocation by name).
"empty string is ignored as invalid", hm, do we have this in spec? I
think SET and LIST must select exactly same sets of contexts. It is
strange behavior of client to set "base:", but it is its decision. And I
don't thing that it is invalid.
Formally we may answer with NBD_REP_ERR_TOO_BIG, but it will look weird,
as client see that both base: and base:allocation returns _one_
context, but in one case it is too big. But if we will have several
base: contextes, server may fairly answer with NBD_REP_ERR_TOO_BIG.
So, I think for now the code is ok.
Also, I don't see NBD_REP_ERR_TOO_BIG possible reply in
NBD_OPT_LIST_META_CONTEXT description. Should it be here?
>
>> + }
>> +}
>> +
>> +/* nbd_negotiate_meta_query
>> + * Return -errno on I/O error, 0 if option was completely handled by
>> + * sending a reply about inconsistent lengths, or 1 on success. */
>> +static int nbd_negotiate_meta_query(NBDClient *client,
>> + NBDExportMetaContexts *meta,
>> Error **errp)
>> +{
>> + int ret;
>> + char *query, *colon, *namespace, *subquery;
>
> Is it worth stack-allocating query here, so you don't have to g_free()
> it later? NBD limits the maximum string to 4k, which is a little bit
> big for stack allocation (on an operating system with 4k pages,
> allocating more than 4k on the stack in one function risks missing the
> guard page on stack overflow), but we also have the benefit that we
> KNOW that the set of meta-context namespaces that we support have a
> much smaller maximum limit of what we even care about.
it is not stack allocated, nbd_alloc_read_size_string calls g_malloc.
>
>> +
>> + ret = nbd_alloc_read_size_string(client, &query, errp);
>> + if (ret <= 0) {
>> + return ret;
>> + }
>> +
>> + colon = strchr(query, ':');
>> + if (colon == NULL) {
>> + ret = nbd_opt_invalid(client, errp, "no colon in query");
>> + goto out;
>
> Hmm, that puts a slight wrinkle into my proposal, or else maybe it is
> something I should bring up on the NBD list. If we only read 5
> characters (because the max namespace WE support is "base:"), but a
> client asks for namespace "X-longname:", we should gracefully ignore
> the client's request; while we still want to reply with an error to a
> client that asks for "garbage" with no colon at all. The question for
> the NBD spec, then, is whether detecting bad client requests that
> didn't use colon is mandatory for the server (meaning we MUST read the
> entire namespace request, and search for the colon) or merely best
> effort (we only have to read 5 characters, and if we silently ignore
> instead of diagnose a bad namespace request that was longer than that,
> oh well). Worded from the client, it switches to a question of whether
> the client should expect the server to diagnose all requests, or must
> be prepared for the server to ignore requests even where those
> requests are bogus. Or, the NBD spec may change slightly to pass
> namespace and leafname as separate fields, both with lengths, rather
> than a colon, to make it easier for the server to skip over an unknown
> namespace/leaf pair without having to parse whether a colon was
> present. I'll send that in a separate email (the upstream NBD list
> doesn't need to see all my review comments on this thread).
>
>> + }
>> + *colon = '\0';
>> + namespace = query;
>> + subquery = colon + 1;
>> +
>> + if (strcmp(namespace, "base") == 0) {
>> + nbd_meta_base_query(meta, subquery);
>> + }
>> +
>> +out:
>> + g_free(query);
>> + return ret;
>> +}
>> +
>> +/* nbd_negotiate_meta_queries
>> + * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
>> + *
>> + * Return -errno on I/O error, 0 if option was completely handled by
>> + * sending a reply about inconsistent lengths, or 1 on success. */
>> +static int nbd_negotiate_meta_queries(NBDClient *client,
>> + NBDExportMetaContexts *meta,
>> Error **errp)
>> +{
>
> The two options can mostly share code, but see my earlier comments
> about how I think you need to distinguish between list ('base:' is a
> valid query) and set ('base:' is an invalid request).
>
>> + int ret;
>> + NBDExport *exp;
>> + NBDExportMetaContexts local_meta;
>> + uint32_t nb_queries;
>> + int i;
>> +
>> + assert(client->structured_reply);
>
> Hmm. Asserting here is bad if the client can get here without
> negotiating structured reply (I'll have to see if you checked that in
> the caller... [1])
>
>> +
>> + if (meta == NULL) {
>
> Bikeshedding - I like writing 'if (!meta) {' as it is fewer characters.
>
> The function documentation should mention that 'meta' may be NULL from
> the caller. But (without seeing the callers yet), why? Are you
> arguing that _LIST uses local_meta (because it only needs something
> for the duration of the reply) while _SET supplies a pre-existing meta
> (that is associated with the server state, and used when the client
> finally calls NBD_OPT_GO)?
yes.
>
>> + meta = &local_meta;
>> + }
>> +
>> + memset(meta, 0, sizeof(*meta));
>> +
>> + ret = nbd_read_size_string(client, meta->export_name,
>> + NBD_MAX_NAME_SIZE, errp);
>
> Revisiting a question I raised in my first half review - you saved the
> name as part of the struct because we have to later compare that the
> final OPT_SET export name matches the request during OPT_GO (if they
> don't match, then we have no contexts to serve after all). So a
> 'const char *' won't work, but maybe the struct could use a 'char *'
> pointing to malloc'd storage rather than char[MAX_NAME] that reserves
> array space that is mostly unused for the typical name that is much
> shorter than the maximum name length.
>
>> + if (ret <= 0) {
>> + return ret;
>> + }
>> +
>> + exp = nbd_export_find(meta->export_name);
>> + if (exp == NULL) {
>> + return nbd_opt_invalid(client, errp,
>> + "export '%s' not present",
>> meta->export_name);
>
> Wrong error; I think NBD_REP_ERR_UNKNOWN is better here.
>
>> + }
>> +
>> + ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp);
>> + if (ret <= 0) {
>> + return ret;
>> + }
>> + cpu_to_be32s(&nb_queries);
>> +
>> + for (i = 0; i < nb_queries; ++i) {
>> + ret = nbd_negotiate_meta_query(client, meta, errp);
>> + if (ret <= 0) {
>> + return ret;
>> + }
>> + }
>
> Interesting choice - you parse all queries prior to preparing any
> response. I guess you could also reply as you read, but it doesn't
> change the fact that you have to remember what the final _SET selected
> for use later on. So this works.
>
>> +
>> + if (meta->base_allocation) {
>> + ret = nbd_negotiate_send_meta_context(client,
>> "base:allocation",
>> + NBD_META_ID_BASE_ALLOCATION,
>> + errp);
>
> Since earlier you have #define NBD_META_ID_BASE_ALLOCATION 0, that
> means you use the context ID of 0 in response to both _LIST (where the
> spec says the server SHOULD send 0, but the client SHOULD ignore the
> id field) and for _SET (where the client needs a unique ID for every
> separate context that actually got selected - but since we only select
> a single context, we get away with it). I suspect later patches will
> add additional contexts where you'll have to actually worry about
> context ids (and always sending 0 in response to _LIST), but for now,
> this works :)
>
>> + if (ret < 0) {
>> + return ret;
>> + }
>> + }
>> +
>> + ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
>> + if (ret == 0) {
>> + meta->valid = true;
>> + }
>
> Interesting - meta->valid can be true even if 0 contexts were
> selected; I presume we use this later on to decide whether to reply
> with EINVAL vs. an empty list if a client uses NBD_CMD_BLOCK_STATUS
> without a valid _SET call. Then again, the NBD spec currently states
> that NBD_CMD_BLOCK_STATUS should not be used even if _SET succeeded,
> if the _SET didn't return at least one context.
>
>> +
>> + return ret;
>> +}
>> +
>> /* nbd_negotiate_options
>> * Process all NBD_OPT_* client option commands, during fixed newstyle
>> * negotiation.
>> @@ -826,6 +1031,22 @@ static int nbd_negotiate_options(NBDClient
>> *client, uint16_t myflags,
>> }
>> break;
>> + case NBD_OPT_LIST_META_CONTEXT:
>> + case NBD_OPT_SET_META_CONTEXT:
>> + if (!client->structured_reply) {
>> + ret = nbd_opt_invalid(
>> + client, errp,
>> + "request option '%s' when structured
>> reply "
>> + "is not negotiated",
>> nbd_opt_lookup(option));
>
> [1] okay, you did filter out clients that request out of order.
>
>> + } else if (option == NBD_OPT_LIST_META_CONTEXT) {
>> + ret = nbd_negotiate_meta_queries(client, NULL,
>> errp);
>> + } else {
>> + ret = nbd_negotiate_meta_queries(client,
>> + &client->export_meta,
>> + errp);
>
> Okay, so you DO have a difference on whether you pass in 'meta' based
> on whether it is a local response vs. setting things for later.
>
>> + }
>> + break;
>> +
>> default:
>> ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
>> "Unsupported option 0x%" PRIx32
>> " (%s)",
>> @@ -1446,6 +1667,78 @@ static int coroutine_fn
>> nbd_co_send_structured_error(NBDClient *client,
>> return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
>> }
>> +static int blockstatus_to_extent_be(BlockDriverState *bs, uint64_t
>> offset,
>> + uint64_t bytes, NBDExtent *extent)
>> +{
>> + uint64_t tail_bytes = bytes;
>> +
>> + while (tail_bytes) {
>
> I might have named this 'remaining'
>
>> + uint32_t flags;
>> + int64_t num;
>> + int ret = bdrv_block_status_above(bs, NULL, offset,
>> tail_bytes, &num,
>> + NULL, NULL);
>> + if (ret < 0) {
>> + return ret;
>> + }
>> +
>> + flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) |
>> + (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
>
> Looks like the correct bit mapping to me.
>
>> +
>> + if (tail_bytes == bytes) {
>> + extent->flags = flags;
>> + }
>> +
>> + if (flags != extent->flags) {
>> + break;
>> + }
>> +
>> + offset += num;
>> + tail_bytes -= num;
>> + }
>> +
>> + cpu_to_be32s(&extent->flags);
>> + extent->length = cpu_to_be32(bytes - tail_bytes);
>
> This only computes one extent, but tries to find the longest extent
> possible (if two consecutive bdrv_block_status calls return the same
> status, perhaps because of fragmentation). Presumably this is called
> in a loop to find a full list of extents covering the client's entire
> request? [2]
yes. And tiny overhead is one extra found extent of other type, so this
function can't be efficiently reused to implement several extents.
However, I don't see real usage for this now, as we don't have multi
extent blockstatus support in block layer.. And code for dirty bitmap
would be separate anyway.
>
>> +
>> + return 0;
>> +}
>> +
>> +/* nbd_co_send_extents
>> + * @extents should be in big-endian */
>> +static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
>> + NBDExtent *extents, unsigned nb_extents,
>> + uint32_t context_id, Error **errp)
>> +{
>> + NBDStructuredMeta chunk;
>> +
>> + struct iovec iov[] = {
>> + {.iov_base = &chunk, .iov_len = sizeof(chunk)},
>> + {.iov_base = extents, .iov_len = nb_extents *
>> sizeof(extents[0])}
>> + };
>> +
>> + set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE,
>> NBD_REPLY_TYPE_BLOCK_STATUS,
>> + handle, sizeof(chunk) - sizeof(chunk.h) +
>> iov[1].iov_len);
>
> This says you can only send a single chunk, which matches the fact
> that right now you support exactly one context type. Presumably later
> patches tweak this.
>
>> + stl_be_p(&chunk.context_id, context_id);
>> +
>> + return nbd_co_send_iov(client, iov, 2, errp);
>> +}
>> +
>> +static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
>> + BlockDriverState *bs, uint64_t
>> offset,
>> + uint64_t length, uint32_t
>> context_id,
>> + Error **errp)
>
> No function comment?
>
>> +{
>> + int ret;
>> + NBDExtent extent;
>> +
>> + ret = blockstatus_to_extent_be(bs, offset, length, &extent);
>> + if (ret < 0) {
>> + return nbd_co_send_structured_error(
>> + client, handle, -ret, "can't get block status", errp);
>> + }
>> +
>> + return nbd_co_send_extents(client, handle, &extent, 1,
>> context_id, errp);
>
> [2] Okay, no looping to answer the client's entire request, so that
> may be something you add in a later patch, to keep this initial patch
> minimal. But at least it matches your commit message.
>
>> +}
>> +
>> /* nbd_co_receive_request
>> * Collect a client request. Return 0 if request looks valid, -EIO
>> to drop
>> * connection right away, and any other negative value to report an
>> error to
>> @@ -1523,6 +1816,8 @@ static int
>> nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
>> valid_flags |= NBD_CMD_FLAG_DF;
>> } else if (request->type == NBD_CMD_WRITE_ZEROES) {
>> valid_flags |= NBD_CMD_FLAG_NO_HOLE;
>> + } else if (request->type == NBD_CMD_BLOCK_STATUS) {
>> + valid_flags |= NBD_CMD_FLAG_REQ_ONE;
>> }
>> if (request->flags & ~valid_flags) {
>> error_setg(errp, "unsupported flags for command %s (got
>> 0x%x)",
>> @@ -1650,6 +1945,19 @@ static coroutine_fn void nbd_trip(void *opaque)
>> }
>> break;
>> + case NBD_CMD_BLOCK_STATUS:
>> + if (client->export_meta.base_allocation) {
>> + ret = nbd_co_send_block_status(req->client, request.handle,
>> + blk_bs(exp->blk),
>> request.from,
>> + request.len,
>> + NBD_META_ID_BASE_ALLOCATION,
>> + &local_err);
>
> No check of client->export_meta.valid?
hmm, strange, looks like I don't use .valid at all. worth add.
>
>> + } else {
>> + ret = -EINVAL;
>> + error_setg(&local_err, "CMD_BLOCK_STATUS not negotiated");
>> + }
>
> At this point, we've either sent the final structured chunk, or we've
> sent nothing and have ret < 0;...
>
>> +
>> + break;
>> default:
>> error_setg(&local_err, "invalid request type (%" PRIu32 ")
>> received",
>> request.type);
>> @@ -1680,7 +1988,7 @@ reply:
>
> if (client->structured_reply &&
> (ret < 0 || request.type == NBD_CMD_READ)) {
>
>> ret = nbd_co_send_structured_done(req->client,
>> request.handle,
>> &local_err);
>> }
>
> ...iIf the client negotiated structured reply, then we've sent the
> error message. But if the client did NOT negotiate structured reply,
> then...
>
>> - } else {
>> + } else if (request.type != NBD_CMD_BLOCK_STATUS) {
>
> ...we fail to reply at all. Oops. That's not good for
> interoperability. You'll need to make sure that we reply with EINVAL
> even if the client (mistakenly) calls NBD_CMD_BLOCK_STATUS without
> negotiating structured replies.
good caught! I felt that I'm doing something ugly, when I was writing
this if. I'll think about how to refactor this a bit.
>
>> ret = nbd_co_send_simple_reply(req->client, request.handle,
>> ret < 0 ? -ret : 0,
>> req->data, reply_data_len,
>> &local_err);
>>
>
> Also missing from the patch - where do you validate that the export
> name in client->export_meta matches the export name in NBD_OPT_GO? If
> they don't match, then export_meta should be discarded.
>
hm, me too.
Thank you for careful review!
--
Best regards,
Vladimir
On 02/16/2018 08:43 AM, Vladimir Sementsov-Ogievskiy wrote:
> 16.02.2018 16:21, Eric Blake wrote:
>> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> Minimal realization: only one extent in server answer is supported.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>> ---
>> Again, function comments are useful.
>>
>>> + if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
>>> + /* Note: empty query should select all contexts within base
>>> + * namespace. */
>>> + meta->base_allocation = true;
>>
>> From the client perspective, this handling of the empty leaf-name
>> works well for NBD_OPT_LIST_META_CONTEXT (I want to see what leaf
>> nodes the server supports), but not so well for
>> NBD_OPT_SET_META_CONTEXT (asking the server to send ALL base
>> allocations, even when I don't necessarily know how to interpret
>> anything other than base:allocation, is a waste). So this function
>> needs a bool parameter that says whether it is being invoked from
>> _LIST (empty string okay, to advertise ALL base leaf names back to
>> client, which for now is just base:allocation) or from _SET (empty
>> string is ignored as invalid; client has to specifically ask for
>> base:allocation by name).
>
> "empty string is ignored as invalid", hm, do we have this in spec? I
> think SET and LIST must select exactly same sets of contexts.
No, it is specifically NOT intended that SET and LIST have to produce
the same set of contexts; although I do see your point that as currently
written, it does not appear to require SET to ignore "base:" or to treat
it as an error. At any rate, the spec gives the example of:
> In either case, however, for any given namespace the server MAY, instead of exhaustively listing every matching context available to select (or every context available to select where no query is given), send sufficient context records back to allow a client with knowledge of the namespace to select any context. This may be helpful where a client can construct algorithmic queries. For instance, a client might reply simply with the namespace with no leaf-name (e.g. 'X-FooBar:') or with a range of values (e.g. 'X-ModifiedDate:20160310-20161214'). The semantics of such a reply are a matter for the definition of the namespace. However each namespace returned MUST begin with the relevant namespace, followed by a colon, and then other UTF-8 characters, with the entire string following the restrictions for strings set out earlier in this document.
with the intent being that for some namespaces, it may be easy to
perform an algorithmic query via _LIST to see what ranges are supported,
but that you cannot select ALL elements in the range simultaneously.
The empty query for _LIST exists to enumerate what is supported, but
does not have to equate to an empty query for _SET selecting everything
possible. I could even see it being possible to have some round-trips,
depending on the namespace (of course, any namespace other than "base:"
will be tightly coordinated between both client and server, so they
understand each other - the point was that the NBD spec didn't want to
constrain what a client and server could do as long as they stay within
the generic framework):
C> LIST ""
S> REPLY "base:allocation" id 0
S> REPLY "X-FooBar:" id 0
S> ACK
C> LIST "X-FooBar:"
S> REPLY "X-FooBar:A_Required", id 0
S> REPLY "X-FooBar:A_Min=100", id 0
S> REPLY "X-FooBar:A_Max=200", id 0
S> REPLY "X-FooBar:B_Default=300", id 0
S> REPLY "X-FooBar:B_Min=300", id 0
S> REPLY "X-FooBar:B_Max=400", id 0
S> ACK
C> SET "X-FooBar:A=150" "base:allocation"
S> REPLY "X-FooBar:A=150,B=300", id 1
S> REPLY "base:allocation", id 2
S> ACK
where the global query of all available contexts merely lists that
X-FooBar: is understood, but that a specific query is needed for more
details (to avoid the client having to parse those specifics if it
doesn't care about X-FooBar:), and the specific query sets up the
algorithmic description (parameter A is required, between 100 and 200;
parameter B is optional, between 300 and 400, defaulting to 300), and
the final SET gives the actual request (A given a value, B left to its
default; but the reply names the entire context rather than repeating
the shorter request). So the spec is written to permit something like
that for third-party namespaces, while also trying to be very specific
about the "base:" context as that is the one that needs the most
interoperability.
> It is
> strange behavior of client to set "base:", but it is its decision. And I
> don't thing that it is invalid.
For LIST, querying "base:" makes total sense (for the sake of example,
we may add "base:frob" down the road that does something new. Being
able to LIST whether "base:" turns into just "base:allocation" or into
"base:allocation"+"base:frob" may be useful to a client that understands
both formats and wants to probe if the server is new; and even for a
client right now, the client can gracefully note that it doesn't want to
select "base:frob"). But for SET, it does not (if "base:" turns into
"base:allocation" + "base:frob" down the road, then the server is
wasting time preparing the response to "base:frob" for every
NBD_CMD_BLOCK_STATUS, and the client is wasting time unpacking from the
wire and ignoring it), so having the empty query work on LIST but not on
SET makes sense.
> Formally we may answer with NBD_REP_ERR_TOO_BIG, but it will look weird,
> as client see that both base: and base:allocation returns _one_ context,
> but in one case it is too big. But if we will have several base:
> contextes, server may fairly answer with NBD_REP_ERR_TOO_BIG.
Hmm, you have a point that while a client can ask for "namespace:", the
server should always respond with "namespace:leaf" for the actual
contexts that it supports/selects, so that the client knows which leaves
it actually got, if it does not fail with ERR_TOO_BIG. You are also
right that failing with ERR_TOO_BIG for "base:" seems odd, but it may
make more sense for other namespaces.
>
> So, I think for now the code is ok.
Then this is probably worth something to bring up on the NBD list, if we
need to tweak wording to be more explicit (whether we should
allow/forbid wildcards during SET, or if wildcard queries are intended
only for LIST). Sounds like I have more spec emails to write to the NBD
list.
>
> Also, I don't see NBD_REP_ERR_TOO_BIG possible reply in
> NBD_OPT_LIST_META_CONTEXT description. Should it be here?
Yeah, that's probably also worth adding to the upstream spec, even
though it already encourages LIST results to send compressed information
back that allows a client to contruct valid specific queries, rather
than an exhaustive list of selecting everything possible.
>>> +/* nbd_negotiate_meta_query
>>> + * Return -errno on I/O error, 0 if option was completely handled by
>>> + * sending a reply about inconsistent lengths, or 1 on success. */
>>> +static int nbd_negotiate_meta_query(NBDClient *client,
>>> + NBDExportMetaContexts *meta,
>>> Error **errp)
>>> +{
>>> + int ret;
>>> + char *query, *colon, *namespace, *subquery;
>>
>> Is it worth stack-allocating query here, so you don't have to g_free()
>> it later? NBD limits the maximum string to 4k, which is a little bit
>> big for stack allocation (on an operating system with 4k pages,
>> allocating more than 4k on the stack in one function risks missing the
>> guard page on stack overflow), but we also have the benefit that we
>> KNOW that the set of meta-context namespaces that we support have a
>> much smaller maximum limit of what we even care about.
>
> it is not stack allocated, nbd_alloc_read_size_string calls g_malloc.
Hence my question - do we NEED the malloc'd version, or can we get away
with a stack-allocated space? Although I then revised my question...
>
>>
>>> +
>>> + ret = nbd_alloc_read_size_string(client, &query, errp);
>>> + if (ret <= 0) {
>>> + return ret;
>>> + }
>>> +
>>> + colon = strchr(query, ':');
>>> + if (colon == NULL) {
>>> + ret = nbd_opt_invalid(client, errp, "no colon in query");
>>> + goto out;
>>
>> Hmm, that puts a slight wrinkle into my proposal, or else maybe it is
>> something I should bring up on the NBD list. If we only read 5
>> characters (because the max namespace WE support is "base:"), but a
>> client asks for namespace "X-longname:", we should gracefully ignore
>> the client's request; while we still want to reply with an error to a
>> client that asks for "garbage" with no colon at all. The question for
>> the NBD spec, then, is whether detecting bad client requests that
>> didn't use colon is mandatory for the server (meaning we MUST read the
>> entire namespace request, and search for the colon) or merely best
>> effort (we only have to read 5 characters, and if we silently ignore
>> instead of diagnose a bad namespace request that was longer than that,
>> oh well). Worded from the client, it switches to a question of whether
>> the client should expect the server to diagnose all requests, or must
>> be prepared for the server to ignore requests even where those
>> requests are bogus. Or, the NBD spec may change slightly to pass
>> namespace and leafname as separate fields, both with lengths, rather
>> than a colon, to make it easier for the server to skip over an unknown
>> namespace/leaf pair without having to parse whether a colon was
>> present. I'll send that in a separate email (the upstream NBD list
>> doesn't need to see all my review comments on this thread).
... in light of this thread now on the NBD list.
>
> Thank you for careful review!
No problem. We still have some things to sort out on the NBD list as
well, but I want to make sure we get something that is likely to work
well with other implementations (I'm also trying, on the side, to get
nbdkit to support structured reads so I have something available for
testing cross-implementation support, but it is slow going).
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3266
Virtualization: qemu.org | libvirt.org
16.02.2018 20:01, Eric Blake wrote:
> On 02/16/2018 08:43 AM, Vladimir Sementsov-Ogievskiy wrote:
>> 16.02.2018 16:21, Eric Blake wrote:
>>> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>> Minimal realization: only one extent in server answer is supported.
>>>>
>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>> ---
>
>>> Again, function comments are useful.
>>>
>>>> + if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
>>>> + /* Note: empty query should select all contexts within base
>>>> + * namespace. */
>>>> + meta->base_allocation = true;
>>>
>>> From the client perspective, this handling of the empty leaf-name
>>> works well for NBD_OPT_LIST_META_CONTEXT (I want to see what leaf
>>> nodes the server supports), but not so well for
>>> NBD_OPT_SET_META_CONTEXT (asking the server to send ALL base
>>> allocations, even when I don't necessarily know how to interpret
>>> anything other than base:allocation, is a waste). So this function
>>> needs a bool parameter that says whether it is being invoked from
>>> _LIST (empty string okay, to advertise ALL base leaf names back to
>>> client, which for now is just base:allocation) or from _SET (empty
>>> string is ignored as invalid; client has to specifically ask for
>>> base:allocation by name).
>>
>> "empty string is ignored as invalid", hm, do we have this in spec? I
>> think SET and LIST must select exactly same sets of contexts.
>
> No, it is specifically NOT intended that SET and LIST have to produce
> the same set of contexts; although I do see your point that as
> currently written, it does not appear to require SET to ignore "base:"
> or to treat it as an error. At any rate, the spec gives the example of:
>
>> In either case, however, for any given namespace the server MAY,
>> instead of exhaustively listing every matching context available to
>> select (or every context available to select where no query is
>> given), send sufficient context records back to allow a client with
>> knowledge of the namespace to select any context. This may be helpful
>> where a client can construct algorithmic queries. For instance, a
>> client might reply simply with the namespace with no leaf-name (e.g.
>> 'X-FooBar:') or with a range of values (e.g.
>> 'X-ModifiedDate:20160310-20161214'). The semantics of such a reply
>> are a matter for the definition of the namespace. However each
>> namespace returned MUST begin with the relevant namespace, followed
>> by a colon, and then other UTF-8 characters, with the entire string
>> following the restrictions for strings set out earlier in this document.
>
> with the intent being that for some namespaces, it may be easy to
> perform an algorithmic query via _LIST to see what ranges are
> supported, but that you cannot select ALL elements in the range
> simultaneously. The empty query for _LIST exists to enumerate what is
> supported, but does not have to equate to an empty query for _SET
> selecting everything possible. I could even see it being possible to
> have some round-trips, depending on the namespace (of course, any
> namespace other than "base:" will be tightly coordinated between both
> client and server, so they understand each other - the point was that
> the NBD spec didn't want to constrain what a client and server could
> do as long as they stay within the generic framework):
>
> C> LIST ""
> S> REPLY "base:allocation" id 0
> S> REPLY "X-FooBar:" id 0
> S> ACK
> C> LIST "X-FooBar:"
> S> REPLY "X-FooBar:A_Required", id 0
> S> REPLY "X-FooBar:A_Min=100", id 0
> S> REPLY "X-FooBar:A_Max=200", id 0
> S> REPLY "X-FooBar:B_Default=300", id 0
> S> REPLY "X-FooBar:B_Min=300", id 0
> S> REPLY "X-FooBar:B_Max=400", id 0
> S> ACK
> C> SET "X-FooBar:A=150" "base:allocation"
> S> REPLY "X-FooBar:A=150,B=300", id 1
> S> REPLY "base:allocation", id 2
> S> ACK
>
> where the global query of all available contexts merely lists that
> X-FooBar: is understood, but that a specific query is needed for more
> details (to avoid the client having to parse those specifics if it
> doesn't care about X-FooBar:), and the specific query sets up the
> algorithmic description (parameter A is required, between 100 and 200;
> parameter B is optional, between 300 and 400, defaulting to 300), and
> the final SET gives the actual request (A given a value, B left to its
> default; but the reply names the entire context rather than repeating
> the shorter request). So the spec is written to permit something like
> that for third-party namespaces, while also trying to be very specific
> about the "base:" context as that is the one that needs the most
> interoperability.
>
>> It is strange behavior of client to set "base:", but it is its
>> decision. And I don't thing that it is invalid.
>
> For LIST, querying "base:" makes total sense (for the sake of example,
> we may add "base:frob" down the road that does something new. Being
> able to LIST whether "base:" turns into just "base:allocation" or into
> "base:allocation"+"base:frob" may be useful to a client that
> understands both formats and wants to probe if the server is new; and
> even for a client right now, the client can gracefully note that it
> doesn't want to select "base:frob"). But for SET, it does not (if
> "base:" turns into "base:allocation" + "base:frob" down the road, then
> the server is wasting time preparing the response to "base:frob" for
> every NBD_CMD_BLOCK_STATUS, and the client is wasting time unpacking
> from the wire and ignoring it), so having the empty query work on LIST
> but not on SET makes sense.
Ok, understand, you are right, and, correspondingly,
>
>> Formally we may answer with NBD_REP_ERR_TOO_BIG, but it will look
>> weird, as client see that both base: and base:allocation returns
>> _one_ context, but in one case it is too big. But if we will have
>> several base: contextes, server may fairly answer with
>> NBD_REP_ERR_TOO_BIG.
>
> Hmm, you have a point that while a client can ask for "namespace:",
> the server should always respond with "namespace:leaf" for the actual
> contexts that it supports/selects, so that the client knows which
> leaves it actually got, if it does not fail with ERR_TOO_BIG. You are
> also right that failing with ERR_TOO_BIG for "base:" seems odd, but it
> may make more sense for other namespaces.
_INVALID is ok here.
>
>>
>> So, I think for now the code is ok.
>
> Then this is probably worth something to bring up on the NBD list, if
> we need to tweak wording to be more explicit (whether we should
> allow/forbid wildcards during SET, or if wildcard queries are intended
> only for LIST). Sounds like I have more spec emails to write to the
> NBD list.
So, code is not ok, I'll fix)
>
>>
>> Also, I don't see NBD_REP_ERR_TOO_BIG possible reply in
>> NBD_OPT_LIST_META_CONTEXT description. Should it be here?
>
> Yeah, that's probably also worth adding to the upstream spec, even
> though it already encourages LIST results to send compressed
> information back that allows a client to contruct valid specific
> queries, rather than an exhaustive list of selecting everything possible.
>
>>>> +/* nbd_negotiate_meta_query
>>>> + * Return -errno on I/O error, 0 if option was completely handled by
>>>> + * sending a reply about inconsistent lengths, or 1 on success. */
>>>> +static int nbd_negotiate_meta_query(NBDClient *client,
>>>> + NBDExportMetaContexts *meta,
>>>> Error **errp)
>>>> +{
>>>> + int ret;
>>>> + char *query, *colon, *namespace, *subquery;
>>>
>>> Is it worth stack-allocating query here, so you don't have to
>>> g_free() it later? NBD limits the maximum string to 4k, which is a
>>> little bit big for stack allocation (on an operating system with 4k
>>> pages, allocating more than 4k on the stack in one function risks
>>> missing the guard page on stack overflow), but we also have the
>>> benefit that we KNOW that the set of meta-context namespaces that we
>>> support have a much smaller maximum limit of what we even care about.
>>
>> it is not stack allocated, nbd_alloc_read_size_string calls g_malloc.
>
> Hence my question - do we NEED the malloc'd version, or can we get
> away with a stack-allocated space? Although I then revised my
> question...
ohm, just misread, will try.
>
>>
>>>
>>>> +
>>>> + ret = nbd_alloc_read_size_string(client, &query, errp);
>>>> + if (ret <= 0) {
>>>> + return ret;
>>>> + }
>>>> +
>>>> + colon = strchr(query, ':');
>>>> + if (colon == NULL) {
>>>> + ret = nbd_opt_invalid(client, errp, "no colon in query");
>>>> + goto out;
>>>
>>> Hmm, that puts a slight wrinkle into my proposal, or else maybe it
>>> is something I should bring up on the NBD list. If we only read 5
>>> characters (because the max namespace WE support is "base:"), but a
>>> client asks for namespace "X-longname:", we should gracefully ignore
>>> the client's request; while we still want to reply with an error to
>>> a client that asks for "garbage" with no colon at all. The question
>>> for the NBD spec, then, is whether detecting bad client requests
>>> that didn't use colon is mandatory for the server (meaning we MUST
>>> read the entire namespace request, and search for the colon) or
>>> merely best effort (we only have to read 5 characters, and if we
>>> silently ignore instead of diagnose a bad namespace request that was
>>> longer than that, oh well). Worded from the client, it switches to a
>>> question of whether the client should expect the server to diagnose
>>> all requests, or must be prepared for the server to ignore requests
>>> even where those requests are bogus. Or, the NBD spec may change
>>> slightly to pass namespace and leafname as separate fields, both
>>> with lengths, rather than a colon, to make it easier for the server
>>> to skip over an unknown namespace/leaf pair without having to parse
>>> whether a colon was present. I'll send that in a separate email (the
>>> upstream NBD list doesn't need to see all my review comments on this
>>> thread).
>
> ... in light of this thread now on the NBD list.
>
>>
>> Thank you for careful review!
>
> No problem. We still have some things to sort out on the NBD list as
> well, but I want to make sure we get something that is likely to work
> well with other implementations (I'm also trying, on the side, to get
> nbdkit to support structured reads so I have something available for
> testing cross-implementation support, but it is slow going).
>
--
Best regards,
Vladimir
16.02.2018 16:21, Eric Blake wrote:
> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>> Minimal realization: only one extent in server answer is supported.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>
>
[...]
>> + meta = &local_meta;
>> + }
>> +
>> + memset(meta, 0, sizeof(*meta));
>> +
>> + ret = nbd_read_size_string(client, meta->export_name,
>> + NBD_MAX_NAME_SIZE, errp);
>
> Revisiting a question I raised in my first half review - you saved the
> name as part of the struct because we have to later compare that the
> final OPT_SET export name matches the request during OPT_GO (if they
> don't match, then we have no contexts to serve after all). So a
> 'const char *' won't work, but maybe the struct could use a 'char *'
> pointing to malloc'd storage rather than char[MAX_NAME] that reserves
> array space that is mostly unused for the typical name that is much
> shorter than the maximum name length.
Do these 256 bytes worth creating nbd_clear_meta function, to call it
from client_put and on NBD_OPT_SET_META_CONTEXT, to clear previous meta?
If yes, I'd prefer to postpone it to continuation about BLOCKSTATUS for
dirty bitmaps, when I'll have to upgrade meta structure, to maintain
dirty bitmaps.
>
>> + if (ret <= 0) {
>> + return ret;
>> + }
>> +
>> + exp = nbd_export_find(meta->export_name);
>> + if (exp == NULL) {
>> + return nbd_opt_invalid(client, errp,
>> + "export '%s' not present",
>> meta->export_name);
>
--
Best regards,
Vladimir
© 2016 - 2025 Red Hat, Inc.