This patch adds an optional qemu device parameter "vii" (very important
inode[s]), which accepts a colon separated list of pathes relative below
the 9p export root, which shall be given the smallest inode suffix (and
hence largest possible inode namespace on guest) according to the given
order. That way host admins do have a way to fine tune the automatic inode
remapping algorithm of 9pfs.
Example: consider a file server running on guest with 9p export root /vm/fs
on host and the mass storage file system being mounted on host on
/vm/fs/var/shares and further consider some very frequently accessed temp
file system being mounted as /vm/fs/tmp on host, then host admin could give
the mass storage file system the highest inode namespace and the temp file
system the 2nd highest inode namespace with qemu virtfs argument:
vii=/var/shares:/tmp
The root file system (/vm/fs on host, / on guest) would then hence get the
3rd highest inode namespace on guest automatically (since not mentioned in
vii list).
For completeness I will also send a (6th) patch against libvirt which
allows to conigure this feature via libvirt/virsh xml config instead.
Additionally this patch addresses:
* Removes unnecessary header includes in 9p.c
(that I used for debugging).
* Fixes a compiler error of previous patch in 9p.c if macro
QPP_TABLE_PERSISTENCY_LIMIT was enabled and latest git master head
version used.
This patch probably needs some cleanup. But I will wait for your coarse
feedback before doing anything else. For instance the device property that
I added in virtio-9p-device.c is actually not used ATM. My initial
assumption was this to be used by the qemu options visitors automatically
for command line arguments, then I noticed that these properties are rather
intended for passing parameters from guest instead. I am unresolved though
whether this parameter shall be configurable from guest or not, so I have
not finished that yet. Then the user manual changes need to be finished and
obviously you might have a better idea for an argument name than "vii" ;-)
Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
fsdev/file-op-9p.h | 1 +
fsdev/qemu-fsdev-opts.c | 6 ++
fsdev/qemu-fsdev.c | 3 +
hw/9pfs/9p.c | 138 +++++++++++++++++++++++++++++++--------------
hw/9pfs/9p.h | 6 ++
hw/9pfs/virtio-9p-device.c | 1 +
qemu-options.hx | 5 +-
vl.c | 9 ++-
8 files changed, 124 insertions(+), 45 deletions(-)
diff --git a/fsdev/file-op-9p.h b/fsdev/file-op-9p.h
index 3fa062b39f..2645faa113 100644
--- a/fsdev/file-op-9p.h
+++ b/fsdev/file-op-9p.h
@@ -77,6 +77,7 @@ typedef struct FsDriverEntry {
FsThrottle fst;
mode_t fmode;
mode_t dmode;
+ char *vii;
} FsDriverEntry;
struct FsContext {
diff --git a/fsdev/qemu-fsdev-opts.c b/fsdev/qemu-fsdev-opts.c
index 7c31ffffaf..5ae02887a5 100644
--- a/fsdev/qemu-fsdev-opts.c
+++ b/fsdev/qemu-fsdev-opts.c
@@ -44,6 +44,9 @@ static QemuOptsList qemu_fsdev_opts = {
}, {
.name = "dmode",
.type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "vii",
+ .type = QEMU_OPT_STRING,
},
THROTTLE_OPTS,
@@ -87,6 +90,9 @@ static QemuOptsList qemu_virtfs_opts = {
}, {
.name = "dmode",
.type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "vii",
+ .type = QEMU_OPT_STRING,
},
{ /*End of list */ }
diff --git a/fsdev/qemu-fsdev.c b/fsdev/qemu-fsdev.c
index 54cb36a212..29133b4a15 100644
--- a/fsdev/qemu-fsdev.c
+++ b/fsdev/qemu-fsdev.c
@@ -34,6 +34,7 @@ int qemu_fsdev_add(QemuOpts *opts, Error **errp)
const char *fsdev_id = qemu_opts_id(opts);
const char *fsdriver = qemu_opt_get(opts, "fsdriver");
const char *writeout = qemu_opt_get(opts, "writeout");
+ const char *vii = qemu_opt_get(opts, "vii");
bool ro = qemu_opt_get_bool(opts, "readonly", 0);
if (!fsdev_id) {
@@ -59,6 +60,7 @@ int qemu_fsdev_add(QemuOpts *opts, Error **errp)
fsle = g_malloc0(sizeof(*fsle));
fsle->fse.fsdev_id = g_strdup(fsdev_id);
+ fsle->fse.vii = g_strdup(vii);
fsle->fse.ops = FsDrivers[i].ops;
if (writeout) {
if (!strcmp(writeout, "immediate")) {
@@ -74,6 +76,7 @@ int qemu_fsdev_add(QemuOpts *opts, Error **errp)
if (fsle->fse.ops->parse_opts) {
if (fsle->fse.ops->parse_opts(opts, &fsle->fse, errp)) {
g_free(fsle->fse.fsdev_id);
+ g_free(fsle->fse.vii);
g_free(fsle);
return -1;
}
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 003901a1e0..b4c4c66a3b 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -30,10 +30,8 @@
#if defined(__linux__) /* TODO: This should probably go into osdep.h instead */
# include <linux/limits.h> /* for XATTR_SIZE_MAX */
#endif
-#include <sys/types.h>
-#include <unistd.h>
-#include <sys/syscall.h>
#include <math.h>
+#include "qemu/cutils.h"
/*
* How many bytes may we store to fs per extended attribute value?
@@ -589,6 +587,8 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
P9_STAT_MODE_NAMED_PIPE | \
P9_STAT_MODE_SOCKET)
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
/* Mirrors all bits of a byte. So e.g. binary 10100000 would become 00000101. */
static inline uint8_t mirror8bit(uint8_t byte) {
return (byte * 0x0202020202ULL & 0x010884422010ULL) % 1023;
@@ -606,8 +606,6 @@ static inline uint64_t mirror64bit(uint64_t value) {
((uint64_t)mirror8bit((value >> 56) & 0xff) ) ;
}
-#if P9_VARI_LENGTH_INODE_SUFFIXES
-
/* Parameter k for the Exponential Golomb algorihm to be used.
*
* The smaller this value, the smaller the minimum bit count for the Exp.
@@ -770,7 +768,7 @@ static int v9fs_store_qpd_table(V9fsState *s);
* generating hash values for the purpose of accessing qpp_table in order
* get consistent behaviour when accessing qpp_table.
*/
-static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
+static int qid_inode_prefix_hash_bits(V9fsState *s, dev_t dev)
{
QpdEntry lookup = {
.dev = dev
@@ -778,14 +776,14 @@ static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
uint32_t hash = dev;
VariLenAffix affix;
- val = qht_lookup(&pdu->s->qpd_table, &lookup, hash);
+ val = qht_lookup(&s->qpd_table, &lookup, hash);
if (!val) {
val = g_malloc0(sizeof(QpdEntry));
*val = lookup;
- affix = affixForIndex(pdu->s->qp_affix_next);
+ affix = affixForIndex(s->qp_affix_next);
val->prefix_bits = affix.bits;
- qht_insert(&pdu->s->qpd_table, val, hash, NULL);
- pdu->s->qp_ndevices++;
+ qht_insert(&s->qpd_table, val, hash, NULL);
+ s->qp_ndevices++;
/*
* Store qpd_table as extended attribute(s) to file system.
@@ -794,7 +792,7 @@ static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
* suspend handler.
*/
#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
- v9fs_store_qpd_table(pdu->s);
+ v9fs_store_qpd_table(s);
#endif
}
return val->prefix_bits;
@@ -817,7 +815,7 @@ static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
* @see qid_path_prefixmap() for details
*
*/
-static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
+static int qid_path_fullmap(V9fsState *s, const struct stat *stbuf,
uint64_t *path)
{
QpfEntry lookup = {
@@ -829,10 +827,10 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
VariLenAffix affix;
#endif
- val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
+ val = qht_lookup(&s->qpf_table, &lookup, hash);
if (!val) {
- if (pdu->s->qp_fullpath_next == 0) {
+ if (s->qp_fullpath_next == 0) {
/* no more files can be mapped :'( */
return -ENFILE;
}
@@ -843,15 +841,15 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
/* new unique inode and device combo */
#if P9_VARI_LENGTH_INODE_SUFFIXES
affix = affixForIndex(
- 1ULL << (sizeof(pdu->s->qp_affix_next) * 8)
+ 1ULL << (sizeof(s->qp_affix_next) * 8)
);
- val->path = (pdu->s->qp_fullpath_next++ << affix.bits) | affix.value;
- pdu->s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1);
+ val->path = (s->qp_fullpath_next++ << affix.bits) | affix.value;
+ s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1);
#else
- val->path = pdu->s->qp_fullpath_next++;
- pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
+ val->path = s->qp_fullpath_next++;
+ s->qp_fullpath_next &= QPATH_INO_MASK;
#endif
- qht_insert(&pdu->s->qpf_table, val, hash, NULL);
+ qht_insert(&s->qpf_table, val, hash, NULL);
}
*path = val->path;
@@ -914,7 +912,7 @@ static void remove_qp_tables_xattrs(FsContext *ctx) {
# if P9_VARI_LENGTH_INODE_SUFFIXES
/* Used to convert qpd hash table into continuous stream. */
-static void qpd_table_serialize(struct qht *ht, void *p, uint32_t h, void *up)
+static void qpd_table_serialize(void *p, uint32_t h, void *up)
{
const QpdEntry *entry = (const QpdEntry*) p;
QpdSerialize *ser = (QpdSerialize*) up;
@@ -1540,11 +1538,11 @@ static void v9fs_store_qp_tables(V9fsState *s)
* suffixes instead. See comment on P9_VARI_LENGTH_INODE_SUFFIXES for
* details.
*/
-static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
+static int qid_path_prefixmap(V9fsState *s, const struct stat *stbuf,
uint64_t *path)
{
#if P9_VARI_LENGTH_INODE_SUFFIXES
- const int ino_hash_bits = qid_inode_prefix_hash_bits(pdu, stbuf->st_dev);
+ const int ino_hash_bits = qid_inode_prefix_hash_bits(s, stbuf->st_dev);
#endif
QppEntry lookup = {
.dev = stbuf->st_dev,
@@ -1556,10 +1554,10 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
}, *val;
uint32_t hash = qpp_hash(lookup);
- val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
+ val = qht_lookup(&s->qpp_table, &lookup, hash);
if (!val) {
- if (pdu->s->qp_affix_next == 0) {
+ if (s->qp_affix_next == 0) {
/* we ran out of affixes */
return -ENFILE;
}
@@ -1569,12 +1567,12 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
/* new unique inode affix and device combo */
#if P9_VARI_LENGTH_INODE_SUFFIXES
- val->qp_affix_index = pdu->s->qp_affix_next++;
+ val->qp_affix_index = s->qp_affix_next++;
val->qp_affix = affixForIndex(val->qp_affix_index);
#else
- val->qp_affix = pdu->s->qp_affix_next++;
+ val->qp_affix = s->qp_affix_next++;
#endif
- qht_insert(&pdu->s->qpp_table, val, hash, NULL);
+ qht_insert(&s->qpp_table, val, hash, NULL);
/*
* Store qpp_table as extended attribute(s) to file system.
@@ -1583,7 +1581,7 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
* suspend handler.
*/
#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
- v9fs_store_qpp_table(pdu->s);
+ v9fs_store_qpp_table(s);
#endif
}
#if P9_VARI_LENGTH_INODE_SUFFIXES
@@ -1595,15 +1593,15 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
return 0;
}
-static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
+static int stat_to_qid(V9fsState *s, const struct stat *stbuf, V9fsQID *qidp)
{
int err;
/* map inode+device to qid path (fast path) */
- err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
+ err = qid_path_prefixmap(s, stbuf, &qidp->path);
if (err == -ENFILE) {
/* fast path didn't work, fall back to full map */
- err = qid_path_fullmap(pdu, stbuf, &qidp->path);
+ err = qid_path_fullmap(s, stbuf, &qidp->path);
}
if (err) {
return err;
@@ -1621,6 +1619,57 @@ static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
return 0;
}
+/** @brief Handle "vii" device config parameter.
+ *
+ * "vii" (very important inode[s]) is a colon separated list of pathes
+ * passed as optional virtfs command line argument "vii" to qemu, which
+ * shall have precedence in their given order when emitting inode nr
+ * remapping suffixes on guest. So the very first vii path will get the
+ * smallest inode nr prefix (i.e. 1 bit) and hence the largest inode
+ * namespace on guest, the next path in the list will get the next higher
+ * inode prefix (i.e. 3 bit), and so on.
+ *
+ * @see qid_path_prefixmap()
+ */
+static void map_important_inodes(FsDriverEntry *fse, V9fsState *s, V9fsPath *path) {
+#if !P9_VARI_LENGTH_INODE_SUFFIXES
+ error_printf("virtfs-9p: ignoring argument vii='%s': virtfs-9p compiled without variable length inode suffixes", fse->vii);
+#else
+ struct stat stbuf;
+ char *buf, *token;
+ V9fsPath vii;
+ V9fsQID qid;
+
+ if (!fse->vii)
+ return; /* no "vii" command line argument passed to qemu */
+
+ v9fs_path_init(&vii);
+
+ buf = g_strdup(fse->vii);
+ while ((token = qemu_strsep(&buf, ":"))) {
+ if (token[0] != '/') {
+ error_printf("virtfs-9p: ignoring vii path '%s': misses mandatory leading slash", token);
+ continue;
+ }
+ /* token+1 : slash must be ripped off to avoid double slash in
+ * in vii and its subsequent assertion fault in lstat() */
+ if (s->ops->name_to_path(&s->ctx, path, token+1, &vii) < 0) {
+ error_printf("virtfs-9p: ignoring vii path '%s': error converting name to path: %s",
+ token, strerror(errno));
+ continue;
+ }
+ if (s->ops->lstat(&s->ctx, &vii, &stbuf) < 0) {
+ error_printf("virtfs-9p: ignoring vii path '%s': lstat failed", token);
+ goto next;
+ }
+ stat_to_qid(s, &stbuf, &qid);
+ next:
+ v9fs_path_free(&vii);
+ }
+ g_free(buf);
+#endif
+}
+
static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
V9fsQID *qidp)
{
@@ -1631,7 +1680,7 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
if (err < 0) {
return err;
}
- err = stat_to_qid(pdu, &stbuf, qidp);
+ err = stat_to_qid(pdu->s, &stbuf, qidp);
if (err < 0) {
return err;
}
@@ -1865,7 +1914,7 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
memset(v9stat, 0, sizeof(*v9stat));
- err = stat_to_qid(pdu, stbuf, &v9stat->qid);
+ err = stat_to_qid(pdu->s, stbuf, &v9stat->qid);
if (err < 0) {
return err;
}
@@ -1951,7 +2000,7 @@ static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf,
/* Currently we only support BASIC fields in stat */
v9lstat->st_result_mask = P9_STATS_BASIC;
- return stat_to_qid(pdu, stbuf, &v9lstat->qid);
+ return stat_to_qid(pdu->s, stbuf, &v9lstat->qid);
}
static void print_sg(struct iovec *sg, int cnt)
@@ -2416,7 +2465,7 @@ static void coroutine_fn v9fs_walk(void *opaque)
if (err < 0) {
goto out;
}
- err = stat_to_qid(pdu, &stbuf, &qid);
+ err = stat_to_qid(pdu->s, &stbuf, &qid);
if (err < 0) {
goto out;
}
@@ -2521,7 +2570,7 @@ static void coroutine_fn v9fs_open(void *opaque)
if (err < 0) {
goto out;
}
- err = stat_to_qid(pdu, &stbuf, &qid);
+ err = stat_to_qid(pdu->s, &stbuf, &qid);
if (err < 0) {
goto out;
}
@@ -2634,7 +2683,7 @@ static void coroutine_fn v9fs_lcreate(void *opaque)
fidp->flags |= FID_NON_RECLAIMABLE;
}
iounit = get_iounit(pdu, &fidp->path);
- err = stat_to_qid(pdu, &stbuf, &qid);
+ err = stat_to_qid(pdu->s, &stbuf, &qid);
if (err < 0) {
goto out;
}
@@ -3371,7 +3420,7 @@ static void coroutine_fn v9fs_create(void *opaque)
}
}
iounit = get_iounit(pdu, &fidp->path);
- err = stat_to_qid(pdu, &stbuf, &qid);
+ err = stat_to_qid(pdu->s, &stbuf, &qid);
if (err < 0) {
goto out;
}
@@ -3431,7 +3480,7 @@ static void coroutine_fn v9fs_symlink(void *opaque)
if (err < 0) {
goto out;
}
- err = stat_to_qid(pdu, &stbuf, &qid);
+ err = stat_to_qid(pdu->s, &stbuf, &qid);
if (err < 0) {
goto out;
}
@@ -4114,7 +4163,7 @@ static void coroutine_fn v9fs_mknod(void *opaque)
if (err < 0) {
goto out;
}
- err = stat_to_qid(pdu, &stbuf, &qid);
+ err = stat_to_qid(pdu->s, &stbuf, &qid);
if (err < 0) {
goto out;
}
@@ -4275,7 +4324,7 @@ static void coroutine_fn v9fs_mkdir(void *opaque)
if (err < 0) {
goto out;
}
- err = stat_to_qid(pdu, &stbuf, &qid);
+ err = stat_to_qid(pdu->s, &stbuf, &qid);
if (err < 0) {
goto out;
}
@@ -4701,8 +4750,13 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
#endif
s->qp_affix_next = 1; /* reserve 0 to detect overflow */
s->qp_fullpath_next = 1;
+
+ /* handle "very important inodes" */
+ map_important_inodes(fse, s, &path);
+
/* try to load and restore previous qpd_table and qpp_table */
#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+ /*TODO: call will be ignored ATM if virfs "vii" argument was passed as well */
v9fs_load_qp_tables(s);
#endif
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 78c29fb916..d78e7eff49 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -168,6 +168,12 @@ typedef struct V9fsConf
/* tag name for the device */
char *tag;
char *fsdev_id;
+ /*
+ * very important inodes: colon separated list of pathes (relative to
+ * export root) which should be preferred for getting a larger inode
+ * namespace on guest
+ */
+ char *viis;
} V9fsConf;
/* 9p2000.L xattr flags (matches Linux values) */
diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 775e8ff766..cdbe157274 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -232,6 +232,7 @@ static const VMStateDescription vmstate_virtio_9p = {
static Property virtio_9p_properties[] = {
DEFINE_PROP_STRING("mount_tag", V9fsVirtioState, state.fsconf.tag),
DEFINE_PROP_STRING("fsdev", V9fsVirtioState, state.fsconf.fsdev_id),
+ DEFINE_PROP_STRING("vii", V9fsVirtioState, state.fsconf.viis),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/qemu-options.hx b/qemu-options.hx
index 08749a3391..4fd56f367e 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1306,12 +1306,13 @@ ETEXI
DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
"-virtfs local,path=path,mount_tag=tag,security_model=[mapped-xattr|mapped-file|passthrough|none]\n"
- " [,id=id][,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n",
+ " [,id=id][,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode]\n"
+ " [,dmode=dmode][,vii=vii-path]\n",
QEMU_ARCH_ALL)
STEXI
-@item -virtfs @var{fsdriver}[,path=@var{path}],mount_tag=@var{mount_tag}[,security_model=@var{security_model}][,writeout=@var{writeout}][,readonly][,socket=@var{socket}|sock_fd=@var{sock_fd}][,fmode=@var{fmode}][,dmode=@var{dmode}]
+@item -virtfs @var{fsdriver}[,path=@var{path}],mount_tag=@var{mount_tag}[,security_model=@var{security_model}][,writeout=@var{writeout}][,readonly][,socket=@var{socket}|sock_fd=@var{sock_fd}][,fmode=@var{fmode}][,dmode=@var{dmode}][,vii=@var{vii-path}]
@findex -virtfs
The general form of a Virtual File system pass-through options are:
diff --git a/vl.c b/vl.c
index c696ad2a13..55d3f079dd 100644
--- a/vl.c
+++ b/vl.c
@@ -3447,7 +3447,8 @@ int main(int argc, char **argv, char **envp)
case QEMU_OPTION_virtfs: {
QemuOpts *fsdev;
QemuOpts *device;
- const char *writeout, *sock_fd, *socket, *path, *security_model;
+ const char *writeout, *sock_fd, *socket, *path, *security_model,
+ *vii;
olist = qemu_find_opts("virtfs");
if (!olist) {
@@ -3503,6 +3504,10 @@ int main(int argc, char **argv, char **envp)
if (sock_fd) {
qemu_opt_set(fsdev, "sock_fd", sock_fd, &error_abort);
}
+ vii = qemu_opt_get(opts, "vii");
+ if (vii) {
+ qemu_opt_set(fsdev, "vii", vii, &error_abort);
+ }
qemu_opt_set_bool(fsdev, "readonly",
qemu_opt_get_bool(opts, "readonly", 0),
@@ -3514,6 +3519,8 @@ int main(int argc, char **argv, char **envp)
qemu_opts_id(fsdev), &error_abort);
qemu_opt_set(device, "mount_tag",
qemu_opt_get(opts, "mount_tag"), &error_abort);
+ qemu_opt_set(device, "vii",
+ qemu_opt_get(opts, "vii"), &error_abort);
break;
}
case QEMU_OPTION_virtfs_synth: {
--
2.11.0
This is the counter part patch against latest libvirt git master head to
support the 'vii' feature of patch 5, which introduces the XML config
XML tag "important" on libvirt side.
To stick with the previous example mentioned with patch 5, likewise
libvirt XML configuration might then look like this:
<domain type='kvm'>
...
<devices>
...
<filesystem type='mount' accessmode='mapped'>
<source dir='/vm/fs'/>
<target dir='root'/>
<important path='/var/shares'/>
<important path='/tmp'/>
</filesystem>
</devices>
</domain>
Like with the vii qemu virtfs command line argument, the order of the
"important" tag defines which one gets the highest inode namespace
(smallest generated suffix) on guest side.
Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
docs/schemas/domaincommon.rng | 6 ++++++
src/conf/domain_conf.c | 30 ++++++++++++++++++++++++++++++
src/conf/domain_conf.h | 3 +++
src/qemu/qemu_command.c | 10 ++++++++++
4 files changed, 49 insertions(+)
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index 111b85c36f..c75edfc4d3 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -2515,6 +2515,12 @@
</choice>
</attribute>
</optional>
+ <zeroOrMore>
+ <element name='important'>
+ <attribute name="path"/>
+ <empty/>
+ </element>
+ </zeroOrMore>
<optional>
<element name='readonly'>
<empty/>
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index b4fb6cf981..cc75c6a7dd 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -2294,6 +2294,8 @@ virDomainFSDefNew(void)
void virDomainFSDefFree(virDomainFSDefPtr def)
{
+ size_t i;
+
if (!def)
return;
@@ -2302,6 +2304,13 @@ void virDomainFSDefFree(virDomainFSDefPtr def)
virDomainDeviceInfoClear(&def->info);
VIR_FREE(def->virtio);
+ if (def->important) {
+ for (i = 0; i < def->nimportant; i++)
+ if (def->important[i])
+ VIR_FREE(def->important[i]);
+ }
+ VIR_FREE(def->important);
+
VIR_FREE(def);
}
@@ -10953,6 +10962,7 @@ virDomainFSDefParseXML(virDomainXMLOptionPtr xmlopt,
VIR_AUTOFREE(char *) usage = NULL;
VIR_AUTOFREE(char *) units = NULL;
VIR_AUTOFREE(char *) model = NULL;
+ long n;
ctxt->node = node;
@@ -11001,6 +11011,12 @@ virDomainFSDefParseXML(virDomainXMLOptionPtr xmlopt,
1, ULLONG_MAX, false) < 0)
goto error;
+ n = virXMLChildElementCount(node);
+ if (n > 0) {
+ if (VIR_ALLOC_N(def->important, n) < 0)
+ goto error;
+ }
+
cur = node->children;
while (cur != NULL) {
if (cur->type == XML_ELEMENT_NODE) {
@@ -11039,6 +11055,8 @@ virDomainFSDefParseXML(virDomainXMLOptionPtr xmlopt,
if (virDomainVirtioOptionsParseXML(cur, &def->virtio) < 0)
goto error;
+ } else if (virXMLNodeNameEqual(cur, "important")) {
+ def->important[def->nimportant++] = virXMLPropString(cur, "path");
}
}
cur = cur->next;
@@ -11107,6 +11125,8 @@ virDomainFSDefParseXML(virDomainXMLOptionPtr xmlopt,
goto error;
cleanup:
+ if (def && def->important && !def->nimportant)
+ VIR_FREE(def->important);
return def;
error:
@@ -24601,6 +24621,7 @@ virDomainFSDefFormat(virBufferPtr buf,
const char *src = def->src->path;
VIR_AUTOCLEAN(virBuffer) driverBuf = VIR_BUFFER_INITIALIZER;
int ret = -1;
+ size_t i;
if (!type) {
virReportError(VIR_ERR_INTERNAL_ERROR,
@@ -24689,6 +24710,15 @@ virDomainFSDefFormat(virBufferPtr buf,
if (def->readonly)
virBufferAddLit(buf, "<readonly/>\n");
+ if (def->important) {
+ for (i = 0; i < def->nimportant; ++i) {
+ if (!def->important[i]) continue;
+ virBufferAddLit(buf, "<important");
+ virBufferEscapeString(buf, " path='%s'", def->important[i]);
+ virBufferAddLit(buf, "/>\n");
+ }
+ }
+
if (virDomainDeviceInfoFormat(buf, &def->info, flags) < 0)
goto cleanup;
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 01c22d8cc3..9bbd66d932 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -818,6 +818,9 @@ struct _virDomainFSDef {
unsigned long long space_soft_limit; /* in bytes */
bool symlinksResolved;
virDomainVirtioOptionsPtr virtio;
+
+ size_t nimportant;
+ char **important;
};
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 50b4205267..2005ccadf8 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -2732,6 +2732,7 @@ qemuBuildFSStr(virDomainFSDefPtr fs)
virBuffer opt = VIR_BUFFER_INITIALIZER;
const char *driver = qemuDomainFSDriverTypeToString(fs->fsdriver);
const char *wrpolicy = virDomainFSWrpolicyTypeToString(fs->wrpolicy);
+ size_t i;
if (fs->type != VIR_DOMAIN_FS_TYPE_MOUNT) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
@@ -2775,6 +2776,15 @@ qemuBuildFSStr(virDomainFSDefPtr fs)
if (fs->readonly)
virBufferAddLit(&opt, ",readonly");
+ if (fs->important) {
+ for (i = 0; i < fs->nimportant; ++i) {
+ if (i == 0)
+ virBufferAsprintf(&opt, ",vii=%s", fs->important[i]);
+ else
+ virBufferAsprintf(&opt, ":%s", fs->important[i]);
+ }
+ }
+
if (virBufferCheckError(&opt) < 0)
goto error;
--
2.11.0
On Mon, May 06, 2019 at 07:58:28PM +0200, Christian Schoenebeck via Qemu-devel wrote:
> This is the counter part patch against latest libvirt git master head to
> support the 'vii' feature of patch 5, which introduces the XML config
> XML tag "important" on libvirt side.
>
> To stick with the previous example mentioned with patch 5, likewise
> libvirt XML configuration might then look like this:
>
> <domain type='kvm'>
> ...
> <devices>
> ...
> <filesystem type='mount' accessmode='mapped'>
> <source dir='/vm/fs'/>
> <target dir='root'/>
> <important path='/var/shares'/>
> <important path='/tmp'/>
> </filesystem>
> </devices>
> </domain>
>
> Like with the vii qemu virtfs command line argument, the order of the
> "important" tag defines which one gets the highest inode namespace
> (smallest generated suffix) on guest side.
Do we think anyone is likely to use this feature in the real world ?
I'm not really a fan of the representation, because this is affecting
guest ABI via a side effect of the ordering which is the kind of thing
that has got us in trouble before. If we need control over the IDs
used for each mount point, then I tend to think we need to represent
it explicitly such that the mgmt app sets the exact ID used.
<pathid dir="/var/shares" id="0x1"/>
<pathid dir="/tmp" id="0x3"/>
this ensures that the IDs are still stable when adding or removing
paths
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Dienstag, 7. Mai 2019 13:57:56 CEST Daniel P. Berrangé wrote: > > ... > > <filesystem type='mount' accessmode='mapped'> > > > > <source dir='/vm/fs'/> > > <target dir='root'/> > > <important path='/var/shares'/> > > <important path='/tmp'/> > > > > </filesystem> > > > > </devices> > > > > </domain> > > > > Like with the vii qemu virtfs command line argument, the order of the > > "important" tag defines which one gets the highest inode namespace > > (smallest generated suffix) on guest side. > > Do we think anyone is likely to use this feature in the real world ? I don't know if other people need it, that's one of the reasons why I am asking for a coarse high level feedback of the current v3 patch set before getting into the details. The only thing I can say right now is that I must use this feature when running Samba to avoid all kinds of serious problems. And I could imagine inode namespace control to become more of an issue once nested virtualization becomes more popular. > I'm not really a fan of the representation, because this is affecting > guest ABI via a side effect of the ordering which is the kind of thing > that has got us in trouble before. If we need control over the IDs > used for each mount point, then I tend to think we need to represent > it explicitly such that the mgmt app sets the exact ID used. > > <pathid dir="/var/shares" id="0x1"/> > <pathid dir="/tmp" id="0x3"/> > > this ensures that the IDs are still stable when adding or removing > paths Well, that would lead to the exact opposite of what you asked for. Because allowing admins to configure an exact ID (which I think you mean should be used as exact inode suffix by 9p then) would expose implementation details inside 9pfs to config space, which are subject to change, might collide with implementation details, and requires implementation knowledge and extreme care by admins so they would pick appropriate IDs with "suffix-free" property which are guaranteed to create unique numbers in all cases: https://en.wikipedia.org/wiki/Prefix_code Also keep in mind that one fs device might end up having multiple suffixes. Hence my suggestion was to only expose the bare minimum to config space regarding this issue: Asking (if required at all) admins which ones are the most critical pathes regarding inode namespace for their use cases, and 9p would then automatically generate appropriate suffixes for those mentioned by admin to achieve the highest inode namespace appropriately and in a safe way. Plus for the "important path=" semantics I suggested you don't have have to use mount points BTW. You can use subdirs and even individual files and 9pfs would then automatically resolve the appropriate fs device of the given path. So e.g. when using nested virtualization, an admin inside a lower level guest does not even need to know the exact mount points on a higher level / host. Best regards, Christian Schoenebeck
On Mon, 06 May 2019 19:58:28 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> This is the counter part patch against latest libvirt git master head to
Hmm... shouldn't this be Cc'd to libvir-list@redhat.com as well then ?
> support the 'vii' feature of patch 5, which introduces the XML config
What is patch 5 ?!? What is 'vii' ? I am a bit lost here...
> XML tag "important" on libvirt side.
>
> To stick with the previous example mentioned with patch 5, likewise
> libvirt XML configuration might then look like this:
>
> <domain type='kvm'>
> ...
> <devices>
> ...
> <filesystem type='mount' accessmode='mapped'>
> <source dir='/vm/fs'/>
> <target dir='root'/>
> <important path='/var/shares'/>
> <important path='/tmp'/>
> </filesystem>
> </devices>
> </domain>
>
> Like with the vii qemu virtfs command line argument, the order of the
> "important" tag defines which one gets the highest inode namespace
> (smallest generated suffix) on guest side.
>
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
> docs/schemas/domaincommon.rng | 6 ++++++
> src/conf/domain_conf.c | 30 ++++++++++++++++++++++++++++++
> src/conf/domain_conf.h | 3 +++
> src/qemu/qemu_command.c | 10 ++++++++++
> 4 files changed, 49 insertions(+)
>
> diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
> index 111b85c36f..c75edfc4d3 100644
> --- a/docs/schemas/domaincommon.rng
> +++ b/docs/schemas/domaincommon.rng
> @@ -2515,6 +2515,12 @@
> </choice>
> </attribute>
> </optional>
> + <zeroOrMore>
> + <element name='important'>
> + <attribute name="path"/>
> + <empty/>
> + </element>
> + </zeroOrMore>
> <optional>
> <element name='readonly'>
> <empty/>
> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
> index b4fb6cf981..cc75c6a7dd 100644
> --- a/src/conf/domain_conf.c
> +++ b/src/conf/domain_conf.c
> @@ -2294,6 +2294,8 @@ virDomainFSDefNew(void)
>
> void virDomainFSDefFree(virDomainFSDefPtr def)
> {
> + size_t i;
> +
> if (!def)
> return;
>
> @@ -2302,6 +2304,13 @@ void virDomainFSDefFree(virDomainFSDefPtr def)
> virDomainDeviceInfoClear(&def->info);
> VIR_FREE(def->virtio);
>
> + if (def->important) {
> + for (i = 0; i < def->nimportant; i++)
> + if (def->important[i])
> + VIR_FREE(def->important[i]);
> + }
> + VIR_FREE(def->important);
> +
> VIR_FREE(def);
> }
>
> @@ -10953,6 +10962,7 @@ virDomainFSDefParseXML(virDomainXMLOptionPtr xmlopt,
> VIR_AUTOFREE(char *) usage = NULL;
> VIR_AUTOFREE(char *) units = NULL;
> VIR_AUTOFREE(char *) model = NULL;
> + long n;
>
> ctxt->node = node;
>
> @@ -11001,6 +11011,12 @@ virDomainFSDefParseXML(virDomainXMLOptionPtr xmlopt,
> 1, ULLONG_MAX, false) < 0)
> goto error;
>
> + n = virXMLChildElementCount(node);
> + if (n > 0) {
> + if (VIR_ALLOC_N(def->important, n) < 0)
> + goto error;
> + }
> +
> cur = node->children;
> while (cur != NULL) {
> if (cur->type == XML_ELEMENT_NODE) {
> @@ -11039,6 +11055,8 @@ virDomainFSDefParseXML(virDomainXMLOptionPtr xmlopt,
>
> if (virDomainVirtioOptionsParseXML(cur, &def->virtio) < 0)
> goto error;
> + } else if (virXMLNodeNameEqual(cur, "important")) {
> + def->important[def->nimportant++] = virXMLPropString(cur, "path");
> }
> }
> cur = cur->next;
> @@ -11107,6 +11125,8 @@ virDomainFSDefParseXML(virDomainXMLOptionPtr xmlopt,
> goto error;
>
> cleanup:
> + if (def && def->important && !def->nimportant)
> + VIR_FREE(def->important);
> return def;
>
> error:
> @@ -24601,6 +24621,7 @@ virDomainFSDefFormat(virBufferPtr buf,
> const char *src = def->src->path;
> VIR_AUTOCLEAN(virBuffer) driverBuf = VIR_BUFFER_INITIALIZER;
> int ret = -1;
> + size_t i;
>
> if (!type) {
> virReportError(VIR_ERR_INTERNAL_ERROR,
> @@ -24689,6 +24710,15 @@ virDomainFSDefFormat(virBufferPtr buf,
> if (def->readonly)
> virBufferAddLit(buf, "<readonly/>\n");
>
> + if (def->important) {
> + for (i = 0; i < def->nimportant; ++i) {
> + if (!def->important[i]) continue;
> + virBufferAddLit(buf, "<important");
> + virBufferEscapeString(buf, " path='%s'", def->important[i]);
> + virBufferAddLit(buf, "/>\n");
> + }
> + }
> +
> if (virDomainDeviceInfoFormat(buf, &def->info, flags) < 0)
> goto cleanup;
>
> diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
> index 01c22d8cc3..9bbd66d932 100644
> --- a/src/conf/domain_conf.h
> +++ b/src/conf/domain_conf.h
> @@ -818,6 +818,9 @@ struct _virDomainFSDef {
> unsigned long long space_soft_limit; /* in bytes */
> bool symlinksResolved;
> virDomainVirtioOptionsPtr virtio;
> +
> + size_t nimportant;
> + char **important;
> };
>
>
> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
> index 50b4205267..2005ccadf8 100644
> --- a/src/qemu/qemu_command.c
> +++ b/src/qemu/qemu_command.c
> @@ -2732,6 +2732,7 @@ qemuBuildFSStr(virDomainFSDefPtr fs)
> virBuffer opt = VIR_BUFFER_INITIALIZER;
> const char *driver = qemuDomainFSDriverTypeToString(fs->fsdriver);
> const char *wrpolicy = virDomainFSWrpolicyTypeToString(fs->wrpolicy);
> + size_t i;
>
> if (fs->type != VIR_DOMAIN_FS_TYPE_MOUNT) {
> virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> @@ -2775,6 +2776,15 @@ qemuBuildFSStr(virDomainFSDefPtr fs)
> if (fs->readonly)
> virBufferAddLit(&opt, ",readonly");
>
> + if (fs->important) {
> + for (i = 0; i < fs->nimportant; ++i) {
> + if (i == 0)
> + virBufferAsprintf(&opt, ",vii=%s", fs->important[i]);
> + else
> + virBufferAsprintf(&opt, ":%s", fs->important[i]);
> + }
> + }
> +
> if (virBufferCheckError(&opt) < 0)
> goto error;
>
On Dienstag, 7. Mai 2019 11:55:56 CEST Greg Kurz wrote: > > support the 'vii' feature of patch 5, which introduces the XML config > > What is patch 5 ?!? What is 'vii' ? I am a bit lost here... Hi Greg, Sorry that I caused a bit of confusion, You were actually commenting mostly on v2 of the patch set, where my email client replaced the message IDs and hence screwed threading. This is v3 that I sent yesterday and which has correct threading: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01143.html Please just have a glimpse on that v3 thread, and before I address the details that you requested (I have reviewed them all already and will address them), I would like you to ask you for a coarse feedback on design/features first. Because there are some things where I am unresolved on design level yet: 1. Should I drop the "persistency" feature of patch 3 (same inode numbers after reboots/suspends) completely from the patch set? It is disabled at compile time by default for now after entire v3 patch set is applied. Or should that persistency feature probably become a qemu command line option instead? 2. If persistency feature should be preserved, shall I probably move out all the inode remapping code into a separate C unit to avoid 9p.c getting bloated too much (the amount of code for saving/loading the qp*_table hash tables is quite large). If yes, any suggestion for an appropriate unit name? 3. Are you fine with the suggested variable length suffixes (patch 4) becoming the default behaviour (instead of the fixed length 16 bit prefix solution by Antonios)? 4. Do you have a better idea for a name instead of the suggested "vii" (patch 5) virtfs qemu command line option? And are you fine with the idea of that "vii" feature anyway? > > This is the counter part patch against latest libvirt git master head to > > Hmm... shouldn't this be Cc'd to libvir-list@redhat.com as well then ? Well, for now I just provided that libvirt patch to give you an idea about how imagined this "vii" feature to be used. Does it make sense to CC them already even though this suggested "vii" command line option does not exist on qemu side yet? I know I piled up quite a bit of code on this patch set, so to speed up things simply raise questions instead of spending too much time in reviewing everything in detail already. Thanks Greg! Best regards, Christian Schoenebeck
On Tue, 07 May 2019 14:23:11 +0200 Christian Schoenebeck <qemu_oss@crudebyte.com> wrote: > On Dienstag, 7. Mai 2019 11:55:56 CEST Greg Kurz wrote: > > > support the 'vii' feature of patch 5, which introduces the XML config > > > > What is patch 5 ?!? What is 'vii' ? I am a bit lost here... > > Hi Greg, > > Sorry that I caused a bit of confusion, You were actually commenting mostly on > v2 of the patch set, where my email client replaced the message IDs and hence > screwed threading. > > This is v3 that I sent yesterday and which has correct threading: > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01143.html > For a reason yet to be investigated, I haven't received it yet... > Please just have a glimpse on that v3 thread, and before I address the details > that you requested (I have reviewed them all already and will address them), I > would like you to ask you for a coarse feedback on design/features first. > Because there are some things where I am unresolved on design level yet: > I'll try but probably not before next week. > 1. Should I drop the "persistency" feature of patch 3 (same inode numbers > after reboots/suspends) completely from the patch set? It is disabled at > compile time by default for now after entire v3 patch set is applied. Or > should that persistency feature probably become a qemu command line option > instead? > > 2. If persistency feature should be preserved, shall I probably move out all > the inode remapping code into a separate C unit to avoid 9p.c getting bloated > too much (the amount of code for saving/loading the qp*_table hash tables is > quite large). If yes, any suggestion for an appropriate unit name? > > 3. Are you fine with the suggested variable length suffixes (patch 4) becoming > the default behaviour (instead of the fixed length 16 bit prefix solution by > Antonios)? > > 4. Do you have a better idea for a name instead of the suggested "vii" (patch > 5) virtfs qemu command line option? And are you fine with the idea of that > "vii" feature anyway? > > > > This is the counter part patch against latest libvirt git master head to > > > > Hmm... shouldn't this be Cc'd to libvir-list@redhat.com as well then ? > > Well, for now I just provided that libvirt patch to give you an idea about how > imagined this "vii" feature to be used. Does it make sense to CC them already > even though this suggested "vii" command line option does not exist on qemu > side yet? > > I know I piled up quite a bit of code on this patch set, so to speed up things > simply raise questions instead of spending too much time in reviewing > everything in detail already. > > Thanks Greg! > > Best regards, > Christian Schoenebeck
On Dienstag, 7. Mai 2019 17:42:39 CEST Greg Kurz wrote: > > Sorry that I caused a bit of confusion, You were actually commenting > > mostly on v2 of the patch set, where my email client replaced the message > > IDs and hence screwed threading. > > > > This is v3 that I sent yesterday and which has correct threading: > > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01143.html > > For a reason yet to be investigated, I haven't received it yet... Here are the archive links for latest v3 patch set [5(+1) patches total]: [PATCH v3 0/5] 9p: Fix file ID collisions: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01143.html [PATCH v3 1/5] 9p: mitigates most QID path collisions: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01142.html [PATCH v3 2/5] 9P: trivial cleanup of QID path collision mitigation: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01140.html [PATCH v3 3/5] 9p: persistency of QID path beyond reboots / suspensions: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01144.html [PATCH v3 4/5] 9p: use variable length suffixes for inode mapping: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01141.html [PATCH v3 5/5] 9p: adds virtfs 'vii' device parameter https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01138.html And the optional libvirt patch: [libvirt patch] qemu: adds support for virtfs 9p argument 'vii': https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01223.html > > Please just have a glimpse on that v3 thread, and before I address the > > details that you requested (I have reviewed them all already and will > > address them), I would like you to ask you for a coarse feedback on > > design/features first. > > Because there are some things where I am unresolved on design level yet: > I'll try but probably not before next week. No problem, take your time! Best regards, Christian Schoenebeck
On Dienstag, 7. Mai 2019 18:16:08 CEST Christian Schoenebeck wrote: > Here are the archive links for latest v3 patch set [5(+1) patches total]: > > [PATCH v3 0/5] 9p: Fix file ID collisions: > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01143.html > > [PATCH v3 1/5] 9p: mitigates most QID path collisions: > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01142.html > > [PATCH v3 2/5] 9P: trivial cleanup of QID path collision mitigation: > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01140.html > > [PATCH v3 3/5] 9p: persistency of QID path beyond reboots / suspensions: > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01144.html > > [PATCH v3 4/5] 9p: use variable length suffixes for inode mapping: > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01141.html > > [PATCH v3 5/5] 9p: adds virtfs 'vii' device parameter > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01138.html > > And the optional libvirt patch: > > [libvirt patch] qemu: adds support for virtfs 9p argument 'vii': > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01223.html > > > > Please just have a glimpse on that v3 thread, and before I address the > > > details that you requested (I have reviewed them all already and will > > > address them), I would like you to ask you for a coarse feedback on > > > design/features first. > > > > > Because there are some things where I am unresolved on design level yet: > > > > I'll try but probably not before next week. Hi Greg, you have not forgotten about me, did you? ;-) Or should I go ahead and provide a v4 next week addressing the issues discussed so far? Best regards, Christian Schoenebeck
On Fri, 17 May 2019 10:40:48 +0200 Christian Schoenebeck <qemu_oss@crudebyte.com> wrote: > On Dienstag, 7. Mai 2019 18:16:08 CEST Christian Schoenebeck wrote: > > Here are the archive links for latest v3 patch set [5(+1) patches total]: > > > > [PATCH v3 0/5] 9p: Fix file ID collisions: > > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01143.html > > > > [PATCH v3 1/5] 9p: mitigates most QID path collisions: > > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01142.html > > > > [PATCH v3 2/5] 9P: trivial cleanup of QID path collision mitigation: > > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01140.html > > > > [PATCH v3 3/5] 9p: persistency of QID path beyond reboots / suspensions: > > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01144.html > > > > [PATCH v3 4/5] 9p: use variable length suffixes for inode mapping: > > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01141.html > > > > [PATCH v3 5/5] 9p: adds virtfs 'vii' device parameter > > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01138.html > > > > And the optional libvirt patch: > > > > [libvirt patch] qemu: adds support for virtfs 9p argument 'vii': > > https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg01223.html > > > > > > Please just have a glimpse on that v3 thread, and before I address the > > > > details that you requested (I have reviewed them all already and will > > > > address them), I would like you to ask you for a coarse feedback on > > > > design/features first. > > > > > > > Because there are some things where I am unresolved on design level yet: > > > > > > I'll try but probably not before next week. > > Hi Greg, you have not forgotten about me, did you? ;-) > Hi Christian, I have certainly not forgotten but I had (and still have) some more urgent work to do and I couldn't find time for this... Sorry :) > Or should I go ahead and provide a v4 next week addressing the issues > discussed so far? > Thinking again about the initial issue raised by Antonios, I agree we should at least detect collisions in the existing 9pfs code and emit an error rather than silently returning duplicate QIDs to the client. This would be patch 2 from Antonios's series: only allow a single host device for a given fs device. Then, we come to the bulk problem: how to handle the case where we have multiple devices involved in a directory we want to share ? Antonios's proposal is a clever way to address the collisions, but your work proves it isn't enough... Before going forward, I'd like to consider another approach. What about: 1) de-compose the shared directory on a per-device basis, ie. identify all mount points under the shared directory 2) expose found mount points separately, each with its onw 9p device 3) re-compose the directory tree within the guest using the same topology as the host ie. if you want to share /vm/fs and /vm/fs on device A /vm/fs/shares on device B /vm/fs/tmp on device C you would start QEMU with -fsdev local,path=/vm/fs,id=fsdev0... \ -device virtio-9p,fsdev=fsdev0,mount_tag=tag0 \ -fsdev local,path=/vm/fs,id=fsdev1... \ -device virtio-9p,fsdev=fsdev1,mount_tag=tag1 \ -fsdev local,path=/vm/fs,id=fsdev2... \ -device virtio-9p,fsdev=fsdev2,mount_tag=tag2 and /etc/fstab in the guest: tag0 / 9p nofail,trans=virtio,version=9p2000.L 0 0 tag1 /shares 9p nofail,trans=virtio,version=9p2000.L 0 0 tag2 /tmp 9p nofail,trans=virtio,version=9p2000.L 0 0 This involves some more work for the user but it doesn't require any changes in QEMU. Would this approach solve the issues you've been hitting with Samba ? Cheers, -- Greg
On Freitag, 17. Mai 2019 14:30:29 CEST Greg Kurz wrote: > Then, we come to the bulk problem: how to handle the case where we > have multiple devices involved in a directory we want to share ? > Antonios's proposal is a clever way to address the collisions, but > your work proves it isn't enough... With the patch set I have right now, things finally bahave smooth. > Before going forward, I'd like > to consider another approach. > > What about: > > 1) de-compose the shared directory on a per-device basis, > ie. identify all mount points under the shared directory > > 2) expose found mount points separately, each with its onw 9p device > > 3) re-compose the directory tree within the guest using the same topology > as the host > > ie. if you want to share /vm/fs and > > /vm/fs on device A > /vm/fs/shares on device B > /vm/fs/tmp on device C > > you would start QEMU with > > -fsdev local,path=/vm/fs,id=fsdev0... \ > -device virtio-9p,fsdev=fsdev0,mount_tag=tag0 \ > -fsdev local,path=/vm/fs,id=fsdev1... \ > -device virtio-9p,fsdev=fsdev1,mount_tag=tag1 \ > -fsdev local,path=/vm/fs,id=fsdev2... \ > -device virtio-9p,fsdev=fsdev2,mount_tag=tag2 > > and /etc/fstab in the guest: > > tag0 / 9p nofail,trans=virtio,version=9p2000.L 0 0 > tag1 /shares 9p nofail,trans=virtio,version=9p2000.L 0 0 > tag2 /tmp 9p nofail,trans=virtio,version=9p2000.L 0 0 > > This involves some more work for the user but it doesn't require > any changes in QEMU. So your suggestion is actually: don't fix it. "Some" more work for the user is a quantity of how many guests you are running, multiplied by the nested virtualization levels you might have = potentially a lot of work for admins. > Would this approach solve the issues you've been hitting with Samba ? No, because that completely neglects runtime changes on a higher level (host), plus it completely destroys the fundamental idea about 9p, which is about transparency of the higher level(s). May I ask, do you have concrete reasons why you want to abondon the entire patch set? Because that's what it sounds to me. Best regards, Christian Schoenebeck
On Fri, 17 May 2019 15:23:01 +0200 Christian Schoenebeck <qemu_oss@crudebyte.com> wrote: > On Freitag, 17. Mai 2019 14:30:29 CEST Greg Kurz wrote: > > Then, we come to the bulk problem: how to handle the case where we > > have multiple devices involved in a directory we want to share ? > > Antonios's proposal is a clever way to address the collisions, but > > your work proves it isn't enough... > > With the patch set I have right now, things finally bahave smooth. > > > Before going forward, I'd like > > to consider another approach. > > > > What about: > > > > 1) de-compose the shared directory on a per-device basis, > > ie. identify all mount points under the shared directory > > > > 2) expose found mount points separately, each with its onw 9p device > > > > 3) re-compose the directory tree within the guest using the same topology > > as the host > > > > ie. if you want to share /vm/fs and > > > > /vm/fs on device A > > /vm/fs/shares on device B > > /vm/fs/tmp on device C > > > > you would start QEMU with > > > > -fsdev local,path=/vm/fs,id=fsdev0... \ > > -device virtio-9p,fsdev=fsdev0,mount_tag=tag0 \ > > -fsdev local,path=/vm/fs,id=fsdev1... \ > > -device virtio-9p,fsdev=fsdev1,mount_tag=tag1 \ > > -fsdev local,path=/vm/fs,id=fsdev2... \ > > -device virtio-9p,fsdev=fsdev2,mount_tag=tag2 > > > > and /etc/fstab in the guest: > > > > tag0 / 9p nofail,trans=virtio,version=9p2000.L 0 0 > > tag1 /shares 9p nofail,trans=virtio,version=9p2000.L 0 0 > > tag2 /tmp 9p nofail,trans=virtio,version=9p2000.L 0 0 > > > > This involves some more work for the user but it doesn't require > > any changes in QEMU. > > So your suggestion is actually: don't fix it. > Potentially yes if another approach is satisfying enough, as I wouldn't want to over-engineer too much around this 9p imposed limitation. The right thing to do would be to come up with a new version of the protocol with variable sized QIDs and call it a day. > "Some" more work for the user is a quantity of how many guests you are > running, multiplied by the nested virtualization levels you might have = > potentially a lot of work for admins. > Maybe, I don't have enough feedback on 9p use cases to have a clear picture... > > Would this approach solve the issues you've been hitting with Samba ? > > No, because that completely neglects runtime changes on a higher level (host), Unless I'm missing something, runtime changes would be neglected as well with the "vii" property since it is static for the device lifetime. > plus it completely destroys the fundamental idea about 9p, which is about > transparency of the higher level(s). > That's a point indeed, even if again I'm not sure if this is a frequent case to share a directory tree spanning over multiple devices. > May I ask, do you have concrete reasons why you want to abondon the entire > patch set? Because that's what it sounds to me. > I don't have that much time to spend on 9p maintainership, for both reviewing and fixing bugs (CVEs most of the time). So, yes it may sound like I want to drop the patchset, but it's just I need to be convinced I won't regret having merged a huge change like this... when I'll have to support it alone later ;-) For the moment, I'm not convinced by the "vii" solution. It even motivated my suggestion of having several devices actually, since the paths you would put in there are known before starting QEMU. It might take me some more rounds of discussion to decide. I understand it is frustrating but bear with me :) > Best regards, > Christian Schoenebeck Cheers, -- Greg
On Freitag, 17. Mai 2019 16:47:46 CEST Greg Kurz wrote: > Potentially yes if another approach is satisfying enough, as I wouldn't > want to over-engineer too much around this 9p imposed limitation. The > right thing to do would be to come up with a new version of the protocol > with variable sized QIDs and call it a day. Yes, but that's the long-term solution which will still take a very long while. This patch set is already a functional solution until that happens, and this 9p issue is IMO quite severe (as minor as data corruption, data loss and exists for several years already). > > plus it completely destroys the fundamental idea about 9p, which is about > > transparency of the higher level(s). > > That's a point indeed, even if again I'm not sure if this is a frequent > case to share a directory tree spanning over multiple devices. When the use case is mass storage then it is very likely that you will have several devices. Times have changed. With modern file systems like ZFS it is very common to create a large amount of "data sets" for all kinds of individual purposes and mount points which is cheap to get. Each fs data set is a separate "device" from OS (i.e. Linux) point of view, even if all those FS data sets are in the same FS pool and even on the same physical IO device. Background: The concept of FS "data sets" combines the benefits of classical partitions (e.g. logical file space separation, independent fs configurations like compression on/off/algorithm, data deduplication on/off, snapshot isolation, snapshots on/off) without the disadvantages of classical real partitions (physical space is dynamically shared, no space wasted on fixed boundaries; physical device pool management is transparent for all data sets, configuration options can be inherited from parent data sets). > I don't have that much time to spend on 9p maintainership, for both > reviewing and fixing bugs (CVEs most of the time). So, yes it may > sound like I want to drop the patchset, but it's just I need to be > convinced I won't regret having merged a huge change like this... > when I'll have to support it alone later ;-) Actually I already assumed this to be the actual root cause. I see that you are currently the only maintainer, and my plan was not to just provide a one time shot but eventually hang along helping with maintaining it due my use of 9p and its huge potential I see (as universal and efficient root file system for all guests, not just for exotically sharing a small tree portion between guests). I also have plans for supporting native file forks BTW. But if you are seriously suggesting to simply omit a fundamental issue in 9p, then my plans would obviously no longer make sense. :) I mean I am completely tolerant to all kinds of religious views on bit level, languages, code style, libs, precise implementation details, parameters, source structure, etc.; but saying to simply leave a fundamental bug open for years to come, that's where I have to drop out. > For the moment, I'm not convinced by the "vii" solution. It even > motivated my suggestion of having several devices actually, since > the paths you would put in there are known before starting QEMU. Well, that "vii" configuration and the QID persistency are 2 optional patches on top of the core fixes. It is a huge difference to suggest dropping these 2 patches than saying to completely drop the entire patch set and to leave this bug open. The mandatory core fixes that I see (for the short/mid term) are at least Antonios' patches plus my variable length prefix patch, the latter significantly reduces the inode numbers on guest and significantly increases the inode name space, and hence also significanlty reduces the propability that 9p might ever need to kick in with any kind of expensive remapping actions (or even something worse like stat fatally returning -ENFILE to the user). About the "vii" configuration, even though you don't like the idea: there is also a big difference giving the user the _optional_ possibility to define e.g. one path (not device) on guest said to be sensitive regarding high inode numbers on guest; and something completely different telling the user that he _must_ configure every single device from host that is ever supposed to pop up with 9p on guest and forcing the user to update that configuration whenever a new device is added or removed on host. The "vii" configuration feature does not require any knowledge of how many and which kinds of devices are actually ever used on host (nor on any higher level host in case of nested virtualization), nor does that "vii" config require any changes ever when host device setup changes. So 9p's core transparency feature would not be touched at all. > It might take me some more rounds of discussion to decide. I understand > it is frustrating but bear with me :) Let me make a different suggestion: how about putting these fixes into a separate C unit for now and making the default behaviour (if you really want) to not use any of that code by default at all. So the user would just get an error message in the qemu log files by default if he tries to export several devices with one 9p device, suggesting him either to map them as separate 9p devices (like you suggested) and informing the user about the alternative of enabling support for the automatic inode remapping code (from that separate C unit) instead by adding one convenient config option if he/she really wants. That way we would have a fix now, not in years, people can decide to use the automatic and hardware transparent solution, instead of being forced to write dozens of config lines for each single guest, or they might decide to stick with your "solution" ;-). And once the long term solution for this issue with variable length QIDs is in place, the inode remapping code can very simply be dropped from the code base completley by just deleting the C unit and about a hand full of lines in 9p.c or so, and hence this fix would not bloat the existing 9p units nor cause maintenance nightmares of any kind. Best regards, Christian Schoenebeck
Hi Christian,
On Fri, 17 May 2019 22:53:41 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> On Freitag, 17. Mai 2019 16:47:46 CEST Greg Kurz wrote:
> > Potentially yes if another approach is satisfying enough, as I wouldn't
> > want to over-engineer too much around this 9p imposed limitation. The
> > right thing to do would be to come up with a new version of the protocol
> > with variable sized QIDs and call it a day.
>
> Yes, but that's the long-term solution which will still take a very long
> while. This patch set is already a functional solution until that happens, and
> this 9p issue is IMO quite severe (as minor as data corruption, data loss and
> exists for several years already).
>
On the other hand, I'm afraid that having a functional solution won't
motivate people to come up with a new spec... Anyway, I agree that the
data corruption/loss issues must be prevented, ie, the 9p server should
at least return an error to the client rather than returning a colliding
QID. A simple way to avoid that is to enforce a single device, ie. patch
2 in Antonios's series. Of course this may break some setups where
multiple devices are involved, but it is pure luck if they aren't already
broken with the current code base. I assume that this covers most 9p users,
based on the fact that Antonios and now yourself are the only people who
ever reported this issue. Then, I'm okay if we try to cover some more
scenarios, but it must take the maintainance burden into account.
> > > plus it completely destroys the fundamental idea about 9p, which is about
> > > transparency of the higher level(s).
> >
> > That's a point indeed, even if again I'm not sure if this is a frequent
> > case to share a directory tree spanning over multiple devices.
>
> When the use case is mass storage then it is very likely that you will have
> several devices. Times have changed. With modern file systems like ZFS it is
> very common to create a large amount of "data sets" for all kinds of
> individual purposes and mount points which is cheap to get. Each fs data set
> is a separate "device" from OS (i.e. Linux) point of view, even if all those
> FS data sets are in the same FS pool and even on the same physical IO device.
>
> Background: The concept of FS "data sets" combines the benefits of classical
> partitions (e.g. logical file space separation, independent fs configurations
> like compression on/off/algorithm, data deduplication on/off, snapshot
> isolation, snapshots on/off) without the disadvantages of classical real
> partitions (physical space is dynamically shared, no space wasted on fixed
> boundaries; physical device pool management is transparent for all data sets,
> configuration options can be inherited from parent data sets).
>
Ok. I must admit my ignorance around ZFS and "data sets"... so IIUC, even with
a single underlying physical device you might end up with lstat() returning
different st_dev values on the associated files, correct ?
I take your word for the likeliness of the issue to popup in such setups. :)
> > I don't have that much time to spend on 9p maintainership, for both
> > reviewing and fixing bugs (CVEs most of the time). So, yes it may
> > sound like I want to drop the patchset, but it's just I need to be
> > convinced I won't regret having merged a huge change like this...
> > when I'll have to support it alone later ;-)
>
> Actually I already assumed this to be the actual root cause.
>
> I see that you are currently the only maintainer, and my plan was not to just
> provide a one time shot but eventually hang along helping with maintaining it
> due my use of 9p and its huge potential I see (as universal and efficient root
> file system for all guests, not just for exotically sharing a small tree
> portion between guests). I also have plans for supporting native file forks
What are "native file forks" ?
> BTW. But if you are seriously suggesting to simply omit a fundamental issue in
> 9p, then my plans would obviously no longer make sense. :)
>
It would be very much appreciated to have someone else investing time in 9p
of course. :)
> I mean I am completely tolerant to all kinds of religious views on bit level,
> languages, code style, libs, precise implementation details, parameters,
> source structure, etc.; but saying to simply leave a fundamental bug open for
> years to come, that's where I have to drop out.
>
I guess I was unclear... As said above, I do want to avoid QID collisions,
at least by enforcing a single device per shared path. Then, supporting
multiple devices will depend on the benefice/expense ratio.
> > For the moment, I'm not convinced by the "vii" solution. It even
> > motivated my suggestion of having several devices actually, since
> > the paths you would put in there are known before starting QEMU.
>
> Well, that "vii" configuration and the QID persistency are 2 optional patches
> on top of the core fixes. It is a huge difference to suggest dropping these 2
> patches than saying to completely drop the entire patch set and to leave this
> bug open.
>
> The mandatory core fixes that I see (for the short/mid term) are at least
> Antonios' patches plus my variable length prefix patch, the latter significantly
> reduces the inode numbers on guest and significantly increases the inode name
> space, and hence also significanlty reduces the propability that 9p might ever
> need to kick in with any kind of expensive remapping actions (or even
> something worse like stat fatally returning -ENFILE to the user).
>
> About the "vii" configuration, even though you don't like the idea: there is
Hmm... I was objecting on the fact that "vii" would cope with runtime
changes on the host while the guest is running.
> also a big difference giving the user the _optional_ possibility to define e.g.
> one path (not device) on guest said to be sensitive regarding high inode
> numbers on guest; and something completely different telling the user that he
> _must_ configure every single device from host that is ever supposed to pop up
> with 9p on guest and forcing the user to update that configuration whenever a
> new device is added or removed on host. The "vii" configuration feature does
> not require any knowledge of how many and which kinds of devices are actually
> ever used on host (nor on any higher level host in case of nested
> virtualization), nor does that "vii" config require any changes ever when host
> device setup changes. So 9p's core transparency feature would not be touched
> at all.
>
I guess this all boils down to I finding some time to read/understand more :)
As usual, a series with smaller and simpler patches will be easier to
review, and more likely to be merged.
> > It might take me some more rounds of discussion to decide. I understand
> > it is frustrating but bear with me :)
>
> Let me make a different suggestion: how about putting these fixes into a
> separate C unit for now and making the default behaviour (if you really want)
> to not use any of that code by default at all. So the user would just get an
> error message in the qemu log files by default if he tries to export several
> devices with one 9p device, suggesting him either to map them as separate 9p
> devices (like you suggested) and informing the user about the alternative of
> enabling support for the automatic inode remapping code (from that separate C
> unit) instead by adding one convenient config option if he/she really wants.
>
It seems that we may be reaching some consensus here :)
I like the approach, provided this is configurable from the command line, off
by default and doesn't duplicate core 9p code. Not sure this needs to sit in
its own C unit though.
> That way we would have a fix now, not in years, people can decide to use the
> automatic and hardware transparent solution, instead of being forced to write
> dozens of config lines for each single guest, or they might decide to stick
> with your "solution" ;-).
>
> And once the long term solution for this issue with variable length QIDs is in
> place, the inode remapping code can very simply be dropped from the code base
> completley by just deleting the C unit and about a hand full of lines in 9p.c
> or so, and hence this fix would not bloat the existing 9p units nor cause
> maintenance nightmares of any kind.
>
The 9p code has a long history of CVEs and limitations that prevented it
to reach full production grade quality. Combined with the poor quality of
the code, this has scared off many experienced QEMU developpers, which
prefer to work on finding an alternative solution. Such alternative is
virtio-fs, which is being actively worked on:
https://lists.gnu.org/archive/html/qemu-devel/2019-04/msg02746.html
Note: I'm not telling "stay away from 9p" but maybe worth taking a look,
because if virtio-fs gets merged, it is likely to become the official
and better supported way to share files between host and guest.
> Best regards,
> Christian Schoenebeck
Please repost a series, possibly based on some of Antonios's patches that
allows to avoid the QID collision, returns an error to the client instead
and possibly printing out some useful messages in the QEMU log. Then, on
top of that, you can start introducing hashing and variable prefix length.
Cheers,
--
Greg
On Montag, 20. Mai 2019 16:05:09 CEST Greg Kurz wrote: > Hi Christian, Hi Greg, > On the other hand, I'm afraid that having a functional solution won't > motivate people to come up with a new spec... Anyway, I agree that the > data corruption/loss issues must be prevented, ie, the 9p server should > at least return an error to the client rather than returning a colliding > QID. Ok, I will extend Antonios' patch to log that error on host. I thought about limiting that error message to once per session (for not flooding the logs), but it is probably not worth it, so if you don't veto then I will just log that error simply on every file access. > A simple way to avoid that is to enforce a single device, ie. patch > 2 in Antonios's series. Of course this may break some setups where > multiple devices are involved, but it is pure luck if they aren't already > broken with the current code base. Yes, the worst thing you can have is this collision silently being ignored, like it is actually right now. Because you end up getting all sorts of different misbehaviours on guests, and these are just symptoms that take a while to debug and realise that the actual root cause is an inode collision. So enforcing a single device is still better than undefined behaviour. > > Background: The concept of FS "data sets" combines the benefits of > > classical partitions (e.g. logical file space separation, independent fs > > configurations like compression on/off/algorithm, data deduplication > > on/off, snapshot isolation, snapshots on/off) without the disadvantages > > of classical real partitions (physical space is dynamically shared, no > > space wasted on fixed boundaries; physical device pool management is > > transparent for all data sets, configuration options can be inherited > > from parent data sets). > > Ok. I must admit my ignorance around ZFS and "data sets"... so IIUC, even > with a single underlying physical device you might end up with lstat() > returning different st_dev values on the associated files, correct ? > > I take your word for the likeliness of the issue to popup in such setups. :) Yes, that is correct, you _always_ get a different stat::st_dev value for each ZFS data set. Furthermore, each ZFS data set has its own inode number sequence generator starting from one. So consider you create two new ZFS data sets, then you create one file on each data set, then both files will have inode number 1. That probably makes it clear why you hit this ID collision bug very easily when using the combination ZFS & 9p. > > also a big difference giving the user the _optional_ possibility to define > > e.g. one path (not device) on guest said to be sensitive regarding high > > inode numbers on guest; and something completely different telling the > > user that he _must_ configure every single device from host that is ever > > supposed to pop up with 9p on guest and forcing the user to update that > > configuration whenever a new device is added or removed on host. The > > "vii" configuration feature does not require any knowledge of how many > > and which kinds of devices are actually ever used on host (nor on any > > higher level host in case of nested > > virtualization), nor does that "vii" config require any changes ever when > > host device setup changes. So 9p's core transparency feature would not be > > touched at all. > > I guess this all boils down to I finding some time to read/understand more > :) Yes, that helps sometimes. :) > As usual, a series with smaller and simpler patches will be easier to > review, and more likely to be merged. Of course. In the next patch series I will completely drop a) the entire QID persistency feature code and b) that disputed "vii" feature. But I will still suggest the variable inode suffix length patch as last patch in that new patch series. That should make the amount of changes manageable small. > > Let me make a different suggestion: how about putting these fixes into a > > separate C unit for now and making the default behaviour (if you really > > want) to not use any of that code by default at all. So the user would > > just get an error message in the qemu log files by default if he tries to > > export several devices with one 9p device, suggesting him either to map > > them as separate 9p devices (like you suggested) and informing the user > > about the alternative of enabling support for the automatic inode > > remapping code (from that separate C unit) instead by adding one > > convenient config option if he/she really wants. > It seems that we may be reaching some consensus here :) > > I like the approach, provided this is configurable from the command line, > off by default and doesn't duplicate core 9p code. Not sure this needs to > sit in its own C unit though. Any preference for a command line argument name and/or libvirt XML config tag/ attribute for switching the inode remapping code on? About that separate C unit: I leave that up to you to decide, it does not matter to me. I just suggested it since you consider these patches as temporary workaround until there are appropriate protocol changes. So clear code separation for them might help to get rid of the entire code later on. Plus for distribution maintainers it might be easiert to cherry pick them as backports. However since I will drop the persistency and "vii" feature in the next patch series, it probably does not make a huge difference anyway. As you prefer. > The 9p code has a long history of CVEs and limitations that prevented it > to reach full production grade quality. Combined with the poor quality of > the code, this has scared off many experienced QEMU developpers, which > prefer to work on finding an alternative solution. And I already wondered about the obvious low activity on this particular qemu feature. I mean I don't find it contemporary still running guests to use their own file system being emulated on a file ontop of yet another file system and loosing essentially all benefits of the host's actual backend file system features. > Such alternative is virtio-fs, which is being actively worked on: > > https://lists.gnu.org/archive/html/qemu-devel/2019-04/msg02746.html > > Note: I'm not telling "stay away from 9p" but maybe worth taking a look, > because if virtio-fs gets merged, it is likely to become the official > and better supported way to share files between host and guest. Ah, good to know! That's new to me, thanks! Makes sense to me, especially its performance will certainly be much better. > Please repost a series, possibly based on some of Antonios's patches that > allows to avoid the QID collision, returns an error to the client instead > and possibly printing out some useful messages in the QEMU log. Then, on > top of that, you can start introducing hashing and variable prefix length. So you want that as its own patch series first, or can I continue with my suggestion to deliver the hash patch and variable suffix length patch as last patches within the same series? Best regards, Christian Schoenebeck
On Wed, 22 May 2019 18:03:13 +0200 Christian Schoenebeck <qemu_oss@crudebyte.com> wrote: > On Montag, 20. Mai 2019 16:05:09 CEST Greg Kurz wrote: > > Hi Christian, > > Hi Greg, > > > On the other hand, I'm afraid that having a functional solution won't > > motivate people to come up with a new spec... Anyway, I agree that the > > data corruption/loss issues must be prevented, ie, the 9p server should > > at least return an error to the client rather than returning a colliding > > QID. > > Ok, I will extend Antonios' patch to log that error on host. I thought about > limiting that error message to once per session (for not flooding the logs), > but it is probably not worth it, so if you don't veto then I will just log > that error simply on every file access. > Please use error_report_once(). > > A simple way to avoid that is to enforce a single device, ie. patch > > 2 in Antonios's series. Of course this may break some setups where > > multiple devices are involved, but it is pure luck if they aren't already > > broken with the current code base. > > Yes, the worst thing you can have is this collision silently being ignored, > like it is actually right now. Because you end up getting all sorts of > different misbehaviours on guests, and these are just symptoms that take a > while to debug and realise that the actual root cause is an inode collision. > So enforcing a single device is still better than undefined behaviour. > > > > Background: The concept of FS "data sets" combines the benefits of > > > classical partitions (e.g. logical file space separation, independent fs > > > configurations like compression on/off/algorithm, data deduplication > > > on/off, snapshot isolation, snapshots on/off) without the disadvantages > > > of classical real partitions (physical space is dynamically shared, no > > > space wasted on fixed boundaries; physical device pool management is > > > transparent for all data sets, configuration options can be inherited > > > from parent data sets). > > > > Ok. I must admit my ignorance around ZFS and "data sets"... so IIUC, even > > with a single underlying physical device you might end up with lstat() > > returning different st_dev values on the associated files, correct ? > > > > I take your word for the likeliness of the issue to popup in such setups. :) > > Yes, that is correct, you _always_ get a different stat::st_dev value for each > ZFS data set. Furthermore, each ZFS data set has its own inode number sequence > generator starting from one. So consider you create two new ZFS data sets, > then you create one file on each data set, then both files will have inode > number 1. > > That probably makes it clear why you hit this ID collision bug very easily > when using the combination ZFS & 9p. > > > > also a big difference giving the user the _optional_ possibility to define > > > e.g. one path (not device) on guest said to be sensitive regarding high > > > inode numbers on guest; and something completely different telling the > > > user that he _must_ configure every single device from host that is ever > > > supposed to pop up with 9p on guest and forcing the user to update that > > > configuration whenever a new device is added or removed on host. The > > > "vii" configuration feature does not require any knowledge of how many > > > and which kinds of devices are actually ever used on host (nor on any > > > higher level host in case of nested > > > virtualization), nor does that "vii" config require any changes ever when > > > host device setup changes. So 9p's core transparency feature would not be > > > touched at all. > > > > I guess this all boils down to I finding some time to read/understand more > > :) > > Yes, that helps sometimes. :) > > > As usual, a series with smaller and simpler patches will be easier to > > review, and more likely to be merged. > > Of course. > > In the next patch series I will completely drop a) the entire QID persistency > feature code and b) that disputed "vii" feature. But I will still suggest the > variable inode suffix length patch as last patch in that new patch series. > > That should make the amount of changes manageable small. > > > > Let me make a different suggestion: how about putting these fixes into a > > > separate C unit for now and making the default behaviour (if you really > > > want) to not use any of that code by default at all. So the user would > > > just get an error message in the qemu log files by default if he tries to > > > export several devices with one 9p device, suggesting him either to map > > > them as separate 9p devices (like you suggested) and informing the user > > > about the alternative of enabling support for the automatic inode > > > remapping code (from that separate C unit) instead by adding one > > > convenient config option if he/she really wants. > > It seems that we may be reaching some consensus here :) > > > > I like the approach, provided this is configurable from the command line, > > off by default and doesn't duplicate core 9p code. Not sure this needs to > > sit in its own C unit though. > > Any preference for a command line argument name and/or libvirt XML config tag/ > attribute for switching the inode remapping code on? > > About that separate C unit: I leave that up to you to decide, it does not > matter to me. I just suggested it since you consider these patches as > temporary workaround until there are appropriate protocol changes. So clear > code separation for them might help to get rid of the entire code later on. > Plus for distribution maintainers it might be easiert to cherry pick them as > backports. > > However since I will drop the persistency and "vii" feature in the next patch > series, it probably does not make a huge difference anyway. As you prefer. > > > The 9p code has a long history of CVEs and limitations that prevented it > > to reach full production grade quality. Combined with the poor quality of > > the code, this has scared off many experienced QEMU developpers, which > > prefer to work on finding an alternative solution. > > And I already wondered about the obvious low activity on this particular qemu > feature. I mean I don't find it contemporary still running guests to use their > own file system being emulated on a file ontop of yet another file system and > loosing essentially all benefits of the host's actual backend file system > features. > > > Such alternative is virtio-fs, which is being actively worked on: > > > > https://lists.gnu.org/archive/html/qemu-devel/2019-04/msg02746.html > > > > Note: I'm not telling "stay away from 9p" but maybe worth taking a look, > > because if virtio-fs gets merged, it is likely to become the official > > and better supported way to share files between host and guest. > > Ah, good to know! That's new to me, thanks! > > Makes sense to me, especially its performance will certainly be much better. > > > Please repost a series, possibly based on some of Antonios's patches that > > allows to avoid the QID collision, returns an error to the client instead > > and possibly printing out some useful messages in the QEMU log. Then, on > > top of that, you can start introducing hashing and variable prefix length. > > So you want that as its own patch series first, or can I continue with my > suggestion to deliver the hash patch and variable suffix length patch as last > patches within the same series? > Same series is okay. > Best regards, > Christian Schoenebeck Cheers, -- Greg
On Montag, 3. Juni 2019 08:57:15 CEST Greg Kurz wrote: > > Ok, I will extend Antonios' patch to log that error on host. I thought > > about limiting that error message to once per session (for not flooding > > the logs), but it is probably not worth it, so if you don't veto then I > > will just log that error simply on every file access. > > Please use error_report_once(). I will. > > > Please repost a series, possibly based on some of Antonios's patches > > > that > > > allows to avoid the QID collision, returns an error to the client > > > instead > > > and possibly printing out some useful messages in the QEMU log. Then, on > > > top of that, you can start introducing hashing and variable prefix > > > length. > > > > So you want that as its own patch series first, or can I continue with my > > suggestion to deliver the hash patch and variable suffix length patch as > > last patches within the same series? > > Same series is okay. Ok. I'm currently busy with other work; I will probably send a new patch set approximately next week. Best regards, Christian Schoenebeck
© 2016 - 2026 Red Hat, Inc.