From: Antonios Motakis <antonios.motakis@huawei.com>
To support multiple devices on the 9p share, and avoid
qid path collisions we take the device id as input
to generate a unique QID path. The lowest 48 bits of
the path will be set equal to the file inode, and the
top bits will be uniquely assigned based on the top
16 bits of the inode and the device id.
Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
---
hw/9pfs/9p.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------
hw/9pfs/9p.h | 13 ++++++++-
2 files changed, 90 insertions(+), 11 deletions(-)
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 4da858f..f434f05 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -25,6 +25,8 @@
#include "trace.h"
#include "migration/blocker.h"
#include "sysemu/qtest.h"
+#include "exec/tb-hash-xx.h"
+#include "qemu/qht.h"
int open_fd_hw;
int total_open_fd;
@@ -572,20 +574,82 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
P9_STAT_MODE_NAMED_PIPE | \
P9_STAT_MODE_SOCKET)
-/* This is the algorithm from ufs in spfs */
+
+/* creative abuse of tb_hash_func7, which is based on xxhash */
+static uint32_t qpp_hash(QppEntry e)
+{
+ return tb_hash_func7(e.ino_prefix, e.dev, 0, 0, 0);
+}
+
+static bool qpp_lookup_func(const void *obj, const void *userp)
+{
+ const QppEntry *e1 = obj, *e2 = userp;
+ return (e1->dev == e2->dev) && (e1->ino_prefix == e2->ino_prefix);
+}
+
+static void qpp_table_remove(struct qht *ht, void *p, uint32_t h, void *up)
+{
+ g_free(p);
+}
+
+static void qpp_table_destroy(struct qht *ht)
+{
+ qht_iter(ht, qpp_table_remove, NULL);
+ qht_destroy(ht);
+}
+
+/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
+ * to a unique QID path (64 bits). To avoid having to map and keep track
+ * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
+ * the device id to the 16 highest bits of the QID path. The 48 lowest bits
+ * of the QID path equal to the lowest bits of the inode number.
+ *
+ * This takes advantage of the fact that inode number are usually not
+ * random but allocated sequentially, so we have fewer items to keep
+ * track of.
+ */
+static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
+ uint64_t *path)
+{
+ QppEntry lookup = {
+ .dev = stbuf->st_dev,
+ .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
+ }, *val;
+ uint32_t hash = qpp_hash(lookup);
+
+ val = qht_lookup(&pdu->s->qpp_table, qpp_lookup_func, &lookup, hash);
+
+ if (!val) {
+ if (pdu->s->qp_prefix_next == 0) {
+ /* we ran out of prefixes */
+ return -ENFILE;
+ }
+
+ val = g_malloc0(sizeof(QppEntry));
+ if (!val) {
+ return -ENOMEM;
+ }
+ *val = lookup;
+
+ /* new unique inode prefix and device combo */
+ val->qp_prefix = pdu->s->qp_prefix_next++;
+ qht_insert(&pdu->s->qpp_table, val, hash);
+ }
+
+ *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+ return 0;
+}
+
static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
{
- size_t size;
+ int err;
- if (pdu->s->dev_id == 0) {
- pdu->s->dev_id = stbuf->st_dev;
- } else if (pdu->s->dev_id != stbuf->st_dev) {
- return -ENOSYS;
+ /* map inode+device to qid path (fast path) */
+ err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
+ if (err) {
+ return err;
}
- memset(&qidp->path, 0, sizeof(qidp->path));
- size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
- memcpy(&qidp->path, &stbuf->st_ino, size);
qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
qidp->type = 0;
if (S_ISDIR(stbuf->st_mode)) {
@@ -3626,7 +3690,9 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
goto out;
}
- s->dev_id = 0;
+ /* QID path hash table. 1 entry ought to be enough for anybody ;) */
+ qht_init(&s->qpp_table, 1, QHT_MODE_AUTO_RESIZE);
+ s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
s->ctx.fst = &fse->fst;
fsdev_throttle_init(s->ctx.fst);
@@ -3641,6 +3707,7 @@ out:
}
g_free(s->tag);
g_free(s->ctx.fs_root);
+ qpp_table_destroy(&s->qpp_table);
v9fs_path_free(&path);
}
return rc;
@@ -3653,6 +3720,7 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
}
fsdev_throttle_cleanup(s->ctx.fst);
g_free(s->tag);
+ qpp_table_destroy(&s->qpp_table);
g_free(s->ctx.fs_root);
}
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index afb4ebd..80428f7 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -8,6 +8,7 @@
#include "fsdev/9p-iov-marshal.h"
#include "qemu/thread.h"
#include "qemu/coroutine.h"
+#include "qemu/qht.h"
enum {
P9_TLERROR = 6,
@@ -231,6 +232,15 @@ struct V9fsFidState
V9fsFidState *rclm_lst;
};
+#define QPATH_INO_MASK (((unsigned long)1 << 48) - 1)
+
+/* QID path prefix entry, see stat_to_qid */
+typedef struct {
+ dev_t dev;
+ uint16_t ino_prefix;
+ uint16_t qp_prefix;
+} QppEntry;
+
struct V9fsState
{
QLIST_HEAD(, V9fsPDU) free_list;
@@ -252,7 +262,8 @@ struct V9fsState
Error *migration_blocker;
V9fsConf fsconf;
V9fsQID root_qid;
- dev_t dev_id;
+ struct qht qpp_table;
+ uint16_t qp_prefix_next;
};
/* 9p2000.L open flags */
--
1.8.3.1
On Thu, 8 Feb 2018 19:00:18 +0100
<antonios.motakis@huawei.com> wrote:
> From: Antonios Motakis <antonios.motakis@huawei.com>
>
> To support multiple devices on the 9p share, and avoid
> qid path collisions we take the device id as input
> to generate a unique QID path. The lowest 48 bits of
> the path will be set equal to the file inode, and the
> top bits will be uniquely assigned based on the top
> 16 bits of the inode and the device id.
>
> Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> ---
> hw/9pfs/9p.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------
> hw/9pfs/9p.h | 13 ++++++++-
> 2 files changed, 90 insertions(+), 11 deletions(-)
>
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index 4da858f..f434f05 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -25,6 +25,8 @@
> #include "trace.h"
> #include "migration/blocker.h"
> #include "sysemu/qtest.h"
> +#include "exec/tb-hash-xx.h"
> +#include "qemu/qht.h"
>
> int open_fd_hw;
> int total_open_fd;
> @@ -572,20 +574,82 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
> P9_STAT_MODE_NAMED_PIPE | \
> P9_STAT_MODE_SOCKET)
>
> -/* This is the algorithm from ufs in spfs */
> +
> +/* creative abuse of tb_hash_func7, which is based on xxhash */
> +static uint32_t qpp_hash(QppEntry e)
> +{
> + return tb_hash_func7(e.ino_prefix, e.dev, 0, 0, 0);
Hmm... Looking at git log include/exec/tb-hash-xx.h, I see that this hash
function signature evolved according to TCG needs. It started with 3
arguments, then 4 and we have 5 today.
So I don't think we should add another unrelated user. Probably best to
have our own version. Also it seems it could be simpler since you always
pass 0 for the third and later arguments.
> +}
> +
> +static bool qpp_lookup_func(const void *obj, const void *userp)
> +{
> + const QppEntry *e1 = obj, *e2 = userp;
> + return (e1->dev == e2->dev) && (e1->ino_prefix == e2->ino_prefix);
I guess this expression is simple enough so that you can drop the
parenthesis, since they're not needed because of '==' having
precedence over '&&'.
See: http://en.cppreference.com/w/c/language/operator_precedence
> +}
> +
> +static void qpp_table_remove(struct qht *ht, void *p, uint32_t h, void *up)
> +{
> + g_free(p);
> +}
> +
> +static void qpp_table_destroy(struct qht *ht)
> +{
> + qht_iter(ht, qpp_table_remove, NULL);
> + qht_destroy(ht);
> +}
> +
> +/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
> + * to a unique QID path (64 bits). To avoid having to map and keep track
> + * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
> + * the device id to the 16 highest bits of the QID path. The 48 lowest bits
> + * of the QID path equal to the lowest bits of the inode number.
> + *
> + * This takes advantage of the fact that inode number are usually not
> + * random but allocated sequentially, so we have fewer items to keep
> + * track of.
> + */
> +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
> + uint64_t *path)
> +{
> + QppEntry lookup = {
> + .dev = stbuf->st_dev,
> + .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> + }, *val;
> + uint32_t hash = qpp_hash(lookup);
> +
> + val = qht_lookup(&pdu->s->qpp_table, qpp_lookup_func, &lookup, hash);
> +
> + if (!val) {
> + if (pdu->s->qp_prefix_next == 0) {
> + /* we ran out of prefixes */
> + return -ENFILE;
Not sure this errno would make sense for guest syscalls that don't open
file descriptors... Maybe ENOENT ?
Cc'ing Eric for insights.
> + }
> +
> + val = g_malloc0(sizeof(QppEntry));
> + if (!val) {
> + return -ENOMEM;
> + }
> + *val = lookup;
> +
> + /* new unique inode prefix and device combo */
> + val->qp_prefix = pdu->s->qp_prefix_next++;
> + qht_insert(&pdu->s->qpp_table, val, hash);
> + }
> +
> + *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
> + return 0;
> +}
> +
> static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
> {
> - size_t size;
> + int err;
>
> - if (pdu->s->dev_id == 0) {
> - pdu->s->dev_id = stbuf->st_dev;
> - } else if (pdu->s->dev_id != stbuf->st_dev) {
> - return -ENOSYS;
> + /* map inode+device to qid path (fast path) */
> + err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
> + if (err) {
> + return err;
> }
>
> - memset(&qidp->path, 0, sizeof(qidp->path));
> - size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
> - memcpy(&qidp->path, &stbuf->st_ino, size);
> qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
> qidp->type = 0;
> if (S_ISDIR(stbuf->st_mode)) {
> @@ -3626,7 +3690,9 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
> goto out;
> }
>
> - s->dev_id = 0;
> + /* QID path hash table. 1 entry ought to be enough for anybody ;) */
> + qht_init(&s->qpp_table, 1, QHT_MODE_AUTO_RESIZE);
> + s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
>
> s->ctx.fst = &fse->fst;
> fsdev_throttle_init(s->ctx.fst);
> @@ -3641,6 +3707,7 @@ out:
> }
> g_free(s->tag);
> g_free(s->ctx.fs_root);
> + qpp_table_destroy(&s->qpp_table);
> v9fs_path_free(&path);
> }
> return rc;
> @@ -3653,6 +3720,7 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
> }
> fsdev_throttle_cleanup(s->ctx.fst);
> g_free(s->tag);
> + qpp_table_destroy(&s->qpp_table);
> g_free(s->ctx.fs_root);
> }
>
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index afb4ebd..80428f7 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -8,6 +8,7 @@
> #include "fsdev/9p-iov-marshal.h"
> #include "qemu/thread.h"
> #include "qemu/coroutine.h"
> +#include "qemu/qht.h"
>
> enum {
> P9_TLERROR = 6,
> @@ -231,6 +232,15 @@ struct V9fsFidState
> V9fsFidState *rclm_lst;
> };
>
> +#define QPATH_INO_MASK (((unsigned long)1 << 48) - 1)
> +
> +/* QID path prefix entry, see stat_to_qid */
> +typedef struct {
> + dev_t dev;
> + uint16_t ino_prefix;
> + uint16_t qp_prefix;
> +} QppEntry;
> +
> struct V9fsState
> {
> QLIST_HEAD(, V9fsPDU) free_list;
> @@ -252,7 +262,8 @@ struct V9fsState
> Error *migration_blocker;
> V9fsConf fsconf;
> V9fsQID root_qid;
> - dev_t dev_id;
> + struct qht qpp_table;
> + uint16_t qp_prefix_next;
> };
>
> /* 9p2000.L open flags */
On 02/09/2018 09:13 AM, Greg Kurz wrote:
> On Thu, 8 Feb 2018 19:00:18 +0100
> <antonios.motakis@huawei.com> wrote:
>
>> From: Antonios Motakis <antonios.motakis@huawei.com>
>>
>> To support multiple devices on the 9p share, and avoid
>> qid path collisions we take the device id as input
>> to generate a unique QID path. The lowest 48 bits of
>> the path will be set equal to the file inode, and the
>> top bits will be uniquely assigned based on the top
>> 16 bits of the inode and the device id.
>> +/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
>> + * to a unique QID path (64 bits). To avoid having to map and keep track
>> + * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
>> + * the device id to the 16 highest bits of the QID path. The 48 lowest bits
>> + * of the QID path equal to the lowest bits of the inode number.
>> + *
>> + * This takes advantage of the fact that inode number are usually not
>> + * random but allocated sequentially, so we have fewer items to keep
>> + * track of.
>> + */
>> +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
>> + uint64_t *path)
>> +{
>> + QppEntry lookup = {
>> + .dev = stbuf->st_dev,
>> + .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
>> + }, *val;
>> + uint32_t hash = qpp_hash(lookup);
>> +
>> + val = qht_lookup(&pdu->s->qpp_table, qpp_lookup_func, &lookup, hash);
>> +
>> + if (!val) {
>> + if (pdu->s->qp_prefix_next == 0) {
>> + /* we ran out of prefixes */
>> + return -ENFILE;
>
> Not sure this errno would make sense for guest syscalls that don't open
> file descriptors... Maybe ENOENT ?
>
> Cc'ing Eric for insights.
Actually, it makes sense to me:
ENFILE 23 Too many open files in system
You only get here if the hash table filled up, which means there are
indeed too many open files (even if this syscall wasn't opening a file).
In fact, ENFILE is usually associated with running into ulimit
restrictions, and come to think of it, you are more likely to hit your
ulimit than you are to run into a hash collision (so this code may be
very hard to reach in practice, but if you do reach it, it behaves
similarly to what you were more likely to hit in the first place).
ENOENT implies the file doesn't exist - but here, the file exists but we
can't open it because we're out of resources for tracking it.
Also, POSIX permits returning specific errno codes that aren't otherwise
listed for a syscall if the usual semantics of that errno code are
indeed the reason for the failure (you should prefer to fail with errno
codes documented by POSIX where possible, but POSIX does not limit you
to just that set).
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3266
Virtualization: qemu.org | libvirt.org
On Fri, 9 Feb 2018 10:06:05 -0600
Eric Blake <eblake@redhat.com> wrote:
> On 02/09/2018 09:13 AM, Greg Kurz wrote:
> > On Thu, 8 Feb 2018 19:00:18 +0100
> > <antonios.motakis@huawei.com> wrote:
> >
> >> From: Antonios Motakis <antonios.motakis@huawei.com>
> >>
> >> To support multiple devices on the 9p share, and avoid
> >> qid path collisions we take the device id as input
> >> to generate a unique QID path. The lowest 48 bits of
> >> the path will be set equal to the file inode, and the
> >> top bits will be uniquely assigned based on the top
> >> 16 bits of the inode and the device id.
>
> >> +/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
> >> + * to a unique QID path (64 bits). To avoid having to map and keep track
> >> + * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
> >> + * the device id to the 16 highest bits of the QID path. The 48 lowest bits
> >> + * of the QID path equal to the lowest bits of the inode number.
> >> + *
> >> + * This takes advantage of the fact that inode number are usually not
> >> + * random but allocated sequentially, so we have fewer items to keep
> >> + * track of.
> >> + */
> >> +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
> >> + uint64_t *path)
> >> +{
> >> + QppEntry lookup = {
> >> + .dev = stbuf->st_dev,
> >> + .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> >> + }, *val;
> >> + uint32_t hash = qpp_hash(lookup);
> >> +
> >> + val = qht_lookup(&pdu->s->qpp_table, qpp_lookup_func, &lookup, hash);
> >> +
> >> + if (!val) {
> >> + if (pdu->s->qp_prefix_next == 0) {
> >> + /* we ran out of prefixes */
> >> + return -ENFILE;
> >
> > Not sure this errno would make sense for guest syscalls that don't open
> > file descriptors... Maybe ENOENT ?
> >
> > Cc'ing Eric for insights.
>
> Actually, it makes sense to me:
>
> ENFILE 23 Too many open files in system
>
> You only get here if the hash table filled up, which means there are
> indeed too many open files (even if this syscall wasn't opening a file).
> In fact, ENFILE is usually associated with running into ulimit
> restrictions, and come to think of it, you are more likely to hit your
> ulimit than you are to run into a hash collision (so this code may be
> very hard to reach in practice, but if you do reach it, it behaves
> similarly to what you were more likely to hit in the first place).
>
> ENOENT implies the file doesn't exist - but here, the file exists but we
> can't open it because we're out of resources for tracking it.
>
Only the host knows that the file exists actually. If stat_to_qid() for
this path returns ENOENT, then the file shouldn't be visible in the
guest.
I say "shouldn't" instead of "isn't" because I now realize that
v9fs_do_readdir(), which is used to implement getdents() in the
guest, sends a degraded QID, hand-crafted without calling
stat_to_qid() at all... :-\
/*
* Fill up just the path field of qid because the client uses
* only that. To fill the entire qid structure we will have
* to stat each dirent found, which is expensive
*/
size = MIN(sizeof(dent->d_ino), sizeof(qid.path));
memcpy(&qid.path, &dent->d_ino, size);
/* Fill the other fields with dummy values */
qid.type = 0;
qid.version = 0;
Antonios, your patchset should probably also address this.
The problem is that the dirent we get from v9fs_co_readdir()
only has the inode number, so I guess we must build a dummy
struct stat with:
stbuf.st_ino = dent->d_ino
stbuf.st_dev = st_dev of the parent directory
The st_dev of the parent directory could be obtained in
v9fs_readdir() and passed to v9fs_do_readdir(). Another
solution could be to cache the QID in the V9fsFidState
of the parent directory when it is open.
Also, if we hit a collision while reading the directory, I'm
afraid the remaining entries won't be read at all. I'm not
sure this is really what we want.
> Also, POSIX permits returning specific errno codes that aren't otherwise
> listed for a syscall if the usual semantics of that errno code are
> indeed the reason for the failure (you should prefer to fail with errno
> codes documented by POSIX where possible, but POSIX does not limit you
> to just that set).
>
Ok, then ENFILE wouldn't be that bad in the end.
Thanks for your POSIX expertise :)
Cheers,
--
Greg
On 02/09/2018 06:57 PM, Greg Kurz wrote:
> On Fri, 9 Feb 2018 10:06:05 -0600
> Eric Blake <eblake@redhat.com> wrote:
>
>> On 02/09/2018 09:13 AM, Greg Kurz wrote:
>>> On Thu, 8 Feb 2018 19:00:18 +0100
>>> <antonios.motakis@huawei.com> wrote:
>>>
>>>> From: Antonios Motakis <antonios.motakis@huawei.com>
>>>>
>>>> To support multiple devices on the 9p share, and avoid
>>>> qid path collisions we take the device id as input
>>>> to generate a unique QID path. The lowest 48 bits of
>>>> the path will be set equal to the file inode, and the
>>>> top bits will be uniquely assigned based on the top
>>>> 16 bits of the inode and the device id.
>>>> +/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
>>>> + * to a unique QID path (64 bits). To avoid having to map and keep track
>>>> + * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
>>>> + * the device id to the 16 highest bits of the QID path. The 48 lowest bits
>>>> + * of the QID path equal to the lowest bits of the inode number.
>>>> + *
>>>> + * This takes advantage of the fact that inode number are usually not
>>>> + * random but allocated sequentially, so we have fewer items to keep
>>>> + * track of.
>>>> + */
>>>> +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
>>>> + uint64_t *path)
>>>> +{
>>>> + QppEntry lookup = {
>>>> + .dev = stbuf->st_dev,
>>>> + .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
>>>> + }, *val;
>>>> + uint32_t hash = qpp_hash(lookup);
>>>> +
>>>> + val = qht_lookup(&pdu->s->qpp_table, qpp_lookup_func, &lookup, hash);
>>>> +
>>>> + if (!val) {
>>>> + if (pdu->s->qp_prefix_next == 0) {
>>>> + /* we ran out of prefixes */
>>>> + return -ENFILE;
>>> Not sure this errno would make sense for guest syscalls that don't open
>>> file descriptors... Maybe ENOENT ?
>>>
>>> Cc'ing Eric for insights.
>> Actually, it makes sense to me:
>>
>> ENFILE 23 Too many open files in system
>>
>> You only get here if the hash table filled up, which means there are
>> indeed too many open files (even if this syscall wasn't opening a file).
>> In fact, ENFILE is usually associated with running into ulimit
>> restrictions, and come to think of it, you are more likely to hit your
>> ulimit than you are to run into a hash collision (so this code may be
>> very hard to reach in practice, but if you do reach it, it behaves
>> similarly to what you were more likely to hit in the first place).
>>
>> ENOENT implies the file doesn't exist - but here, the file exists but we
>> can't open it because we're out of resources for tracking it.
>>
> Only the host knows that the file exists actually. If stat_to_qid() for
> this path returns ENOENT, then the file shouldn't be visible in the
> guest.
>
> I say "shouldn't" instead of "isn't" because I now realize that
> v9fs_do_readdir(), which is used to implement getdents() in the
> guest, sends a degraded QID, hand-crafted without calling
> stat_to_qid() at all... :-\
>
> /*
> * Fill up just the path field of qid because the client uses
> * only that. To fill the entire qid structure we will have
> * to stat each dirent found, which is expensive
> */
> size = MIN(sizeof(dent->d_ino), sizeof(qid.path));
> memcpy(&qid.path, &dent->d_ino, size);
> /* Fill the other fields with dummy values */
> qid.type = 0;
> qid.version = 0;
>
>
> Antonios, your patchset should probably also address this.
>
> The problem is that the dirent we get from v9fs_co_readdir()
> only has the inode number, so I guess we must build a dummy
> struct stat with:
>
> stbuf.st_ino = dent->d_ino
> stbuf.st_dev = st_dev of the parent directory
>
> The st_dev of the parent directory could be obtained in
> v9fs_readdir() and passed to v9fs_do_readdir(). Another
> solution could be to cache the QID in the V9fsFidState
> of the parent directory when it is open.
>
> Also, if we hit a collision while reading the directory, I'm
> afraid the remaining entries won't be read at all. I'm not
> sure this is really what we want.
Good catch!
I'm not that sure that we can make the assumption that all entries in a
dir will share the st_dev,
I think we have to check it for each entry...
>
>> Also, POSIX permits returning specific errno codes that aren't otherwise
>> listed for a syscall if the usual semantics of that errno code are
>> indeed the reason for the failure (you should prefer to fail with errno
>> codes documented by POSIX where possible, but POSIX does not limit you
>> to just that set).
>>
> Ok, then ENFILE wouldn't be that bad in the end.
>
> Thanks for your POSIX expertise :)
Will keep that one then!
>
> Cheers,
>
> --
> Greg
On Fri, 16 Feb 2018 11:20:33 +0100 Antonios Motakis <antonios.motakis@huawei.com> wrote: <snip> > > I'm not that sure that we can make the assumption that all entries in a > dir will share the st_dev, The assumption stands for any entry that is not a directory actually. But indeed a directory could be a mount point, and have a different st_dev. > I think we have to check it for each entry... > Only if the entry might be a directory, ie, dent->d_type == DT_DIR || dent->d_type == DT_UNKNOWN > > > >> Also, POSIX permits returning specific errno codes that aren't otherwise > >> listed for a syscall if the usual semantics of that errno code are > >> indeed the reason for the failure (you should prefer to fail with errno > >> codes documented by POSIX where possible, but POSIX does not limit you > >> to just that set). > >> > > Ok, then ENFILE wouldn't be that bad in the end. > > > > Thanks for your POSIX expertise :) > Will keep that one then! > > > > > Cheers, > > > > -- > > Greg >
On Fri, Feb 09, 2018 at 16:13:26 +0100, Greg Kurz wrote:
> On Thu, 8 Feb 2018 19:00:18 +0100
> <antonios.motakis@huawei.com> wrote:
(snip)
> > +/* creative abuse of tb_hash_func7, which is based on xxhash */
> > +static uint32_t qpp_hash(QppEntry e)
> > +{
> > + return tb_hash_func7(e.ino_prefix, e.dev, 0, 0, 0);
>
> Hmm... Looking at git log include/exec/tb-hash-xx.h, I see that this hash
> function signature evolved according to TCG needs. It started with 3
> arguments, then 4 and we have 5 today.
>
> So I don't think we should add another unrelated user. Probably best to
> have our own version. Also it seems it could be simpler since you always
> pass 0 for the third and later arguments.
It's fine to use it; the compiler will do the right thing
with those '0' arguments, because the function is always inlined.
That said, I have some cleanup patches to export this as xxhash and make
tb_hash a user of it. The patches also remove most of the ugly 0's from
callers. Those patches will have to wait for a bit though, so for now
just use tb_hash_func7.
(snip)
> > + val = g_malloc0(sizeof(QppEntry));
> > + if (!val) {
> > + return -ENOMEM;
> > + }
g_malloc0 and all other glib memory allocation functions only
return NULL if the size argument is 0.
If out of memory, the application is terminated, which is why
this check is not needed.
Reference:
https://developer.gnome.org/glib/stable/glib-Memory-Allocation.html#glib-Memory-Allocation.description
Thanks,
Emilio
On 02/09/2018 10:58 PM, Emilio G. Cota wrote:
> On Fri, Feb 09, 2018 at 16:13:26 +0100, Greg Kurz wrote:
>> On Thu, 8 Feb 2018 19:00:18 +0100
>> <antonios.motakis@huawei.com> wrote:
> (snip)
>>> +/* creative abuse of tb_hash_func7, which is based on xxhash */
>>> +static uint32_t qpp_hash(QppEntry e)
>>> +{
>>> + return tb_hash_func7(e.ino_prefix, e.dev, 0, 0, 0);
>> Hmm... Looking at git log include/exec/tb-hash-xx.h, I see that this hash
>> function signature evolved according to TCG needs. It started with 3
>> arguments, then 4 and we have 5 today.
>>
>> So I don't think we should add another unrelated user. Probably best to
>> have our own version. Also it seems it could be simpler since you always
>> pass 0 for the third and later arguments.
> It's fine to use it; the compiler will do the right thing
> with those '0' arguments, because the function is always inlined.
>
> That said, I have some cleanup patches to export this as xxhash and make
> tb_hash a user of it. The patches also remove most of the ugly 0's from
> callers. Those patches will have to wait for a bit though, so for now
> just use tb_hash_func7.
I guess this is why it's a good idea to share patches even if still rough ;)
So I will not worry about this part for now.
>
> (snip)
>>> + val = g_malloc0(sizeof(QppEntry));
>>> + if (!val) {
>>> + return -ENOMEM;
>>> + }
> g_malloc0 and all other glib memory allocation functions only
> return NULL if the size argument is 0.
> If out of memory, the application is terminated, which is why
> this check is not needed.
> Reference:
> https://developer.gnome.org/glib/stable/glib-Memory-Allocation.html#glib-Memory-Allocation.description
Thanks for the clarification.
>
> Thanks,
>
> Emilio
© 2016 - 2026 Red Hat, Inc.