Currently, lo_inode.fhandle is always NULL and so always keep an O_PATH
FD in lo_inode.fd. Therefore, when the respective inode is unlinked,
its inode ID will remain in use until we drop our lo_inode (and
lo_inode_put() thus closes the FD). Therefore, lo_find() can safely use
the inode ID as an lo_inode key, because any inode with an inode ID we
find in lo_data.inodes (on the same filesystem) must be the exact same
file.
This will change when we start setting lo_inode.fhandle so we do not
have to keep an O_PATH FD open. Then, unlinking such an inode will
immediately remove it, so its ID can then be reused by newly created
files, even while the lo_inode object is still there[1].
So creating a new file can then reuse the old file's inode ID, and
looking up the new file would lead to us finding the old file's
lo_inode, which is not ideal.
Luckily, just as file handles cause this problem, they also solve it: A
file handle contains a generation ID, which changes when an inode ID is
reused, so the new file can be distinguished from the old one. So all
we need to do is to add a second map besides lo_data.inodes that maps
file handles to lo_inodes, namely lo_data.inodes_by_handle. For
clarity, lo_data.inodes is renamed to lo_data.inodes_by_ids.
Unfortunately, we cannot rely on being able to generate file handles
every time. Therefore, we still enter every lo_inode object into
inodes_by_ids, but having an entry in inodes_by_handle is optional. A
potential inodes_by_handle entry then has precedence, the inodes_by_ids
entry is just a fallback.
Note that we do not generate lo_fhandle objects yet, and so we also do
not enter anything into the inodes_by_handle map yet. Also, all lookups
skip that map. We might manually create file handles with some code
that is immediately removed by the next patch again, but that would
break the assumption in lo_find() that every lo_inode with a non-NULL
.fhandle must have an entry in inodes_by_handle and vice versa. So we
leave actually using the inodes_by_handle map for the next patch.
[1] If some application in the guest still has the file open, there is
going to be a corresponding FD mapping in lo_data.fd_map. In such a
case, the inode will only go away once every application in the guest
has closed it. The problem described only applies to cases where the
guest does not have the file open, and it is just in the dentry cache,
basically.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Connor Kuehl <ckuehl@redhat.com>
---
tools/virtiofsd/passthrough_ll.c | 80 +++++++++++++++++++++++++-------
1 file changed, 64 insertions(+), 16 deletions(-)
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index e665575401..793d2c333e 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -179,7 +179,8 @@ struct lo_data {
int announce_submounts;
bool use_statx;
struct lo_inode root;
- GHashTable *inodes; /* protected by lo->mutex */
+ GHashTable *inodes_by_ids; /* protected by lo->mutex */
+ GHashTable *inodes_by_handle; /* protected by lo->mutex */
struct lo_map ino_map; /* protected by lo->mutex */
struct lo_map dirp_map; /* protected by lo->mutex */
struct lo_map fd_map; /* protected by lo->mutex */
@@ -257,8 +258,9 @@ static struct {
/* That we loaded cap-ng in the current thread from the saved */
static __thread bool cap_loaded = 0;
-static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
- uint64_t mnt_id);
+static struct lo_inode *lo_find(struct lo_data *lo,
+ const struct lo_fhandle *fhandle,
+ struct stat *st, uint64_t mnt_id);
static int xattr_map_client(const struct lo_data *lo, const char *client_name,
char **out_name);
@@ -1032,18 +1034,39 @@ out_err:
fuse_reply_err(req, saverr);
}
-static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
- uint64_t mnt_id)
+static struct lo_inode *lo_find(struct lo_data *lo,
+ const struct lo_fhandle *fhandle,
+ struct stat *st, uint64_t mnt_id)
{
- struct lo_inode *p;
- struct lo_key key = {
+ struct lo_inode *p = NULL;
+ struct lo_key ids_key = {
.ino = st->st_ino,
.dev = st->st_dev,
.mnt_id = mnt_id,
};
pthread_mutex_lock(&lo->mutex);
- p = g_hash_table_lookup(lo->inodes, &key);
+ if (fhandle) {
+ p = g_hash_table_lookup(lo->inodes_by_handle, fhandle);
+ }
+ if (!p) {
+ p = g_hash_table_lookup(lo->inodes_by_ids, &ids_key);
+ /*
+ * When we had to fall back to looking up an inode by its IDs,
+ * ensure that we hit an entry that does not have a file
+ * handle. Entries with file handles must also have a handle
+ * alt key, so if we have not found it by that handle alt key,
+ * we must have found an entry with a mismatching handle; i.e.
+ * an entry for a different file, even though it has the same
+ * inode ID.
+ * (This can happen when we look up a new file that has reused
+ * the inode ID of some previously unlinked inode for which we
+ * still have an lo_inode object.)
+ */
+ if (p && fhandle != NULL && p->fhandle != NULL) {
+ p = NULL;
+ }
+ }
if (p) {
assert(p->nlookup > 0);
p->nlookup++;
@@ -1183,7 +1206,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
e->attr_flags |= FUSE_ATTR_SUBMOUNT;
}
- inode = lo_find(lo, &e->attr, mnt_id);
+ inode = lo_find(lo, NULL, &e->attr, mnt_id);
if (inode) {
close(newfd);
} else {
@@ -1213,7 +1236,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
}
pthread_mutex_lock(&lo->mutex);
inode->fuse_ino = lo_add_inode_mapping(req, inode);
- g_hash_table_insert(lo->inodes, &inode->key, inode);
+ g_hash_table_insert(lo->inodes_by_ids, &inode->key, inode);
pthread_mutex_unlock(&lo->mutex);
}
e->ino = inode->fuse_ino;
@@ -1525,7 +1548,7 @@ static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
return NULL;
}
- return lo_find(lo, &attr, mnt_id);
+ return lo_find(lo, NULL, &attr, mnt_id);
}
static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
@@ -1688,7 +1711,7 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
inode->nlookup -= n;
if (!inode->nlookup) {
lo_map_remove(&lo->ino_map, inode->fuse_ino);
- g_hash_table_remove(lo->inodes, &inode->key);
+ g_hash_table_remove(lo->inodes_by_ids, &inode->key);
if (lo->posix_lock) {
if (g_hash_table_size(inode->posix_locks)) {
fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
@@ -3388,7 +3411,7 @@ static void lo_destroy(void *userdata)
GHashTableIter iter;
gpointer key, value;
- g_hash_table_iter_init(&iter, lo->inodes);
+ g_hash_table_iter_init(&iter, lo->inodes_by_ids);
if (!g_hash_table_iter_next(&iter, &key, &value)) {
break;
}
@@ -3931,10 +3954,34 @@ static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
}
+static guint lo_fhandle_hash(gconstpointer key)
+{
+ const struct lo_fhandle *fh = key;
+ guint hash;
+ size_t i;
+
+ /* Basically g_str_hash() */
+ hash = 5381;
+ for (i = 0; i < sizeof(fh->padding); i++) {
+ hash += hash * 33 + (unsigned char)fh->padding[i];
+ }
+ hash += hash * 33 + fh->mount_id;
+
+ return hash;
+}
+
+static gboolean lo_fhandle_equal(gconstpointer a, gconstpointer b)
+{
+ return !memcmp(a, b, sizeof(struct lo_fhandle));
+}
+
static void fuse_lo_data_cleanup(struct lo_data *lo)
{
- if (lo->inodes) {
- g_hash_table_destroy(lo->inodes);
+ if (lo->inodes_by_ids) {
+ g_hash_table_destroy(lo->inodes_by_ids);
+ }
+ if (lo->inodes_by_ids) {
+ g_hash_table_destroy(lo->inodes_by_handle);
}
if (lo->root.posix_locks) {
@@ -3990,7 +4037,8 @@ int main(int argc, char *argv[])
qemu_init_exec_dir(argv[0]);
pthread_mutex_init(&lo.mutex, NULL);
- lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
+ lo.inodes_by_ids = g_hash_table_new(lo_key_hash, lo_key_equal);
+ lo.inodes_by_handle = g_hash_table_new(lo_fhandle_hash, lo_fhandle_equal);
lo.root.fd = -1;
lo.root.fuse_ino = FUSE_ROOT_ID;
lo.cache = CACHE_AUTO;
--
2.31.1
On Wed, Jun 09, 2021 at 05:55:49PM +0200, Max Reitz wrote:
> Currently, lo_inode.fhandle is always NULL and so always keep an O_PATH
> FD in lo_inode.fd. Therefore, when the respective inode is unlinked,
> its inode ID will remain in use until we drop our lo_inode (and
> lo_inode_put() thus closes the FD). Therefore, lo_find() can safely use
> the inode ID as an lo_inode key, because any inode with an inode ID we
> find in lo_data.inodes (on the same filesystem) must be the exact same
> file.
>
> This will change when we start setting lo_inode.fhandle so we do not
> have to keep an O_PATH FD open. Then, unlinking such an inode will
> immediately remove it, so its ID can then be reused by newly created
> files, even while the lo_inode object is still there[1].
>
> So creating a new file can then reuse the old file's inode ID, and
> looking up the new file would lead to us finding the old file's
> lo_inode, which is not ideal.
>
> Luckily, just as file handles cause this problem, they also solve it: A
> file handle contains a generation ID, which changes when an inode ID is
> reused, so the new file can be distinguished from the old one. So all
> we need to do is to add a second map besides lo_data.inodes that maps
> file handles to lo_inodes, namely lo_data.inodes_by_handle. For
> clarity, lo_data.inodes is renamed to lo_data.inodes_by_ids.
>
> Unfortunately, we cannot rely on being able to generate file handles
> every time.
Hi Max,
What are the cases where we can not rely being able to generate file
handles?
> Therefore, we still enter every lo_inode object into
> inodes_by_ids, but having an entry in inodes_by_handle is optional. A
> potential inodes_by_handle entry then has precedence, the inodes_by_ids
> entry is just a fallback.
If we have to keep inodes_by_ids around, then can we just add fhandle
to the lo_key. That way we can manage with single hash table and still
be able to detect if inode ID has been reused.
Thanks
Vivek
>
> Note that we do not generate lo_fhandle objects yet, and so we also do
> not enter anything into the inodes_by_handle map yet. Also, all lookups
> skip that map. We might manually create file handles with some code
> that is immediately removed by the next patch again, but that would
> break the assumption in lo_find() that every lo_inode with a non-NULL
> .fhandle must have an entry in inodes_by_handle and vice versa. So we
> leave actually using the inodes_by_handle map for the next patch.
>
> [1] If some application in the guest still has the file open, there is
> going to be a corresponding FD mapping in lo_data.fd_map. In such a
> case, the inode will only go away once every application in the guest
> has closed it. The problem described only applies to cases where the
> guest does not have the file open, and it is just in the dentry cache,
> basically.
>
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> Reviewed-by: Connor Kuehl <ckuehl@redhat.com>
> ---
> tools/virtiofsd/passthrough_ll.c | 80 +++++++++++++++++++++++++-------
> 1 file changed, 64 insertions(+), 16 deletions(-)
>
> diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
> index e665575401..793d2c333e 100644
> --- a/tools/virtiofsd/passthrough_ll.c
> +++ b/tools/virtiofsd/passthrough_ll.c
> @@ -179,7 +179,8 @@ struct lo_data {
> int announce_submounts;
> bool use_statx;
> struct lo_inode root;
> - GHashTable *inodes; /* protected by lo->mutex */
> + GHashTable *inodes_by_ids; /* protected by lo->mutex */
> + GHashTable *inodes_by_handle; /* protected by lo->mutex */
> struct lo_map ino_map; /* protected by lo->mutex */
> struct lo_map dirp_map; /* protected by lo->mutex */
> struct lo_map fd_map; /* protected by lo->mutex */
> @@ -257,8 +258,9 @@ static struct {
> /* That we loaded cap-ng in the current thread from the saved */
> static __thread bool cap_loaded = 0;
>
> -static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
> - uint64_t mnt_id);
> +static struct lo_inode *lo_find(struct lo_data *lo,
> + const struct lo_fhandle *fhandle,
> + struct stat *st, uint64_t mnt_id);
> static int xattr_map_client(const struct lo_data *lo, const char *client_name,
> char **out_name);
>
> @@ -1032,18 +1034,39 @@ out_err:
> fuse_reply_err(req, saverr);
> }
>
> -static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
> - uint64_t mnt_id)
> +static struct lo_inode *lo_find(struct lo_data *lo,
> + const struct lo_fhandle *fhandle,
> + struct stat *st, uint64_t mnt_id)
> {
> - struct lo_inode *p;
> - struct lo_key key = {
> + struct lo_inode *p = NULL;
> + struct lo_key ids_key = {
> .ino = st->st_ino,
> .dev = st->st_dev,
> .mnt_id = mnt_id,
> };
>
> pthread_mutex_lock(&lo->mutex);
> - p = g_hash_table_lookup(lo->inodes, &key);
> + if (fhandle) {
> + p = g_hash_table_lookup(lo->inodes_by_handle, fhandle);
> + }
> + if (!p) {
> + p = g_hash_table_lookup(lo->inodes_by_ids, &ids_key);
> + /*
> + * When we had to fall back to looking up an inode by its IDs,
> + * ensure that we hit an entry that does not have a file
> + * handle. Entries with file handles must also have a handle
> + * alt key, so if we have not found it by that handle alt key,
> + * we must have found an entry with a mismatching handle; i.e.
> + * an entry for a different file, even though it has the same
> + * inode ID.
> + * (This can happen when we look up a new file that has reused
> + * the inode ID of some previously unlinked inode for which we
> + * still have an lo_inode object.)
> + */
> + if (p && fhandle != NULL && p->fhandle != NULL) {
> + p = NULL;
> + }
> + }
> if (p) {
> assert(p->nlookup > 0);
> p->nlookup++;
> @@ -1183,7 +1206,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
> e->attr_flags |= FUSE_ATTR_SUBMOUNT;
> }
>
> - inode = lo_find(lo, &e->attr, mnt_id);
> + inode = lo_find(lo, NULL, &e->attr, mnt_id);
> if (inode) {
> close(newfd);
> } else {
> @@ -1213,7 +1236,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
> }
> pthread_mutex_lock(&lo->mutex);
> inode->fuse_ino = lo_add_inode_mapping(req, inode);
> - g_hash_table_insert(lo->inodes, &inode->key, inode);
> + g_hash_table_insert(lo->inodes_by_ids, &inode->key, inode);
> pthread_mutex_unlock(&lo->mutex);
> }
> e->ino = inode->fuse_ino;
> @@ -1525,7 +1548,7 @@ static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
> return NULL;
> }
>
> - return lo_find(lo, &attr, mnt_id);
> + return lo_find(lo, NULL, &attr, mnt_id);
> }
>
> static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
> @@ -1688,7 +1711,7 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
> inode->nlookup -= n;
> if (!inode->nlookup) {
> lo_map_remove(&lo->ino_map, inode->fuse_ino);
> - g_hash_table_remove(lo->inodes, &inode->key);
> + g_hash_table_remove(lo->inodes_by_ids, &inode->key);
> if (lo->posix_lock) {
> if (g_hash_table_size(inode->posix_locks)) {
> fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
> @@ -3388,7 +3411,7 @@ static void lo_destroy(void *userdata)
> GHashTableIter iter;
> gpointer key, value;
>
> - g_hash_table_iter_init(&iter, lo->inodes);
> + g_hash_table_iter_init(&iter, lo->inodes_by_ids);
> if (!g_hash_table_iter_next(&iter, &key, &value)) {
> break;
> }
> @@ -3931,10 +3954,34 @@ static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
> return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
> }
>
> +static guint lo_fhandle_hash(gconstpointer key)
> +{
> + const struct lo_fhandle *fh = key;
> + guint hash;
> + size_t i;
> +
> + /* Basically g_str_hash() */
> + hash = 5381;
> + for (i = 0; i < sizeof(fh->padding); i++) {
> + hash += hash * 33 + (unsigned char)fh->padding[i];
> + }
> + hash += hash * 33 + fh->mount_id;
> +
> + return hash;
> +}
> +
> +static gboolean lo_fhandle_equal(gconstpointer a, gconstpointer b)
> +{
> + return !memcmp(a, b, sizeof(struct lo_fhandle));
> +}
> +
> static void fuse_lo_data_cleanup(struct lo_data *lo)
> {
> - if (lo->inodes) {
> - g_hash_table_destroy(lo->inodes);
> + if (lo->inodes_by_ids) {
> + g_hash_table_destroy(lo->inodes_by_ids);
> + }
> + if (lo->inodes_by_ids) {
> + g_hash_table_destroy(lo->inodes_by_handle);
> }
>
> if (lo->root.posix_locks) {
> @@ -3990,7 +4037,8 @@ int main(int argc, char *argv[])
> qemu_init_exec_dir(argv[0]);
>
> pthread_mutex_init(&lo.mutex, NULL);
> - lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
> + lo.inodes_by_ids = g_hash_table_new(lo_key_hash, lo_key_equal);
> + lo.inodes_by_handle = g_hash_table_new(lo_fhandle_hash, lo_fhandle_equal);
> lo.root.fd = -1;
> lo.root.fuse_ino = FUSE_ROOT_ID;
> lo.cache = CACHE_AUTO;
> --
> 2.31.1
>
> _______________________________________________
> Virtio-fs mailing list
> Virtio-fs@redhat.com
> https://listman.redhat.com/mailman/listinfo/virtio-fs
>
On 11.06.21 22:04, Vivek Goyal wrote:
> On Wed, Jun 09, 2021 at 05:55:49PM +0200, Max Reitz wrote:
>> Currently, lo_inode.fhandle is always NULL and so always keep an O_PATH
>> FD in lo_inode.fd. Therefore, when the respective inode is unlinked,
>> its inode ID will remain in use until we drop our lo_inode (and
>> lo_inode_put() thus closes the FD). Therefore, lo_find() can safely use
>> the inode ID as an lo_inode key, because any inode with an inode ID we
>> find in lo_data.inodes (on the same filesystem) must be the exact same
>> file.
>>
>> This will change when we start setting lo_inode.fhandle so we do not
>> have to keep an O_PATH FD open. Then, unlinking such an inode will
>> immediately remove it, so its ID can then be reused by newly created
>> files, even while the lo_inode object is still there[1].
>>
>> So creating a new file can then reuse the old file's inode ID, and
>> looking up the new file would lead to us finding the old file's
>> lo_inode, which is not ideal.
>>
>> Luckily, just as file handles cause this problem, they also solve it: A
>> file handle contains a generation ID, which changes when an inode ID is
>> reused, so the new file can be distinguished from the old one. So all
>> we need to do is to add a second map besides lo_data.inodes that maps
>> file handles to lo_inodes, namely lo_data.inodes_by_handle. For
>> clarity, lo_data.inodes is renamed to lo_data.inodes_by_ids.
>>
>> Unfortunately, we cannot rely on being able to generate file handles
>> every time.
> Hi Max,
>
> What are the cases where we can not rely being able to generate file
> handles?
I have no idea, but it’s much easier to claim we can’t than to prove
that we can. I’d rather be resilient.
>> Therefore, we still enter every lo_inode object into
>> inodes_by_ids, but having an entry in inodes_by_handle is optional. A
>> potential inodes_by_handle entry then has precedence, the inodes_by_ids
>> entry is just a fallback.
> If we have to keep inodes_by_ids around, then can we just add fhandle
> to the lo_key. That way we can manage with single hash table and still
> be able to detect if inode ID has been reused.
We cannot, because I assume we cannot rely on name_to_handle_at()
working every time. Therefore, maybe at one point we can generate a file
handle, and at another, we cannot – we should still be able to look up
the inode regardless.
If the file handle were part of inodes_by_ids, then we can look up
inodes only if we can generate a file handle either every time (for a
given inode) or never. Or, well, I suppose we could always create two
entries, one with the file handles zeroed out, and one with the file
handle specified, but I wouldn’t find that very beautiful.
Max
>> Note that we do not generate lo_fhandle objects yet, and so we also do
>> not enter anything into the inodes_by_handle map yet. Also, all lookups
>> skip that map. We might manually create file handles with some code
>> that is immediately removed by the next patch again, but that would
>> break the assumption in lo_find() that every lo_inode with a non-NULL
>> .fhandle must have an entry in inodes_by_handle and vice versa. So we
>> leave actually using the inodes_by_handle map for the next patch.
>>
>> [1] If some application in the guest still has the file open, there is
>> going to be a corresponding FD mapping in lo_data.fd_map. In such a
>> case, the inode will only go away once every application in the guest
>> has closed it. The problem described only applies to cases where the
>> guest does not have the file open, and it is just in the dentry cache,
>> basically.
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> Reviewed-by: Connor Kuehl <ckuehl@redhat.com>
>> ---
>> tools/virtiofsd/passthrough_ll.c | 80 +++++++++++++++++++++++++-------
>> 1 file changed, 64 insertions(+), 16 deletions(-)
>>
>> diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
>> index e665575401..793d2c333e 100644
>> --- a/tools/virtiofsd/passthrough_ll.c
>> +++ b/tools/virtiofsd/passthrough_ll.c
>> @@ -179,7 +179,8 @@ struct lo_data {
>> int announce_submounts;
>> bool use_statx;
>> struct lo_inode root;
>> - GHashTable *inodes; /* protected by lo->mutex */
>> + GHashTable *inodes_by_ids; /* protected by lo->mutex */
>> + GHashTable *inodes_by_handle; /* protected by lo->mutex */
>> struct lo_map ino_map; /* protected by lo->mutex */
>> struct lo_map dirp_map; /* protected by lo->mutex */
>> struct lo_map fd_map; /* protected by lo->mutex */
>> @@ -257,8 +258,9 @@ static struct {
>> /* That we loaded cap-ng in the current thread from the saved */
>> static __thread bool cap_loaded = 0;
>>
>> -static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
>> - uint64_t mnt_id);
>> +static struct lo_inode *lo_find(struct lo_data *lo,
>> + const struct lo_fhandle *fhandle,
>> + struct stat *st, uint64_t mnt_id);
>> static int xattr_map_client(const struct lo_data *lo, const char *client_name,
>> char **out_name);
>>
>> @@ -1032,18 +1034,39 @@ out_err:
>> fuse_reply_err(req, saverr);
>> }
>>
>> -static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
>> - uint64_t mnt_id)
>> +static struct lo_inode *lo_find(struct lo_data *lo,
>> + const struct lo_fhandle *fhandle,
>> + struct stat *st, uint64_t mnt_id)
>> {
>> - struct lo_inode *p;
>> - struct lo_key key = {
>> + struct lo_inode *p = NULL;
>> + struct lo_key ids_key = {
>> .ino = st->st_ino,
>> .dev = st->st_dev,
>> .mnt_id = mnt_id,
>> };
>>
>> pthread_mutex_lock(&lo->mutex);
>> - p = g_hash_table_lookup(lo->inodes, &key);
>> + if (fhandle) {
>> + p = g_hash_table_lookup(lo->inodes_by_handle, fhandle);
>> + }
>> + if (!p) {
>> + p = g_hash_table_lookup(lo->inodes_by_ids, &ids_key);
>> + /*
>> + * When we had to fall back to looking up an inode by its IDs,
>> + * ensure that we hit an entry that does not have a file
>> + * handle. Entries with file handles must also have a handle
>> + * alt key, so if we have not found it by that handle alt key,
>> + * we must have found an entry with a mismatching handle; i.e.
>> + * an entry for a different file, even though it has the same
>> + * inode ID.
>> + * (This can happen when we look up a new file that has reused
>> + * the inode ID of some previously unlinked inode for which we
>> + * still have an lo_inode object.)
>> + */
>> + if (p && fhandle != NULL && p->fhandle != NULL) {
>> + p = NULL;
>> + }
>> + }
>> if (p) {
>> assert(p->nlookup > 0);
>> p->nlookup++;
>> @@ -1183,7 +1206,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
>> e->attr_flags |= FUSE_ATTR_SUBMOUNT;
>> }
>>
>> - inode = lo_find(lo, &e->attr, mnt_id);
>> + inode = lo_find(lo, NULL, &e->attr, mnt_id);
>> if (inode) {
>> close(newfd);
>> } else {
>> @@ -1213,7 +1236,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
>> }
>> pthread_mutex_lock(&lo->mutex);
>> inode->fuse_ino = lo_add_inode_mapping(req, inode);
>> - g_hash_table_insert(lo->inodes, &inode->key, inode);
>> + g_hash_table_insert(lo->inodes_by_ids, &inode->key, inode);
>> pthread_mutex_unlock(&lo->mutex);
>> }
>> e->ino = inode->fuse_ino;
>> @@ -1525,7 +1548,7 @@ static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
>> return NULL;
>> }
>>
>> - return lo_find(lo, &attr, mnt_id);
>> + return lo_find(lo, NULL, &attr, mnt_id);
>> }
>>
>> static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
>> @@ -1688,7 +1711,7 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
>> inode->nlookup -= n;
>> if (!inode->nlookup) {
>> lo_map_remove(&lo->ino_map, inode->fuse_ino);
>> - g_hash_table_remove(lo->inodes, &inode->key);
>> + g_hash_table_remove(lo->inodes_by_ids, &inode->key);
>> if (lo->posix_lock) {
>> if (g_hash_table_size(inode->posix_locks)) {
>> fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
>> @@ -3388,7 +3411,7 @@ static void lo_destroy(void *userdata)
>> GHashTableIter iter;
>> gpointer key, value;
>>
>> - g_hash_table_iter_init(&iter, lo->inodes);
>> + g_hash_table_iter_init(&iter, lo->inodes_by_ids);
>> if (!g_hash_table_iter_next(&iter, &key, &value)) {
>> break;
>> }
>> @@ -3931,10 +3954,34 @@ static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
>> return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
>> }
>>
>> +static guint lo_fhandle_hash(gconstpointer key)
>> +{
>> + const struct lo_fhandle *fh = key;
>> + guint hash;
>> + size_t i;
>> +
>> + /* Basically g_str_hash() */
>> + hash = 5381;
>> + for (i = 0; i < sizeof(fh->padding); i++) {
>> + hash += hash * 33 + (unsigned char)fh->padding[i];
>> + }
>> + hash += hash * 33 + fh->mount_id;
>> +
>> + return hash;
>> +}
>> +
>> +static gboolean lo_fhandle_equal(gconstpointer a, gconstpointer b)
>> +{
>> + return !memcmp(a, b, sizeof(struct lo_fhandle));
>> +}
>> +
>> static void fuse_lo_data_cleanup(struct lo_data *lo)
>> {
>> - if (lo->inodes) {
>> - g_hash_table_destroy(lo->inodes);
>> + if (lo->inodes_by_ids) {
>> + g_hash_table_destroy(lo->inodes_by_ids);
>> + }
>> + if (lo->inodes_by_ids) {
>> + g_hash_table_destroy(lo->inodes_by_handle);
>> }
>>
>> if (lo->root.posix_locks) {
>> @@ -3990,7 +4037,8 @@ int main(int argc, char *argv[])
>> qemu_init_exec_dir(argv[0]);
>>
>> pthread_mutex_init(&lo.mutex, NULL);
>> - lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
>> + lo.inodes_by_ids = g_hash_table_new(lo_key_hash, lo_key_equal);
>> + lo.inodes_by_handle = g_hash_table_new(lo_fhandle_hash, lo_fhandle_equal);
>> lo.root.fd = -1;
>> lo.root.fuse_ino = FUSE_ROOT_ID;
>> lo.cache = CACHE_AUTO;
>> --
>> 2.31.1
>>
>> _______________________________________________
>> Virtio-fs mailing list
>> Virtio-fs@redhat.com
>> https://listman.redhat.com/mailman/listinfo/virtio-fs
>>
On Wed, Jun 16, 2021 at 03:38:13PM +0200, Max Reitz wrote:
> On 11.06.21 22:04, Vivek Goyal wrote:
> > On Wed, Jun 09, 2021 at 05:55:49PM +0200, Max Reitz wrote:
> > > Currently, lo_inode.fhandle is always NULL and so always keep an O_PATH
> > > FD in lo_inode.fd. Therefore, when the respective inode is unlinked,
> > > its inode ID will remain in use until we drop our lo_inode (and
> > > lo_inode_put() thus closes the FD). Therefore, lo_find() can safely use
> > > the inode ID as an lo_inode key, because any inode with an inode ID we
> > > find in lo_data.inodes (on the same filesystem) must be the exact same
> > > file.
> > >
> > > This will change when we start setting lo_inode.fhandle so we do not
> > > have to keep an O_PATH FD open. Then, unlinking such an inode will
> > > immediately remove it, so its ID can then be reused by newly created
> > > files, even while the lo_inode object is still there[1].
> > >
> > > So creating a new file can then reuse the old file's inode ID, and
> > > looking up the new file would lead to us finding the old file's
> > > lo_inode, which is not ideal.
> > >
> > > Luckily, just as file handles cause this problem, they also solve it: A
> > > file handle contains a generation ID, which changes when an inode ID is
> > > reused, so the new file can be distinguished from the old one. So all
> > > we need to do is to add a second map besides lo_data.inodes that maps
> > > file handles to lo_inodes, namely lo_data.inodes_by_handle. For
> > > clarity, lo_data.inodes is renamed to lo_data.inodes_by_ids.
> > >
> > > Unfortunately, we cannot rely on being able to generate file handles
> > > every time.
> > Hi Max,
> >
> > What are the cases where we can not rely being able to generate file
> > handles?
>
> I have no idea, but it’s much easier to claim we can’t than to prove that we
> can. I’d rather be resilient.
Assuming that we can not genererate file handles all the time and hence
mainitaing another inode cache seems little problematic to me.
I would rather start with that we can generate file handles and have
a single inode cache.
>
> > > Therefore, we still enter every lo_inode object into
> > > inodes_by_ids, but having an entry in inodes_by_handle is optional. A
> > > potential inodes_by_handle entry then has precedence, the inodes_by_ids
> > > entry is just a fallback.
> > If we have to keep inodes_by_ids around, then can we just add fhandle
> > to the lo_key. That way we can manage with single hash table and still
> > be able to detect if inode ID has been reused.
>
> We cannot, because I assume we cannot rely on name_to_handle_at() working
> every time.
I guess either we need concrete information that we can't generate
file handle every time or we should assume we can until we are proven
wrong. And then fix it accordingly, IMHO.
> Therefore, maybe at one point we can generate a file handle, and
> at another, we cannot – we should still be able to look up the inode
> regardless.
>
> If the file handle were part of inodes_by_ids, then we can look up inodes
> only if we can generate a file handle either every time (for a given inode)
> or never.
Right. And is there a reason to belive that for the same file we can
sometimes generate file handles and other times not.
Thanks
Vivek
> Or, well, I suppose we could always create two entries, one with
> the file handles zeroed out, and one with the file handle specified, but I
> wouldn’t find that very beautiful.
>
> Max
>
> > > Note that we do not generate lo_fhandle objects yet, and so we also do
> > > not enter anything into the inodes_by_handle map yet. Also, all lookups
> > > skip that map. We might manually create file handles with some code
> > > that is immediately removed by the next patch again, but that would
> > > break the assumption in lo_find() that every lo_inode with a non-NULL
> > > .fhandle must have an entry in inodes_by_handle and vice versa. So we
> > > leave actually using the inodes_by_handle map for the next patch.
> > >
> > > [1] If some application in the guest still has the file open, there is
> > > going to be a corresponding FD mapping in lo_data.fd_map. In such a
> > > case, the inode will only go away once every application in the guest
> > > has closed it. The problem described only applies to cases where the
> > > guest does not have the file open, and it is just in the dentry cache,
> > > basically.
> > >
> > > Signed-off-by: Max Reitz <mreitz@redhat.com>
> > > Reviewed-by: Connor Kuehl <ckuehl@redhat.com>
> > > ---
> > > tools/virtiofsd/passthrough_ll.c | 80 +++++++++++++++++++++++++-------
> > > 1 file changed, 64 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
> > > index e665575401..793d2c333e 100644
> > > --- a/tools/virtiofsd/passthrough_ll.c
> > > +++ b/tools/virtiofsd/passthrough_ll.c
> > > @@ -179,7 +179,8 @@ struct lo_data {
> > > int announce_submounts;
> > > bool use_statx;
> > > struct lo_inode root;
> > > - GHashTable *inodes; /* protected by lo->mutex */
> > > + GHashTable *inodes_by_ids; /* protected by lo->mutex */
> > > + GHashTable *inodes_by_handle; /* protected by lo->mutex */
> > > struct lo_map ino_map; /* protected by lo->mutex */
> > > struct lo_map dirp_map; /* protected by lo->mutex */
> > > struct lo_map fd_map; /* protected by lo->mutex */
> > > @@ -257,8 +258,9 @@ static struct {
> > > /* That we loaded cap-ng in the current thread from the saved */
> > > static __thread bool cap_loaded = 0;
> > > -static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
> > > - uint64_t mnt_id);
> > > +static struct lo_inode *lo_find(struct lo_data *lo,
> > > + const struct lo_fhandle *fhandle,
> > > + struct stat *st, uint64_t mnt_id);
> > > static int xattr_map_client(const struct lo_data *lo, const char *client_name,
> > > char **out_name);
> > > @@ -1032,18 +1034,39 @@ out_err:
> > > fuse_reply_err(req, saverr);
> > > }
> > > -static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
> > > - uint64_t mnt_id)
> > > +static struct lo_inode *lo_find(struct lo_data *lo,
> > > + const struct lo_fhandle *fhandle,
> > > + struct stat *st, uint64_t mnt_id)
> > > {
> > > - struct lo_inode *p;
> > > - struct lo_key key = {
> > > + struct lo_inode *p = NULL;
> > > + struct lo_key ids_key = {
> > > .ino = st->st_ino,
> > > .dev = st->st_dev,
> > > .mnt_id = mnt_id,
> > > };
> > > pthread_mutex_lock(&lo->mutex);
> > > - p = g_hash_table_lookup(lo->inodes, &key);
> > > + if (fhandle) {
> > > + p = g_hash_table_lookup(lo->inodes_by_handle, fhandle);
> > > + }
> > > + if (!p) {
> > > + p = g_hash_table_lookup(lo->inodes_by_ids, &ids_key);
> > > + /*
> > > + * When we had to fall back to looking up an inode by its IDs,
> > > + * ensure that we hit an entry that does not have a file
> > > + * handle. Entries with file handles must also have a handle
> > > + * alt key, so if we have not found it by that handle alt key,
> > > + * we must have found an entry with a mismatching handle; i.e.
> > > + * an entry for a different file, even though it has the same
> > > + * inode ID.
> > > + * (This can happen when we look up a new file that has reused
> > > + * the inode ID of some previously unlinked inode for which we
> > > + * still have an lo_inode object.)
> > > + */
> > > + if (p && fhandle != NULL && p->fhandle != NULL) {
> > > + p = NULL;
> > > + }
> > > + }
> > > if (p) {
> > > assert(p->nlookup > 0);
> > > p->nlookup++;
> > > @@ -1183,7 +1206,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
> > > e->attr_flags |= FUSE_ATTR_SUBMOUNT;
> > > }
> > > - inode = lo_find(lo, &e->attr, mnt_id);
> > > + inode = lo_find(lo, NULL, &e->attr, mnt_id);
> > > if (inode) {
> > > close(newfd);
> > > } else {
> > > @@ -1213,7 +1236,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
> > > }
> > > pthread_mutex_lock(&lo->mutex);
> > > inode->fuse_ino = lo_add_inode_mapping(req, inode);
> > > - g_hash_table_insert(lo->inodes, &inode->key, inode);
> > > + g_hash_table_insert(lo->inodes_by_ids, &inode->key, inode);
> > > pthread_mutex_unlock(&lo->mutex);
> > > }
> > > e->ino = inode->fuse_ino;
> > > @@ -1525,7 +1548,7 @@ static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
> > > return NULL;
> > > }
> > > - return lo_find(lo, &attr, mnt_id);
> > > + return lo_find(lo, NULL, &attr, mnt_id);
> > > }
> > > static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
> > > @@ -1688,7 +1711,7 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
> > > inode->nlookup -= n;
> > > if (!inode->nlookup) {
> > > lo_map_remove(&lo->ino_map, inode->fuse_ino);
> > > - g_hash_table_remove(lo->inodes, &inode->key);
> > > + g_hash_table_remove(lo->inodes_by_ids, &inode->key);
> > > if (lo->posix_lock) {
> > > if (g_hash_table_size(inode->posix_locks)) {
> > > fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
> > > @@ -3388,7 +3411,7 @@ static void lo_destroy(void *userdata)
> > > GHashTableIter iter;
> > > gpointer key, value;
> > > - g_hash_table_iter_init(&iter, lo->inodes);
> > > + g_hash_table_iter_init(&iter, lo->inodes_by_ids);
> > > if (!g_hash_table_iter_next(&iter, &key, &value)) {
> > > break;
> > > }
> > > @@ -3931,10 +3954,34 @@ static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
> > > return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
> > > }
> > > +static guint lo_fhandle_hash(gconstpointer key)
> > > +{
> > > + const struct lo_fhandle *fh = key;
> > > + guint hash;
> > > + size_t i;
> > > +
> > > + /* Basically g_str_hash() */
> > > + hash = 5381;
> > > + for (i = 0; i < sizeof(fh->padding); i++) {
> > > + hash += hash * 33 + (unsigned char)fh->padding[i];
> > > + }
> > > + hash += hash * 33 + fh->mount_id;
> > > +
> > > + return hash;
> > > +}
> > > +
> > > +static gboolean lo_fhandle_equal(gconstpointer a, gconstpointer b)
> > > +{
> > > + return !memcmp(a, b, sizeof(struct lo_fhandle));
> > > +}
> > > +
> > > static void fuse_lo_data_cleanup(struct lo_data *lo)
> > > {
> > > - if (lo->inodes) {
> > > - g_hash_table_destroy(lo->inodes);
> > > + if (lo->inodes_by_ids) {
> > > + g_hash_table_destroy(lo->inodes_by_ids);
> > > + }
> > > + if (lo->inodes_by_ids) {
> > > + g_hash_table_destroy(lo->inodes_by_handle);
> > > }
> > > if (lo->root.posix_locks) {
> > > @@ -3990,7 +4037,8 @@ int main(int argc, char *argv[])
> > > qemu_init_exec_dir(argv[0]);
> > > pthread_mutex_init(&lo.mutex, NULL);
> > > - lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
> > > + lo.inodes_by_ids = g_hash_table_new(lo_key_hash, lo_key_equal);
> > > + lo.inodes_by_handle = g_hash_table_new(lo_fhandle_hash, lo_fhandle_equal);
> > > lo.root.fd = -1;
> > > lo.root.fuse_ino = FUSE_ROOT_ID;
> > > lo.cache = CACHE_AUTO;
> > > --
> > > 2.31.1
> > >
> > > _______________________________________________
> > > Virtio-fs mailing list
> > > Virtio-fs@redhat.com
> > > https://listman.redhat.com/mailman/listinfo/virtio-fs
> > >
>
On 17.06.21 23:21, Vivek Goyal wrote: > On Wed, Jun 16, 2021 at 03:38:13PM +0200, Max Reitz wrote: >> On 11.06.21 22:04, Vivek Goyal wrote: >>> On Wed, Jun 09, 2021 at 05:55:49PM +0200, Max Reitz wrote: >>>> Currently, lo_inode.fhandle is always NULL and so always keep an O_PATH >>>> FD in lo_inode.fd. Therefore, when the respective inode is unlinked, >>>> its inode ID will remain in use until we drop our lo_inode (and >>>> lo_inode_put() thus closes the FD). Therefore, lo_find() can safely use >>>> the inode ID as an lo_inode key, because any inode with an inode ID we >>>> find in lo_data.inodes (on the same filesystem) must be the exact same >>>> file. >>>> >>>> This will change when we start setting lo_inode.fhandle so we do not >>>> have to keep an O_PATH FD open. Then, unlinking such an inode will >>>> immediately remove it, so its ID can then be reused by newly created >>>> files, even while the lo_inode object is still there[1]. >>>> >>>> So creating a new file can then reuse the old file's inode ID, and >>>> looking up the new file would lead to us finding the old file's >>>> lo_inode, which is not ideal. >>>> >>>> Luckily, just as file handles cause this problem, they also solve it: A >>>> file handle contains a generation ID, which changes when an inode ID is >>>> reused, so the new file can be distinguished from the old one. So all >>>> we need to do is to add a second map besides lo_data.inodes that maps >>>> file handles to lo_inodes, namely lo_data.inodes_by_handle. For >>>> clarity, lo_data.inodes is renamed to lo_data.inodes_by_ids. >>>> >>>> Unfortunately, we cannot rely on being able to generate file handles >>>> every time. >>> Hi Max, >>> >>> What are the cases where we can not rely being able to generate file >>> handles? >> I have no idea, but it’s much easier to claim we can’t than to prove that we >> can. I’d rather be resilient. > Assuming that we can not genererate file handles all the time and hence > mainitaing another inode cache seems little problematic to me. How so? > I would rather start with that we can generate file handles and have > a single inode cache. The assumption that we can generate file handles all the time is a much stronger one (and one which needs to be proven) than assuming that failure is possible. Also, it is still a single inode cache. I'm just adding a third key. (The two existing keys are lo_key (through lo->inodes) and fuse_ino_t (through lo->ino_map).) >>>> Therefore, we still enter every lo_inode object into >>>> inodes_by_ids, but having an entry in inodes_by_handle is optional. A >>>> potential inodes_by_handle entry then has precedence, the inodes_by_ids >>>> entry is just a fallback. >>> If we have to keep inodes_by_ids around, then can we just add fhandle >>> to the lo_key. That way we can manage with single hash table and still >>> be able to detect if inode ID has been reused. >> We cannot, because I assume we cannot rely on name_to_handle_at() working >> every time. > I guess either we need concrete information that we can't generate > file handle every time or we should assume we can until we are proven > wrong. And then fix it accordingly, IMHO. I don’t know why we need this other than because it would simplify the code. Under the assumption that for a specific file we can either generate file handles all the time or never, the code as it is will behave correct. It’s just a bit more complicated than it would need to be, but I don’t find the diffstat of +64/-16 to be indicative of something that’s really bad. >> Therefore, maybe at one point we can generate a file handle, and >> at another, we cannot – we should still be able to look up the inode >> regardless. >> >> If the file handle were part of inodes_by_ids, then we can look up inodes >> only if we can generate a file handle either every time (for a given inode) >> or never. > Right. And is there a reason to belive that for the same file we can > sometimes generate file handles and other times not. Looking into name_to_handle_at()’s man page, there is no error listed that I could imagine happening only sometimes. But it doesn’t give an explicit guarantee that it will either always succeed or fail for a given inode. Looking into the kernel, I can see that most filesystems only fail .encode_fh() if the buffer is too small. Overlayfs can also fail with ENOMEM (which will be translated to EOVERFLOW along the way, so so much for name_to_handle_at()’s list of error conditions), and EIO on conditions I don’t understand well enough (again, will become EOVERFLOW for the user). You’re probably right that at least in practice we don’t need to accommodate for name_to_handle_at() sometimes working for some inode and sometimes not. But I feel quite uneasy relying on this being the case, and being the case forever, when I find it quite simple to just have some added complexity to deal with it. It’s just a third key for our inode cache. If you want to, I can write a 10/9 patch that simplifies the code under the assumption that name_to_handle_at() will either fail or not, but frankly I wouldn’t want to have my name under it. (Which is why it would be a 10/9 so I can have some explicit note that my S-o-b would be there only for legal reasons, not because this is really my patch.) (And now I tentatively wrote such a patch (which requires patch 9 to be reverted, of course), and that gives me a diffstat of +37/-66. Basically, the difference is just having two comments removed.) Max
On Fri, Jun 18, 2021 at 10:28:38AM +0200, Max Reitz wrote: > On 17.06.21 23:21, Vivek Goyal wrote: > > On Wed, Jun 16, 2021 at 03:38:13PM +0200, Max Reitz wrote: > > > On 11.06.21 22:04, Vivek Goyal wrote: > > > > On Wed, Jun 09, 2021 at 05:55:49PM +0200, Max Reitz wrote: > > > > > Currently, lo_inode.fhandle is always NULL and so always keep an O_PATH > > > > > FD in lo_inode.fd. Therefore, when the respective inode is unlinked, > > > > > its inode ID will remain in use until we drop our lo_inode (and > > > > > lo_inode_put() thus closes the FD). Therefore, lo_find() can safely use > > > > > the inode ID as an lo_inode key, because any inode with an inode ID we > > > > > find in lo_data.inodes (on the same filesystem) must be the exact same > > > > > file. > > > > > > > > > > This will change when we start setting lo_inode.fhandle so we do not > > > > > have to keep an O_PATH FD open. Then, unlinking such an inode will > > > > > immediately remove it, so its ID can then be reused by newly created > > > > > files, even while the lo_inode object is still there[1]. > > > > > > > > > > So creating a new file can then reuse the old file's inode ID, and > > > > > looking up the new file would lead to us finding the old file's > > > > > lo_inode, which is not ideal. > > > > > > > > > > Luckily, just as file handles cause this problem, they also solve it: A > > > > > file handle contains a generation ID, which changes when an inode ID is > > > > > reused, so the new file can be distinguished from the old one. So all > > > > > we need to do is to add a second map besides lo_data.inodes that maps > > > > > file handles to lo_inodes, namely lo_data.inodes_by_handle. For > > > > > clarity, lo_data.inodes is renamed to lo_data.inodes_by_ids. > > > > > > > > > > Unfortunately, we cannot rely on being able to generate file handles > > > > > every time. > > > > Hi Max, > > > > > > > > What are the cases where we can not rely being able to generate file > > > > handles? > > > I have no idea, but it’s much easier to claim we can’t than to prove that we > > > can. I’d rather be resilient. > > Assuming that we can not genererate file handles all the time and hence > > mainitaing another inode cache seems little problematic to me. > > How so? It is extra complexity. Need to worry about one more hash table. Sure, I give it to you that we are not creating two copies of inodes. Same inode object is being added to two different hash tables and is being looked up using two different keys. > > > I would rather start with that we can generate file handles and have > > a single inode cache. > > The assumption that we can generate file handles all the time is a much > stronger one (and one which needs to be proven) than assuming that failure > is possible. So if temporary failures can happen (like -ENOMEM, as you mentioned), these can happen with openat() too. And in that case we return error to caller. So why to try to hide temporary failures from name_to_handle_at(). I am still reading your code and trying to understand it. But one question came to mind. What happens if we can generate file handle during lookup. But can't generate when same file is looked up again. - A file foo.txt is looked. We can create file handle and we add it to lo->inodes_by_handle as well as lo->inodes_by_ds. - Say somebody deleted file and created again and inode number got reused. - Now during ->revalidation path, lookup happens again. This time say we can't generate file handle. If am reading lo_do_find() code correctly, it will find the old inode using ids and return same inode as result of lookup. And we did not recognize that inode number has been reused. And issues might arise if we could not generate file handle in first lookup but could generate in second. - A file foo.txt is lookedup. We can not create file handle and we add it to lo->inodes_by_ids. - Say somebody deleted file and created again and inode number got reused. - Now during ->revalidation path, lookup happens again. This time say we can generate file handle. If am reading lo_do_find() code correctly, it will find the old inode using ids and return same inode as result of lookup. And we did not recognize that inode number has been reused. IOW, because we could not generate the file handle, we lost the ability to recognize that inode number has been reused. That means either we don't switch to using file handles or if we do switch, it is important that we can generate file handle to differentiate whether inode number has been used or not. If not, return error to caller. Caller probably will mark inode bad and let and do a lookup again. > > Also, it is still a single inode cache. I'm just adding a third key. (The > two existing keys are lo_key (through lo->inodes) and fuse_ino_t (through > lo->ino_map).) > > > > > > Therefore, we still enter every lo_inode object into > > > > > inodes_by_ids, but having an entry in inodes_by_handle is optional. A > > > > > potential inodes_by_handle entry then has precedence, the inodes_by_ids > > > > > entry is just a fallback. > > > > If we have to keep inodes_by_ids around, then can we just add fhandle > > > > to the lo_key. That way we can manage with single hash table and still > > > > be able to detect if inode ID has been reused. > > > We cannot, because I assume we cannot rely on name_to_handle_at() working > > > every time. > > I guess either we need concrete information that we can't generate > > file handle every time or we should assume we can until we are proven > > wrong. And then fix it accordingly, IMHO. > > I don’t know why we need this other than because it would simplify the code. > > Under the assumption that for a specific file we can either generate file > handles all the time or never, the code as it is will behave correct. It’s > just a bit more complicated than it would need to be, but I don’t find the > diffstat of +64/-16 to be indicative of something that’s really bad. > > > > Therefore, maybe at one point we can generate a file handle, and > > > at another, we cannot – we should still be able to look up the inode > > > regardless. > > > > > > If the file handle were part of inodes_by_ids, then we can look up inodes > > > only if we can generate a file handle either every time (for a given inode) > > > or never. > > Right. And is there a reason to belive that for the same file we can > > sometimes generate file handles and other times not. > > Looking into name_to_handle_at()’s man page, there is no error listed that I > could imagine happening only sometimes. But it doesn’t give an explicit > guarantee that it will either always succeed or fail for a given inode. > > Looking into the kernel, I can see that most filesystems only fail > .encode_fh() if the buffer is too small. Overlayfs can also fail with ENOMEM > (which will be translated to EOVERFLOW along the way, so so much for > name_to_handle_at()’s list of error conditions), and EIO on conditions I > don’t understand well enough (again, will become EOVERFLOW for the user). > > You’re probably right that at least in practice we don’t need to accommodate > for name_to_handle_at() sometimes working for some inode and sometimes not. What am I not able to understand is that why we can't return error if we run into a temporary issue and can't generate file handle. What's that requirement that we need to hide the error and try to cover it up by some other means. Thanks Vivek > > But I feel quite uneasy relying on this being the case, and being the case > forever, when I find it quite simple to just have some added complexity to > deal with it. It’s just a third key for our inode cache. > > If you want to, I can write a 10/9 patch that simplifies the code under the > assumption that name_to_handle_at() will either fail or not, but frankly I > wouldn’t want to have my name under it. (Which is why it would be a 10/9 so > I can have some explicit note that my S-o-b would be there only for legal > reasons, not because this is really my patch.) > > (And now I tentatively wrote such a patch (which requires patch 9 to be > reverted, of course), and that gives me a diffstat of +37/-66. Basically, > the difference is just having two comments removed.) > > Max >
On 18.06.21 20:29, Vivek Goyal wrote: > On Fri, Jun 18, 2021 at 10:28:38AM +0200, Max Reitz wrote: >> On 17.06.21 23:21, Vivek Goyal wrote: >>> On Wed, Jun 16, 2021 at 03:38:13PM +0200, Max Reitz wrote: >>>> On 11.06.21 22:04, Vivek Goyal wrote: >>>>> On Wed, Jun 09, 2021 at 05:55:49PM +0200, Max Reitz wrote: >>>>>> Currently, lo_inode.fhandle is always NULL and so always keep an O_PATH >>>>>> FD in lo_inode.fd. Therefore, when the respective inode is unlinked, >>>>>> its inode ID will remain in use until we drop our lo_inode (and >>>>>> lo_inode_put() thus closes the FD). Therefore, lo_find() can safely use >>>>>> the inode ID as an lo_inode key, because any inode with an inode ID we >>>>>> find in lo_data.inodes (on the same filesystem) must be the exact same >>>>>> file. >>>>>> >>>>>> This will change when we start setting lo_inode.fhandle so we do not >>>>>> have to keep an O_PATH FD open. Then, unlinking such an inode will >>>>>> immediately remove it, so its ID can then be reused by newly created >>>>>> files, even while the lo_inode object is still there[1]. >>>>>> >>>>>> So creating a new file can then reuse the old file's inode ID, and >>>>>> looking up the new file would lead to us finding the old file's >>>>>> lo_inode, which is not ideal. >>>>>> >>>>>> Luckily, just as file handles cause this problem, they also solve it: A >>>>>> file handle contains a generation ID, which changes when an inode ID is >>>>>> reused, so the new file can be distinguished from the old one. So all >>>>>> we need to do is to add a second map besides lo_data.inodes that maps >>>>>> file handles to lo_inodes, namely lo_data.inodes_by_handle. For >>>>>> clarity, lo_data.inodes is renamed to lo_data.inodes_by_ids. >>>>>> >>>>>> Unfortunately, we cannot rely on being able to generate file handles >>>>>> every time. >>>>> Hi Max, >>>>> >>>>> What are the cases where we can not rely being able to generate file >>>>> handles? >>>> I have no idea, but it’s much easier to claim we can’t than to prove that we >>>> can. I’d rather be resilient. >>> Assuming that we can not genererate file handles all the time and hence >>> mainitaing another inode cache seems little problematic to me. >> How so? > It is extra complexity. Need to worry about one more hash table. Sure, > I give it to you that we are not creating two copies of inodes. Same > inode object is being added to two different hash tables and is being > looked up using two different keys. > >>> I would rather start with that we can generate file handles and have >>> a single inode cache. >> The assumption that we can generate file handles all the time is a much >> stronger one (and one which needs to be proven) than assuming that failure >> is possible. > So if temporary failures can happen (like -ENOMEM, as you mentioned), > these can happen with openat() too. And in that case we return error > to caller. So why to try to hide temporary failures from > name_to_handle_at(). Well, for openat() we don’t have a choice, if that fails, there is no fallback, so we must return an error. For name_to_handle_at(), there is a fallback. > I am still reading your code and trying to understand it. But one > question came to mind. What happens if we can generate file handle > during lookup. But can't generate when same file is looked up again. > > - A file foo.txt is looked. We can create file handle and we add it > to lo->inodes_by_handle as well as lo->inodes_by_ds. > > - Say somebody deleted file and created again and inode number got > reused. > > - Now during ->revalidation path, lookup happens again. This time say > we can't generate file handle. If am reading lo_do_find() code > correctly, it will find the old inode using ids and return same > inode as result of lookup. And we did not recognize that inode > number has been reused. Oh, that’s a good point. If an lo_inode has no O_PATH fd but is only addressed by handle, we must always look it up by handle. > And issues might arise if we could not generate file handle in first > lookup but could generate in second. > > - A file foo.txt is lookedup. We can not create file handle and we add it > to lo->inodes_by_ids. > > - Say somebody deleted file and created again and inode number got > reused. This is not possible, because if we could not generate a file handle on the first lookup, the lo_inode will have an O_PATH fd attached to it, so the inode number cannot be reused while the lo_inode still exists. > - Now during ->revalidation path, lookup happens again. This time say > we can generate file handle. If am reading lo_do_find() code > correctly, it will find the old inode using ids and return same > inode as result of lookup. And we did not recognize that inode > number has been reused. > > IOW, because we could not generate the file handle, we lost the > ability to recognize that inode number has been reused. That means > either we don't switch to using file handles or if we do switch, > it is important that we can generate file handle to differentiate > whether inode number has been used or not. If not, return error to > caller. Caller probably will mark inode bad and let and do a lookup > again. > >> Also, it is still a single inode cache. I'm just adding a third key. (The >> two existing keys are lo_key (through lo->inodes) and fuse_ino_t (through >> lo->ino_map).) >> >>>>>> Therefore, we still enter every lo_inode object into >>>>>> inodes_by_ids, but having an entry in inodes_by_handle is optional. A >>>>>> potential inodes_by_handle entry then has precedence, the inodes_by_ids >>>>>> entry is just a fallback. >>>>> If we have to keep inodes_by_ids around, then can we just add fhandle >>>>> to the lo_key. That way we can manage with single hash table and still >>>>> be able to detect if inode ID has been reused. >>>> We cannot, because I assume we cannot rely on name_to_handle_at() working >>>> every time. >>> I guess either we need concrete information that we can't generate >>> file handle every time or we should assume we can until we are proven >>> wrong. And then fix it accordingly, IMHO. >> I don’t know why we need this other than because it would simplify the code. >> >> Under the assumption that for a specific file we can either generate file >> handles all the time or never, the code as it is will behave correct. It’s >> just a bit more complicated than it would need to be, but I don’t find the >> diffstat of +64/-16 to be indicative of something that’s really bad. >> >>>> Therefore, maybe at one point we can generate a file handle, and >>>> at another, we cannot – we should still be able to look up the inode >>>> regardless. >>>> >>>> If the file handle were part of inodes_by_ids, then we can look up inodes >>>> only if we can generate a file handle either every time (for a given inode) >>>> or never. >>> Right. And is there a reason to belive that for the same file we can >>> sometimes generate file handles and other times not. >> Looking into name_to_handle_at()’s man page, there is no error listed that I >> could imagine happening only sometimes. But it doesn’t give an explicit >> guarantee that it will either always succeed or fail for a given inode. >> >> Looking into the kernel, I can see that most filesystems only fail >> .encode_fh() if the buffer is too small. Overlayfs can also fail with ENOMEM >> (which will be translated to EOVERFLOW along the way, so so much for >> name_to_handle_at()’s list of error conditions), and EIO on conditions I >> don’t understand well enough (again, will become EOVERFLOW for the user). >> >> You’re probably right that at least in practice we don’t need to accommodate >> for name_to_handle_at() sometimes working for some inode and sometimes not. > What am I not able to understand is that why we can't return error if > we run into a temporary issue and can't generate file handle. What's > that requirement that we need to hide the error and try to cover it > up by some other means. There is no requirement, it’s just that we need to hide ENOTSUPP errors anyway (because e.g. some submounted filesystem may not support file handles), so I considered hiding temporary errors not to really add complexity. (Which is true, as can be seen from the diff stat I posted below: Dropping the second hash table and making the handle part of lo_key, so that temporary errors are now fatal, generates a diff of +37/-66, where -20 are just two comments (which realistically I’d need to replace by different comments), so in terms of code, it’s +37/-46.) I’d just rather handle errors gracefully when I find it doesn’t really cost complexity. However, you made a good point in that we must require name_to_handle_at() to work if it worked before for some inode, not because it would be simpler, but because it would be wrong otherwise. As for the other way around... Well, now I don’t have a strong opinion on it. Handling temporary name_to_handle_at() failure after it worked the first time should not add extra complexity, but it wouldn’t be symmetric. Like, allowing temporary failure sometimes but not at other times. The next question is, how do we detect temporary failure, because if we look up some new inode, name_to_handle_at() fails, we ignore it, and then it starts to work and we fail all further lookups, that’s not good. We should have the first lookup fail. I suppose ENOTSUPP means “OK to ignore”, and for everything else we should let lookup fail? (And that pretty much answers my "what if name_to_handle_at() works the first time, but then fails" question. If we let anything but ENOTSUPP let the lookup fail, then we should do so every time.) Max >> But I feel quite uneasy relying on this being the case, and being the case >> forever, when I find it quite simple to just have some added complexity to >> deal with it. It’s just a third key for our inode cache. >> >> If you want to, I can write a 10/9 patch that simplifies the code under the >> assumption that name_to_handle_at() will either fail or not, but frankly I >> wouldn’t want to have my name under it. (Which is why it would be a 10/9 so >> I can have some explicit note that my S-o-b would be there only for legal >> reasons, not because this is really my patch.) >> >> (And now I tentatively wrote such a patch (which requires patch 9 to be >> reverted, of course), and that gives me a diffstat of +37/-66. Basically, >> the difference is just having two comments removed.) >> >> Max >>
On Mon, Jun 21, 2021 at 11:02:16AM +0200, Max Reitz wrote: > On 18.06.21 20:29, Vivek Goyal wrote: > > On Fri, Jun 18, 2021 at 10:28:38AM +0200, Max Reitz wrote: > > > On 17.06.21 23:21, Vivek Goyal wrote: > > > > On Wed, Jun 16, 2021 at 03:38:13PM +0200, Max Reitz wrote: > > > > > On 11.06.21 22:04, Vivek Goyal wrote: > > > > > > On Wed, Jun 09, 2021 at 05:55:49PM +0200, Max Reitz wrote: > > > > > > > Currently, lo_inode.fhandle is always NULL and so always keep an O_PATH > > > > > > > FD in lo_inode.fd. Therefore, when the respective inode is unlinked, > > > > > > > its inode ID will remain in use until we drop our lo_inode (and > > > > > > > lo_inode_put() thus closes the FD). Therefore, lo_find() can safely use > > > > > > > the inode ID as an lo_inode key, because any inode with an inode ID we > > > > > > > find in lo_data.inodes (on the same filesystem) must be the exact same > > > > > > > file. > > > > > > > > > > > > > > This will change when we start setting lo_inode.fhandle so we do not > > > > > > > have to keep an O_PATH FD open. Then, unlinking such an inode will > > > > > > > immediately remove it, so its ID can then be reused by newly created > > > > > > > files, even while the lo_inode object is still there[1]. > > > > > > > > > > > > > > So creating a new file can then reuse the old file's inode ID, and > > > > > > > looking up the new file would lead to us finding the old file's > > > > > > > lo_inode, which is not ideal. > > > > > > > > > > > > > > Luckily, just as file handles cause this problem, they also solve it: A > > > > > > > file handle contains a generation ID, which changes when an inode ID is > > > > > > > reused, so the new file can be distinguished from the old one. So all > > > > > > > we need to do is to add a second map besides lo_data.inodes that maps > > > > > > > file handles to lo_inodes, namely lo_data.inodes_by_handle. For > > > > > > > clarity, lo_data.inodes is renamed to lo_data.inodes_by_ids. > > > > > > > > > > > > > > Unfortunately, we cannot rely on being able to generate file handles > > > > > > > every time. > > > > > > Hi Max, > > > > > > > > > > > > What are the cases where we can not rely being able to generate file > > > > > > handles? > > > > > I have no idea, but it’s much easier to claim we can’t than to prove that we > > > > > can. I’d rather be resilient. > > > > Assuming that we can not genererate file handles all the time and hence > > > > mainitaing another inode cache seems little problematic to me. > > > How so? > > It is extra complexity. Need to worry about one more hash table. Sure, > > I give it to you that we are not creating two copies of inodes. Same > > inode object is being added to two different hash tables and is being > > looked up using two different keys. > > > > > > I would rather start with that we can generate file handles and have > > > > a single inode cache. > > > The assumption that we can generate file handles all the time is a much > > > stronger one (and one which needs to be proven) than assuming that failure > > > is possible. > > So if temporary failures can happen (like -ENOMEM, as you mentioned), > > these can happen with openat() too. And in that case we return error > > to caller. So why to try to hide temporary failures from > > name_to_handle_at(). > > Well, for openat() we don’t have a choice, if that fails, there is no > fallback, so we must return an error. For name_to_handle_at(), there is a > fallback. > > > I am still reading your code and trying to understand it. But one > > question came to mind. What happens if we can generate file handle > > during lookup. But can't generate when same file is looked up again. > > > > - A file foo.txt is looked. We can create file handle and we add it > > to lo->inodes_by_handle as well as lo->inodes_by_ds. > > > > - Say somebody deleted file and created again and inode number got > > reused. > > > > - Now during ->revalidation path, lookup happens again. This time say > > we can't generate file handle. If am reading lo_do_find() code > > correctly, it will find the old inode using ids and return same > > inode as result of lookup. And we did not recognize that inode > > number has been reused. > > Oh, that’s a good point. If an lo_inode has no O_PATH fd but is only > addressed by handle, we must always look it up by handle. > > > And issues might arise if we could not generate file handle in first > > lookup but could generate in second. > > > > - A file foo.txt is lookedup. We can not create file handle and we add it > > to lo->inodes_by_ids. > > > > - Say somebody deleted file and created again and inode number got > > reused. > > This is not possible, because if we could not generate a file handle on the > first lookup, the lo_inode will have an O_PATH fd attached to it, so the > inode number cannot be reused while the lo_inode still exists. > > > - Now during ->revalidation path, lookup happens again. This time say > > we can generate file handle. If am reading lo_do_find() code > > correctly, it will find the old inode using ids and return same > > inode as result of lookup. And we did not recognize that inode > > number has been reused. > > > > IOW, because we could not generate the file handle, we lost the > > ability to recognize that inode number has been reused. That means > > either we don't switch to using file handles or if we do switch, > > it is important that we can generate file handle to differentiate > > whether inode number has been used or not. If not, return error to > > caller. Caller probably will mark inode bad and let and do a lookup > > again. > > > > > Also, it is still a single inode cache. I'm just adding a third key. (The > > > two existing keys are lo_key (through lo->inodes) and fuse_ino_t (through > > > lo->ino_map).) > > > > > > > > > > Therefore, we still enter every lo_inode object into > > > > > > > inodes_by_ids, but having an entry in inodes_by_handle is optional. A > > > > > > > potential inodes_by_handle entry then has precedence, the inodes_by_ids > > > > > > > entry is just a fallback. > > > > > > If we have to keep inodes_by_ids around, then can we just add fhandle > > > > > > to the lo_key. That way we can manage with single hash table and still > > > > > > be able to detect if inode ID has been reused. > > > > > We cannot, because I assume we cannot rely on name_to_handle_at() working > > > > > every time. > > > > I guess either we need concrete information that we can't generate > > > > file handle every time or we should assume we can until we are proven > > > > wrong. And then fix it accordingly, IMHO. > > > I don’t know why we need this other than because it would simplify the code. > > > > > > Under the assumption that for a specific file we can either generate file > > > handles all the time or never, the code as it is will behave correct. It’s > > > just a bit more complicated than it would need to be, but I don’t find the > > > diffstat of +64/-16 to be indicative of something that’s really bad. > > > > > > > > Therefore, maybe at one point we can generate a file handle, and > > > > > at another, we cannot – we should still be able to look up the inode > > > > > regardless. > > > > > > > > > > If the file handle were part of inodes_by_ids, then we can look up inodes > > > > > only if we can generate a file handle either every time (for a given inode) > > > > > or never. > > > > Right. And is there a reason to belive that for the same file we can > > > > sometimes generate file handles and other times not. > > > Looking into name_to_handle_at()’s man page, there is no error listed that I > > > could imagine happening only sometimes. But it doesn’t give an explicit > > > guarantee that it will either always succeed or fail for a given inode. > > > > > > Looking into the kernel, I can see that most filesystems only fail > > > .encode_fh() if the buffer is too small. Overlayfs can also fail with ENOMEM > > > (which will be translated to EOVERFLOW along the way, so so much for > > > name_to_handle_at()’s list of error conditions), and EIO on conditions I > > > don’t understand well enough (again, will become EOVERFLOW for the user). > > > > > > You’re probably right that at least in practice we don’t need to accommodate > > > for name_to_handle_at() sometimes working for some inode and sometimes not. > > What am I not able to understand is that why we can't return error if > > we run into a temporary issue and can't generate file handle. What's > > that requirement that we need to hide the error and try to cover it > > up by some other means. > > There is no requirement, it’s just that we need to hide ENOTSUPP errors > anyway (because e.g. some submounted filesystem may not support file > handles), so I considered hiding temporary errors ENOTSUPP is not a temporary error I guess. But this is a good point that if host filesystem does not support file handles, should we return error so that user is forced to remove "-o inode_file_handles" option. I can see multiple modes and they all seem to be useful in different scenarios. A. Do not use file handles at all. B. Use file handles if filesystem supports it. Also this could do some kind of mix and matching so that some inodes use file handles while others use fd. We could think of implementing some threshold and if open fds go above this threshold, switch to file handle stuff. C. Strictly use file handles otherwise return error to caller. One possibility is that we implement two options inode_file_handles and no_inode_file_handles. - User specified -o no_inode_file_handles, implement A. - User specified -o inode_file_handles, implement C. - User did not specify anything, then use file handles opportunistically as seen fit by daemon. This provides us the maximum flexibility and this practically implements option B. IOW, if user did not specify anything, then we can think of implementing file handles by default and fallback to using O_PATH fds if filesystem does not support file handles (ENOTSUPP). But if user did specify "-o no_inode_file_handles" or "-o inode_file_handles", this kind of points to strictly implementing respective policy, IMHO. That's how I have implemented some other options. Alternatively, we could think of adding one more option say "inode_file_handles_only. - "-o no_inode_files_handles" implements A. - "-o inode_files_handles" implements B. - "-o inode_files_handles_only" implements C. - If user did not specify anything on command line, then its up to the daemon whatever default policy it wants and default can change over time. > not to really add > complexity. (Which is true, as can be seen from the diff stat I posted > below: Dropping the second hash table and making the handle part of lo_key, > so that temporary errors are now fatal, generates a diff of +37/-66, where > -20 are just two comments (which realistically I’d need to replace by > different comments), so in terms of code, it’s +37/-46.) > > I’d just rather handle errors gracefully when I find it doesn’t really cost > complexity. > > > However, you made a good point in that we must require name_to_handle_at() > to work if it worked before for some inode, not because it would be simpler, > but because it would be wrong otherwise. > > As for the other way around... Well, now I don’t have a strong opinion on > it. Handling temporary name_to_handle_at() failure after it worked the > first time should not add extra complexity, but it wouldn’t be symmetric. > Like, allowing temporary failure sometimes but not at other times. Right. If we decided that we need to generate file handle for an inode and underlying filesystem supports file handles, then handling temporary failures only sometimes will make it assymetric. At this point of time I am more inclined to return error to caller on temporary failures. But if this does not work well in practice, I am open to change the policy. > > The next question is, how do we detect temporary failure, because if we look > up some new inode, name_to_handle_at() fails, we ignore it, and then it > starts to work and we fail all further lookups, that’s not good. We should > have the first lookup fail. I suppose ENOTSUPP means “OK to ignore”, and > for everything else we should let lookup fail? ENOTSUPP will be a permanent failure right? And not a temporary one. It seems reasonable that we gracefully fall back to O_PATH fd in case of ENOTSUPP (Assuming -o inode_file_handles means try to use file handles and fallback to O_PATH fds if host filesystem does not support it). But for temporary failures we probably will have to return errors to callers. Do you have a specific temporary failure in mind which you are concerned about. > (And that pretty much > answers my "what if name_to_handle_at() works the first time, but then > fails" question. If we let anything but ENOTSUPP let the lookup fail, then > we should do so every time.) Agreed. ENOTSUPP seems to be the only error when falling back to O_PATH fd makes most sense to me. Rest of them probably should be propagated to caller. And given ENOTSUPP is a permanent failure, you probably will be able to add fhandle to lo_key and not require a separate mapping which looks up inode by fhandle. Thanks Vivek > > Max > > > > But I feel quite uneasy relying on this being the case, and being the case > > > forever, when I find it quite simple to just have some added complexity to > > > deal with it. It’s just a third key for our inode cache. > > > > > > If you want to, I can write a 10/9 patch that simplifies the code under the > > > assumption that name_to_handle_at() will either fail or not, but frankly I > > > wouldn’t want to have my name under it. (Which is why it would be a 10/9 so > > > I can have some explicit note that my S-o-b would be there only for legal > > > reasons, not because this is really my patch.) > > > > > > (And now I tentatively wrote such a patch (which requires patch 9 to be > > > reverted, of course), and that gives me a diffstat of +37/-66. Basically, > > > the difference is just having two comments removed.) > > > > > > Max > > > >
On 21.06.21 17:51, Vivek Goyal wrote: > On Mon, Jun 21, 2021 at 11:02:16AM +0200, Max Reitz wrote: >> On 18.06.21 20:29, Vivek Goyal wrote: [snip] >>> What am I not able to understand is that why we can't return error if >>> we run into a temporary issue and can't generate file handle. What's >>> that requirement that we need to hide the error and try to cover it >>> up by some other means. >> There is no requirement, it’s just that we need to hide ENOTSUPP errors >> anyway (because e.g. some submounted filesystem may not support file >> handles), so I considered hiding temporary errors > ENOTSUPP is not a temporary error I guess. But this is a good point that > if host filesystem does not support file handles, should we return > error so that user is forced to remove "-o inode_file_handles" option. > > I can see multiple modes and they all seem to be useful in different > scenarios. > > A. Do not use file handles at all. > > B. Use file handles if filesystem supports it. Also this could do some > kind of mix and matching so that some inodes use file handles while > others use fd. We could think of implementing some threshold and > if open fds go above this threshold, switch to file handle stuff. > > C. Strictly use file handles otherwise return error to caller. > > One possibility is that we implement two options inode_file_handles > and no_inode_file_handles. > > - User specified -o no_inode_file_handles, implement A. > - User specified -o inode_file_handles, implement C. > - User did not specify anything, then use file handles opportunistically > as seen fit by daemon. This provides us the maximum flexibility and > this practically implements option B. > > IOW, if user did not specify anything, then we can think of implementing > file handles by default and fallback to using O_PATH fds if filesystem > does not support file handles (ENOTSUPP). But if user did specify > "-o no_inode_file_handles" or "-o inode_file_handles", this kind > of points to strictly implementing respective policy, IMHO. That's how > I have implemented some other options. > > Alternatively, we could think of adding one more option say > "inode_file_handles_only. > > - "-o no_inode_files_handles" implements A. > - "-o inode_files_handles" implements B. > - "-o inode_files_handles_only" implements C. > - If user did not specify anything on command line, then its up to the > daemon whatever default policy it wants and default can change > over time. I think it makes sense not to punish the user for wanting to use file handles as much as possible and still gracefully handling submounts that don’t support them. So I’d want to implement B first, and have that be -o inode_files_handles. I think A as -o no_inode_file_handles is also there, automatically...? I don’t think there’s much of a problem with implementing C, except we probably want to log ENOTSUPP errors then, so the user can figure out what’s going on. I feel like we can still wait with such an option until there’s a demand for it. (Except that perhaps the demand would come in the form of “please help I use -o inode_file_handles but virtiofsd’s FD count is still too high I don’t know what to do”, and that probably wouldn’t be so great. But then again, inodes_files_handles_only wouldn’t help a user in that case either, because it changes “works badly” to “doesn’t work at all”. Now I’m wondering what the actual use case of inodes_files_handles_only would be.) >> not to really add >> complexity. (Which is true, as can be seen from the diff stat I posted >> below: Dropping the second hash table and making the handle part of lo_key, >> so that temporary errors are now fatal, generates a diff of +37/-66, where >> -20 are just two comments (which realistically I’d need to replace by >> different comments), so in terms of code, it’s +37/-46.) >> >> I’d just rather handle errors gracefully when I find it doesn’t really cost >> complexity. >> >> >> However, you made a good point in that we must require name_to_handle_at() >> to work if it worked before for some inode, not because it would be simpler, >> but because it would be wrong otherwise. >> >> As for the other way around... Well, now I don’t have a strong opinion on >> it. Handling temporary name_to_handle_at() failure after it worked the >> first time should not add extra complexity, but it wouldn’t be symmetric. >> Like, allowing temporary failure sometimes but not at other times. > Right. If we decided that we need to generate file handle for an inode > and underlying filesystem supports file handles, then handling temporary > failures only sometimes will make it assymetric. At this point of time > I am more inclined to return error to caller on temporary failures. But > if this does not work well in practice, I am open to change the policy. > >> The next question is, how do we detect temporary failure, because if we look >> up some new inode, name_to_handle_at() fails, we ignore it, and then it >> starts to work and we fail all further lookups, that’s not good. We should >> have the first lookup fail. I suppose ENOTSUPP means “OK to ignore”, and >> for everything else we should let lookup fail? > ENOTSUPP will be a permanent failure right? And not a temporary one. I sure hope so. The man page says “The filesystem does not support decoding of a pathname to a file handle.”, which sounds pretty permanent, unless perhaps the filesystem driver is updated in-place. > It seems reasonable that we gracefully fall back to O_PATH fd in case > of ENOTSUPP (Assuming -o inode_file_handles means try to use file > handles and fallback to O_PATH fds if host filesystem does not > support it). > > But for temporary failures we probably will have to return errors to callers. > Do you have a specific temporary failure in mind which you are concerned > about. Oh, no. It’s just, there can be errors other than EOPNOTSUPP (now that I looked into the man page I know that’s what it actually is), and we have to decide what to do then. Nothing more. >> (And that pretty much >> answers my "what if name_to_handle_at() works the first time, but then >> fails" question. If we let anything but ENOTSUPP let the lookup fail, then >> we should do so every time.) > Agreed. ENOTSUPP seems to be the only error when falling back to O_PATH > fd makes most sense to me. Rest of them probably should be propagated > to caller. Perhaps also EOVERFLOW, which indicates that our file handle storage is too small. This shouldn’t happen, given that we use MAX_HANDLE_SZ to size it, but I imagine it’s possible that MAX_HANDLE_SZ is increased in the future, and when you then compile virtiofsd on one system and run it on another, it’s possible that some filesystem driver wants to store larger handles. Given that we expect a file handle to always be the same for a given inode, such an EOVERFLOW error should be permanent, too (for a given inode). > And given ENOTSUPP is a permanent failure, you probably will be able > to add fhandle to lo_key and not require a separate mapping which > looks up inode by fhandle. Sure, but again, this doesn’t make anything simpler. What I particularly don’t like about this is that for doing so, we have a choice between (A) adding fhandle to lo_key inline (i.e. of type lo_fhandle), or (B) adding it as a pointer (i.e. of type lo_fhandle *). (A) seems the more obvious solution, but then lo_inode.fhandle would either not exist (one would have to go through lo_inode.key.fhandle, which I don’t really like), or be a pointer to &lo_inode.key.fhandle, which is also kind of weird. Also, if there is no file handle, it would need to be explicitly 0, which wastes memory when not using -o inode_file_handles (and cycles, because hashing and comparing will take longer). So (B) would do it the other way around, lo_inode.key.fhandle would be a copy of lo_inode.fhandle. But having a pointer be part of a key is, while perfectly feasible, again strange. I find two hash maps to just be cleaner. Max
On Mon, Jun 21, 2021 at 07:07:19PM +0200, Max Reitz wrote:
> On 21.06.21 17:51, Vivek Goyal wrote:
> > On Mon, Jun 21, 2021 at 11:02:16AM +0200, Max Reitz wrote:
> > > On 18.06.21 20:29, Vivek Goyal wrote:
>
> [snip]
>
> > > > What am I not able to understand is that why we can't return error if
> > > > we run into a temporary issue and can't generate file handle. What's
> > > > that requirement that we need to hide the error and try to cover it
> > > > up by some other means.
> > > There is no requirement, it’s just that we need to hide ENOTSUPP errors
> > > anyway (because e.g. some submounted filesystem may not support file
> > > handles), so I considered hiding temporary errors
> > ENOTSUPP is not a temporary error I guess. But this is a good point that
> > if host filesystem does not support file handles, should we return
> > error so that user is forced to remove "-o inode_file_handles" option.
> >
> > I can see multiple modes and they all seem to be useful in different
> > scenarios.
> >
> > A. Do not use file handles at all.
> >
> > B. Use file handles if filesystem supports it. Also this could do some
> > kind of mix and matching so that some inodes use file handles while
> > others use fd. We could think of implementing some threshold and
> > if open fds go above this threshold, switch to file handle stuff.
> >
> > C. Strictly use file handles otherwise return error to caller.
> >
> > One possibility is that we implement two options inode_file_handles
> > and no_inode_file_handles.
> >
> > - User specified -o no_inode_file_handles, implement A.
> > - User specified -o inode_file_handles, implement C.
> > - User did not specify anything, then use file handles opportunistically
> > as seen fit by daemon. This provides us the maximum flexibility and
> > this practically implements option B.
> >
> > IOW, if user did not specify anything, then we can think of implementing
> > file handles by default and fallback to using O_PATH fds if filesystem
> > does not support file handles (ENOTSUPP). But if user did specify
> > "-o no_inode_file_handles" or "-o inode_file_handles", this kind
> > of points to strictly implementing respective policy, IMHO. That's how
> > I have implemented some other options.
> >
> > Alternatively, we could think of adding one more option say
> > "inode_file_handles_only.
> >
> > - "-o no_inode_files_handles" implements A.
> > - "-o inode_files_handles" implements B.
> > - "-o inode_files_handles_only" implements C.
> > - If user did not specify anything on command line, then its up to the
> > daemon whatever default policy it wants and default can change
> > over time.
>
> I think it makes sense not to punish the user for wanting to use file
> handles as much as possible and still gracefully handling submounts that
> don’t support them. So I’d want to implement B first, and have that be -o
> inode_files_handles.
Agreed. B probably will be most common.
> I think A as -o no_inode_file_handles is also there,
> automatically...?
I do see you have added it.
{ "inode_file_handles", offsetof(struct lo_data, inode_file_handles), 1 },
{ "no_inode_file_handles",
offsetof(struct lo_data, inode_file_handles),
0 },
>
> I don’t think there’s much of a problem with implementing C, except we
> probably want to log ENOTSUPP errors then, so the user can figure out what’s
> going on. I feel like we can still wait with such an option until there’s a
> demand for it.
Agreed.
>
> (Except that perhaps the demand would come in the form of “please help I use
> -o inode_file_handles but virtiofsd’s FD count is still too high I don’t
> know what to do”, and that probably wouldn’t be so great. But then again,
> inodes_files_handles_only wouldn’t help a user in that case either, because
> it changes “works badly” to “doesn’t work at all”.
May be we should log it. Will be good if some of the Info logs go into
syslogs and users can look at it.
> Now I’m wondering what
> the actual use case of inodes_files_handles_only would be.)
I was thinking more of debugging use cases. (Assuming, "-o
inode_file_handles" can take liberties and use O_PATH fd for some inodes
and file handle for other inodes). For example, determining what's the
overhead of using file handles. In that case I will like to make sure
O_PATH fds are not being used.
But we don't need it now. We can add it when we need it later.
>
> > > not to really add
> > > complexity. (Which is true, as can be seen from the diff stat I posted
> > > below: Dropping the second hash table and making the handle part of lo_key,
> > > so that temporary errors are now fatal, generates a diff of +37/-66, where
> > > -20 are just two comments (which realistically I’d need to replace by
> > > different comments), so in terms of code, it’s +37/-46.)
> > >
> > > I’d just rather handle errors gracefully when I find it doesn’t really cost
> > > complexity.
> > >
> > >
> > > However, you made a good point in that we must require name_to_handle_at()
> > > to work if it worked before for some inode, not because it would be simpler,
> > > but because it would be wrong otherwise.
> > >
> > > As for the other way around... Well, now I don’t have a strong opinion on
> > > it. Handling temporary name_to_handle_at() failure after it worked the
> > > first time should not add extra complexity, but it wouldn’t be symmetric.
> > > Like, allowing temporary failure sometimes but not at other times.
> > Right. If we decided that we need to generate file handle for an inode
> > and underlying filesystem supports file handles, then handling temporary
> > failures only sometimes will make it assymetric. At this point of time
> > I am more inclined to return error to caller on temporary failures. But
> > if this does not work well in practice, I am open to change the policy.
> >
> > > The next question is, how do we detect temporary failure, because if we look
> > > up some new inode, name_to_handle_at() fails, we ignore it, and then it
> > > starts to work and we fail all further lookups, that’s not good. We should
> > > have the first lookup fail. I suppose ENOTSUPP means “OK to ignore”, and
> > > for everything else we should let lookup fail?
> > ENOTSUPP will be a permanent failure right? And not a temporary one.
>
> I sure hope so. The man page says “The filesystem does not support decoding
> of a pathname to a file handle.”, which sounds pretty permanent, unless
> perhaps the filesystem driver is updated in-place.
>
> > It seems reasonable that we gracefully fall back to O_PATH fd in case
> > of ENOTSUPP (Assuming -o inode_file_handles means try to use file
> > handles and fallback to O_PATH fds if host filesystem does not
> > support it).
> >
> > But for temporary failures we probably will have to return errors to callers.
> > Do you have a specific temporary failure in mind which you are concerned
> > about.
>
> Oh, no. It’s just, there can be errors other than EOPNOTSUPP (now that I
> looked into the man page I know that’s what it actually is), and we have to
> decide what to do then. Nothing more.
>
> > > (And that pretty much
> > > answers my "what if name_to_handle_at() works the first time, but then
> > > fails" question. If we let anything but ENOTSUPP let the lookup fail, then
> > > we should do so every time.)
> > Agreed. ENOTSUPP seems to be the only error when falling back to O_PATH
> > fd makes most sense to me. Rest of them probably should be propagated
> > to caller.
>
> Perhaps also EOVERFLOW, which indicates that our file handle storage is too
> small. This shouldn’t happen, given that we use MAX_HANDLE_SZ to size it,
> but I imagine it’s possible that MAX_HANDLE_SZ is increased in the future,
> and when you then compile virtiofsd on one system and run it on another,
> it’s possible that some filesystem driver wants to store larger handles.
> Given that we expect a file handle to always be the same for a given inode,
> such an EOVERFLOW error should be permanent, too (for a given inode).
man page says that EOVERFLOW can also indicate that no file handle is
available.
*******
Some care is needed here as EOVERFLOW can also indicate that no file
handle is available for this particular name in a filesystem which
does normally support file-handle lookup. This case can be detected
when the EOVERFLOW error is returned without handle_bytes being increased.
*****
>
> > And given ENOTSUPP is a permanent failure, you probably will be able
> > to add fhandle to lo_key and not require a separate mapping which
> > looks up inode by fhandle.
>
> Sure, but again, this doesn’t make anything simpler.
>
> What I particularly don’t like about this is that for doing so, we have a
> choice between (A) adding fhandle to lo_key inline (i.e. of type
> lo_fhandle), or (B) adding it as a pointer (i.e. of type lo_fhandle *).
>
> (A) seems the more obvious solution, but then lo_inode.fhandle would either
> not exist (one would have to go through lo_inode.key.fhandle, which I don’t
> really like), or be a pointer to &lo_inode.key.fhandle, which is also kind
> of weird. Also, if there is no file handle, it would need to be explicitly
> 0, which wastes memory when not using -o inode_file_handles (and cycles,
> because hashing and comparing will take longer).
>
> So (B) would do it the other way around, lo_inode.key.fhandle would be a
> copy of lo_inode.fhandle. But having a pointer be part of a key is, while
> perfectly feasible, again strange.
Thinking more about it. So an inode is uniquely identified either by
lo_key1 (dev, ino, mnt_id) or lo_key2 (fhandle, mnt_id).
Ideally we will add inodes in hash table using lo_key2 if file handles
are enabled and using lo_key1 if file handles are not enabled.
Given we support submounts and its possible that we have mix if inodes
where for some inodes we use lo_key1 while for other we use lo_key2.
But it will never happen that for same inode we keep flipping the
mode between lo_key1 and lo_key2. (We will return temporary errors
from file system).
>
> I find two hash maps to just be cleaner.
Given an inode is supposed to be looked up either by lo_key1 or
lo_key2 (depending on whether file system support file handles)
, I guess it makes less sense to merge two keys and come
up with single lo_key which contains everything. So I am fine
with adding another key type for fhandle and lookup in separate
hash map.
Thanks
Vivek
So I’m coming back to this after three weeks (well, PTO), and this again
turns into a bit of a pain, actually.
I don’t think it’s anything serious, but I had thought we had found
something that would make us both happy because it wouldn’t be too ugly,
and now it’s turning ugly again... So I’m sending this mail as a heads
up before I send v3 in the next days, to explain my thought process.
On 21.06.21 11:02, Max Reitz wrote:
> On 18.06.21 20:29, Vivek Goyal wrote:
>
[...]
>> I am still reading your code and trying to understand it. But one
>> question came to mind. What happens if we can generate file handle
>> during lookup. But can't generate when same file is looked up again.
>>
>> - A file foo.txt is looked. We can create file handle and we add it
>> to lo->inodes_by_handle as well as lo->inodes_by_ds.
>>
>> - Say somebody deleted file and created again and inode number got
>> reused.
>>
>> - Now during ->revalidation path, lookup happens again. This time say
>> we can't generate file handle. If am reading lo_do_find() code
>> correctly, it will find the old inode using ids and return same
>> inode as result of lookup. And we did not recognize that inode
>> number has been reused.
>
> Oh, that’s a good point. If an lo_inode has no O_PATH fd but is only
> addressed by handle, we must always look it up by handle.
Also, just wanted to throw in this remark:
Now that I read the code again, lo_do_find() already has a condition to
prevent this. It’s this:
if (p && fhandle != NULL && p->fhandle != NULL) {
p = NULL;
}
There’s just one thing wrong with it, and that is the `fhandle != NULL`
part. It has no place there. But this piece of code does exactly what
we’d need it do if it were just:
if (p && p->fhandle != NULL) {
p = NULL;
}
[...]
> However, you made a good point in that we must require
> name_to_handle_at() to work if it worked before for some inode, not
> because it would be simpler, but because it would be wrong otherwise.
>
> As for the other way around... Well, now I don’t have a strong
> opinion on it. Handling temporary name_to_handle_at() failure after
> it worked the first time should not add extra complexity, but it
> wouldn’t be symmetric. Like, allowing temporary failure sometimes but
> not at other times.
(I think I mistyped here, it should be “Handling name_to_handle_at()
randomly working after it failed the first time”.)
> The next question is, how do we detect temporary failure, because if
> we look up some new inode, name_to_handle_at() fails, we ignore it,
> and then it starts to work and we fail all further lookups, that’s not
> good. We should have the first lookup fail. I suppose ENOTSUPP means
> “OK to ignore”, and for everything else we should let lookup fail?
> (And that pretty much answers my "what if name_to_handle_at() works
> the first time, but then fails" question. If we let anything but
> ENOTSUPP let the lookup fail, then we should do so every time.)
I don’t think this will work as cleanly as I’d hoped.
The problem I’m facing is that get_file_handle() doesn’t only call
name_to_handle_at(), but also contains a lot of code managing
mount_fds. There are a lot of places that can fail, too, and I think we
should have them fall back to using an O_PATH FD:
Say mount_fds doesn’t contain an FD for the new handle’s mount ID yet,
so we want to add one. However, it turns out that the file is not a
regular file or directory, so we cannot open it as a regular FD and add
it to mount_fds; or that it is a regular file, but without permission to
open it O_RDONLY. So we cannot return a file handle, because it will
not be usable until a mount FD is added.
I think in such a case we should fall back to an O_PATH FD, because this
is not some unexpected error, but just an unfortunate (but reproducible
and valid) circumstance where using `-o inode_file_handles` fails to do
something that works without it.
Now, however, this means that the next time we try to generate a handle
for this file (to look it up), it will absolutely work if some other FD
was added to mount_fds for this mount ID in the meantime.
We could get around this by not trying to open the file for which we are
to generate a handle to add its FD to mount_fds, but instead doing what
the open_by_handle_at() man page suggests:
> The mount_id argument returns an identifier for the filesystem mount
> that corresponds to pathname. This corresponds to the first field in
> one of the records in /proc/self/mountinfo. Opening the pathname in
> the fifth field of that record yields a file descriptor for the mount
> point; that file descriptor can be used in a subsequent call to
> open_by_handle_at().
However, I’d rather avoid parsing mountinfo. And as far as I
understand, the only problem here is that we’ll have to cope with the
fact that sometimes on lookups, we can generate a file handle, but the
lo_inode we want to find has no file handle attached to it (because
get_file_handle() failed the first time), and so we won’t find it by
that handle but have to look it up by its inode ID. (Which is safe,
because that lo_inode must have an O_PATH FD attached to it, so the
inode ID cannot be reused.) And that’s something that this series
already does, so I tend to favor that over parsing mountinfo.
Max
On 09.06.21 17:55, Max Reitz wrote:
> Currently, lo_inode.fhandle is always NULL and so always keep an O_PATH
> FD in lo_inode.fd. Therefore, when the respective inode is unlinked,
> its inode ID will remain in use until we drop our lo_inode (and
> lo_inode_put() thus closes the FD). Therefore, lo_find() can safely use
> the inode ID as an lo_inode key, because any inode with an inode ID we
> find in lo_data.inodes (on the same filesystem) must be the exact same
> file.
>
> This will change when we start setting lo_inode.fhandle so we do not
> have to keep an O_PATH FD open. Then, unlinking such an inode will
> immediately remove it, so its ID can then be reused by newly created
> files, even while the lo_inode object is still there[1].
>
> So creating a new file can then reuse the old file's inode ID, and
> looking up the new file would lead to us finding the old file's
> lo_inode, which is not ideal.
>
> Luckily, just as file handles cause this problem, they also solve it: A
> file handle contains a generation ID, which changes when an inode ID is
> reused, so the new file can be distinguished from the old one. So all
> we need to do is to add a second map besides lo_data.inodes that maps
> file handles to lo_inodes, namely lo_data.inodes_by_handle. For
> clarity, lo_data.inodes is renamed to lo_data.inodes_by_ids.
>
> Unfortunately, we cannot rely on being able to generate file handles
> every time. Therefore, we still enter every lo_inode object into
> inodes_by_ids, but having an entry in inodes_by_handle is optional. A
> potential inodes_by_handle entry then has precedence, the inodes_by_ids
> entry is just a fallback.
>
> Note that we do not generate lo_fhandle objects yet, and so we also do
> not enter anything into the inodes_by_handle map yet. Also, all lookups
> skip that map. We might manually create file handles with some code
> that is immediately removed by the next patch again, but that would
> break the assumption in lo_find() that every lo_inode with a non-NULL
> .fhandle must have an entry in inodes_by_handle and vice versa. So we
> leave actually using the inodes_by_handle map for the next patch.
>
> [1] If some application in the guest still has the file open, there is
> going to be a corresponding FD mapping in lo_data.fd_map. In such a
> case, the inode will only go away once every application in the guest
> has closed it. The problem described only applies to cases where the
> guest does not have the file open, and it is just in the dentry cache,
> basically.
>
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> Reviewed-by: Connor Kuehl <ckuehl@redhat.com>
> ---
> tools/virtiofsd/passthrough_ll.c | 80 +++++++++++++++++++++++++-------
> 1 file changed, 64 insertions(+), 16 deletions(-)
[...]
> @@ -3931,10 +3954,34 @@ static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
> return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
> }
>
> +static guint lo_fhandle_hash(gconstpointer key)
> +{
> + const struct lo_fhandle *fh = key;
> + guint hash;
> + size_t i;
> +
> + /* Basically g_str_hash() */
> + hash = 5381;
> + for (i = 0; i < sizeof(fh->padding); i++) {
> + hash += hash * 33 + (unsigned char)fh->padding[i];
> + }
> + hash += hash * 33 + fh->mount_id;
Just spotted: These `+=` should be `=`.
Max
> +
> + return hash;
> +}
© 2016 - 2026 Red Hat, Inc.