fs/fuse/dev.c | 110 +++++++++++++++++++++++++++++++++++++- fs/fuse/dev_uring.c | 3 ++ fs/fuse/dev_uring_i.h | 13 +++++ fs/fuse/fuse_dev_i.h | 5 ++ fs/fuse/fuse_i.h | 1 + fs/fuse/req.c | 10 ++++ include/uapi/linux/fuse.h | 23 +++++++- 7 files changed, 162 insertions(+), 3 deletions(-)
Deliver FUSE_FORGET through fuse_uring_queue_fuse_req() when the io_uring
is ready and userspace has opted in by setting
FUSE_IO_URING_REGISTER_FORGET_COMMIT in fuse_uring_cmd_req.flags on
FUSE_IO_URING_CMD_REGISTER. Until any REGISTER
carries that bit, FORGET continues to use the legacy
fuse_dev_queue_forget() path even while io_uring is active, so unmodified
userspace (e.g. libfuse that does not issue a completion SQE for FORGET)
does not wedge ring entries.
Benefits:
- FORGET can share the same commit/fetch loop as other opcodes.
- Reduces split transport for high-volume forgets when the ring is primary.
- Reuses existing per-queue io-uring machinery and noreply/force
request setup.
Signed-off-by: Li Wang <liwang@kylinos.cn>
---
Tested with passthrough_ll, based on the latest fuse
git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git#for-next,
and the latest libfuse patched with
https://github.com/libfuse/libfuse/pull/1487.
Changes since v2:
- Introduce a flag that allows libfuse to inform the kernel
during registration that it supports receiving and processing
FORGET requests via io_uring.
- Keep FORGET requests in the processing queue until the kernel
receives the completion SQEs for them.
Changes since v1:
- Single forget enqueue entry: fuse_io_uring_ops.send_forget stays
fuse_dev_queue_forget(); when fuse_uring_ready() call
fuse_io_uring_send_forget(), else use the legacy list. v1 wired
send_forget to fuse_io_uring_send_forget() directly.
- Move fuse_io_uring_send_forget() and fuse_forget_uring_data from dev.c
to dev_uring.c; declare fuse_request_alloc, fuse_adjust_compat,
fuse_force_creds, fuse_args_to_req, fuse_drop_waiting in fuse_dev_i.h.
- Split list-only enqueue into fuse_dev_queue_forget_list(); use it on
fallback paths inside fuse_io_uring_send_forget() to avoid recursion.
fs/fuse/dev.c | 110 +++++++++++++++++++++++++++++++++++++-
fs/fuse/dev_uring.c | 3 ++
fs/fuse/dev_uring_i.h | 13 +++++
fs/fuse/fuse_dev_i.h | 5 ++
fs/fuse/fuse_i.h | 1 +
fs/fuse/req.c | 10 ++++
include/uapi/linux/fuse.h | 23 +++++++-
7 files changed, 162 insertions(+), 3 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6fe0d8c263df..0006951e3954 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -26,6 +26,7 @@
#include <linux/seq_file.h>
#include "fuse_trace.h"
+#include "fuse_i.h"
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -224,8 +225,9 @@ struct fuse_forget_link *fuse_alloc_forget(void)
return kzalloc_obj(struct fuse_forget_link, GFP_KERNEL_ACCOUNT);
}
-void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
- struct fuse_forget_link *forget)
+
+static inline void fuse_dev_queue_forget_list(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget)
{
spin_lock(&fiq->lock);
if (fiq->connected) {
@@ -238,6 +240,21 @@ void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
}
}
+void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget)
+{
+#ifdef CONFIG_FUSE_IO_URING
+ struct fuse_chan *fch = container_of(fiq, struct fuse_chan, iq);
+
+ if (fuse_uring_ready(fch) && fuse_uring_forget_via_ring(fch)) {
+ fuse_io_uring_send_forget(fiq, forget);
+ return;
+ }
+#endif
+ fuse_dev_queue_forget_list(fiq, forget);
+}
+
+
void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->lock);
@@ -800,6 +817,95 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
__set_bit(FR_ASYNC, &req->flags);
}
+struct fuse_forget_uring_data {
+ struct fuse_args args;
+ struct fuse_forget_in inarg;
+};
+
+static void fuse_forget_uring_free(struct fuse_args *args, int error)
+{
+ struct fuse_forget_uring_data *d =
+ container_of(args, struct fuse_forget_uring_data, args);
+
+ kfree(d);
+}
+
+
+#ifdef CONFIG_FUSE_IO_URING
+void fuse_io_uring_send_forget(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget)
+{
+ struct fuse_chan *fch = container_of(fiq, struct fuse_chan, iq);
+ struct fuse_conn *fc = fch->conn;
+ struct fuse_mount *fm;
+ struct fuse_req *req;
+ struct fuse_forget_uring_data *d;
+ int err;
+
+ if (!fuse_uring_ready(fch)) {
+ fuse_dev_queue_forget_list(fiq, forget);
+ return;
+ }
+
+ down_read(&fc->killsb);
+ if (list_empty(&fc->mounts)) {
+ up_read(&fc->killsb);
+ fuse_dev_queue_forget_list(fiq, forget);
+ return;
+ }
+ fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
+ up_read(&fc->killsb);
+
+ d = kmalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ goto fallback;
+
+ atomic_inc(&fch->num_waiting);
+ req = fuse_request_alloc(fm->fc->chan, GFP_KERNEL);
+ if (!req) {
+ kfree(d);
+ fuse_drop_waiting(fch);
+ goto fallback;
+ }
+
+ memset(&d->args, 0, sizeof(d->args));
+ d->inarg.nlookup = forget->forget_one.nlookup;
+ d->args.opcode = FUSE_FORGET;
+ d->args.nodeid = forget->forget_one.nodeid;
+ d->args.in_numargs = 1;
+ d->args.in_args[0].size = sizeof(d->inarg);
+ d->args.in_args[0].value = &d->inarg;
+ d->args.force = true;
+ d->args.noreply = true;
+ d->args.end = fuse_forget_uring_free;
+
+ err = fuse_prepare_force_args(fm, &d->args);
+ if (err) {
+ kfree(d);
+ fuse_put_request(req);
+ fuse_drop_waiting(fch);
+ goto fallback;
+ }
+
+ __set_bit(FR_WAITING, &req->flags);
+ if (!d->args.abort_on_kill)
+ __set_bit(FR_FORCE, &req->flags);
+ fuse_adjust_compat(fch, &d->args);
+ fuse_args_to_req(req, &d->args);
+ req->in.h.len = sizeof(struct fuse_in_header) +
+ fuse_len_args(req->args->in_numargs,
+ (struct fuse_arg *)req->args->in_args);
+
+ kfree(forget);
+ fuse_uring_queue_fuse_req(fiq, req);
+ return;
+
+fallback:
+ fuse_dev_queue_forget_list(fiq, forget);
+}
+#endif
+
+
ssize_t fuse_chan_send(struct fuse_chan *fch, struct fuse_args *args)
{
struct fuse_req *req;
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index e467b23e6895..6f55f0ad59f2 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -1114,6 +1114,9 @@ static int fuse_uring_register(struct io_uring_cmd *cmd,
if (IS_ERR(ent))
return PTR_ERR(ent);
+ if (READ_ONCE(cmd_req->flags) & FUSE_IO_URING_REGISTER_FORGET_COMMIT)
+ ring->forget_ring_commit = true;
+
fuse_uring_do_register(ent, cmd, issue_flags);
return 0;
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 368f4d0790eb..258486422586 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -133,6 +133,12 @@ struct fuse_ring {
atomic_t queue_refs;
bool ready;
+
+ /*
+ * Set when any REGISTER SQE sets FUSE_IO_URING_REGISTER_FORGET_COMMIT.
+ * Until then, FORGET stays on the legacy forget list.
+ */
+ bool forget_ring_commit;
};
void fuse_uring_stop_queues(struct fuse_ring *ring);
@@ -170,6 +176,13 @@ static inline bool fuse_uring_ready(struct fuse_chan *fch)
return fch->ring && fch->ring->ready;
}
+static inline bool fuse_uring_forget_via_ring(struct fuse_chan *fch)
+{
+ struct fuse_ring *ring = READ_ONCE(fch->ring);
+
+ return ring && ring->forget_ring_commit;
+}
+
#else /* CONFIG_FUSE_IO_URING */
static inline void fuse_uring_abort(struct fuse_chan *fch)
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 9ce987826ded..f410e124be6b 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -383,8 +383,13 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
int zeroing);
int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
unsigned int nbytes);
+struct fuse_mount;
void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
struct fuse_forget_link *forget);
+#ifdef CONFIG_FUSE_IO_URING
+void fuse_io_uring_send_forget(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget);
+#endif
void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req);
bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 3a7ac74a23ed..0f41e70c06b6 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1017,6 +1017,7 @@ static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap,
int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
+int fuse_prepare_force_args(struct fuse_mount *fm, struct fuse_args *args);
int fuse_simple_notify_reply(struct fuse_mount *fm, struct fuse_args *args, u64 unique);
void fuse_dentry_tree_init(void);
diff --git a/fs/fuse/req.c b/fs/fuse/req.c
index a01ee743d31e..bfb26a71cc5c 100644
--- a/fs/fuse/req.c
+++ b/fs/fuse/req.c
@@ -97,3 +97,12 @@ int fuse_simple_notify_reply(struct fuse_mount *fm, struct fuse_args *args, u64
return fuse_chan_send_notify_reply(fc->chan, args, unique);
}
+
+int fuse_prepare_force_args(struct fuse_mount *fm, struct fuse_args *args)
+{
+ WARN_ON(!args->force);
+ WARN_ON(args->nocreds);
+
+ return fuse_req_prep(fm, args, &invalid_mnt_idmap);
+}
+EXPORT_SYMBOL_GPL(fuse_prepare_force_args);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index c13e1f9a2f12..737eb06f00fa 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -240,6 +240,12 @@
* - add FUSE_COPY_FILE_RANGE_64
* - add struct fuse_copy_file_range_out
* - add FUSE_NOTIFY_PRUNE
+ *
+ * 7.46
+ * - add FUSE_IO_URING_REGISTER_FORGET_COMMIT (fuse_uring_cmd_req.flags on
+ * FUSE_IO_URING_CMD_REGISTER): optional delivery of FUSE_FORGET via
+ * io-uring with FUSE_IO_URING_CMD_COMMIT_AND_FETCH completion; default
+ * keeps FORGET on the classic /dev/fuse queue.
*/
#ifndef _LINUX_FUSE_H
@@ -275,7 +281,7 @@
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 45
+#define FUSE_KERNEL_MINOR_VERSION 46
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@@ -1298,6 +1304,10 @@ enum fuse_uring_cmd {
* In the 80B command area of the SQE.
*/
struct fuse_uring_cmd_req {
+ /*
+ * Bit FUSE_IO_URING_REGISTER_FORGET_COMMIT is interpreted for
+ * FUSE_IO_URING_CMD_REGISTER; other commands ignore it.
+ */
uint64_t flags;
/* entry identifier for commits */
@@ -1308,4 +1318,15 @@ struct fuse_uring_cmd_req {
uint8_t padding[6];
};
+/*
+ * fuse_uring_cmd_req.flags (FUSE_IO_URING_CMD_REGISTER)
+ *
+ * When FUSE_IO_URING_REGISTER_FORGET_COMMIT is set, the kernel may deliver
+ * FUSE_FORGET through the io-uring ring; userspace must complete each
+ * request with FUSE_IO_URING_CMD_COMMIT_AND_FETCH. When unset (default),
+ * FORGET uses the legacy forget list even if io-uring is active, so
+ * unmodified userspace (e.g. libfuse without FORGET completion) stays safe.
+ */
+#define FUSE_IO_URING_REGISTER_FORGET_COMMIT (1ULL << 0)
+
#endif /* _LINUX_FUSE_H */
--
2.34.1
Hi Li,
kernel test robot noticed the following build warnings:
[auto build test WARNING on mszeredi-fuse/for-next]
[also build test WARNING on next-20260429]
[cannot apply to linus/master v7.1-rc1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Li-Wang/fuse-optional-FORGET-delivery-over-io_uring/20260424-173453
base: https://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git for-next
patch link: https://lore.kernel.org/r/20260423110954.2676-1-liwang%40kylinos.cn
patch subject: [PATCH v3] fuse: optional FORGET delivery over io_uring
config: x86_64-randconfig-076-20260427 (https://download.01.org/0day-ci/archive/20260430/202604301155.Gw5TQF7K-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260430/202604301155.Gw5TQF7K-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202604301155.Gw5TQF7K-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> fs/fuse/dev.c:825:13: warning: 'fuse_forget_uring_free' defined but not used [-Wunused-function]
825 | static void fuse_forget_uring_free(struct fuse_args *args, int error)
| ^~~~~~~~~~~~~~~~~~~~~~
vim +/fuse_forget_uring_free +825 fs/fuse/dev.c
824
> 825 static void fuse_forget_uring_free(struct fuse_args *args, int error)
826 {
827 struct fuse_forget_uring_data *d =
828 container_of(args, struct fuse_forget_uring_data, args);
829
830 kfree(d);
831 }
832
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On Thu, Apr 23, 2026 at 4:11 AM Li Wang <liwang@kylinos.cn> wrote: > > Deliver FUSE_FORGET through fuse_uring_queue_fuse_req() when the io_uring > is ready and userspace has opted in by setting > FUSE_IO_URING_REGISTER_FORGET_COMMIT in fuse_uring_cmd_req.flags on > FUSE_IO_URING_CMD_REGISTER. Until any REGISTER > carries that bit, FORGET continues to use the legacy > fuse_dev_queue_forget() path even while io_uring is active, so unmodified > userspace (e.g. libfuse that does not issue a completion SQE for FORGET) > does not wedge ring entries. > > Benefits: > - FORGET can share the same commit/fetch loop as other opcodes. > - Reduces split transport for high-volume forgets when the ring is primary. > - Reuses existing per-queue io-uring machinery and noreply/force > request setup. > > Signed-off-by: Li Wang <liwang@kylinos.cn> Hi Li, Thanks for sending this. To be completely honest, I'm not convinced delivering forget over io-uring is worth the added complexity/cost. In the /dev/fuse path we rely on forget batching/amoritizing and explicit scheduling/fairness logic so forget processing makes progress and doesn't get drowned out by regular requests; I think we'd likely need something comparable for the io-uring path as well. Additionally, routing it through io-uring makes forget behave more like a "real" request on the ring (it needs per-request state to live until userspace completes and the entry can be recyycled) which introduces extra allocation/lifetime management on this path and it requires a uapi change and corresponding libfuse changes. Forgets would consume ring entries but they're tiny one-way notifications and imo I don't think they benefit much from io-uring's main advantages (eg data-path/zero-copy). I worry theyy could contend with read-write heavy traffic where ring capacity is more valuable. Given that, my preference would be to keep forget/interrupts on the legacy /dev/fuse path even when io-uring is enabled. Sorry for not bringing this up earlier in v2. Bernd, I’d also be curious to hear your perspective on this. Thanks, Joanne
On 4/24/26 22:38, Joanne Koong wrote: > On Thu, Apr 23, 2026 at 4:11 AM Li Wang <liwang@kylinos.cn> wrote: >> >> Deliver FUSE_FORGET through fuse_uring_queue_fuse_req() when the io_uring >> is ready and userspace has opted in by setting >> FUSE_IO_URING_REGISTER_FORGET_COMMIT in fuse_uring_cmd_req.flags on >> FUSE_IO_URING_CMD_REGISTER. Until any REGISTER >> carries that bit, FORGET continues to use the legacy >> fuse_dev_queue_forget() path even while io_uring is active, so unmodified >> userspace (e.g. libfuse that does not issue a completion SQE for FORGET) >> does not wedge ring entries. >> >> Benefits: >> - FORGET can share the same commit/fetch loop as other opcodes. >> - Reduces split transport for high-volume forgets when the ring is primary. >> - Reuses existing per-queue io-uring machinery and noreply/force >> request setup. >> >> Signed-off-by: Li Wang <liwang@kylinos.cn> > > Hi Li, > > Thanks for sending this. To be completely honest, I'm not convinced > delivering forget over io-uring is worth the added complexity/cost. In > the /dev/fuse path we rely on forget batching/amoritizing and explicit > scheduling/fairness logic so forget processing makes progress and > doesn't get drowned out by regular requests; I think we'd likely need > something comparable for the io-uring path as well. Additionally, > routing it through io-uring makes forget behave more like a "real" > request on the ring (it needs per-request state to live until > userspace completes and the entry can be recyycled) which introduces > extra allocation/lifetime management on this path and it requires a > uapi change and corresponding libfuse changes. > > Forgets would consume ring entries but they're tiny one-way > notifications and imo I don't think they benefit much from io-uring's > main advantages (eg data-path/zero-copy). I worry theyy could contend > with read-write heavy traffic where ring capacity is more valuable. I think when FORGET starts to disturb writes or reads, there also must be some metadata load that causes many of these requests. Going via /dev/fuse also includes another two syscalls and cpu task switch to the libfuse thread handling them - that is not for free either. However, this currently might disturb reads/writes, if the queue depth is limited and memory optimized to carry reads/writes. The missing feature here is to have a multiple request sizes on the same queue. Btw, syscall overhead is basically the reason why I wouldn't like to have multiple rings per request size, but one ring with entries of different sizes. Will try to respond to the other mail later today. > > Given that, my preference would be to keep forget/interrupts on the > legacy /dev/fuse path even when io-uring is enabled. Sorry for not > bringing this up earlier in v2. Bernd, I’d also be curious to hear > your perspective on this. It is certainly not something that I would count as priority. Though I don't think it would hurt either. In the end we might be able to run entirely without /dev/fuse IO and threads and would save a bit resources. @Li do you see performance improvements with FORGETs through the ring or could you post your underlying motivation? Thanks, Bernd
On Sun, Apr 26, 2026 at 4:48 PM Bernd Schubert <bernd@bsbernd.com> wrote: > > On 4/24/26 22:38, Joanne Koong wrote: > > On Thu, Apr 23, 2026 at 4:11 AM Li Wang <liwang@kylinos.cn> wrote: > >> > >> Deliver FUSE_FORGET through fuse_uring_queue_fuse_req() when the io_uring > >> is ready and userspace has opted in by setting > >> FUSE_IO_URING_REGISTER_FORGET_COMMIT in fuse_uring_cmd_req.flags on > >> FUSE_IO_URING_CMD_REGISTER. Until any REGISTER > >> carries that bit, FORGET continues to use the legacy > >> fuse_dev_queue_forget() path even while io_uring is active, so unmodified > >> userspace (e.g. libfuse that does not issue a completion SQE for FORGET) > >> does not wedge ring entries. > >> > >> Benefits: > >> - FORGET can share the same commit/fetch loop as other opcodes. > >> - Reduces split transport for high-volume forgets when the ring is primary. > >> - Reuses existing per-queue io-uring machinery and noreply/force > >> request setup. > >> > >> Signed-off-by: Li Wang <liwang@kylinos.cn> > > > > Hi Li, > > > > Thanks for sending this. To be completely honest, I'm not convinced > > delivering forget over io-uring is worth the added complexity/cost. In > > the /dev/fuse path we rely on forget batching/amoritizing and explicit > > scheduling/fairness logic so forget processing makes progress and > > doesn't get drowned out by regular requests; I think we'd likely need > > something comparable for the io-uring path as well. Additionally, > > routing it through io-uring makes forget behave more like a "real" > > request on the ring (it needs per-request state to live until > > userspace completes and the entry can be recyycled) which introduces > > extra allocation/lifetime management on this path and it requires a > > uapi change and corresponding libfuse changes. > > > > Forgets would consume ring entries but they're tiny one-way > > notifications and imo I don't think they benefit much from io-uring's > > main advantages (eg data-path/zero-copy). I worry theyy could contend > > with read-write heavy traffic where ring capacity is more valuable. > > I think when FORGET starts to disturb writes or reads, there also must > be some metadata load that causes many of these requests. Going via > /dev/fuse also includes another two syscalls and cpu task switch to the > libfuse thread handling them - that is not for free either. I think the /dev/fuse cost is already amortized by batching forgets and the io-uring path would have its own per-forget overhead (eg needing to reply back to the forget request which would be a syscall, needing fuse_req and fuse_forget_uring_data allocations). I agree benchmarks would be useful to make a conclusion about performance. > > However, this currently might disturb reads/writes, if the queue depth > is limited and memory optimized to carry reads/writes. The missing > feature here is to have a multiple request sizes on the same queue. > > Btw, syscall overhead is basically the reason why I wouldn't like to > have multiple rings per request size, but one ring with entries of > different sizes. Will try to respond to the other mail later today. I'll keep an eye out for your response to that email. I think we disagree on this but maybe we can discuss this at the fuse BoF next week . Thanks, Joanne >
Hi Bernd and Joanne, On 26/04/2026 23:48, Bernd Schubert wrote: > > > On 4/24/26 22:38, Joanne Koong wrote: >> On Thu, Apr 23, 2026 at 4:11 AM Li Wang <liwang@kylinos.cn> wrote: >>> >>> Deliver FUSE_FORGET through fuse_uring_queue_fuse_req() when the io_uring >>> is ready and userspace has opted in by setting >>> FUSE_IO_URING_REGISTER_FORGET_COMMIT in fuse_uring_cmd_req.flags on >>> FUSE_IO_URING_CMD_REGISTER. Until any REGISTER >>> carries that bit, FORGET continues to use the legacy >>> fuse_dev_queue_forget() path even while io_uring is active, so unmodified >>> userspace (e.g. libfuse that does not issue a completion SQE for FORGET) >>> does not wedge ring entries. >>> >>> Benefits: >>> - FORGET can share the same commit/fetch loop as other opcodes. >>> - Reduces split transport for high-volume forgets when the ring is primary. >>> - Reuses existing per-queue io-uring machinery and noreply/force >>> request setup. >>> >>> Signed-off-by: Li Wang <liwang@kylinos.cn> >> >> Hi Li, >> >> Thanks for sending this. To be completely honest, I'm not convinced >> delivering forget over io-uring is worth the added complexity/cost. In >> the /dev/fuse path we rely on forget batching/amoritizing and explicit >> scheduling/fairness logic so forget processing makes progress and >> doesn't get drowned out by regular requests; I think we'd likely need >> something comparable for the io-uring path as well. Additionally, >> routing it through io-uring makes forget behave more like a "real" >> request on the ring (it needs per-request state to live until >> userspace completes and the entry can be recyycled) which introduces >> extra allocation/lifetime management on this path and it requires a >> uapi change and corresponding libfuse changes. >> >> Forgets would consume ring entries but they're tiny one-way >> notifications and imo I don't think they benefit much from io-uring's >> main advantages (eg data-path/zero-copy). I worry theyy could contend >> with read-write heavy traffic where ring capacity is more valuable. > > I think when FORGET starts to disturb writes or reads, there also must > be some metadata load that causes many of these requests. Going via > /dev/fuse also includes another two syscalls and cpu task switch to the > libfuse thread handling them - that is not for free either. > > However, this currently might disturb reads/writes, if the queue depth > is limited and memory optimized to carry reads/writes. The missing > feature here is to have a multiple request sizes on the same queue. > > Btw, syscall overhead is basically the reason why I wouldn't like to > have multiple rings per request size, but one ring with entries of > different sizes. Will try to respond to the other mail later today. > >> >> Given that, my preference would be to keep forget/interrupts on the >> legacy /dev/fuse path even when io-uring is enabled. Sorry for not >> bringing this up earlier in v2. Bernd, I’d also be curious to hear >> your perspective on this. > > It is certainly not something that I would count as priority. Though I > don't think it would hurt either. In the end we might be able to run > entirely without /dev/fuse IO and threads and would save a bit resources. > > @Li do you see performance improvements with FORGETs through the ring or > could you post your underlying motivation? > To be honest, my initial motivation for this wasn't necessarily performance, but rather architectural simplicity. Maintaining two separate paths and thread pools for forget/interrupt and I/O requests adds a bit of complexity to the code and introduces relatively higher overhead, as well as some resource redundancy. Furthermore, in the current implementation, forget and I/O requests are dispatched through different paths and handled by different sets of threads. This means we cannot guarantee their original priority order upon reception, and it potentially introduces extra context-switching overhead, which impacts the processing efficiency of both. Theoretically, optimization measures such as batching forget requests can also be implemented via the io_uring path. This would allow us to amortize the state maintenance and completion overhead of forget requests, while also addressing the issue of uniform request sizes within the same ring. I'll get around to running some performance benchmarks. Thanks, Li
On Mon, Apr 27, 2026 at 10:56 AM Li Wang <liwang@kylinos.cn> wrote: > > Hi Bernd and Joanne, > > On 26/04/2026 23:48, Bernd Schubert wrote: > > > Furthermore, in the current implementation, forget and I/O requests are > dispatched through different paths and handled by different sets of threads. > This means we cannot guarantee their original priority order upon reception, I don't think ordering matters. forgets are inherently asynchronous and there's no correctness dependency on a forget arriving before or after a read/write request. In the /dev/fuse path ordering isn't preserved either (eg the order they're queued by the kernel is not the order they're delivered to userspace). Thanks, Joanne
Hi Joanne, On 27/04/2026 19:31, Joanne Koong wrote: > On Mon, Apr 27, 2026 at 10:56 AM Li Wang <liwang@kylinos.cn> wrote: >> >> Hi Bernd and Joanne, >> >> On 26/04/2026 23:48, Bernd Schubert wrote: >>> >> Furthermore, in the current implementation, forget and I/O requests are >> dispatched through different paths and handled by different sets of threads. >> This means we cannot guarantee their original priority order upon reception, > > I don't think ordering matters. forgets are inherently asynchronous > and there's no correctness dependency on a forget arriving before or > after a read/write request. In the /dev/fuse path ordering isn't > preserved either (eg the order they're queued by the kernel is not the > order they're delivered to userspace). > Yeah, I agree, and I apologize for the confusion. By 'original', I meant the logic used before the io_uring path was introduced — where the system prioritized reading FORGET requests into the userspace buffer while still maintaining fairness. I wasn't referring to the order in which the system receives those requests. That said, I’m curious: since FORGET is asynchronous and doesn't require a reply, it doesn't seem particularly urgent. Why prioritize it? To prevent a large memory footprint for metadata-heavy workloads? Could we be more aggressive with delays and only send them in batches once a certain threshold is met? Also, if a FORGET request is lost for whatever reason, what are the consequences? Is it just a matter of causing a memory leak? Thanks, Li
On Tue, Apr 28, 2026 at 10:06 AM Li Wang <liwang@kylinos.cn> wrote: > > Hi Joanne, > > On 27/04/2026 19:31, Joanne Koong wrote: > > On Mon, Apr 27, 2026 at 10:56 AM Li Wang <liwang@kylinos.cn> wrote: > >> > >> Hi Bernd and Joanne, > >> > >> On 26/04/2026 23:48, Bernd Schubert wrote: > >>> > >> Furthermore, in the current implementation, forget and I/O requests are > >> dispatched through different paths and handled by different sets of threads. > >> This means we cannot guarantee their original priority order upon reception, > > > > I don't think ordering matters. forgets are inherently asynchronous > > and there's no correctness dependency on a forget arriving before or > > after a read/write request. In the /dev/fuse path ordering isn't > > preserved either (eg the order they're queued by the kernel is not the > > order they're delivered to userspace). > > > > Yeah, I agree, and I apologize for the confusion. By 'original', > I meant the logic used before the io_uring path was introduced — where the > system prioritized reading FORGET requests into the userspace buffer > while still maintaining fairness. I wasn't referring to the order in which > the system receives those requests. Gotcha, thanks for clarifying. > > That said, I’m curious: since FORGET is asynchronous and doesn't require > a reply, it doesn't seem particularly urgent. Why prioritize it? To prevent > a large memory footprint for metadata-heavy workloads? Could we be more As I understand it, yes, forgets are the only way the server has of knowing it is safe to release resources / clean up state. Luis is working on lookup handles [1] which will help mitigate how much state the server needs to keep track of, but without that the server needs to keep locally stashed info for that inode alive so if future requests for that nodeid come in, it can serve them. [1] https://lore.kernel.org/linux-fsdevel/20260225112439.27276-1-luis@igalia.com/ > aggressive with delays and only send them in batches once a certain > threshold is met? Also, if a FORGET request is lost for whatever reason, I think we could but the current approach seems like a reasonable balance already. i'm not sure I see the benefit, as it seems like the right threshold would be different for different workloads. imo this would add complexity for unclear benefit, but I think it'd be ideal to keep things as simple as possible. > what are the consequences? Is it just a matter of causing a memory leak? If a forget request is lost, the server has to keep maintaining local state for that inode. on server teardown everything gets cleaned up normally but while the server is alive it's effectively a memory leak. Thanks, Joanne > > Thanks, > Li >
© 2016 - 2026 Red Hat, Inc.