Introduce fs_revocable_replace() to simplify the use of the revocable
API with file_operations.
The function, should be called from a driver's ->open(), replaces the
fops with a wrapper that automatically handles the `try_access` and
`withdraw_access`.
When the file is closed, the wrapper's ->release() restores the original
fops and cleanups. This centralizes the revocable logic, making drivers
cleaner and easier to maintain.
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
---
PoC patch.
Known issues:
- All file operations call revocable_try_access() for guaranteeing the
resource even if the resource may be unused in the fops.
v5:
- Rename to "fs_revocable".
- Move the replacement context to struct file.
- Support multiple revocable providers.
v4: https://lore.kernel.org/chrome-platform/20250923075302.591026-6-tzungbi@kernel.org
- New in the series.
fs/Makefile | 2 +-
fs/fs_revocable.c | 154 +++++++++++++++++++++++++++++++++++
include/linux/fs.h | 2 +
include/linux/fs_revocable.h | 21 +++++
4 files changed, 178 insertions(+), 1 deletion(-)
create mode 100644 fs/fs_revocable.c
create mode 100644 include/linux/fs_revocable.h
diff --git a/fs/Makefile b/fs/Makefile
index e3523ab2e587..ec2b21385deb 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
- file_attr.o
+ file_attr.o fs_revocable.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
diff --git a/fs/fs_revocable.c b/fs/fs_revocable.c
new file mode 100644
index 000000000000..4b6d755abfed
--- /dev/null
+++ b/fs/fs_revocable.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2025 Google LLC
+ *
+ * File operation replacement with Revocable
+ */
+
+#include <linux/cleanup.h>
+#include <linux/fs_revocable.h>
+#include <linux/poll.h>
+#include <linux/revocable.h>
+
+struct fs_revocable_replacement {
+ const struct fs_revocable_operations *frops;
+ const struct file_operations *orig_fops;
+ struct file_operations fops;
+ struct revocable **revs;
+ size_t num_revs;
+};
+
+static int fs_revocable_try_access(struct file *filp)
+{
+ struct fs_revocable_replacement *rr = filp->f_rr;
+
+ return rr->frops->try_access(rr->revs, rr->num_revs,
+ filp->private_data);
+}
+
+static void fs_revocable_withdraw_access(struct fs_revocable_replacement *rr)
+{
+ for (size_t i = 0; i < rr->num_revs; ++i)
+ revocable_withdraw_access(rr->revs[i]);
+}
+
+DEFINE_FREE(fs_revocable_replacement, struct fs_revocable_replacement *,
+ if (_T) fs_revocable_withdraw_access(_T))
+
+static ssize_t fs_revocable_read(struct file *filp, char __user *buffer,
+ size_t length, loff_t *offset)
+{
+ struct fs_revocable_replacement *rr
+ __free(fs_revocable_replacement) = filp->f_rr;
+ int ret;
+
+ ret = fs_revocable_try_access(filp);
+ if (ret)
+ return ret;
+ return rr->orig_fops->read(filp, buffer, length, offset);
+}
+
+static __poll_t fs_revocable_poll(struct file *filp, poll_table *wait)
+{
+ struct fs_revocable_replacement *rr
+ __free(fs_revocable_replacement) = filp->f_rr;
+ int ret;
+
+ ret = fs_revocable_try_access(filp);
+ if (ret)
+ return ret;
+ return rr->orig_fops->poll(filp, wait);
+}
+
+static long fs_revocable_unlocked_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fs_revocable_replacement *rr
+ __free(fs_revocable_replacement) = filp->f_rr;
+ int ret;
+
+ ret = fs_revocable_try_access(filp);
+ if (ret)
+ return ret;
+ return rr->orig_fops->unlocked_ioctl(filp, cmd, arg);
+}
+
+static int fs_revocable_release(struct inode *inode, struct file *filp)
+{
+ int ret = 0;
+ struct fs_revocable_replacement *rr = filp->f_rr;
+
+ if (!rr->orig_fops->release)
+ goto recover;
+
+ ret = fs_revocable_try_access(filp);
+ if (ret)
+ goto recover;
+
+ ret = rr->orig_fops->release(inode, filp);
+
+ fs_revocable_withdraw_access(rr);
+recover:
+ filp->f_op = rr->orig_fops;
+ filp->f_rr = NULL;
+
+ for (size_t i = 0; i < rr->num_revs; ++i)
+ revocable_free(rr->revs[i]);
+ kfree(rr->revs);
+ kfree(rr);
+
+ return ret;
+}
+
+/**
+ * fs_revocable_replace() - Replace the file operations to be revocable-aware.
+ *
+ * Should be used only from ->open() instances.
+ */
+int fs_revocable_replace(struct file *filp,
+ const struct fs_revocable_operations *frops,
+ struct revocable_provider **rps, size_t num_rps)
+{
+ struct fs_revocable_replacement *rr;
+ size_t i;
+
+ rr = kzalloc(sizeof(*rr), GFP_KERNEL);
+ if (!rr)
+ return -ENOMEM;
+ filp->f_rr = rr;
+
+ rr->frops = frops;
+ rr->revs = kcalloc(num_rps, sizeof(*rr->revs), GFP_KERNEL);
+ if (!rr->revs)
+ goto free_rr;
+ for (i = 0; i < num_rps; ++i) {
+ rr->revs[i] = revocable_alloc(rps[i]);
+ if (!rr->revs[i])
+ goto free_revs;
+ }
+ rr->num_revs = num_rps;
+ rr->orig_fops = filp->f_op;
+
+ memcpy(&rr->fops, filp->f_op, sizeof(rr->fops));
+ rr->fops.release = fs_revocable_release;
+
+ if (rr->fops.read)
+ rr->fops.read = fs_revocable_read;
+ if (rr->fops.poll)
+ rr->fops.poll = fs_revocable_poll;
+ if (rr->fops.unlocked_ioctl)
+ rr->fops.unlocked_ioctl = fs_revocable_unlocked_ioctl;
+
+ filp->f_op = &rr->fops;
+ return 0;
+free_revs:
+ if (i) {
+ while (--i)
+ revocable_free(rr->revs[i]);
+ }
+ kfree(rr->revs);
+free_rr:
+ kfree(rr);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(fs_revocable_replace);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68c4a59ec8fb..a03dd7f343a9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -80,6 +80,7 @@ struct fs_context;
struct fs_parameter_spec;
struct file_kattr;
struct iomap_ops;
+struct fs_revocable_replacement;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -1248,6 +1249,7 @@ struct file {
};
file_ref_t f_ref;
/* --- cacheline 3 boundary (192 bytes) --- */
+ struct fs_revocable_replacement *f_rr;
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
diff --git a/include/linux/fs_revocable.h b/include/linux/fs_revocable.h
new file mode 100644
index 000000000000..bb066392d232
--- /dev/null
+++ b/include/linux/fs_revocable.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2025 Google LLC
+ */
+
+#ifndef __LINUX_FS_REVOCABLE_H
+#define __LINUX_FS_REVOCABLE_H
+
+#include <linux/fs.h>
+#include <linux/revocable.h>
+
+struct fs_revocable_operations {
+ int (*try_access)(struct revocable **revs, size_t num_revs, void *data);
+};
+
+int fs_revocable_replace(struct file *filp,
+ const struct fs_revocable_operations *frops,
+ struct revocable_provider **rps, size_t num_rps);
+
+#endif /* __LINUX_FS_REVOCABLE_H */
--
2.51.0.788.g6d19910ace-goog
Hi,
On 10/15/25 10:42 PM, Tzung-Bi Shih wrote:
> +/**
> + * fs_revocable_replace() - Replace the file operations to be revocable-aware.
> + *
> + * Should be used only from ->open() instances.
> + */
> +int fs_revocable_replace(struct file *filp,
> + const struct fs_revocable_operations *frops,
> + struct revocable_provider **rps, size_t num_rps)
> +{
Please add the function parameters to the kernel-doc comment to avoid
kernel-doc warnings. E.g.:
* @filp: foo description
* @frops: bar description
* @rps: baz description
* @num_rps: number of @rps entries
--
~Randy
On Thu, Oct 16, 2025 at 11:38:27AM -0700, Randy Dunlap wrote:
> On 10/15/25 10:42 PM, Tzung-Bi Shih wrote:
> > +/**
> > + * fs_revocable_replace() - Replace the file operations to be revocable-aware.
> > + *
> > + * Should be used only from ->open() instances.
> > + */
> > +int fs_revocable_replace(struct file *filp,
> > + const struct fs_revocable_operations *frops,
> > + struct revocable_provider **rps, size_t num_rps)
> > +{
>
> Please add the function parameters to the kernel-doc comment to avoid
> kernel-doc warnings. E.g.:
Ah, thanks. I didn't intend to make it a kernel-doc (as PoC code) but just
wrongly copy-n-paste around.
On Thu, Oct 16, 2025 at 05:42:02AM +0000, Tzung-Bi Shih wrote:
> Introduce fs_revocable_replace() to simplify the use of the revocable
> API with file_operations.
>
> The function, should be called from a driver's ->open(), replaces the
> fops with a wrapper that automatically handles the `try_access` and
> `withdraw_access`.
>
> When the file is closed, the wrapper's ->release() restores the original
> fops and cleanups. This centralizes the revocable logic, making drivers
> cleaner and easier to maintain.
>
> Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
> ---
> PoC patch.
>
> Known issues:
> - All file operations call revocable_try_access() for guaranteeing the
> resource even if the resource may be unused in the fops.
Why is this so complicated??
You already added a per-flip struct:
> +struct fs_revocable_replacement {
> + const struct fs_revocable_operations *frops;
> + const struct file_operations *orig_fops;
> + struct file_operations fops;
> + struct revocable **revs;
> + size_t num_revs;
> +};
Why does it need so much junk in it?
struct fs_revocable_replacement {
struct srcu_struct srcu;
bool *alive;
};
That's it. When the caller sets this up it provides a bool * pointer
from it's own private struct that is kept krefcounted to life cycle of
the struct file.
Then the ops wrapers are a simple thing - generate them with a macro:
srcu_read_lock(&f_rr->srcu);
if (*f_rr_>alive)
ret = f_rr->orig_fops->XX(...)
else
ret = -ENODEV;
srcu_read_unlock(&f_rr->srcu);
return ret;
No need for all this revokable maze to do somethinig so simple.
Also, I don't think srcu is a good idea for this use case, maybe as an
option, but the default should be to use rwsem.
Jason
On Thu, Oct 16, 2025 at 09:31:49AM -0300, Jason Gunthorpe wrote:
> On Thu, Oct 16, 2025 at 05:42:02AM +0000, Tzung-Bi Shih wrote:
> > Introduce fs_revocable_replace() to simplify the use of the revocable
> > API with file_operations.
> >
> > The function, should be called from a driver's ->open(), replaces the
> > fops with a wrapper that automatically handles the `try_access` and
> > `withdraw_access`.
> >
> > When the file is closed, the wrapper's ->release() restores the original
> > fops and cleanups. This centralizes the revocable logic, making drivers
> > cleaner and easier to maintain.
> >
> > Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
> > ---
> > PoC patch.
> >
> > Known issues:
> > - All file operations call revocable_try_access() for guaranteeing the
> > resource even if the resource may be unused in the fops.
>
> Why is this so complicated??
>
> You already added a per-flip struct:
>
> > +struct fs_revocable_replacement {
> > + const struct fs_revocable_operations *frops;
> > + const struct file_operations *orig_fops;
> > + struct file_operations fops;
> > + struct revocable **revs;
> > + size_t num_revs;
> > +};
>
> Why does it need so much junk in it?
>
> struct fs_revocable_replacement {
> struct srcu_struct srcu;
> bool *alive;
> };
>
> That's it. When the caller sets this up it provides a bool * pointer
> from it's own private struct that is kept krefcounted to life cycle of
> the struct file.
>
> Then the ops wrapers are a simple thing - generate them with a macro:
>
> srcu_read_lock(&f_rr->srcu);
> if (*f_rr_>alive)
> ret = f_rr->orig_fops->XX(...)
> else
> ret = -ENODEV;
> srcu_read_unlock(&f_rr->srcu);
> return ret;
>
> No need for all this revokable maze to do somethinig so simple.
Imagining the following example:
/* res1 and res2 are provided by hot-pluggable devices. */
struct filp_priv {
void *res1;
void *res2;
};
/* In .open() fops */
priv = kzalloc(sizeof(struct filp_priv), ...);
priv->res1 = ...;
priv->res2 = ...;
filp->private_data = priv;
/* In .read() fops */
priv = filp->private_data;
priv->res1 // could result UAF if the device has gone
priv->res2 // could result UAF if the device has gone
How does the bool * work for the example?
On Fri, Oct 17, 2025 at 02:36:58AM +0000, Tzung-Bi Shih wrote:
> Imagining the following example:
>
> /* res1 and res2 are provided by hot-pluggable devices. */
> struct filp_priv {
> void *res1;
> void *res2;
> };
>
> /* In .open() fops */
> priv = kzalloc(sizeof(struct filp_priv), ...);
> priv->res1 = ...;
> priv->res2 = ...;
> filp->private_data = priv;
>
> /* In .read() fops */
> priv = filp->private_data;
> priv->res1 // could result UAF if the device has gone
> priv->res2 // could result UAF if the device has gone
>
>
> How does the bool * work for the example?
You are thinking about it completely wrong, you are trying to keep the
driver running conccurrently after it's remove returns - but that
isn't how Linux drivers are designed.
We have a whole family of synchronous fencing APIs that drivers call
in their remove() callback to shut down their concurrency. Think of
things like free_irq(), cancel_work_sync(), timer_shutdown_sync(),
sysfs_remove_files(). All of these guarentee the concurrent callbacks
are fenced before returning.
The only issue with cros_ec is this:
static void cros_ec_chardev_remove(struct platform_device *pdev)
{
struct miscdevice *misc = dev_get_drvdata(&pdev->dev);
misc_deregister(misc);
}
It doesn't fence the cdevs! Misc is a hard API to use because it
doesn't have a misc_deregister_sync() variation!
Dan/Laurent's point and proposal was that mis_deregister() does not
work like this! It is an anomaly that driver authors typically over
look.
So the proposal was to add some way to get a:
misc_deregister_sync()
What gives the fence. Under your proposal it would lock the SRCU and
change the bool. After it returns no cdev related threads are running
in fops touching res1/res2. I think your proposal to replace the fops
and that related machinery is smart and has a chance to succeed.
From this perspective your example is malformed. Resources should not
become revoked concurrently *while a driver is bound*. The driver
should be unbound, call misc_deregister_sync()/etc, and return from
remove() guaranteeing it no longer touches any resources.
For this specific cros_ec driver it's "res" is this:
struct cros_ec_dev *ec = dev_get_drvdata(pdev->dev.parent);
struct cros_ec_platform *ec_platform = dev_get_platdata(ec->dev);
This is already properly lifetime controlled!
It *HAS* to be, and even your patches are assuming it by blindly
reaching into the parent's memory!
+ misc->rps[0] = ec->ec_dev->revocable_provider;
If the parent driver has been racily unbound at this point the
ec->ec_dev is already a UAF!
For cros it is safe because the cros_ec driver is a child of a MFD and
the MFD logic ensures that the children are unbound as part of
destroying the parent. So 'ec' is guarenteed valid from probe() to
remove() return.
IHMO auto-revoke is a terrible idea, if you go down that path then why
is misc special? You need to auto-revoke irqs, timers, work queues,
etc too? That's a mess.
I think your previous idea for revoke was properly formed, the issue
and objection was that the bug you are fixing is a miscdev complexity
caused by the lack of misc_deregister_sync(). If you fix that directly
then you don't need recovable at all, and it is a much more useful fix
that is an easy and natural API for drivers to use.
Jason
On Fri, Oct 17, 2025 at 10:49:16AM -0300, Jason Gunthorpe wrote:
> On Fri, Oct 17, 2025 at 02:36:58AM +0000, Tzung-Bi Shih wrote:
> > Imagining the following example:
> >
> > /* res1 and res2 are provided by hot-pluggable devices. */
> > struct filp_priv {
> > void *res1;
> > void *res2;
> > };
> >
> > /* In .open() fops */
> > priv = kzalloc(sizeof(struct filp_priv), ...);
> > priv->res1 = ...;
> > priv->res2 = ...;
> > filp->private_data = priv;
> >
> > /* In .read() fops */
> > priv = filp->private_data;
> > priv->res1 // could result UAF if the device has gone
> > priv->res2 // could result UAF if the device has gone
> >
> >
> > How does the bool * work for the example?
>
> You are thinking about it completely wrong, you are trying to keep the
> driver running conccurrently after it's remove returns - but that
> isn't how Linux drivers are designed.
>
> We have a whole family of synchronous fencing APIs that drivers call
> in their remove() callback to shut down their concurrency. Think of
> things like free_irq(), cancel_work_sync(), timer_shutdown_sync(),
> sysfs_remove_files(). All of these guarentee the concurrent callbacks
> are fenced before returning.
>
> The only issue with cros_ec is this:
>
> static void cros_ec_chardev_remove(struct platform_device *pdev)
> {
> struct miscdevice *misc = dev_get_drvdata(&pdev->dev);
>
> misc_deregister(misc);
> }
>
> It doesn't fence the cdevs! Misc is a hard API to use because it
> doesn't have a misc_deregister_sync() variation!
>
> Dan/Laurent's point and proposal was that mis_deregister() does not
> work like this! It is an anomaly that driver authors typically over
> look.
>
> So the proposal was to add some way to get a:
> misc_deregister_sync()
>
> What gives the fence. Under your proposal it would lock the SRCU and
> change the bool. After it returns no cdev related threads are running
> in fops touching res1/res2. I think your proposal to replace the fops
> and that related machinery is smart and has a chance to succeed.
>
> From this perspective your example is malformed. Resources should not
> become revoked concurrently *while a driver is bound*. The driver
> should be unbound, call misc_deregister_sync()/etc, and return from
> remove() guaranteeing it no longer touches any resources.
Imagining:
- Driver X provides the res1.
- Driver Y provides the res2.
- Driver Z provides the chardev /dev/zzz. The file operations use res1
and res2.
- A userspace program opened /dev/zzz.
In the approach, what is .remove() of driver X supposed to do when driver X
is unbinding (e.g. due to device unplug)?
If it ends up call misc_deregister_sync(), should the synchronous function
wait for the userspace program to close the FD?
The design behind revocable: the driver X waits via synchronize_srcu(), and
then it is free to go. The subsequent accesses to res1 will get NULL, and
should fail gracefully.
> For this specific cros_ec driver it's "res" is this:
>
> struct cros_ec_dev *ec = dev_get_drvdata(pdev->dev.parent);
> struct cros_ec_platform *ec_platform = dev_get_platdata(ec->dev);
In fact, no, the "res" we are concerning is struct cros_ec_device, e.g. [1].
(I knew the naming cros_ec_device vs. cros_ec_dev is somehow easy to confuse.)
[1] https://elixir.bootlin.com/linux/v6.17/source/drivers/platform/chrome/cros_ec_spi.c#L752
> This is already properly lifetime controlled!
>
> It *HAS* to be, and even your patches are assuming it by blindly
> reaching into the parent's memory!
>
> + misc->rps[0] = ec->ec_dev->revocable_provider;
>
> If the parent driver has been racily unbound at this point the
> ec->ec_dev is already a UAF!
Not really, it uses the fact that the caller is from probe(). I think the
driver can't be unbound when it is still in probe().
(Probe protocol device -> register the MFD device ->
add cros-ec-chardev device and probe.)
> For cros it is safe because the cros_ec driver is a child of a MFD and
> the MFD logic ensures that the children are unbound as part of
> destroying the parent. So 'ec' is guarenteed valid from probe() to
> remove() return.
>
> IHMO auto-revoke is a terrible idea, if you go down that path then why
> is misc special? You need to auto-revoke irqs, timers, work queues,
> etc too? That's a mess.
To be clear, I'm using misc as an example which is also the real crash we
observed. If the file operations use other resources provided by a
hot-pluggable device, it'd need to use revocable APIs to prevent the UAFs.
On 10/17/25 6:07 PM, Tzung-Bi Shih wrote: > Imagining: > - Driver X provides the res1. > - Driver Y provides the res2. > - Driver Z provides the chardev /dev/zzz. The file operations use res1 > and res2. > - A userspace program opened /dev/zzz. > > In the approach, what is .remove() of driver X supposed to do when driver X > is unbinding (e.g. due to device unplug)? There are use-cases for revocable, but this example is a bit too generic: Drivers don't just share device resources with other random drivers, they usually have a defined relationship through a bus. For instance, if you have a driver on the platform bus and another driver connected through the auxiliary bus, there is a defined lifetime: The auxiliary device has to be unbound before its parent device. This means that as long as you are in a scope where your auxiliary device is bound, it is safe to use a device resource from that parent without further checks. The goal should always be to proof to be in such a scope when accessing device resources (in Rust we can let the compiler ensure this at compile time :). However, there are rare (yet valid) cases where such a scope can't be guaranteed. DRM has such cases, and, unfortunately, MISC device seems to be another one. I know the reasons why DRM has to have this design, I'm not sure about MISC device though. Unless there's a good reason, I think MISC device should be "fenced" instead.
On Fri, Oct 17, 2025 at 06:29:10PM +0200, Danilo Krummrich wrote: > I'm not sure about MISC device though. Unless there's a good reason, > I think MISC device should be "fenced" instead. misc is a very small wrapper around raw fops, and raw fops are optimized for performance. Adding locking that many important things like normal files don't need to all fops would not be agreed. The sketch in this series where we have a core helper to provide a shim fops that adds on the lock is smart and I think could be an agreeable way to make a synchronous misc and cdev unregister for everyone to trivially use. Jason
On 10/17/25 6:37 PM, Jason Gunthorpe wrote: > On Fri, Oct 17, 2025 at 06:29:10PM +0200, Danilo Krummrich wrote: > >> I'm not sure about MISC device though. Unless there's a good reason, >> I think MISC device should be "fenced" instead. > > misc is a very small wrapper around raw fops, and raw fops are > optimized for performance. Adding locking that many important things > like normal files don't need to all fops would not be agreed. > > The sketch in this series where we have a core helper to provide a > shim fops that adds on the lock is smart and I think could be an > agreeable way to make a synchronous misc and cdev unregister for > everyone to trivially use. Sure, for MISC devices without a parent for instance there are no device resources to access anyways.
On Fri, Oct 17, 2025 at 08:19:06PM +0200, Danilo Krummrich wrote: > On 10/17/25 6:37 PM, Jason Gunthorpe wrote: > > On Fri, Oct 17, 2025 at 06:29:10PM +0200, Danilo Krummrich wrote: > > > >> I'm not sure about MISC device though. Unless there's a good reason, > >> I think MISC device should be "fenced" instead. > > > > misc is a very small wrapper around raw fops, and raw fops are > > optimized for performance. Adding locking that many important things > > like normal files don't need to all fops would not be agreed. > > > > The sketch in this series where we have a core helper to provide a > > shim fops that adds on the lock is smart and I think could be an > > agreeable way to make a synchronous misc and cdev unregister for > > everyone to trivially use. > > Sure, for MISC devices without a parent for instance there are no device > resources to access anyways. There are many situations with misc that can get people into trouble without parent: misc_deregister(x); timer_shutdown_sync(y); kfree(z); For example. It is is buggy if the fops touch y or z. This is why a _sync version is such a nice clean idea because with 5 letters the above can just be fixed. Wrapping everything in a revocable would be a huge PITA. Jason
On Fri Oct 17, 2025 at 8:44 PM CEST, Jason Gunthorpe wrote: > On Fri, Oct 17, 2025 at 08:19:06PM +0200, Danilo Krummrich wrote: >> On 10/17/25 6:37 PM, Jason Gunthorpe wrote: >> > On Fri, Oct 17, 2025 at 06:29:10PM +0200, Danilo Krummrich wrote: >> > >> >> I'm not sure about MISC device though. Unless there's a good reason, >> >> I think MISC device should be "fenced" instead. >> > >> > misc is a very small wrapper around raw fops, and raw fops are >> > optimized for performance. Adding locking that many important things >> > like normal files don't need to all fops would not be agreed. >> > >> > The sketch in this series where we have a core helper to provide a >> > shim fops that adds on the lock is smart and I think could be an >> > agreeable way to make a synchronous misc and cdev unregister for >> > everyone to trivially use. >> >> Sure, for MISC devices without a parent for instance there are no device >> resources to access anyways. > > There are many situations with misc that can get people into trouble without > parent: > > misc_deregister(x); > timer_shutdown_sync(y); > kfree(z); > > For example. It is is buggy if the fops touch y or z. > > This is why a _sync version is such a nice clean idea because with 5 > letters the above can just be fixed. > > Wrapping everything in a revocable would be a huge PITA. That's a bit of a different problem though. Revocable clearly isn't the solution. _sync() works, but doesn't account for the actual problem, which is that the file private has at least shared ownership of y and z. So, it's more of an ownership / lifetime problem. The file private data should either own y and z entirely or a corresponding reference count that is dropped in fops release(). Device resources are different though, since we can't just hold on to them with a reference count etc.; they're strictly gone once the bus device is unbound, hence revocable when there is no _sync().
On Fri, Oct 17, 2025 at 11:41:56PM +0200, Danilo Krummrich wrote: > On Fri Oct 17, 2025 at 8:44 PM CEST, Jason Gunthorpe wrote: > > On Fri, Oct 17, 2025 at 08:19:06PM +0200, Danilo Krummrich wrote: > >> On 10/17/25 6:37 PM, Jason Gunthorpe wrote: > >> > On Fri, Oct 17, 2025 at 06:29:10PM +0200, Danilo Krummrich wrote: > >> > > >> >> I'm not sure about MISC device though. Unless there's a good reason, > >> >> I think MISC device should be "fenced" instead. > >> > > >> > misc is a very small wrapper around raw fops, and raw fops are > >> > optimized for performance. Adding locking that many important things > >> > like normal files don't need to all fops would not be agreed. > >> > > >> > The sketch in this series where we have a core helper to provide a > >> > shim fops that adds on the lock is smart and I think could be an > >> > agreeable way to make a synchronous misc and cdev unregister for > >> > everyone to trivially use. > >> > >> Sure, for MISC devices without a parent for instance there are no device > >> resources to access anyways. > > > > There are many situations with misc that can get people into trouble without > > parent: > > > > misc_deregister(x); > > timer_shutdown_sync(y); > > kfree(z); > > > > For example. It is is buggy if the fops touch y or z. > > > > This is why a _sync version is such a nice clean idea because with 5 > > letters the above can just be fixed. > > > > Wrapping everything in a revocable would be a huge PITA. > > That's a bit of a different problem though. Revocable clearly isn't the > solution. _sync() works, but doesn't account for the actual problem, which is > that the file private has at least shared ownership of y and z. > > So, it's more of an ownership / lifetime problem. The file private data should > either own y and z entirely or a corresponding reference count that is dropped > in fops release(). I think both versions are popular in the kernel.. You can legimately treat y and z the same as device resources without creating a correctness problem and it is less code. You can also do refcounts. For instance at least in C you'd never argue that people should use refcount private data when they use a timer or irq subsystem. You'd use a simple sync cleanup and be done with it. These bugs are coming because of mixing models, we have a bunch of sync APIs in the kernel that are easy to use and then you get these weird non-sync ones without any clear documentation :) Jason
On Sat Oct 18, 2025 at 12:56 AM CEST, Jason Gunthorpe wrote: > On Fri, Oct 17, 2025 at 11:41:56PM +0200, Danilo Krummrich wrote: >> On Fri Oct 17, 2025 at 8:44 PM CEST, Jason Gunthorpe wrote: >> > On Fri, Oct 17, 2025 at 08:19:06PM +0200, Danilo Krummrich wrote: >> >> On 10/17/25 6:37 PM, Jason Gunthorpe wrote: >> >> > On Fri, Oct 17, 2025 at 06:29:10PM +0200, Danilo Krummrich wrote: >> >> > >> >> >> I'm not sure about MISC device though. Unless there's a good reason, >> >> >> I think MISC device should be "fenced" instead. >> >> > >> >> > misc is a very small wrapper around raw fops, and raw fops are >> >> > optimized for performance. Adding locking that many important things >> >> > like normal files don't need to all fops would not be agreed. >> >> > >> >> > The sketch in this series where we have a core helper to provide a >> >> > shim fops that adds on the lock is smart and I think could be an >> >> > agreeable way to make a synchronous misc and cdev unregister for >> >> > everyone to trivially use. >> >> >> >> Sure, for MISC devices without a parent for instance there are no device >> >> resources to access anyways. >> > >> > There are many situations with misc that can get people into trouble without >> > parent: >> > >> > misc_deregister(x); >> > timer_shutdown_sync(y); >> > kfree(z); >> > >> > For example. It is is buggy if the fops touch y or z. >> > >> > This is why a _sync version is such a nice clean idea because with 5 >> > letters the above can just be fixed. >> > >> > Wrapping everything in a revocable would be a huge PITA. >> >> That's a bit of a different problem though. Revocable clearly isn't the >> solution. _sync() works, but doesn't account for the actual problem, which is >> that the file private has at least shared ownership of y and z. >> >> So, it's more of an ownership / lifetime problem. The file private data should >> either own y and z entirely or a corresponding reference count that is dropped >> in fops release(). > > I think both versions are popular in the kernel.. You can legimately > treat y and z the same as device resources without creating a > correctness problem and it is less code. > > You can also do refcounts. > > For instance at least in C you'd never argue that people should use > refcount private data when they use a timer or irq subsystem. You'd > use a simple sync cleanup and be done with it. Don't get me wrong, I'm well aware that there are multiple ways to achieve correctness, and I know what's common, not common, etc. And I also don't mean to say "just reference count your private data", which in a lot of (maybe even most) cases clearly is the wrong thing to do (compared to specific objects contained within the private data). What I'm saying is that it's better to think about the lifetime of an object and what it is owned by. Let's say we have a memory allocation of some object X, then the question is who owns this object. Is it the module, the file private data, device private data, interrupt handler, etc. If it is only owned by a single thing (e.g. file private), then it's simple: allocate and initialize in open(), release in release(), and only ever access in other fops. However, if we have shared ownership of an object, and it is contained in multiple different private data objects, then reference count is much more robust, compared to having your driver manually keeping track of how it can synchronize against all the owners of that object.
On Sat, Oct 18, 2025 at 12:07:58AM +0800, Tzung-Bi Shih wrote:
> > From this perspective your example is malformed. Resources should not
> > become revoked concurrently *while a driver is bound*. The driver
> > should be unbound, call misc_deregister_sync()/etc, and return from
> > remove() guaranteeing it no longer touches any resources.
>
> Imagining:
> - Driver X provides the res1.
> - Driver Y provides the res2.
> - Driver Z provides the chardev /dev/zzz. The file operations use res1
> and res2.
> - A userspace program opened /dev/zzz.
Yes, then you have a mess and it is pretty hard to deal with.
We don't usually build things like that, and I'm not aware of any
cases where killing the whole char dev is the right answer. Usually
it's something like 'res1' is critical and 'res2' is discovered
optionally.
Making up elaborate fictional use cases is not a way to justify an
over complex design.
> If it ends up call misc_deregister_sync(), should the synchronous function
> wait for the userspace program to close the FD?
Some subsystems do this, it isn't great.
> The design behind revocable: the driver X waits via synchronize_srcu(), and
> then it is free to go. The subsequent accesses to res1 will get NULL, and
> should fail gracefully.
Yes, I understand what it is for, I am saying it is not required here.
> > For this specific cros_ec driver it's "res" is this:
> >
> > struct cros_ec_dev *ec = dev_get_drvdata(pdev->dev.parent);
> > struct cros_ec_platform *ec_platform = dev_get_platdata(ec->dev);
>
> In fact, no, the "res" we are concerning is struct cros_ec_device, e.g. [1].
> (I knew the naming cros_ec_device vs. cros_ec_dev is somehowg easy to confuse.)
struct cros_ec_dev *ec = dev_get_drvdata(mdev->parent);
struct cros_ec_device *ec_dev = ec->ec_dev;
It's all the same stuff. The parent needs to ensure it remains bound
only while it's ec->ec_dev is valid. That is its job.
> > This is already properly lifetime controlled!
> >
> > It *HAS* to be, and even your patches are assuming it by blindly
> > reaching into the parent's memory!
> >
> > + misc->rps[0] = ec->ec_dev->revocable_provider;
> >
> > If the parent driver has been racily unbound at this point the
> > ec->ec_dev is already a UAF!
>
> Not really, it uses the fact that the caller is from probe(). I think the
> driver can't be unbound when it is still in probe().
Right, but that's my point you are already relying on driver binding
lifetime rules to make your access valid. You should continue to rely
on that and fix the lack of synchronous remove to fix the bug.
> To be clear, I'm using misc as an example which is also the real crash we
> observed. If the file operations use other resources provided by a
> hot-pluggable device, it'd need to use revocable APIs to prevent the
> UAFs.
I understand, but it is a very poor use of this new construct. Come
with a driver that actually has two resources and needs something so
complicated first.
Improve miscdev as Laurent suggested to fix this specific driver bug.
Do not mess up char dev by trying to link it to lists of recovable and
build some wild auto-revoking thing nobody needs.
Jason
On Fri, Oct 17, 2025 at 01:21:16PM -0300, Jason Gunthorpe wrote:
> On Sat, Oct 18, 2025 at 12:07:58AM +0800, Tzung-Bi Shih wrote:
> > > This is already properly lifetime controlled!
> > >
> > > It *HAS* to be, and even your patches are assuming it by blindly
> > > reaching into the parent's memory!
> > >
> > > + misc->rps[0] = ec->ec_dev->revocable_provider;
> > >
> > > If the parent driver has been racily unbound at this point the
> > > ec->ec_dev is already a UAF!
> >
> > Not really, it uses the fact that the caller is from probe(). I think the
> > driver can't be unbound when it is still in probe().
>
> Right, but that's my point you are already relying on driver binding
> lifetime rules to make your access valid. You should continue to rely
> on that and fix the lack of synchronous remove to fix the bug.
I think what you're looking for is something similar to the following
patches.
- Instead of having a real resource to protect with revocable, use the
subsystem device itself as a virtual resource. Revoke the virtual
resource when unregistering the device from the subsystem.
- Exit earlier if the virtual resource is NULL (i.e. the subsystem device
has been unregistered) in the file operation wrappers.
By doing so, we don't need to provide a misc_deregister_sync() which could
probably maintain a list of opening files in miscdevice and handle with all
opening files when unregistering. The device unbound is free to go and
doesn't need to wait for closing or interrupting all opening files.
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 27078541489f..bc4d249c4d0f 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -173,8 +175,12 @@ static int misc_open(struct inode *inode, struct file *file)
*/
file->private_data = c;
- err = 0;
replace_fops(file, new_fops);
+
+ err = fs_revocable_replace(c->rp, file);
+ if (err)
+ goto fail;
+
if (file->f_op->open)
err = file->f_op->open(inode, file);
fail:
@@ -234,6 +240,10 @@ int misc_register(struct miscdevice *misc)
return -EINVAL;
}
+ misc->rp = revocable_provider_alloc(misc);
+ if (!misc->rp)
+ return -ENOMEM;
+
INIT_LIST_HEAD(&misc->list);
mutex_lock(&misc_mtx);
@@ -291,6 +291,8 @@ EXPORT_SYMBOL(misc_register);
void misc_deregister(struct miscdevice *misc)
{
+ revocable_provider_revoke(misc->rp);
+
mutex_lock(&misc_mtx);
list_del_init(&misc->list);
device_destroy(&misc_class, MKDEV(MISC_MAJOR, misc->minor));
diff --git a/fs/fs_revocable.c b/fs/fs_revocable.c
new file mode 100644
...
+struct fs_revocable_replacement {
+ struct revocable *rev;
+ const struct file_operations *orig_fops;
+ struct file_operations fops;
+};
+
+static ssize_t fs_revocable_read(struct file *filp, char __user *buffer,
+ size_t length, loff_t *offset)
+{
+ void *any;
+ struct fs_revocable_replacement *rr = filp->f_rr;
+
+ REVOCABLE_TRY_ACCESS_WITH(rr->rev, any) {
+ if (!any)
+ return -ENODEV;
+ return rr->orig_fops->read(filp, buffer, length, offset);
+ }
+}
...
+int fs_revocable_replace(struct revocable_provider *rp, struct file *filp)
+{
+ struct fs_revocable_replacement *rr;
+
+ rr = kzalloc(sizeof(*rr), GFP_KERNEL);
+ if (!rr)
+ return -ENOMEM;
+
+ rr->rev = revocable_alloc(rp);
+ if (!rr->rev)
+ goto free_rr;
+
+ rr->orig_fops = filp->f_op;
+ memcpy(&rr->fops, filp->f_op, sizeof(rr->fops));
+ rr->fops.release = fs_revocable_release;
+
+ if (rr->fops.read)
+ rr->fops.read = fs_revocable_read;
+ if (rr->fops.poll)
+ rr->fops.poll = fs_revocable_poll;
+ if (rr->fops.unlocked_ioctl)
+ rr->fops.unlocked_ioctl = fs_revocable_unlocked_ioctl;
+
+ filp->f_rr = rr;
+ filp->f_op = &rr->fops;
+ return 0;
+free_rr:
+ kfree(rr);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(fs_revocable_replace);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a6de8d93838d..163496a5df6c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1066,6 +1066,7 @@ struct file {
freeptr_t f_freeptr;
};
/* --- cacheline 3 boundary (192 bytes) --- */
+ struct fs_revocable_replacement *f_rr;
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index aca911687bd5..c98b97f84c07 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -94,6 +94,7 @@ struct miscdevice {
const struct attribute_group **groups;
const char *nodename;
umode_t mode;
+ struct revocable_provider *rp;
};
On Sun, Oct 19, 2025 at 11:08:29PM +0800, Tzung-Bi Shih wrote:
> On Fri, Oct 17, 2025 at 01:21:16PM -0300, Jason Gunthorpe wrote:
> > On Sat, Oct 18, 2025 at 12:07:58AM +0800, Tzung-Bi Shih wrote:
> > > > This is already properly lifetime controlled!
> > > >
> > > > It *HAS* to be, and even your patches are assuming it by blindly
> > > > reaching into the parent's memory!
> > > >
> > > > + misc->rps[0] = ec->ec_dev->revocable_provider;
> > > >
> > > > If the parent driver has been racily unbound at this point the
> > > > ec->ec_dev is already a UAF!
> > >
> > > Not really, it uses the fact that the caller is from probe(). I think the
> > > driver can't be unbound when it is still in probe().
> >
> > Right, but that's my point you are already relying on driver binding
> > lifetime rules to make your access valid. You should continue to rely
> > on that and fix the lack of synchronous remove to fix the bug.
>
> I think what you're looking for is something similar to the following
> patches.
>
> - Instead of having a real resource to protect with revocable, use the
> subsystem device itself as a virtual resource. Revoke the virtual
> resource when unregistering the device from the subsystem.
>
> - Exit earlier if the virtual resource is NULL (i.e. the subsystem device
> has been unregistered) in the file operation wrappers.
Sure
> By doing so, we don't need to provide a misc_deregister_sync() which could
> probably maintain a list of opening files in miscdevice and handle with all
> opening files when unregistering.
I don't think we want to change the default behavior of
misc_deregister.. Maybe if it was a mutex not srcu it would be OK, but
srcu you are looking at delaying driver removal by seconds
potentially.
> @@ -234,6 +240,10 @@ int misc_register(struct miscdevice *misc)
> return -EINVAL;
> }
>
> + misc->rp = revocable_provider_alloc(misc);
> + if (!misc->rp)
> + return -ENOMEM;
Just get rid of all this revocable stuff, all this needs is a scru or
a mutex, none of this obfuscation around a simple lock is helpful in
core kernel code.
> @@ -1066,6 +1066,7 @@ struct file {
> freeptr_t f_freeptr;
> };
> /* --- cacheline 3 boundary (192 bytes) --- */
> + struct fs_revocable_replacement *f_rr;
> } __randomize_layout
The thing that will likely attract objections is this. It is probably
a good idea to try to remove it.
For simple misc users the inode->i_cdev will always be valid and you
can reach the struct misc_dev/cdev from there in all the calls.
More complex cdev users replace the inode so that wouldn't work
universally but it is good enough to get started at least.
Jason
On Mon, Oct 20, 2025 at 08:57:34AM -0300, Jason Gunthorpe wrote:
> On Sun, Oct 19, 2025 at 11:08:29PM +0800, Tzung-Bi Shih wrote:
> > On Fri, Oct 17, 2025 at 01:21:16PM -0300, Jason Gunthorpe wrote:
> > > On Sat, Oct 18, 2025 at 12:07:58AM +0800, Tzung-Bi Shih wrote:
> > > > > This is already properly lifetime controlled!
> > > > >
> > > > > It *HAS* to be, and even your patches are assuming it by blindly
> > > > > reaching into the parent's memory!
> > > > >
> > > > > + misc->rps[0] = ec->ec_dev->revocable_provider;
> > > > >
> > > > > If the parent driver has been racily unbound at this point the
> > > > > ec->ec_dev is already a UAF!
> > > >
> > > > Not really, it uses the fact that the caller is from probe(). I think the
> > > > driver can't be unbound when it is still in probe().
> > >
> > > Right, but that's my point you are already relying on driver binding
> > > lifetime rules to make your access valid. You should continue to rely
> > > on that and fix the lack of synchronous remove to fix the bug.
> >
> > I think what you're looking for is something similar to the following
> > patches.
> >
> > - Instead of having a real resource to protect with revocable, use the
> > subsystem device itself as a virtual resource. Revoke the virtual
> > resource when unregistering the device from the subsystem.
> >
> > - Exit earlier if the virtual resource is NULL (i.e. the subsystem device
> > has been unregistered) in the file operation wrappers.
>
> Sure
>
> > By doing so, we don't need to provide a misc_deregister_sync() which could
> > probably maintain a list of opening files in miscdevice and handle with all
> > opening files when unregistering.
>
> I don't think we want to change the default behavior of
> misc_deregister.. Maybe if it was a mutex not srcu it would be OK, but
> srcu you are looking at delaying driver removal by seconds
> potentially.
>
> > @@ -234,6 +240,10 @@ int misc_register(struct miscdevice *misc)
> > return -EINVAL;
> > }
> >
> > + misc->rp = revocable_provider_alloc(misc);
> > + if (!misc->rp)
> > + return -ENOMEM;
>
> Just get rid of all this revocable stuff, all this needs is a scru or
> a mutex, none of this obfuscation around a simple lock is helpful in
> core kernel code.
I didn't get the idea. With a mutex, how to handle the opening files?
Are they something like: (?)
- Maintain a list for opening files in both .open() and .release().
- In misc_deregister_sync(), traverse the list, do something (what?), and
wait for the userspace programs close the files.
> > @@ -1066,6 +1066,7 @@ struct file {
> > freeptr_t f_freeptr;
> > };
> > /* --- cacheline 3 boundary (192 bytes) --- */
> > + struct fs_revocable_replacement *f_rr;
> > } __randomize_layout
>
> The thing that will likely attract objections is this. It is probably
> a good idea to try to remove it.
>
> For simple misc users the inode->i_cdev will always be valid and you
> can reach the struct misc_dev/cdev from there in all the calls.
>
> More complex cdev users replace the inode so that wouldn't work
> universally but it is good enough to get started at least.
The context is meant to be the same lifecycle with file opens/releases but
not the miscdevice. I think the mutex vs. revocable stuff is the more
fundamental issue, we can focus on that first.
On Tue, Oct 21, 2025 at 04:49:59AM +0000, Tzung-Bi Shih wrote:
> I didn't get the idea. With a mutex, how to handle the opening files?
>
> Are they something like: (?)
> - Maintain a list for opening files in both .open() and .release().
> - In misc_deregister_sync(), traverse the list, do something (what?), and
> wait for the userspace programs close the files.
You don't need any list, we don't want to close files.
Something like this, it is very simple. You can replace the rwsem with
a srcu. srcu gives faster read locking but much slower sync.
diff --git a/fs/char_dev.c b/fs/char_dev.c
index c2ddb998f3c943..69bbfe9de4f3bb 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -5,6 +5,7 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*/
+#include <linux/cleanup.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/kdev_t.h>
@@ -343,6 +344,74 @@ void __unregister_chrdev(unsigned int major, unsigned int baseminor,
kfree(cd);
}
+struct cdev_sync_data {
+ struct rw_semaphore sem;
+ const struct file_operations *orig_fops;
+ struct file_operations sync_fops;
+ bool revoked;
+};
+
+static int cdev_sync_open(struct inode *inode, struct file *filp)
+{
+ struct cdev *p = inode->i_cdev;
+ struct cdev_sync_data *sync = p->sync;
+ const struct file_operations *fops;
+ int ret;
+
+ scoped_cond_guard(rwsem_read_kill, return -ERESTARTSYS, &sync->sem) {
+ if (sync->revoked)
+ return -ENODEV;
+
+ fops = fops_get(sync->orig_fops);
+ if (fops->open) {
+ ret = filp->f_op->open(inode, filp);
+ if (ret) {
+ fops_put(fops);
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+static void cdev_sync_release(struct inode *inode, struct file *filp)
+{
+ struct cdev *p = inode->i_cdev;
+ struct cdev_sync_data *sync = p->sync;
+
+ /*
+ * Release can continue to be called after unregister. The driver must
+ * only clean up memory.
+ */
+ if (sync->orig_fops->release)
+ sync->orig_fops->release(inode, filp);
+ fops_put(sync->orig_fops);
+}
+
+/* Must call before chrdev_open can happen */
+static int cdev_sync_init(struct cdev *p)
+{
+ struct cdev_sync_data *sync;
+
+ sync = kzalloc(sizeof(*sync), GFP_KERNEL);
+ if (!sync)
+ return -ENOMEM;
+ sync->sync_fops.owner = THIS_MODULE;
+ sync->sync_fops.open = cdev_sync_open;
+ sync->sync_fops.release = cdev_sync_release;
+ // ..
+ p->is_sync = true;
+ p->sync = sync;
+}
+
+static int cdev_sync_revoke(struct cdev *p)
+{
+ struct cdev_sync_data *sync = p->sync;
+
+ guard(rwsem_write)(&sync->sem);
+ sync->revoked = true;
+}
+
static DEFINE_SPINLOCK(cdev_lock);
static struct kobject *cdev_get(struct cdev *p)
@@ -405,7 +474,11 @@ static int chrdev_open(struct inode *inode, struct file *filp)
return ret;
ret = -ENXIO;
- fops = fops_get(p->ops);
+ if (p->is_sync)
+ fops = fops_get(p->ops);
+ else
+ fops = fops_get(&p->sync->sync_fops);
+
if (!fops)
goto out_cdev_put;
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index 0e8cd6293debba..28f0445011df20 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -11,13 +11,19 @@ struct file_operations;
struct inode;
struct module;
+struct cdev_sync_data;
+
struct cdev {
struct kobject kobj;
struct module *owner;
- const struct file_operations *ops;
+ union {
+ const struct file_operations *ops;
+ struct cdev_sync_data *sync;
+ };
struct list_head list;
dev_t dev;
unsigned int count;
+ bool is_sync;
} __randomize_layout;
void cdev_init(struct cdev *, const struct file_operations *);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index f1aaf676a874a1..298c7d4d8abb5e 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -253,6 +253,7 @@ extern void up_write(struct rw_semaphore *sem);
DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T))
DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T), _RET == 0)
+DEFINE_GUARD_COND(rwsem_read, _kill, down_read_killable(_T), _RET == 0)
DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
On Tue, Oct 21, 2025 at 09:15:36AM -0300, Jason Gunthorpe wrote:
> On Tue, Oct 21, 2025 at 04:49:59AM +0000, Tzung-Bi Shih wrote:
>
> > I didn't get the idea. With a mutex, how to handle the opening files?
> >
> > Are they something like: (?)
> > - Maintain a list for opening files in both .open() and .release().
> > - In misc_deregister_sync(), traverse the list, do something (what?), and
> > wait for the userspace programs close the files.
>
> You don't need any list, we don't want to close files.
>
> Something like this, it is very simple. You can replace the rwsem with
> a srcu. srcu gives faster read locking but much slower sync.
>
> diff --git a/fs/char_dev.c b/fs/char_dev.c
> index c2ddb998f3c943..69bbfe9de4f3bb 100644
> --- a/fs/char_dev.c
> +++ b/fs/char_dev.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 1991, 1992 Linus Torvalds
> */
>
> +#include <linux/cleanup.h>
> #include <linux/init.h>
> #include <linux/fs.h>
> #include <linux/kdev_t.h>
> @@ -343,6 +344,74 @@ void __unregister_chrdev(unsigned int major, unsigned int baseminor,
> kfree(cd);
> }
>
> +struct cdev_sync_data {
> + struct rw_semaphore sem;
> + const struct file_operations *orig_fops;
> + struct file_operations sync_fops;
> + bool revoked;
> +};
> +
> +static int cdev_sync_open(struct inode *inode, struct file *filp)
> +{
> + struct cdev *p = inode->i_cdev;
> + struct cdev_sync_data *sync = p->sync;
> + const struct file_operations *fops;
> + int ret;
> +
> + scoped_cond_guard(rwsem_read_kill, return -ERESTARTSYS, &sync->sem) {
> + if (sync->revoked)
> + return -ENODEV;
> +
> + fops = fops_get(sync->orig_fops);
> + if (fops->open) {
> + ret = filp->f_op->open(inode, filp);
> + if (ret) {
> + fops_put(fops);
> + return ret;
> + }
> + }
> + }
> + return 0;
> +}
> +
> +static void cdev_sync_release(struct inode *inode, struct file *filp)
> +{
> + struct cdev *p = inode->i_cdev;
> + struct cdev_sync_data *sync = p->sync;
> +
> + /*
> + * Release can continue to be called after unregister. The driver must
> + * only clean up memory.
> + */
> + if (sync->orig_fops->release)
> + sync->orig_fops->release(inode, filp);
> + fops_put(sync->orig_fops);
> +}
> +
> +/* Must call before chrdev_open can happen */
> +static int cdev_sync_init(struct cdev *p)
> +{
> + struct cdev_sync_data *sync;
> +
> + sync = kzalloc(sizeof(*sync), GFP_KERNEL);
> + if (!sync)
> + return -ENOMEM;
> + sync->sync_fops.owner = THIS_MODULE;
> + sync->sync_fops.open = cdev_sync_open;
> + sync->sync_fops.release = cdev_sync_release;
> + // ..
> + p->is_sync = true;
> + p->sync = sync;
> +}
> +
> +static int cdev_sync_revoke(struct cdev *p)
> +{
> + struct cdev_sync_data *sync = p->sync;
> +
> + guard(rwsem_write)(&sync->sem);
> + sync->revoked = true;
> +}
> +
> static DEFINE_SPINLOCK(cdev_lock);
>
> static struct kobject *cdev_get(struct cdev *p)
> @@ -405,7 +474,11 @@ static int chrdev_open(struct inode *inode, struct file *filp)
> return ret;
>
> ret = -ENXIO;
> - fops = fops_get(p->ops);
> + if (p->is_sync)
> + fops = fops_get(p->ops);
> + else
> + fops = fops_get(&p->sync->sync_fops);
> +
> if (!fops)
> goto out_cdev_put;
>
> diff --git a/include/linux/cdev.h b/include/linux/cdev.h
> index 0e8cd6293debba..28f0445011df20 100644
> --- a/include/linux/cdev.h
> +++ b/include/linux/cdev.h
> @@ -11,13 +11,19 @@ struct file_operations;
> struct inode;
> struct module;
>
> +struct cdev_sync_data;
> +
> struct cdev {
> struct kobject kobj;
> struct module *owner;
> - const struct file_operations *ops;
> + union {
> + const struct file_operations *ops;
> + struct cdev_sync_data *sync;
> + };
> struct list_head list;
> dev_t dev;
> unsigned int count;
> + bool is_sync;
> } __randomize_layout;
>
> void cdev_init(struct cdev *, const struct file_operations *);
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index f1aaf676a874a1..298c7d4d8abb5e 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -253,6 +253,7 @@ extern void up_write(struct rw_semaphore *sem);
> DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
> DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T))
> DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T), _RET == 0)
> +DEFINE_GUARD_COND(rwsem_read, _kill, down_read_killable(_T), _RET == 0)
>
> DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
> DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
Realized the approach doesn't work for the issue I'm looking into.
- All misc devices share the same cdev[1]. If misc_deregister() calls
cdev_sync_revoke(), the misc stop working due to one of the miscdevice
deregistered.
- The context (struct cdev_sync_data) should be the same lifecycle with
the opening file (e.g. struct file). Otherwise, when accessing the
context in the fops wrappers, it results an UAF. For example, the
sturct cdev is likely freed after cdev_sync_revoke().
[2] is a follow-up series of my original approach.
[1] https://elixir.bootlin.com/linux/v6.17/source/drivers/char/misc.c#L299
[2] https://lore.kernel.org/chrome-platform/20251106152712.11850-1-tzungbi@kernel.org
On Fri, Nov 07, 2025 at 04:11:40AM +0000, Tzung-Bi Shih wrote: > Realized the approach doesn't work for the issue I'm looking into. > > - All misc devices share the same cdev[1]. If misc_deregister() calls > cdev_sync_revoke(), the misc stop working due to one of the miscdevice > deregistered. > [1] https://elixir.bootlin.com/linux/v6.17/source/drivers/char/misc.c#L299 That's not a "cdev" in this context, but yes, misc doesn't use struct cdev at all.. Instead you have a struct miscdevice which has a similar lifecycle as cdev. Indeed you can't use what I showed above at the cdev layer exactly as is, but there is not a fundamental issue here. > - The context (struct cdev_sync_data) should be the same lifecycle with > the opening file (e.g. struct file). Otherwise, when accessing the > context in the fops wrappers, it results an UAF. For example, the > sturct cdev is likely freed after cdev_sync_revoke(). Yes, it should be tied to the memory lifecycle of the struct device under the cdev which would then by tied to the file lifecycle. It is not hard. Jason
On Tue, Oct 21, 2025 at 09:15:36AM -0300, Jason Gunthorpe wrote: > On Tue, Oct 21, 2025 at 04:49:59AM +0000, Tzung-Bi Shih wrote: > > > I didn't get the idea. With a mutex, how to handle the opening files? > > > > Are they something like: (?) > > - Maintain a list for opening files in both .open() and .release(). > > - In misc_deregister_sync(), traverse the list, do something (what?), and > > wait for the userspace programs close the files. > > You don't need any list, we don't want to close files. > > Something like this, it is very simple. You can replace the rwsem with > a srcu. srcu gives faster read locking but much slower sync. > > [...] I see. The idea is basically the same but don't use revocable at all. I was misunderstanding about the "sync" we were discussing for misc_deregister_sync(). The "sync", is analogous to synchronize_srcu() of revocable_provider_revoke() in the revocable version [1], doesn't wait for closing all opened files. [1] https://lore.kernel.org/chrome-platform/aPT-7TTgW_Xop99j@tzungbi-laptop/
On Thu, Oct 23, 2025 at 10:22:01PM +0800, Tzung-Bi Shih wrote: > I was misunderstanding about the "sync" we were discussing for > misc_deregister_sync(). The "sync", is analogous to synchronize_srcu() > of revocable_provider_revoke() in the revocable version [1], doesn't wait > for closing all opened files. Yes, and my remark is we don't need to obfuscate simple locks in core kernel code. Jason
On Thu, Oct 23, 2025 at 11:51:31AM -0300, Jason Gunthorpe wrote:
> On Thu, Oct 23, 2025 at 10:22:01PM +0800, Tzung-Bi Shih wrote:
>
> > I was misunderstanding about the "sync" we were discussing for
> > misc_deregister_sync(). The "sync", is analogous to synchronize_srcu()
> > of revocable_provider_revoke() in the revocable version [1], doesn't wait
> > for closing all opened files.
>
> Yes, and my remark is we don't need to obfuscate simple locks in core
> kernel code.
{sigh}
Yes, that's not the goal here at all.
I've refrained from jumping in as I think we are thinking of different
stuff here, probably talking past each other in places.
The original goal of having "revocable" is still needed, despite you
feeling that cdev can live without it (I strongly disagree with that,
and the v4l, gpio, i2c, and other subsystem developers have feelings
along those lines as backed up by the many talks over the years about
this.)
The use of it in the Rust code already is kind of proof of this, it
enables driver authors to not have to worry about a ton of real-world
issues they would have to otherwise. Which is why I suggested copying
that pattern into C to help us out here.
Anyway...
I've been traveling a ton, and it's not going to let up soon, but I'll
try to dig into this more later next week, or on the next 12+ hour
flight that I'll be just after that, to give a more detailed review and
response, sorry I've not been able to do so yet.
thanks,
greg k-h
On Thu, Oct 23, 2025 at 05:04:57PM +0200, Greg KH wrote:
> On Thu, Oct 23, 2025 at 11:51:31AM -0300, Jason Gunthorpe wrote:
> > On Thu, Oct 23, 2025 at 10:22:01PM +0800, Tzung-Bi Shih wrote:
> >
> > > I was misunderstanding about the "sync" we were discussing for
> > > misc_deregister_sync(). The "sync", is analogous to synchronize_srcu()
> > > of revocable_provider_revoke() in the revocable version [1], doesn't wait
> > > for closing all opened files.
> >
> > Yes, and my remark is we don't need to obfuscate simple locks in core
> > kernel code.
>
> {sigh}
>
> Yes, that's not the goal here at all.
>
> I've refrained from jumping in as I think we are thinking of different
> stuff here, probably talking past each other in places.
>
> The original goal of having "revocable" is still needed, despite you
> feeling that cdev can live without it (I strongly disagree with that,
> and the v4l, gpio, i2c, and other subsystem developers have feelings
> along those lines as backed up by the many talks over the years about
> this.)
Replying late to this thread, I don't think this is right for V4L2. When
it comes to solving the .remove() vs. userspace race, I think the right
solution is at the cdev level. We could also implement it exactly the
same way in the V4L2 core, but that would be a shame as other subsystems
would need to replicate the same. There are existing implementations,
they're easy to use in drivers (if implemented in cdev with the little
amount of required driver-logic implemented in the V4L2 core, it would
be mostly transparent for drivers), easy to wrap your head around (no
obscure lifetime magic), and they work.
Where I believe a revocable infrastructure could be useful is for
solving the .remove() vs. *kernel usage* race, which is something GPIO
likely cares about strongly.
> The use of it in the Rust code already is kind of proof of this, it
> enables driver authors to not have to worry about a ton of real-world
> issues they would have to otherwise. Which is why I suggested copying
> that pattern into C to help us out here.
>
> Anyway...
>
> I've been traveling a ton, and it's not going to let up soon, but I'll
> try to dig into this more later next week, or on the next 12+ hour
> flight that I'll be just after that, to give a more detailed review and
> response, sorry I've not been able to do so yet.
--
Regards,
Laurent Pinchart
> Replying late to this thread, I don't think this is right for V4L2. When > it comes to solving the .remove() vs. userspace race, I think the right > solution is at the cdev level. Isn't there even prototype code from Dan Williams? "[PATCH 1/3] cdev: Finish the cdev api with queued mode support" https://lkml.org/lkml/2021/1/20/997
On Thu, Dec 11, 2025 at 12:47:16PM +0900, Wolfram Sang wrote: > > > Replying late to this thread, I don't think this is right for V4L2. When > > it comes to solving the .remove() vs. userspace race, I think the right > > solution is at the cdev level. > > Isn't there even prototype code from Dan Williams? > > "[PATCH 1/3] cdev: Finish the cdev api with queued mode support" > > https://lkml.org/lkml/2021/1/20/997 I mentioned that in my LPC talk in 2022 :-) I think we should merge that (or a rebased, possibly improved version of it). I've meant to try plumbing that series in V4L2 but couldn't find the time so far. -- Regards, Laurent Pinchart
> > Isn't there even prototype code from Dan Williams? > > > > "[PATCH 1/3] cdev: Finish the cdev api with queued mode support" > > > > https://lkml.org/lkml/2021/1/20/997 > > I mentioned that in my LPC talk in 2022 :-) I think we should merge that > (or a rebased, possibly improved version of it). I've meant to try > plumbing that series in V4L2 but couldn't find the time so far. Yes, you mentioned it in 2022 but maybe not everyone in this thread is right now aware of it ;) The patch above got changes requested. I talked to Dan very briefly about it at Maintainers Summit 2023 and he was also open (back then) to pick it up again.
On Thu, Dec 11, 2025 at 05:36:57PM +0900, Wolfram Sang wrote: > > > > Isn't there even prototype code from Dan Williams? > > > > > > "[PATCH 1/3] cdev: Finish the cdev api with queued mode support" > > > > > > https://lkml.org/lkml/2021/1/20/997 > > > > I mentioned that in my LPC talk in 2022 :-) I think we should merge that > > (or a rebased, possibly improved version of it). I've meant to try > > plumbing that series in V4L2 but couldn't find the time so far. > > Yes, you mentioned it in 2022 but maybe not everyone in this thread is > right now aware of it ;) The patch above got changes requested. I talked > to Dan very briefly about it at Maintainers Summit 2023 and he was also > open (back then) to pick it up again. After discussing with Tzung-Bi today after his presentation (thank you Tzung-Bi for your time, it helped me understand the problem you're facing better), I wonder if this series is fixing the issue in the right place. At the core of the problem is a devm_kzalloc() call to allocate driver-specific data. That data structure is then referenced from a cdev, which can dereference is after it gets freed. It seems that reference-counting the data structure instead of using devm_kzalloc() could be a better solution. -- Regards, Laurent Pinchart
On Thu, Dec 11, 2025 at 10:43:06PM +0900, Laurent Pinchart wrote: > On Thu, Dec 11, 2025 at 05:36:57PM +0900, Wolfram Sang wrote: > > > > > > Isn't there even prototype code from Dan Williams? > > > > > > > > "[PATCH 1/3] cdev: Finish the cdev api with queued mode support" > > > > > > > > https://lkml.org/lkml/2021/1/20/997 > > > > > > I mentioned that in my LPC talk in 2022 :-) I think we should merge that > > > (or a rebased, possibly improved version of it). I've meant to try > > > plumbing that series in V4L2 but couldn't find the time so far. > > > > Yes, you mentioned it in 2022 but maybe not everyone in this thread is > > right now aware of it ;) The patch above got changes requested. I talked > > to Dan very briefly about it at Maintainers Summit 2023 and he was also > > open (back then) to pick it up again. > > After discussing with Tzung-Bi today after his presentation (thank you > Tzung-Bi for your time, it helped me understand the problem you're > facing better), I wonder if this series is fixing the issue in the right > place. Thank you for your time too for providing me some more context. > At the core of the problem is a devm_kzalloc() call to allocate > driver-specific data. That data structure is then referenced from a > cdev, which can dereference is after it gets freed. It seems that > reference-counting the data structure instead of using devm_kzalloc() > could be a better solution. After discussing with you, I recalled this was one of my previous attempts. See the series [1] and Greg's feedback [2]. I want to provide some more context about the cdev level solution. I failed to do so for misc device [3] mainly because all misc devices share a same cdev [4]. If one of the misc device drivers "revoke" the cdev, all other drivers stop working. I'm not saying we shouldn't seek for cdev level solution. But at least it doesn't work for misc device. Still need some other ways for misc devices. [1] https://lore.kernel.org/chrome-platform/20250721044456.2736300-8-tzungbi@kernel.org/ [2] https://lore.kernel.org/chrome-platform/2025072114-unifier-screen-1594@gregkh/ [3] https://lore.kernel.org/chrome-platform/aQ1xfHuyg1y8eJQ_@google.com/ [4] https://elixir.bootlin.com/linux/v6.17/source/drivers/char/misc.c#L299
On Thu, Dec 11, 2025 at 02:46:08PM +0000, Tzung-Bi Shih wrote:
> On Thu, Dec 11, 2025 at 10:43:06PM +0900, Laurent Pinchart wrote:
> > On Thu, Dec 11, 2025 at 05:36:57PM +0900, Wolfram Sang wrote:
> > >
> > > > > Isn't there even prototype code from Dan Williams?
> > > > >
> > > > > "[PATCH 1/3] cdev: Finish the cdev api with queued mode support"
> > > > >
> > > > > https://lkml.org/lkml/2021/1/20/997
> > > >
> > > > I mentioned that in my LPC talk in 2022 :-) I think we should merge that
> > > > (or a rebased, possibly improved version of it). I've meant to try
> > > > plumbing that series in V4L2 but couldn't find the time so far.
> > >
> > > Yes, you mentioned it in 2022 but maybe not everyone in this thread is
> > > right now aware of it ;) The patch above got changes requested. I talked
> > > to Dan very briefly about it at Maintainers Summit 2023 and he was also
> > > open (back then) to pick it up again.
> >
> > After discussing with Tzung-Bi today after his presentation (thank you
> > Tzung-Bi for your time, it helped me understand the problem you're
> > facing better), I wonder if this series is fixing the issue in the right
> > place.
>
> Thank you for your time too for providing me some more context.
>
> > At the core of the problem is a devm_kzalloc() call to allocate
> > driver-specific data. That data structure is then referenced from a
> > cdev, which can dereference is after it gets freed. It seems that
> > reference-counting the data structure instead of using devm_kzalloc()
> > could be a better solution.
>
> After discussing with you, I recalled this was one of my previous attempts.
> See the series [1] and Greg's feedback [2].
>
> I want to provide some more context about the cdev level solution. I failed
> to do so for misc device [3] mainly because all misc devices share a same
> cdev [4]. If one of the misc device drivers "revoke" the cdev, all other
> drivers stop working.
>
> I'm not saying we shouldn't seek for cdev level solution. But at least it
> doesn't work for misc device. Still need some other ways for misc devices.
>
> [1] https://lore.kernel.org/chrome-platform/20250721044456.2736300-8-tzungbi@kernel.org/
> [2] https://lore.kernel.org/chrome-platform/2025072114-unifier-screen-1594@gregkh/
> [3] https://lore.kernel.org/chrome-platform/aQ1xfHuyg1y8eJQ_@google.com/
> [4] https://elixir.bootlin.com/linux/v6.17/source/drivers/char/misc.c#L299
Continuing the context, the subsystem level solution for misc device without
revocable could be more or less like the following patch. Observed 2 main
issues of it:
1. Because it tries to synchronize the misc device and open files, it has a
big lock between them. misc_deregister() needs to wait for all open files.
I think this is a common issue shared by "replacing file operations"
approaches. All file operations are considered as critical sections.
2. It doesn't stop existing open files. UAF still happens when the dangling
FD tries to access the miscdevice (which should have been freed).
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 726516fb0a3b..0ce415da10c2 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -115,6 +116,89 @@ static const struct seq_operations misc_seq_ops = {
};
#endif
+static struct miscdevice *find_miscdevice(int minor)
+{
+ struct miscdevice *c;
+
+ list_for_each_entry(c, &misc_list, list)
+ if (c->minor == minor)
+ return c;
+ return NULL;
+}
+
+static __poll_t misc_some_poll(struct file *filp, poll_table *wait)
+{
+ struct miscdevice *c;
+
+ c = find_miscdevice(iminor(filp->f_inode));
+ if (!c)
+ return -ENODEV;
+ if (!c->fops->poll)
+ return 0;
+
+ guard(mutex)(&c->some_lock);
+ if (!c->registered)
+ return -ENODEV;
+ return c->fops->poll(filp, wait);
+}
+
+static const struct file_operations misc_some_fops = {
+ .poll = misc_some_poll,
+ .read = misc_some_read,
+ .unlocked_ioctl = misc_some_ioctl,
+ .release = misc_some_release,
+};
@@ -161,6 +245,7 @@ static int misc_open(struct inode *inode, struct file *file)
replace_fops(file, new_fops);
if (file->f_op->open)
err = file->f_op->open(inode, file);
+ file->f_op = &misc_some_fops;
fail:
mutex_unlock(&misc_mtx);
return err;
@@ -262,6 +347,8 @@ int misc_register(struct miscdevice *misc)
goto out;
}
+ mutex_init(&misc->some_lock);
+ misc->registered = true;
/*
* Add it to the front, so that later devices can "override"
* earlier defaults
@@ -283,6 +370,9 @@ EXPORT_SYMBOL(misc_register);
void misc_deregister(struct miscdevice *misc)
{
+ scoped_guard(mutex, &misc->some_lock)
+ misc->registered = false;
+
mutex_lock(&misc_mtx);
list_del_init(&misc->list);
device_destroy(&misc_class, MKDEV(MISC_MAJOR, misc->minor));
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 7d0aa718499c..3b42cf273f97 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -92,6 +92,8 @@ struct miscdevice {
const struct attribute_group **groups;
const char *nodename;
umode_t mode;
+ struct mutex some_lock;
+ bool registered;
};
On Thu, Oct 23, 2025 at 05:04:57PM +0200, Greg Kroah-Hartman wrote:
> On Thu, Oct 23, 2025 at 11:51:31AM -0300, Jason Gunthorpe wrote:
> > On Thu, Oct 23, 2025 at 10:22:01PM +0800, Tzung-Bi Shih wrote:
> >
> > > I was misunderstanding about the "sync" we were discussing for
> > > misc_deregister_sync(). The "sync", is analogous to synchronize_srcu()
> > > of revocable_provider_revoke() in the revocable version [1], doesn't wait
> > > for closing all opened files.
> >
> > Yes, and my remark is we don't need to obfuscate simple locks in core
> > kernel code.
>
> {sigh}
>
> Yes, that's not the goal here at all.
It is what is being proposed by this series.
> I've refrained from jumping in as I think we are thinking of different
> stuff here, probably talking past each other in places.
>
> The original goal of having "revocable" is still needed, despite you
> feeling that cdev can live without it (I strongly disagree with that,
> and the v4l, gpio, i2c, and other subsystem developers have feelings
> along those lines as backed up by the many talks over the years about
> this.)
Yes, I undertand that, but this example is just not a good
justification or vehicle for it.
cdev wants a sync unregister, this is a common pattern open coded in
many subsystems. It solves problems for alot of places, including the
bug identified in this series.
Dan and Laurent have brought this up in the past, it is something that
can be fixed and Tzung-Bi's fops shimming approach is a good idea.
Of course it is not *every* problem and there may still be a role for
revokable.
This series is fixing a simple driver bug that is due to not having a
misc_unregister_sync(). So let's fix that in the natural way and find
some other, hopefully better, reason to introduce revocable.
> The use of it in the Rust code already is kind of proof of this, it
> enables driver authors to not have to worry about a ton of real-world
> issues they would have to otherwise. Which is why I suggested copying
> that pattern into C to help us out here.
IMHO the rust code does it principally because the sync unregister
life cycle model does not fit naturally into rust.
Jason
On Thu Oct 23, 2025 at 5:57 PM CEST, Jason Gunthorpe wrote: > IMHO the rust code does it principally because the sync unregister > life cycle model does not fit naturally into rust. That's not the case. In fact, we try to give as much "sync" guarantees as possible. For instance, when a driver registers an IRQ the irq::Registration API enforces that the IRQ is unregistered before the registering device is unbound. As a consequence, the IRQ callback can provide a &Device<Bound>, which acts as a "cookie" that proves that for this scope (IRQ callback) the device is guaranteed to be bound. With this "cookie" we can then directly access device resources (such as I/O memory) that is within a Devres (and hence a Revocable) container directly, *without* any locking. I.e. we can safely bypass the Revocable and hence its overhead. The idea is to utilize this pattern for every applicable scope, e.g. workqueues / work items, timers, IRQs, substems callbacks, IOCTLs, etc. Only for scopes where no such guarantee can be given upheld, the caller actually has to go through the Revocable. And this is good, because it means the caller is indeed in a scope where there is no guarantee that the device is not unbound concurrently. So, what the Rust code aims at, is to guarantee correctness in either case. But in order to achieve that without unnecessary overhead, all the other APIs (e.g. IRQ, workqueue, etc.) have to provide specific "sync" APIs playing along the driver model. The difference between C and Rust here is mostly that the "safely bypass Revocable" trick is only possible due to Rust's type system (and hence the compiler) stopping people from doing it in an unsafe way. In C that's not possible unfortunately.
On Thu, Oct 23, 2025 at 06:20:02PM +0200, Danilo Krummrich wrote: > On Thu Oct 23, 2025 at 5:57 PM CEST, Jason Gunthorpe wrote: > > IMHO the rust code does it principally because the sync unregister > > life cycle model does not fit naturally into rust. > > That's not the case. > > In fact, we try to give as much "sync" guarantees as possible. For instance, > when a driver registers an IRQ the irq::Registration API enforces that the IRQ > is unregistered before the registering device is unbound. > > As a consequence, the IRQ callback can provide a &Device<Bound>, which acts as a > "cookie" that proves that for this scope (IRQ callback) the device is guaranteed > to be bound. > > With this "cookie" we can then directly access device resources (such as I/O > memory) that is within a Devres (and hence a Revocable) container directly, > *without* any locking. I.e. we can safely bypass the Revocable and hence its > overhead. It is good news to hear it, but I think you are making the point I was trying to make. In rust if you have a Device<bound> and you skip the revocable locking, I'd argue that you don't need "revocable" at all, just enforcement of a Device<bound>. IOW the presence of revocable in rust, with all the locking, is because the sync life cycle model is not available. Sounds like the idea is that the sync model will be widely available and the revocable lock will rarely be used? Jason
On Thu Oct 23, 2025 at 6:48 PM CEST, Jason Gunthorpe wrote:
> On Thu, Oct 23, 2025 at 06:20:02PM +0200, Danilo Krummrich wrote:
>> On Thu Oct 23, 2025 at 5:57 PM CEST, Jason Gunthorpe wrote:
>> > IMHO the rust code does it principally because the sync unregister
>> > life cycle model does not fit naturally into rust.
>>
>> That's not the case.
>>
>> In fact, we try to give as much "sync" guarantees as possible. For instance,
>> when a driver registers an IRQ the irq::Registration API enforces that the IRQ
>> is unregistered before the registering device is unbound.
>>
>> As a consequence, the IRQ callback can provide a &Device<Bound>, which acts as a
>> "cookie" that proves that for this scope (IRQ callback) the device is guaranteed
>> to be bound.
>>
>> With this "cookie" we can then directly access device resources (such as I/O
>> memory) that is within a Devres (and hence a Revocable) container directly,
>> *without* any locking. I.e. we can safely bypass the Revocable and hence its
>> overhead.
>
> It is good news to hear it, but I think you are making the point I was
> trying to make.
>
> In rust if you have a Device<bound> and you skip the revocable
> locking, I'd argue that you don't need "revocable" at all, just
> enforcement of a Device<bound>.
>
> IOW the presence of revocable in rust, with all the locking, is
> because the sync life cycle model is not available.
That's not the reason, it *is* available.
Requiring a &Device<Bound> "cookie" to be able to access a device resource
directly is one part of it. The other one is to ensure that the device resource
is actually released once the device is unbound.
When a device is unbound the Revocable within a Devres container automatically
drops the device resource (i.e. calls the destructor, which, for instance,
unmaps and releases an MMIO memory region).
Subsequently, it also ensures that the device resources can't be accessed
anymore, even if a driver would hold on to the corresponding object instance:
Obviously, it can't be accessed with a &Device<Bound> anymore, because it is
impossible that the caller is within a scope where a &Device<Bound> is present.
And an access with Revocable::try_access() will fail as well, because Revocable
knows internally that the destructor of the wrapped object was called already.
So, what we achieve is that as long as the driver uses safe code (i.e. no unsafe
{}), there is no way for a driver to mess this up and produce a bug that affects
the rest of the kernel.
While at the same time there is zero overhead in "sync" scopes, and non-"sync"
scopes, which we unfortunately need in some rare cases, are still supported in a
safe way.
> Sounds like the idea is that the sync model will be widely available
> and the revocable lock will rarely be used?
That is correct.
© 2016 - 2025 Red Hat, Inc.