When an IOMMU hardware detects an error due to a faulty device (e.g. an ATS
invalidation timeout), IOMMU drivers may quarantine the device by disabling
specific hardware features or dropping translation capabilities.
However, the core-level states of the faulty device are out of sync, as the
device can be still attached to a translation domain or even potentially be
moved to a new domain that might overwrite the driver-level quarantine.
Given that such an error can be likely an ISR, introduce a broken_work per
iommu_group, and add a helper function to allow driver to report the broken
device, so as to completely quarantine it in the core.
Use the existing pci_dev_reset_iommu_prepare() function to shift the device
to its resetting_domain/blocking_domain. A later pci_dev_reset_iommu_done()
call will clear it and move it out of the quarantine.
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
include/linux/iommu.h | 2 ++
drivers/iommu/iommu.c | 59 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9ba12b2164724..9b5f94e566ff9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -891,6 +891,8 @@ static inline struct iommu_device *__iommu_get_iommu_dev(struct device *dev)
#define iommu_get_iommu_dev(dev, type, member) \
container_of(__iommu_get_iommu_dev(dev), type, member)
+void iommu_report_device_broken(struct device *dev);
+
static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
{
*gather = (struct iommu_iotlb_gather) {
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index fcd2902d9e8db..2f297f689a3a3 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -55,6 +55,8 @@ struct iommu_group {
struct list_head devices;
struct xarray pasid_array;
struct mutex mutex;
+ struct work_struct broken_work;
+ bool requires_reset;
void *iommu_data;
void (*iommu_data_release)(void *iommu_data);
char *name;
@@ -146,6 +148,7 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
struct device *dev);
static void __iommu_group_free_device(struct iommu_group *group,
struct group_device *grp_dev);
+static void iommu_group_broken_worker(struct work_struct *work);
static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
const struct iommu_ops *ops);
@@ -1057,6 +1060,7 @@ struct iommu_group *iommu_group_alloc(void)
if (!group)
return ERR_PTR(-ENOMEM);
+ INIT_WORK(&group->broken_work, iommu_group_broken_worker);
group->kobj.kset = iommu_group_kset;
mutex_init(&group->mutex);
INIT_LIST_HEAD(&group->devices);
@@ -4031,6 +4035,7 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev)
if (WARN_ON(!group->blocking_domain))
return;
+ WRITE_ONCE(group->requires_reset, false);
/*
* A PCI device might have been in an error state, so the IOMMU driver
* had to quarantine the device by disabling specific hardware feature
@@ -4062,6 +4067,60 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev)
}
EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done);
+static void iommu_group_broken_worker(struct work_struct *work)
+{
+ struct iommu_group *group =
+ container_of(work, struct iommu_group, broken_work);
+ struct pci_dev *pdev = NULL;
+ struct device *dev;
+
+ scoped_guard(mutex, &group->mutex) {
+ /* Do not block the device again if it has been recovered */
+ if (!READ_ONCE(group->requires_reset))
+ goto out_put;
+ if (list_is_singular(&group->devices)) {
+ /* Note: only support group with a single device */
+ dev = iommu_group_first_dev(group);
+ if (dev_is_pci(dev)) {
+ pdev = to_pci_dev(dev);
+ pci_dev_get(pdev);
+ }
+ }
+ }
+
+ if (pdev) {
+ /*
+ * Quarantine the device completely. This will be cleared upon
+ * a pci_dev_reset_iommu_done() call indicating the recovery.
+ */
+ pci_dev_lock(pdev);
+ pci_dev_reset_iommu_prepare(pdev);
+ pci_dev_unlock(pdev);
+ pci_dev_put(pdev);
+ }
+out_put:
+ iommu_group_put(group);
+}
+
+void iommu_report_device_broken(struct device *dev)
+{
+ struct iommu_group *group = iommu_group_get(dev);
+
+ if (!group)
+ return;
+
+ if (READ_ONCE(group->requires_reset)) {
+ iommu_group_put(group);
+ return;
+ }
+ WRITE_ONCE(group->requires_reset, true);
+
+ /* Put the group now or later in iommu_group_broken_worker() */
+ if (!schedule_work(&group->broken_work))
+ iommu_group_put(group);
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_broken);
+
#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
/**
* iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
--
2.43.0
On 3/18/26 3:15 AM, Nicolin Chen wrote: > When an IOMMU hardware detects an error due to a faulty device (e.g. an ATS > invalidation timeout), IOMMU drivers may quarantine the device by disabling > specific hardware features or dropping translation capabilities. > > However, the core-level states of the faulty device are out of sync, as the > device can be still attached to a translation domain or even potentially be > moved to a new domain that might overwrite the driver-level quarantine. > > Given that such an error can be likely an ISR, introduce a broken_work per > iommu_group, and add a helper function to allow driver to report the broken > device, so as to completely quarantine it in the core. > > Use the existing pci_dev_reset_iommu_prepare() function to shift the device > to its resetting_domain/blocking_domain. A later pci_dev_reset_iommu_done() > call will clear it and move it out of the quarantine. > > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> > --- > include/linux/iommu.h | 2 ++ > drivers/iommu/iommu.c | 59 +++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 61 insertions(+) > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h > index 9ba12b2164724..9b5f94e566ff9 100644 > --- a/include/linux/iommu.h > +++ b/include/linux/iommu.h > @@ -891,6 +891,8 @@ static inline struct iommu_device *__iommu_get_iommu_dev(struct device *dev) > #define iommu_get_iommu_dev(dev, type, member) \ > container_of(__iommu_get_iommu_dev(dev), type, member) > > +void iommu_report_device_broken(struct device *dev); > + This declaration is inside the #ifdef CONFIG_IOMMU_API section, but there's no corresponding stub in the #else block. While current callers (arm-smmu-v3) always have CONFIG_IOMMU_API, for API completeness, please add a stub. Thanks. Shuai
On Wed, Mar 18, 2026 at 07:45:19PM +0800, Shuai Xue wrote: > On 3/18/26 3:15 AM, Nicolin Chen wrote: > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h > > index 9ba12b2164724..9b5f94e566ff9 100644 > > --- a/include/linux/iommu.h > > +++ b/include/linux/iommu.h > > @@ -891,6 +891,8 @@ static inline struct iommu_device *__iommu_get_iommu_dev(struct device *dev) > > #define iommu_get_iommu_dev(dev, type, member) \ > > container_of(__iommu_get_iommu_dev(dev), type, member) > > +void iommu_report_device_broken(struct device *dev); > > + > > This declaration is inside the #ifdef CONFIG_IOMMU_API section, but > there's no corresponding stub in the #else block. While current > callers (arm-smmu-v3) always have CONFIG_IOMMU_API, for API > completeness, please add a stub. Yes. That's a miss. Thanks Nicolin
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, March 18, 2026 3:16 AM
>
> +static void iommu_group_broken_worker(struct work_struct *work)
> +{
> + struct iommu_group *group =
> + container_of(work, struct iommu_group, broken_work);
> + struct pci_dev *pdev = NULL;
> + struct device *dev;
> +
> + scoped_guard(mutex, &group->mutex) {
> + /* Do not block the device again if it has been recovered */
> + if (!READ_ONCE(group->requires_reset))
> + goto out_put;
> + if (list_is_singular(&group->devices)) {
> + /* Note: only support group with a single device */
this series is about fixing a vulnerability. Then it sounds incomplete to
leave certain configuration still under risk. Probably we should first
ensure ATS can be enabled only in singleton group, just like how we
did for pci_enable_pasid()?
> + dev = iommu_group_first_dev(group);
> + if (dev_is_pci(dev)) {
> + pdev = to_pci_dev(dev);
> + pci_dev_get(pdev);
> + }
> + }
> + }
> +
> + if (pdev) {
> + /*
> + * Quarantine the device completely. This will be cleared
> upon
> + * a pci_dev_reset_iommu_done() call indicating the recovery.
> + */
> + pci_dev_lock(pdev);
> + pci_dev_reset_iommu_prepare(pdev);
let's rename it to iommu_quarantine_device() to be called here. then
have another wrapper pci_dev_reset_iommu_prepare() to call it too.
this path has nothing to do with reset.
> + pci_dev_unlock(pdev);
> + pci_dev_put(pdev);
> + }
> +out_put:
> + iommu_group_put(group);
> +}
> +
On Wed, Mar 18, 2026 at 07:31:18AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Wednesday, March 18, 2026 3:16 AM
> >
> > +static void iommu_group_broken_worker(struct work_struct *work)
> > +{
> > + struct iommu_group *group =
> > + container_of(work, struct iommu_group, broken_work);
> > + struct pci_dev *pdev = NULL;
> > + struct device *dev;
> > +
> > + scoped_guard(mutex, &group->mutex) {
> > + /* Do not block the device again if it has been recovered */
> > + if (!READ_ONCE(group->requires_reset))
> > + goto out_put;
> > + if (list_is_singular(&group->devices)) {
> > + /* Note: only support group with a single device */
>
> this series is about fixing a vulnerability. Then it sounds incomplete to
> leave certain configuration still under risk. Probably we should first
> ensure ATS can be enabled only in singleton group, just like how we
> did for pci_enable_pasid()?
I understand your concern. But I am not very sure about applying
limitation to ATS support. Would this block some existing cases?
> > + dev = iommu_group_first_dev(group);
> > + if (dev_is_pci(dev)) {
> > + pdev = to_pci_dev(dev);
> > + pci_dev_get(pdev);
> > + }
> > + }
> > + }
> > +
> > + if (pdev) {
> > + /*
> > + * Quarantine the device completely. This will be cleared
> > upon
> > + * a pci_dev_reset_iommu_done() call indicating the recovery.
> > + */
> > + pci_dev_lock(pdev);
> > + pci_dev_reset_iommu_prepare(pdev);
>
> let's rename it to iommu_quarantine_device() to be called here. then
> have another wrapper pci_dev_reset_iommu_prepare() to call it too.
>
> this path has nothing to do with reset.
But the implementation of that iommu_quarantine_device would be
still to shift to resetting_domain. Perhaps, we can rename that
to quarantine_domain or so.
Thanks
Nicolin
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Thursday, March 19, 2026 9:31 AM
>
> On Wed, Mar 18, 2026 at 07:31:18AM +0000, Tian, Kevin wrote:
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Wednesday, March 18, 2026 3:16 AM
> > >
> > > +static void iommu_group_broken_worker(struct work_struct *work)
> > > +{
> > > + struct iommu_group *group =
> > > + container_of(work, struct iommu_group, broken_work);
> > > + struct pci_dev *pdev = NULL;
> > > + struct device *dev;
> > > +
> > > + scoped_guard(mutex, &group->mutex) {
> > > + /* Do not block the device again if it has been recovered */
> > > + if (!READ_ONCE(group->requires_reset))
> > > + goto out_put;
> > > + if (list_is_singular(&group->devices)) {
> > > + /* Note: only support group with a single device */
> >
> > this series is about fixing a vulnerability. Then it sounds incomplete to
> > leave certain configuration still under risk. Probably we should first
> > ensure ATS can be enabled only in singleton group, just like how we
> > did for pci_enable_pasid()?
>
> I understand your concern. But I am not very sure about applying
> limitation to ATS support. Would this block some existing cases?
what about just throwing out a message to warn that enabling ATS
in a non-singleton iommu group may suffer from unquarantined
situation if any device in the group triggers a ATC invalidation timeout?
>
> > > + dev = iommu_group_first_dev(group);
> > > + if (dev_is_pci(dev)) {
> > > + pdev = to_pci_dev(dev);
> > > + pci_dev_get(pdev);
> > > + }
> > > + }
> > > + }
> > > +
> > > + if (pdev) {
> > > + /*
> > > + * Quarantine the device completely. This will be cleared
> > > upon
> > > + * a pci_dev_reset_iommu_done() call indicating the recovery.
> > > + */
> > > + pci_dev_lock(pdev);
> > > + pci_dev_reset_iommu_prepare(pdev);
> >
> > let's rename it to iommu_quarantine_device() to be called here. then
> > have another wrapper pci_dev_reset_iommu_prepare() to call it too.
> >
> > this path has nothing to do with reset.
>
> But the implementation of that iommu_quarantine_device would be
> still to shift to resetting_domain. Perhaps, we can rename that
> to quarantine_domain or so.
>
yes let's rename that too. the purpose of this function is clearly about
quarantine. reset is just one user of it.
On Thu, Mar 19, 2026 at 02:35:33AM +0000, Tian, Kevin wrote:
> > > > + scoped_guard(mutex, &group->mutex) {
> > > > + /* Do not block the device again if it has been recovered */
> > > > + if (!READ_ONCE(group->requires_reset))
> > > > + goto out_put;
> > > > + if (list_is_singular(&group->devices)) {
> > > > + /* Note: only support group with a single device */
> > >
> > > this series is about fixing a vulnerability. Then it sounds incomplete to
> > > leave certain configuration still under risk. Probably we should first
> > > ensure ATS can be enabled only in singleton group, just like how we
> > > did for pci_enable_pasid()?
> >
> > I understand your concern. But I am not very sure about applying
> > limitation to ATS support. Would this block some existing cases?
>
> what about just throwing out a message to warn that enabling ATS
> in a non-singleton iommu group may suffer from unquarantined
> situation if any device in the group triggers a ATC invalidation timeout?
Yes. Baolu suggested the same, we could move this condition into
iommu_report_device_broken().
> > > > + /*
> > > > + * Quarantine the device completely. This will be cleared
> > > > upon
> > > > + * a pci_dev_reset_iommu_done() call indicating the recovery.
> > > > + */
> > > > + pci_dev_lock(pdev);
> > > > + pci_dev_reset_iommu_prepare(pdev);
> > >
> > > let's rename it to iommu_quarantine_device() to be called here. then
> > > have another wrapper pci_dev_reset_iommu_prepare() to call it too.
> > >
> > > this path has nothing to do with reset.
> >
> > But the implementation of that iommu_quarantine_device would be
> > still to shift to resetting_domain. Perhaps, we can rename that
> > to quarantine_domain or so.
>
> yes let's rename that too. the purpose of this function is clearly about
> quarantine. reset is just one user of it.
OK. I will make it happen.
Nicolin
On 3/18/26 03:15, Nicolin Chen wrote:
> When an IOMMU hardware detects an error due to a faulty device (e.g. an ATS
> invalidation timeout), IOMMU drivers may quarantine the device by disabling
> specific hardware features or dropping translation capabilities.
>
> However, the core-level states of the faulty device are out of sync, as the
> device can be still attached to a translation domain or even potentially be
> moved to a new domain that might overwrite the driver-level quarantine.
>
> Given that such an error can be likely an ISR, introduce a broken_work per
> iommu_group, and add a helper function to allow driver to report the broken
> device, so as to completely quarantine it in the core.
>
> Use the existing pci_dev_reset_iommu_prepare() function to shift the device
> to its resetting_domain/blocking_domain. A later pci_dev_reset_iommu_done()
> call will clear it and move it out of the quarantine.
>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
> include/linux/iommu.h | 2 ++
> drivers/iommu/iommu.c | 59 +++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 61 insertions(+)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 9ba12b2164724..9b5f94e566ff9 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -891,6 +891,8 @@ static inline struct iommu_device *__iommu_get_iommu_dev(struct device *dev)
> #define iommu_get_iommu_dev(dev, type, member) \
> container_of(__iommu_get_iommu_dev(dev), type, member)
>
> +void iommu_report_device_broken(struct device *dev);
> +
> static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
> {
> *gather = (struct iommu_iotlb_gather) {
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index fcd2902d9e8db..2f297f689a3a3 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -55,6 +55,8 @@ struct iommu_group {
> struct list_head devices;
> struct xarray pasid_array;
> struct mutex mutex;
> + struct work_struct broken_work;
> + bool requires_reset;
> void *iommu_data;
> void (*iommu_data_release)(void *iommu_data);
> char *name;
> @@ -146,6 +148,7 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
> struct device *dev);
> static void __iommu_group_free_device(struct iommu_group *group,
> struct group_device *grp_dev);
> +static void iommu_group_broken_worker(struct work_struct *work);
> static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
> const struct iommu_ops *ops);
>
> @@ -1057,6 +1060,7 @@ struct iommu_group *iommu_group_alloc(void)
> if (!group)
> return ERR_PTR(-ENOMEM);
>
> + INIT_WORK(&group->broken_work, iommu_group_broken_worker);
> group->kobj.kset = iommu_group_kset;
> mutex_init(&group->mutex);
> INIT_LIST_HEAD(&group->devices);
> @@ -4031,6 +4035,7 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev)
> if (WARN_ON(!group->blocking_domain))
> return;
>
> + WRITE_ONCE(group->requires_reset, false);
> /*
> * A PCI device might have been in an error state, so the IOMMU driver
> * had to quarantine the device by disabling specific hardware feature
> @@ -4062,6 +4067,60 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev)
> }
> EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done);
>
> +static void iommu_group_broken_worker(struct work_struct *work)
> +{
> + struct iommu_group *group =
> + container_of(work, struct iommu_group, broken_work);
> + struct pci_dev *pdev = NULL;
> + struct device *dev;
> +
> + scoped_guard(mutex, &group->mutex) {
> + /* Do not block the device again if it has been recovered */
> + if (!READ_ONCE(group->requires_reset))
> + goto out_put;
> + if (list_is_singular(&group->devices)) {
> + /* Note: only support group with a single device */
> + dev = iommu_group_first_dev(group);
> + if (dev_is_pci(dev)) {
> + pdev = to_pci_dev(dev);
> + pci_dev_get(pdev);
> + }
> + }
The current mechanism is designed only for the PCIe devices within
singleton iommu groups. How about moving the above check to
iommu_report_device_broken() and emitting a message if it is called for
unsupported devices?
> + }
> +
> + if (pdev) {
> + /*
> + * Quarantine the device completely. This will be cleared upon
> + * a pci_dev_reset_iommu_done() call indicating the recovery.
> + */
> + pci_dev_lock(pdev);
> + pci_dev_reset_iommu_prepare(pdev);
> + pci_dev_unlock(pdev);
> + pci_dev_put(pdev);
> + }
> +out_put:
> + iommu_group_put(group);
> +}
> +
> +void iommu_report_device_broken(struct device *dev)
> +{
> + struct iommu_group *group = iommu_group_get(dev);
> +
> + if (!group)
> + return;
> +
> + if (READ_ONCE(group->requires_reset)) {
> + iommu_group_put(group);
> + return;
> + }
> + WRITE_ONCE(group->requires_reset, true);
> +
> + /* Put the group now or later in iommu_group_broken_worker() */
> + if (!schedule_work(&group->broken_work))
> + iommu_group_put(group);
> +}
> +EXPORT_SYMBOL_GPL(iommu_report_device_broken);
> +
> #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
> /**
> * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
Thanks,
baolu
On Wed, Mar 18, 2026 at 02:13:09PM +0800, Baolu Lu wrote:
> On 3/18/26 03:15, Nicolin Chen wrote:
> > + scoped_guard(mutex, &group->mutex) {
> > + /* Do not block the device again if it has been recovered */
> > + if (!READ_ONCE(group->requires_reset))
> > + goto out_put;
> > + if (list_is_singular(&group->devices)) {
> > + /* Note: only support group with a single device */
> > + dev = iommu_group_first_dev(group);
> > + if (dev_is_pci(dev)) {
> > + pdev = to_pci_dev(dev);
> > + pci_dev_get(pdev);
> > + }
> > + }
>
> The current mechanism is designed only for the PCIe devices within
> singleton iommu groups. How about moving the above check to
> iommu_report_device_broken() and emitting a message if it is called for
> unsupported devices?
Yes. That would be cleaner.
Thanks
Nicolin
© 2016 - 2026 Red Hat, Inc.