drivers/iommu/amd/iommu.c | 61 ++++++++++++++++++++++++++++++++------- drivers/pci/ats.c | 6 ++-- 2 files changed, 55 insertions(+), 12 deletions(-)
This series addresses some issues identified by Sashiko in the AMD IOMMU driver during the review of the subsystem-wide ATS update work [1]. Patches 1-2 address the pre-existing bugs in iommu_ignore_device() regarding the order of alias clearing and cleaning up the DTEs. Patches 3-4 refine the probe error paths. As noted[2] by Ankit & Sashiko, unconditionally calling iommu_ignore_device() on probe failure breaks IRQ remapping for devices in PD_MODE_NONE. We now split the error paths in probe_device to preserve interrupt mapping for non-fatal failures while ensuring that dangling device pointers are cleared to prevent potential UAFs. Finally, patch 5 implements the Fail Hard pattern being standardized for ATS, ensuring configuration errors are caught during probe_device and ATS enablement failures are reported with a WARN_ON(). Patch 6 is carried forward as is from the original ATS work [3] to maintain bisectibility. [1] https://lore.kernel.org/all/20260529111208.387412-1-praan@google.com/ [2] https://lore.kernel.org/all/20260529153216.2AD1E1F00899@smtp.kernel.org/ [3] https://lore.kernel.org/all/20260529111208.387412-4-praan@google.com/ Thanks, Praan Pranjal Shrivastava (6): iommu/amd: Clear aliases before setting the rlookup_table to NULL iommu/amd: Clear DTE with update_dte256 in iommu_ignore_device() iommu/amd: Split probe error paths to preserve IRQ remapping iommu/amd: Fix Use-After-Free in non-fatal probe error path iommu/amd: Fail probe on ATS configuration failure PCI/ATS: Mandate checking pci_ats_supported() before pci_prepare_ats() drivers/iommu/amd/iommu.c | 61 ++++++++++++++++++++++++++++++++------- drivers/pci/ats.c | 6 ++-- 2 files changed, 55 insertions(+), 12 deletions(-) base-commit: 283d245468a2b61c41aa8b582f25ed5615d1c304 -- 2.54.0.823.g6e5bcc1fc9-goog
Pranjal,
On 6/1/2026 7:11 PM, Pranjal Shrivastava wrote:
> This series addresses some issues identified by Sashiko in the AMD IOMMU
> driver during the review of the subsystem-wide ATS update work [1].
>
> Patches 1-2 address the pre-existing bugs in iommu_ignore_device()
> regarding the order of alias clearing and cleaning up the DTEs.
>
Thanks for the series. While this series fixes few important bugs, at the end
certain checks are still scatters. I really want to move around the code bit so
that it makes easy to read.
ex: even after this series, in probe path its calling dev_is_pci() check 3
times! Device capability is still scattered between probe() and
iommu_init_device() function.
Does something like below makes sense (untested code)? This doesn't address
iommu_ignore_device() issues. May be we should rename iommu_ignore_device() to
something liek disable_device_dma that clears TV bit and other things.
-Vasant
---<---
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 57dc8fabc7d9..c628a2e7e3a8 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -672,11 +672,12 @@ static void pdev_disable_caps(struct pci_dev *pdev)
* This function checks if the driver got a valid device from the caller to
* avoid dereferencing invalid pointers.
*/
-static bool check_device(struct device *dev)
+static bool iommu_lookup_device(struct device *dev,
+ struct amd_iommu **iommu_out, u16 *devid_out)
{
struct amd_iommu_pci_seg *pci_seg;
- struct amd_iommu *iommu;
- int devid, sbdf;
+ int sbdf;
+ u16 devid;
if (!dev)
return false;
@@ -687,7 +688,8 @@ static bool check_device(struct device *dev)
devid = PCI_SBDF_TO_DEVID(sbdf);
iommu = rlookup_amd_iommu(dev);
- if (!iommu)
+ /* Not registered yet? */
+ if (!iommu || !iommu->iommu.ops)
return false;
/* Out of our scope? */
@@ -695,25 +697,19 @@ static bool check_device(struct device *dev)
if (devid > pci_seg->last_bdf)
return false;
+ *iommu_out = iommu;
+ *devid_out = devid;
return true;
}
-static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
+static struct iommu_dev_data *iommu_init_device(struct amd_iommu *iommu,
+ struct device *dev, u16 devid)
{
struct iommu_dev_data *dev_data;
- int devid, sbdf;
-
- if (dev_iommu_priv_get(dev))
- return 0;
- sbdf = get_device_sbdf_id(dev);
- if (sbdf < 0)
- return sbdf;
-
- devid = PCI_SBDF_TO_DEVID(sbdf);
dev_data = find_dev_data(iommu, devid);
if (!dev_data)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
dev_data->dev = dev;
@@ -724,6 +720,25 @@ static int iommu_init_device(struct amd_iommu *iommu,
struct device *dev)
dev_iommu_priv_set(dev, dev_data);
setup_aliases(iommu, dev);
+ iommu_completion_wait(iommu);
+
+ return dev_data;
+}
+
+static void iommu_init_device_caps(struct iommu_dev_data *dev_data,
+ struct device *dev,
+ struct amd_iommu *iommu)
+{
+ if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2))
+ dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K;
+ else
+ dev_data->max_irqs = MAX_IRQS_PER_TABLE_512;
+
+ amd_iommu_set_pci_msi_domain(dev, iommu);
+
+ if (!dev_is_pci(dev))
+ return;
+
/*
* By default we use passthrough mode for IOMMUv2 capable device.
* But if amd_iommu=force_isolation is set (e.g. to debug DMA to
@@ -731,11 +746,21 @@ static int iommu_init_device(struct amd_iommu *iommu,
struct device *dev)
* it'll be forced to go into translation mode.
*/
if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
- dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) {
+ amd_iommu_gt_ppr_supported()) {
dev_data->flags = pdev_get_caps(to_pci_dev(dev));
}
- return 0;
+ /*
+ * If IOMMU and device supports PASID then it will contain max
+ * supported PASIDs, else it will be zero.
+ */
+ if (amd_iommu_pasid_supported() &&
+ pdev_pasid_supported(dev_data)) {
+ dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids,
+ pci_max_pasids(to_pci_dev(dev)));
+ }
+
+ pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT);
}
static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
@@ -2451,43 +2476,29 @@ static struct iommu_device
*amd_iommu_probe_device(struct device *dev)
struct amd_iommu *iommu;
struct iommu_dev_data *dev_data;
int ret;
+ u16 devid;
- if (!check_device(dev))
- return ERR_PTR(-ENODEV);
-
- iommu = rlookup_amd_iommu(dev);
- if (!iommu)
- return ERR_PTR(-ENODEV);
-
- /* Not registered yet? */
- if (!iommu->iommu.ops)
+ if (!iommu_lookup_device(dev, &iommu, &devid))
return ERR_PTR(-ENODEV);
if (dev_iommu_priv_get(dev))
return &iommu->iommu;
- ret = iommu_init_device(iommu, dev);
- if (ret) {
- dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
- iommu_dev = ERR_PTR(ret);
+ dev_data = iommu_init_device(iommu, dev, devid);
+ if (IS_ERR(dev_data)) {
+ dev_err(dev, "Failed to initialize \n", devid);
iommu_ignore_device(iommu, dev);
- goto out_err;
+ return ERR_CAST(dev_data);
}
- amd_iommu_set_pci_msi_domain(dev, iommu);
+ iommu_init_device_caps(dev_data, dev, iommu);
iommu_dev = &iommu->iommu;
/*
- * If IOMMU and device supports PASID then it will contain max
- * supported PASIDs, else it will be zero.
+ * When DMA translation is unavailable return error so the iommu core
+ * won't attempt domain attach for this device. But interrupt-remap
+ * is still supported. Hence do not ignore the device.
*/
- dev_data = dev_iommu_priv_get(dev);
- if (amd_iommu_pasid_supported() && dev_is_pci(dev) &&
- pdev_pasid_supported(dev_data)) {
- dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids,
- pci_max_pasids(to_pci_dev(dev)));
- }
-
if (amd_iommu_pgtable == PD_MODE_NONE) {
pr_warn_once("%s: DMA translation not supported by iommu.\n",
__func__);
@@ -2495,16 +2506,6 @@ static struct iommu_device *amd_iommu_probe_device(struct
device *dev)
goto out_err;
}
- iommu_completion_wait(iommu);
-
- if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2))
- dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K;
- else
- dev_data->max_irqs = MAX_IRQS_PER_TABLE_512;
-
- if (dev_is_pci(dev))
- pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT);
-
out_err:
return iommu_dev;
}
> Patches 3-4 refine the probe error paths. As noted[2] by Ankit & Sashiko,
> unconditionally calling iommu_ignore_device() on probe failure breaks
> IRQ remapping for devices in PD_MODE_NONE. We now split the error paths
> in probe_device to preserve interrupt mapping for non-fatal failures
> while ensuring that dangling device pointers are cleared to prevent
> potential UAFs.
>
> Finally, patch 5 implements the Fail Hard pattern being standardized
> for ATS, ensuring configuration errors are caught during probe_device and
> ATS enablement failures are reported with a WARN_ON().
>
> Patch 6 is carried forward as is from the original ATS work [3] to
> maintain bisectibility.
>
> [1] https://lore.kernel.org/all/20260529111208.387412-1-praan@google.com/
> [2] https://lore.kernel.org/all/20260529153216.2AD1E1F00899@smtp.kernel.org/
> [3] https://lore.kernel.org/all/20260529111208.387412-4-praan@google.com/
>
> Thanks,
> Praan
>
> Pranjal Shrivastava (6):
> iommu/amd: Clear aliases before setting the rlookup_table to NULL
> iommu/amd: Clear DTE with update_dte256 in iommu_ignore_device()
> iommu/amd: Split probe error paths to preserve IRQ remapping
> iommu/amd: Fix Use-After-Free in non-fatal probe error path
> iommu/amd: Fail probe on ATS configuration failure
> PCI/ATS: Mandate checking pci_ats_supported() before pci_prepare_ats()
>
> drivers/iommu/amd/iommu.c | 61 ++++++++++++++++++++++++++++++++-------
> drivers/pci/ats.c | 6 ++--
> 2 files changed, 55 insertions(+), 12 deletions(-)
>
> base-commit: 283d245468a2b61c41aa8b582f25ed5615d1c304
On Tue, Jun 02, 2026 at 02:19:54PM +0530, Vasant Hegde wrote:
> -static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
> +static struct iommu_dev_data *iommu_init_device(struct amd_iommu *iommu,
> + struct device *dev, u16 devid)
> {
> struct iommu_dev_data *dev_data;
> - int devid, sbdf;
> -
> - if (dev_iommu_priv_get(dev))
> - return 0;
>
> - sbdf = get_device_sbdf_id(dev);
> - if (sbdf < 0)
> - return sbdf;
> -
> - devid = PCI_SBDF_TO_DEVID(sbdf);
> dev_data = find_dev_data(iommu, devid);
> if (!dev_data)
> - return -ENOMEM;
> + return ERR_PTR(-ENOMEM);
Why does the AMD driver do this? The dev data is supposed to be
allocated during probe and freed during remove, it doesn't make sense
to search for it in probe - it should not exist if the kernel is
working right.
Jason
Jason,
On 6/2/2026 5:38 PM, Jason Gunthorpe wrote:
> On Tue, Jun 02, 2026 at 02:19:54PM +0530, Vasant Hegde wrote:
>> -static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
>> +static struct iommu_dev_data *iommu_init_device(struct amd_iommu *iommu,
>> + struct device *dev, u16 devid)
>> {
>> struct iommu_dev_data *dev_data;
>> - int devid, sbdf;
>> -
>> - if (dev_iommu_priv_get(dev))
>> - return 0;
>>
>> - sbdf = get_device_sbdf_id(dev);
>> - if (sbdf < 0)
>> - return sbdf;
>> -
>> - devid = PCI_SBDF_TO_DEVID(sbdf);
>> dev_data = find_dev_data(iommu, devid);
>> if (!dev_data)
>> - return -ENOMEM;
>> + return ERR_PTR(-ENOMEM);
>
> Why does the AMD driver do this? The dev data is supposed to be
> allocated during probe and freed during remove, it doesn't make sense
> to search for it in probe - it should not exist if the kernel is
> working right.
Digging into git history and how its all evolved, I agree. We should fix up
find_dev_data() stuff. we should separate out search from allocation. I will add
separate fix once this series settles.
-Vasant
© 2016 - 2026 Red Hat, Inc.