PCIe permits a device to ignore ATS invalidation TLPs, while processing a
reset. This creates a problem visible to the OS where an ATS invalidation
command will time out: e.g. an SVA domain will have no coordination with a
reset event and can racily issue ATS invalidations to a resetting device.
The PCIe spec in sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and
block ATS before initiating a Function Level Reset. It also mentions that
other reset methods could have the same vulnerability as well.
Now iommu_dev_reset_prepare/done() helpers are introduced for this matter.
Use them in all the existing reset functions, which will attach the device
to an IOMMU_DOMAIN_BLOCKED during a reset, so as to allow IOMMU driver to:
- invoke pci_disable_ats() and pci_enable_ats(), if necessary
- wait for all ATS invalidations to complete
- stop issuing new ATS invalidations
- fence any incoming ATS queries
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/pci/pci-acpi.c | 17 +++++++++--
drivers/pci/pci.c | 68 ++++++++++++++++++++++++++++++++++++++----
drivers/pci/quirks.c | 23 +++++++++++++-
3 files changed, 100 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index ddb25960ea47d..adaf46422c05d 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -9,6 +9,7 @@
#include <linux/delay.h>
#include <linux/init.h>
+#include <linux/iommu.h>
#include <linux/irqdomain.h>
#include <linux/pci.h>
#include <linux/msi.h>
@@ -969,6 +970,7 @@ void pci_set_acpi_fwnode(struct pci_dev *dev)
int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
{
acpi_handle handle = ACPI_HANDLE(&dev->dev);
+ int ret = 0;
if (!handle || !acpi_has_method(handle, "_RST"))
return -ENOTTY;
@@ -976,12 +978,23 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
if (probe)
return 0;
+ /*
+ * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
+ * before initiating a reset. Notify the iommu driver that enabled ATS.
+ */
+ ret = iommu_dev_reset_prepare(&dev->dev);
+ if (ret) {
+ pci_err(dev, "failed to stop IOMMU\n");
+ return ret;
+ }
+
if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) {
pci_warn(dev, "ACPI _RST failed\n");
- return -ENOTTY;
+ ret = -ENOTTY;
}
- return 0;
+ iommu_dev_reset_done(&dev->dev);
+ return ret;
}
bool acpi_pci_power_manageable(struct pci_dev *dev)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b0f4d98036cdd..d6d87e22d81b3 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -13,6 +13,7 @@
#include <linux/delay.h>
#include <linux/dmi.h>
#include <linux/init.h>
+#include <linux/iommu.h>
#include <linux/msi.h>
#include <linux/of.h>
#include <linux/pci.h>
@@ -4529,13 +4530,26 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction);
*/
int pcie_flr(struct pci_dev *dev)
{
+ int ret = 0;
+
if (!pci_wait_for_pending_transaction(dev))
pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n");
+ /*
+ * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
+ * before initiating a reset. Notify the iommu driver that enabled ATS.
+ * Have to call it after waiting for pending DMA transaction.
+ */
+ ret = iommu_dev_reset_prepare(&dev->dev);
+ if (ret) {
+ pci_err(dev, "failed to stop IOMMU\n");
+ return ret;
+ }
+
pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR);
if (dev->imm_ready)
- return 0;
+ goto done;
/*
* Per PCIe r4.0, sec 6.6.2, a device must complete an FLR within
@@ -4544,7 +4558,11 @@ int pcie_flr(struct pci_dev *dev)
*/
msleep(100);
- return pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
+ ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
+
+done:
+ iommu_dev_reset_done(&dev->dev);
+ return ret;
}
EXPORT_SYMBOL_GPL(pcie_flr);
@@ -4572,6 +4590,7 @@ EXPORT_SYMBOL_GPL(pcie_reset_flr);
static int pci_af_flr(struct pci_dev *dev, bool probe)
{
+ int ret = 0;
int pos;
u8 cap;
@@ -4598,10 +4617,21 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
PCI_AF_STATUS_TP << 8))
pci_err(dev, "timed out waiting for pending transaction; performing AF function level reset anyway\n");
+ /*
+ * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
+ * before initiating a reset. Notify the iommu driver that enabled ATS.
+ * Have to call it after waiting for pending DMA transaction.
+ */
+ ret = iommu_dev_reset_prepare(&dev->dev);
+ if (ret) {
+ pci_err(dev, "failed to stop IOMMU\n");
+ return ret;
+ }
+
pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR);
if (dev->imm_ready)
- return 0;
+ goto done;
/*
* Per Advanced Capabilities for Conventional PCI ECN, 13 April 2006,
@@ -4611,7 +4641,11 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
*/
msleep(100);
- return pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
+ ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
+
+done:
+ iommu_dev_reset_done(&dev->dev);
+ return ret;
}
/**
@@ -4632,6 +4666,7 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
static int pci_pm_reset(struct pci_dev *dev, bool probe)
{
u16 csr;
+ int ret;
if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET)
return -ENOTTY;
@@ -4646,6 +4681,16 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
if (dev->current_state != PCI_D0)
return -EINVAL;
+ /*
+ * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
+ * before initiating a reset. Notify the iommu driver that enabled ATS.
+ */
+ ret = iommu_dev_reset_prepare(&dev->dev);
+ if (ret) {
+ pci_err(dev, "failed to stop IOMMU\n");
+ return ret;
+ }
+
csr &= ~PCI_PM_CTRL_STATE_MASK;
csr |= PCI_D3hot;
pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
@@ -4656,7 +4701,9 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
pci_dev_d3_sleep(dev);
- return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
+ ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
+ iommu_dev_reset_done(&dev->dev);
+ return ret;
}
/**
@@ -5111,6 +5158,16 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
if (rc)
return -ENOTTY;
+ /*
+ * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
+ * before initiating a reset. Notify the iommu driver that enabled ATS.
+ */
+ rc = iommu_dev_reset_prepare(&dev->dev);
+ if (rc) {
+ pci_err(dev, "failed to stop IOMMU\n");
+ return rc;
+ }
+
if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) {
val = reg;
} else {
@@ -5125,6 +5182,7 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL,
reg);
+ iommu_dev_reset_done(&dev->dev);
return rc;
}
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index d97335a401930..6157c6c02bdb0 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -21,6 +21,7 @@
#include <linux/pci.h>
#include <linux/isa-dma.h> /* isa_dma_bridge_buggy */
#include <linux/init.h>
+#include <linux/iommu.h>
#include <linux/delay.h>
#include <linux/acpi.h>
#include <linux/dmi.h>
@@ -4225,6 +4226,26 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = {
{ 0 }
};
+static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe,
+ const struct pci_dev_reset_methods *i)
+{
+ int ret;
+
+ /*
+ * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
+ * before initiating a reset. Notify the iommu driver that enabled ATS.
+ */
+ ret = iommu_dev_reset_prepare(&dev->dev);
+ if (ret) {
+ pci_err(dev, "failed to stop IOMMU\n");
+ return ret;
+ }
+
+ ret = i->reset(dev, probe);
+ iommu_dev_reset_done(&dev->dev);
+ return ret;
+}
+
/*
* These device-specific reset methods are here rather than in a driver
* because when a host assigns a device to a guest VM, the host may need
@@ -4239,7 +4260,7 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
i->vendor == (u16)PCI_ANY_ID) &&
(i->device == dev->device ||
i->device == (u16)PCI_ANY_ID))
- return i->reset(dev, probe);
+ return __pci_dev_specific_reset(dev, probe, i);
}
return -ENOTTY;
--
2.43.0
On 8/12/2025 6:59 AM, Nicolin Chen wrote: > PCIe permits a device to ignore ATS invalidation TLPs, while processing a > reset. This creates a problem visible to the OS where an ATS invalidation > command will time out: e.g. an SVA domain will have no coordination with a > reset event and can racily issue ATS invalidations to a resetting device. > > The PCIe spec in sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and > block ATS before initiating a Function Level Reset. It also mentions that > other reset methods could have the same vulnerability as well. > > Now iommu_dev_reset_prepare/done() helpers are introduced for this matter. > Use them in all the existing reset functions, which will attach the device > to an IOMMU_DOMAIN_BLOCKED during a reset, so as to allow IOMMU driver to: > - invoke pci_disable_ats() and pci_enable_ats(), if necessary > - wait for all ATS invalidations to complete > - stop issuing new ATS invalidations > - fence any incoming ATS queries > > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> > --- > drivers/pci/pci-acpi.c | 17 +++++++++-- > drivers/pci/pci.c | 68 ++++++++++++++++++++++++++++++++++++++---- > drivers/pci/quirks.c | 23 +++++++++++++- > 3 files changed, 100 insertions(+), 8 deletions(-) > > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c > index ddb25960ea47d..adaf46422c05d 100644 > --- a/drivers/pci/pci-acpi.c > +++ b/drivers/pci/pci-acpi.c > @@ -9,6 +9,7 @@ > > #include <linux/delay.h> > #include <linux/init.h> > +#include <linux/iommu.h> > #include <linux/irqdomain.h> > #include <linux/pci.h> > #include <linux/msi.h> > @@ -969,6 +970,7 @@ void pci_set_acpi_fwnode(struct pci_dev *dev) > int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) > { > acpi_handle handle = ACPI_HANDLE(&dev->dev); > + int ret = 0; > > if (!handle || !acpi_has_method(handle, "_RST")) > return -ENOTTY; > @@ -976,12 +978,23 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) > if (probe) > return 0; > > + /* > + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS > + * before initiating a reset. Notify the iommu driver that enabled ATS. > + */ > + ret = iommu_dev_reset_prepare(&dev->dev); > + if (ret) { > + pci_err(dev, "failed to stop IOMMU\n"); > + return ret; > + } > + > if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) { > pci_warn(dev, "ACPI _RST failed\n"); > - return -ENOTTY; > + ret = -ENOTTY; > } > > - return 0; > + iommu_dev_reset_done(&dev->dev); > + return ret; > } > > bool acpi_pci_power_manageable(struct pci_dev *dev) > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c > index b0f4d98036cdd..d6d87e22d81b3 100644 > --- a/drivers/pci/pci.c > +++ b/drivers/pci/pci.c > @@ -13,6 +13,7 @@ > #include <linux/delay.h> > #include <linux/dmi.h> > #include <linux/init.h> > +#include <linux/iommu.h> > #include <linux/msi.h> > #include <linux/of.h> > #include <linux/pci.h> > @@ -4529,13 +4530,26 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction); > */ > int pcie_flr(struct pci_dev *dev) > { > + int ret = 0; > + > if (!pci_wait_for_pending_transaction(dev)) > pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n"); > > + /* > + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS > + * before initiating a reset. Notify the iommu driver that enabled ATS. > + * Have to call it after waiting for pending DMA transaction. > + */ > + ret = iommu_dev_reset_prepare(&dev->dev); If we dont' consider the association between IOMMU and devices in FLR(), it can be understood that more complex processing logic resides outside this function. However, if the FLR() function already synchironizes and handles the association with IOMMU like this (disabling ATS by attaching device to blocking domain), then how would the following scenarios behave ? 1. Reset one of PCIe alias devices. 2. Reset PF when its VFs are actvie. .... Thanks, Ethan> + if (ret) { > + pci_err(dev, "failed to stop IOMMU\n"); > + return ret; > + } > + > pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR); > > if (dev->imm_ready) > - return 0; > + goto done; > > /* > * Per PCIe r4.0, sec 6.6.2, a device must complete an FLR within > @@ -4544,7 +4558,11 @@ int pcie_flr(struct pci_dev *dev) > */ > msleep(100); > > - return pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS); > + ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS); > + > +done: > + iommu_dev_reset_done(&dev->dev); > + return ret; > } > EXPORT_SYMBOL_GPL(pcie_flr); > > @@ -4572,6 +4590,7 @@ EXPORT_SYMBOL_GPL(pcie_reset_flr); > > static int pci_af_flr(struct pci_dev *dev, bool probe) > { > + int ret = 0; > int pos; > u8 cap; > > @@ -4598,10 +4617,21 @@ static int pci_af_flr(struct pci_dev *dev, bool probe) > PCI_AF_STATUS_TP << 8)) > pci_err(dev, "timed out waiting for pending transaction; performing AF function level reset anyway\n"); > > + /* > + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS > + * before initiating a reset. Notify the iommu driver that enabled ATS. > + * Have to call it after waiting for pending DMA transaction. > + */ > + ret = iommu_dev_reset_prepare(&dev->dev); > + if (ret) { > + pci_err(dev, "failed to stop IOMMU\n"); > + return ret; > + } > + > pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR); > > if (dev->imm_ready) > - return 0; > + goto done; > > /* > * Per Advanced Capabilities for Conventional PCI ECN, 13 April 2006, > @@ -4611,7 +4641,11 @@ static int pci_af_flr(struct pci_dev *dev, bool probe) > */ > msleep(100); > > - return pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS); > + ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS); > + > +done: > + iommu_dev_reset_done(&dev->dev); > + return ret; > } > > /** > @@ -4632,6 +4666,7 @@ static int pci_af_flr(struct pci_dev *dev, bool probe) > static int pci_pm_reset(struct pci_dev *dev, bool probe) > { > u16 csr; > + int ret; > > if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET) > return -ENOTTY; > @@ -4646,6 +4681,16 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe) > if (dev->current_state != PCI_D0) > return -EINVAL; > > + /* > + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS > + * before initiating a reset. Notify the iommu driver that enabled ATS. > + */ > + ret = iommu_dev_reset_prepare(&dev->dev); > + if (ret) { > + pci_err(dev, "failed to stop IOMMU\n"); > + return ret; > + } > + > csr &= ~PCI_PM_CTRL_STATE_MASK; > csr |= PCI_D3hot; > pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); > @@ -4656,7 +4701,9 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe) > pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); > pci_dev_d3_sleep(dev); > > - return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS); > + ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS); > + iommu_dev_reset_done(&dev->dev); > + return ret; > } > > /** > @@ -5111,6 +5158,16 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) > if (rc) > return -ENOTTY; > > + /* > + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS > + * before initiating a reset. Notify the iommu driver that enabled ATS. > + */ > + rc = iommu_dev_reset_prepare(&dev->dev); > + if (rc) { > + pci_err(dev, "failed to stop IOMMU\n"); > + return rc; > + } > + > if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) { > val = reg; > } else { > @@ -5125,6 +5182,7 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) > pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL, > reg); > > + iommu_dev_reset_done(&dev->dev); > return rc; > } > > diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c > index d97335a401930..6157c6c02bdb0 100644 > --- a/drivers/pci/quirks.c > +++ b/drivers/pci/quirks.c > @@ -21,6 +21,7 @@ > #include <linux/pci.h> > #include <linux/isa-dma.h> /* isa_dma_bridge_buggy */ > #include <linux/init.h> > +#include <linux/iommu.h> > #include <linux/delay.h> > #include <linux/acpi.h> > #include <linux/dmi.h> > @@ -4225,6 +4226,26 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { > { 0 } > }; > > +static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe, > + const struct pci_dev_reset_methods *i) > +{ > + int ret; > + > + /* > + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS > + * before initiating a reset. Notify the iommu driver that enabled ATS. > + */ > + ret = iommu_dev_reset_prepare(&dev->dev); > + if (ret) { > + pci_err(dev, "failed to stop IOMMU\n"); > + return ret; > + } > + > + ret = i->reset(dev, probe); > + iommu_dev_reset_done(&dev->dev); > + return ret; > +} > + > /* > * These device-specific reset methods are here rather than in a driver > * because when a host assigns a device to a guest VM, the host may need > @@ -4239,7 +4260,7 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe) > i->vendor == (u16)PCI_ANY_ID) && > (i->device == dev->device || > i->device == (u16)PCI_ANY_ID)) > - return i->reset(dev, probe); > + return __pci_dev_specific_reset(dev, probe, i); > } > > return -ENOTTY;
On Tue, Aug 19, 2025 at 10:12:41PM +0800, Ethan Zhao wrote: > On 8/12/2025 6:59 AM, Nicolin Chen wrote: > > @@ -4529,13 +4530,26 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction); > > */ > > int pcie_flr(struct pci_dev *dev) > > { > > + int ret = 0; > > + > > if (!pci_wait_for_pending_transaction(dev)) > > pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n"); > > + /* > > + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS > > + * before initiating a reset. Notify the iommu driver that enabled ATS. > > + * Have to call it after waiting for pending DMA transaction. > > + */ > > + ret = iommu_dev_reset_prepare(&dev->dev); > If we dont' consider the association between IOMMU and devices in FLR(), > it can be understood that more complex processing logic resides outside > this function. However, if the FLR() function already synchironizes and > handles the association with IOMMU like this (disabling ATS by attaching > device to blocking domain), then how would the following scenarios > behave ? That's a good point. The iommu-level reset is per struct device. So, basically it'll match with the FLR per pci_dev. Yet, the RID isolation between siblings might be a concern: > 1. Reset one of PCIe alias devices. IIRC, an alias device might have: a) one pci_dev; multiple RIDs In this case, neither FLR nor IOMMU isolates between RIDs. So, both FLR and IOMMU blocking will reset all RIDs. There should be no issue resulted from the IOMMU blocking. b) multiple pci_devs; single RID In this case, FLR only resets one device, while the IOMMU- level reset will block the entire RID (i.e. all devices), since they share the single translation tunnel. This could break the siblings, if they aren't also being reset along. > 2. Reset PF when its VFs are actvie. c) multiple pci_devs with their own RIDs In this case, either FLR or IOMMU only resets the PF. That being said, VFs might be affected since PF is resetting? If there is an issue, I don't see it coming from the IOMMU- level reset.. Thus, case b might be breaking. So, perhaps we should add a few conditions when calling iommu_dev_reset_prepare/done(): + Make sure that the pci_dev has ATS capability + Make sure no sibling pci_dev(s) sharing the same RID + Any others? Thanks Nicolin
On Tue, Aug 19, 2025 at 02:59:07PM -0700, Nicolin Chen wrote: > c) multiple pci_devs with their own RIDs > > In this case, either FLR or IOMMU only resets the PF. That > being said, VFs might be affected since PF is resetting? > If there is an issue, I don't see it coming from the IOMMU- > level reset.. It would still allow the ATS issue from the VF side. The VF could be pushing an invalidation during the PF reset that will get clobbered. I haven't fully checked but I think Linux doesn't really (easially?) allow resetting a PF while a VF is present... Arguably if the PF is reset the VFs should have their translations blocked too. Jason
On Thu, Aug 21, 2025 at 10:07:41AM -0300, Jason Gunthorpe wrote: > On Tue, Aug 19, 2025 at 02:59:07PM -0700, Nicolin Chen wrote: > > c) multiple pci_devs with their own RIDs > > > > In this case, either FLR or IOMMU only resets the PF. That > > being said, VFs might be affected since PF is resetting? > > If there is an issue, I don't see it coming from the IOMMU- > > level reset.. > > It would still allow the ATS issue from the VF side. The VF could be > pushing an invalidation during the PF reset that will get clobbered. > > I haven't fully checked but I think Linux doesn't really (easially?) > allow resetting a PF while a VF is present... Hmm, what if the PF encountered some fault? Does Linux have a choice not to reset PF? > Arguably if the PF is reset the VFs should have their translations > blocked too. Yea, that sounds plausible to me. But, prior to that (an IOMMU-level reset), should VFs be first reset at the PCI level? Thanks Nicolin
On Thu, Aug 21, 2025 at 11:35:27PM -0700, Nicolin Chen wrote: > On Thu, Aug 21, 2025 at 10:07:41AM -0300, Jason Gunthorpe wrote: > > On Tue, Aug 19, 2025 at 02:59:07PM -0700, Nicolin Chen wrote: > > > c) multiple pci_devs with their own RIDs > > > > > > In this case, either FLR or IOMMU only resets the PF. That > > > being said, VFs might be affected since PF is resetting? > > > If there is an issue, I don't see it coming from the IOMMU- > > > level reset.. > > > > It would still allow the ATS issue from the VF side. The VF could be > > pushing an invalidation during the PF reset that will get clobbered. > > > > I haven't fully checked but I think Linux doesn't really (easially?) > > allow resetting a PF while a VF is present... > > Hmm, what if the PF encountered some fault? Does Linux have a choice > not to reset PF? Upon more reflect I guess outside VFIO I seem to remember the SRIOV reset to the PFs will clobber the VFs too and then restore the SRIOV configuration in config space to bring them back. > > Arguably if the PF is reset the VFs should have their translations > > blocked too. > > Yea, that sounds plausible to me. But, prior to that (an IOMMU-level > reset), should VFs be first reset at the PCI level? PF reset of a SRIOV PF disables the VFs and effectively resets them already. But reaching out to mangle the translation of the VFs means you do have to take care not to disrupt anything else the VF owning driver is doing since it is fully unaware of this. Ie it could be reattaching to something else concurrently. Jason
On Fri, Aug 22, 2025 at 11:08:21AM -0300, Jason Gunthorpe wrote: > On Thu, Aug 21, 2025 at 11:35:27PM -0700, Nicolin Chen wrote: > > On Thu, Aug 21, 2025 at 10:07:41AM -0300, Jason Gunthorpe wrote: > > > On Tue, Aug 19, 2025 at 02:59:07PM -0700, Nicolin Chen wrote: > > > > c) multiple pci_devs with their own RIDs > > > > > > > > In this case, either FLR or IOMMU only resets the PF. That > > > > being said, VFs might be affected since PF is resetting? > > > > If there is an issue, I don't see it coming from the IOMMU- > > > > level reset.. > > > > > > It would still allow the ATS issue from the VF side. The VF could be > > > pushing an invalidation during the PF reset that will get clobbered. > > > > > > I haven't fully checked but I think Linux doesn't really (easially?) > > > allow resetting a PF while a VF is present... > > > > Hmm, what if the PF encountered some fault? Does Linux have a choice > > not to reset PF? > > Upon more reflect I guess outside VFIO I seem to remember the SRIOV > reset to the PFs will clobber the VFs too and then restore the SRIOV > configuration in config space to bring them back. Yea, I see ci_restore_iov_state() called in pci_restore_state(). > > > Arguably if the PF is reset the VFs should have their translations > > > blocked too. > > > > Yea, that sounds plausible to me. But, prior to that (an IOMMU-level > > reset), should VFs be first reset at the PCI level? > > PF reset of a SRIOV PF disables the VFs and effectively resets them > already. Yea, I was expecting something like a SW reset routine, for each VF, which would invoke iommu_dev_reset_prepare/_done() individually. Without that, iommu_dev_reset_prepare/_done() has to iterate all VFs internally and block them. > But reaching out to mangle the translation of the VFs means you do > have to take care not to disrupt anything else the VF owning driver is > doing since it is fully unaware of this. Ie it could be reattaching to > something else concurrently. Hmm, and this is tricky now.. The current version allows deferring the concurrent attach during a reset. But, as Kevin pointed out, we may have no choice but to fail any concurrent attach with -EBUSY, because a deferred attach might fail due to incompatibility triggering a WARN_ON only in done(). This isn't likely a problem for PF, as we can expect its driver not to do an insane concurrent attach during a reset. But it would be a very sane case for a VF. So if its driver doesn't retry or defer an EBUSY-ed attach properly, it would not be restored successfully.. It feels like we need a no-fail re-attach operation, or at least an unlikely-to-fail one. I recall years ago we tried a can_attach op to test the compatibility but it didn't get merged. Maybe we'd need it so that a concurrent attach can test compatibility, allowing the re-attach in iommu_dev_reset_done() to more likely succeed. Thanks Nicolin
On Fri, Aug 22, 2025 at 11:50:58AM -0700, Nicolin Chen wrote: > It feels like we need a no-fail re-attach operation, or at least an > unlikely-to-fail one. I recall years ago we tried a can_attach op > to test the compatibility but it didn't get merged. Maybe we'd need > it so that a concurrent attach can test compatibility, allowing the > re-attach in iommu_dev_reset_done() to more likely succeed. This is probably the cleanest option to split these things Jason
On Thu, Aug 28, 2025 at 09:51:49AM -0300, Jason Gunthorpe wrote: > On Fri, Aug 22, 2025 at 11:50:58AM -0700, Nicolin Chen wrote: > > > It feels like we need a no-fail re-attach operation, or at least an > > unlikely-to-fail one. I recall years ago we tried a can_attach op > > to test the compatibility but it didn't get merged. Maybe we'd need > > it so that a concurrent attach can test compatibility, allowing the > > re-attach in iommu_dev_reset_done() to more likely succeed. > > This is probably the cleanest option to split these things Yea, that could avoid failing a concurrent attach_dev during FLR unless the dryrun fails, helping non-SRIOV cases too. So, next version could have some new preparatory patches: - Pass in old domain to attach_dev - Add a can_attach_dev op Thanks Nicolin
On Thu, Aug 28, 2025 at 08:08:13AM -0700, Nicolin Chen wrote: > On Thu, Aug 28, 2025 at 09:51:49AM -0300, Jason Gunthorpe wrote: > > On Fri, Aug 22, 2025 at 11:50:58AM -0700, Nicolin Chen wrote: > > > > > It feels like we need a no-fail re-attach operation, or at least an > > > unlikely-to-fail one. I recall years ago we tried a can_attach op > > > to test the compatibility but it didn't get merged. Maybe we'd need > > > it so that a concurrent attach can test compatibility, allowing the > > > re-attach in iommu_dev_reset_done() to more likely succeed. > > > > This is probably the cleanest option to split these things > > Yea, that could avoid failing a concurrent attach_dev during FLR > unless the dryrun fails, helping non-SRIOV cases too. > > So, next version could have some new preparatory patches: > - Pass in old domain to attach_dev > - Add a can_attach_dev op I wouldn't make this more complicated, just focus on the signal device case here then we move on from there Just adding can_attach_dev is big series on its own Jason
On Thu, Aug 28, 2025 at 03:46:08PM -0300, Jason Gunthorpe wrote: > On Thu, Aug 28, 2025 at 08:08:13AM -0700, Nicolin Chen wrote: > > On Thu, Aug 28, 2025 at 09:51:49AM -0300, Jason Gunthorpe wrote: > > > On Fri, Aug 22, 2025 at 11:50:58AM -0700, Nicolin Chen wrote: > > > > > > > It feels like we need a no-fail re-attach operation, or at least an > > > > unlikely-to-fail one. I recall years ago we tried a can_attach op > > > > to test the compatibility but it didn't get merged. Maybe we'd need > > > > it so that a concurrent attach can test compatibility, allowing the > > > > re-attach in iommu_dev_reset_done() to more likely succeed. > > > > > > This is probably the cleanest option to split these things > > > > Yea, that could avoid failing a concurrent attach_dev during FLR > > unless the dryrun fails, helping non-SRIOV cases too. > > > > So, next version could have some new preparatory patches: > > - Pass in old domain to attach_dev > > - Add a can_attach_dev op > > I wouldn't make this more complicated, just focus on the signal device > case here then we move on from there > > Just adding can_attach_dev is big series on its own OK. I suppose a concurrent attach on a single device will be rare, so failing it won't impact that much and thus can be a Part-1. Then, for part-2, we will do can_attach_dev and support SRIOV. Thanks Nicolin
On 8/20/2025 5:59 AM, Nicolin Chen wrote: > On Tue, Aug 19, 2025 at 10:12:41PM +0800, Ethan Zhao wrote: >> On 8/12/2025 6:59 AM, Nicolin Chen wrote: >>> @@ -4529,13 +4530,26 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction); >>> */ >>> int pcie_flr(struct pci_dev *dev) >>> { >>> + int ret = 0; >>> + >>> if (!pci_wait_for_pending_transaction(dev)) >>> pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n"); >>> + /* >>> + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS >>> + * before initiating a reset. Notify the iommu driver that enabled ATS. >>> + * Have to call it after waiting for pending DMA transaction. >>> + */ >>> + ret = iommu_dev_reset_prepare(&dev->dev); > >> If we dont' consider the association between IOMMU and devices in FLR(), >> it can be understood that more complex processing logic resides outside >> this function. However, if the FLR() function already synchironizes and >> handles the association with IOMMU like this (disabling ATS by attaching >> device to blocking domain), then how would the following scenarios >> behave ? > > That's a good point. The iommu-level reset is per struct device. > So, basically it'll match with the FLR per pci_dev. Yet, the RID > isolation between siblings might be a concern: > >> 1. Reset one of PCIe alias devices. > > IIRC, an alias device might have: > > a) one pci_dev; multiple RIDs > > In this case, neither FLR nor IOMMU isolates between RIDs. > So, both FLR and IOMMU blocking will reset all RIDs. There > should be no issue resulted from the IOMMU blocking. > > b) multiple pci_devs; single RID > > In this case, FLR only resets one device, while the IOMMU- > level reset will block the entire RID (i.e. all devices), > since they share the single translation tunnel. This could > break the siblings, if they aren't also being reset along. Yup, such alias devices might not have ATS cap. because of they are PCI devices or they share the RID(BDF), so checking ATS cap condition might be useful here to skip the prepare()/done() .> >> 2. Reset PF when its VFs are actvie. > > c) multiple pci_devs with their own RIDs > > In this case, either FLR or IOMMU only resets the PF. That > being said, VFs might be affected since PF is resetting? > If there is an issue, I don't see it coming from the IOMMU- > level reset.. Each of the PF and its VFs has it owns RID(BDF), but the VFs' life depends on the living of PF, resetting PF, means all its VFs are lost. There is no processing logic about PF and its VFs in FLR() yet. my understanding the upper layer callers should consider the complexity of such case. While we introducing the connection of IOMMU & device in FLR(), seems we brought some of the logic from the outside to the inside part. One method might we don't handle PF either by explicit checking its VF configuration existing to skip prepare()/done() ? till we have much clearer handling logic about it. Thanks, Ethan > d > Thus, case b might be breaking. So, perhaps we should add a few > conditions when calling iommu_dev_reset_prepare/done(): > + Make sure that the pci_dev has ATS capability > + Make sure no sibling pci_dev(s) sharing the same RID > + Any others? > > Thanks > Nicolin
On Wed, Aug 20, 2025 at 11:18:52AM +0800, Ethan Zhao wrote: > On 8/20/2025 5:59 AM, Nicolin Chen wrote: > > b) multiple pci_devs; single RID > > > > In this case, FLR only resets one device, while the IOMMU- > > level reset will block the entire RID (i.e. all devices), > > since they share the single translation tunnel. This could > > break the siblings, if they aren't also being reset along. > Yup, such alias devices might not have ATS cap. because of they > are PCI devices or they share the RID(BDF), so checking ATS cap > condition might be useful here to skip the prepare()/done() Yea, I agree, yet I think we need it to be "sure" than "might"? So perhaps we should check alias too. Given that all alias devices in this case share the same RID and reside in the same iommu_group, we could iterate the group devices for pci_devs_are_dma_aliases(). > > > 2. Reset PF when its VFs are actvie. > > > > c) multiple pci_devs with their own RIDs > > > > In this case, either FLR or IOMMU only resets the PF. That > > being said, VFs might be affected since PF is resetting? > > If there is an issue, I don't see it coming from the IOMMU- > > level reset.. > Each of the PF and its VFs has it owns RID(BDF), but the VFs' life > depends on the living of PF, resetting PF, means all its VFs are > lost. > > There is no processing logic about PF and its VFs in FLR() yet. > my understanding the upper layer callers should consider the > complexity of such case. > > While we introducing the connection of IOMMU & device in FLR(), > seems we brought some of the logic from the outside to the inside > part. > > One method might we don't handle PF either by explicit checking its > VF configuration existing to skip prepare()/done() ? till we have > much clearer handling logic about it. That sounds a good one to start with. The prepare()/done() functions can internally bypass for devices: if (!pci_ats_supported(pci_dev) || pci_sriov_get_totalvfs(pci_dev)) return 0; /* And check alias too */ Thanks Nicolin
© 2016 - 2025 Red Hat, Inc.