[v4] Error recovery for vfio-pci devices on s390x

[PATCH v4 01/10] PCI: Avoid saving error values for config space

Posted by Farhan Ali 1 week ago

The current reset process saves the device's config space state before
reset and restores it afterward. However, when a device is in an error
state before reset, config space reads may return error values instead of
valid data. This results in saving corrupted values that get written back
to the device during state restoration.

Avoid saving the state of the config space when the device is in error.
While restoring we only restore the state that can be restored through
kernel data such as BARs or doesn't depend on the saved state.

Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
 drivers/pci/pci.c      | 25 ++++++++++++++++++++++---
 drivers/pci/pcie/aer.c |  3 +++
 drivers/pci/pcie/dpc.c |  3 +++
 drivers/pci/pcie/ptm.c |  3 +++
 drivers/pci/tph.c      |  3 +++
 drivers/pci/vc.c       |  3 +++
 6 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b0f4d98036cd..a3d93d1baee7 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1720,6 +1720,9 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
 	struct pci_cap_saved_state *save_state;
 	u16 *cap;
 
+	if (!dev->state_saved)
+		return;
+
 	/*
 	 * Restore max latencies (in the LTR capability) before enabling
 	 * LTR itself in PCI_EXP_DEVCTL2.
@@ -1775,6 +1778,9 @@ static void pci_restore_pcix_state(struct pci_dev *dev)
 	struct pci_cap_saved_state *save_state;
 	u16 *cap;
 
+	if (!dev->state_saved)
+		return;
+
 	save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX);
 	pos = pci_find_capability(dev, PCI_CAP_ID_PCIX);
 	if (!save_state || !pos)
@@ -1792,6 +1798,14 @@ static void pci_restore_pcix_state(struct pci_dev *dev)
 int pci_save_state(struct pci_dev *dev)
 {
 	int i;
+	u32 val;
+
+	pci_read_config_dword(dev, PCI_COMMAND, &val);
+	if (PCI_POSSIBLE_ERROR(val)) {
+		pci_warn(dev, "Device config space inaccessible, will only be partially restored\n");
+		return -EIO;
+	}
+
 	/* XXX: 100% dword access ok here? */
 	for (i = 0; i < 16; i++) {
 		pci_read_config_dword(dev, i * 4, &dev->saved_config_space[i]);
@@ -1854,6 +1868,14 @@ static void pci_restore_config_space_range(struct pci_dev *pdev,
 
 static void pci_restore_config_space(struct pci_dev *pdev)
 {
+	if (!pdev->state_saved) {
+		pci_warn(pdev, "No saved config space, restoring BARs\n");
+		pci_restore_bars(pdev);
+		pci_write_config_word(pdev, PCI_COMMAND,
+				PCI_COMMAND_MEMORY | PCI_COMMAND_IO);
+		return;
+	}
+
 	if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
 		pci_restore_config_space_range(pdev, 10, 15, 0, false);
 		/* Restore BARs before the command register. */
@@ -1906,9 +1928,6 @@ static void pci_restore_rebar_state(struct pci_dev *pdev)
  */
 void pci_restore_state(struct pci_dev *dev)
 {
-	if (!dev->state_saved)
-		return;
-
 	pci_restore_pcie_state(dev);
 	pci_restore_pasid_state(dev);
 	pci_restore_pri_state(dev);
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e286c197d716..e6023ffbf94d 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -361,6 +361,9 @@ void pci_restore_aer_state(struct pci_dev *dev)
 	if (!aer)
 		return;
 
+	if (!dev->state_saved)
+		return;
+
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_ERR);
 	if (!save_state)
 		return;
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index fc18349614d7..e0d7034c2cd8 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -67,6 +67,9 @@ void pci_restore_dpc_state(struct pci_dev *dev)
 	if (!pci_is_pcie(dev))
 		return;
 
+	if (!dev->state_saved)
+		return;
+
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
 	if (!save_state)
 		return;
diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c
index 65e4b008be00..78613000acfb 100644
--- a/drivers/pci/pcie/ptm.c
+++ b/drivers/pci/pcie/ptm.c
@@ -112,6 +112,9 @@ void pci_restore_ptm_state(struct pci_dev *dev)
 	if (!ptm)
 		return;
 
+	if (!dev->state_saved)
+		return;
+
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_PTM);
 	if (!save_state)
 		return;
diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c
index cc64f93709a4..c0fa1b9a7a78 100644
--- a/drivers/pci/tph.c
+++ b/drivers/pci/tph.c
@@ -435,6 +435,9 @@ void pci_restore_tph_state(struct pci_dev *pdev)
 	if (!pdev->tph_enabled)
 		return;
 
+	if (!pdev->state_saved)
+		return;
+
 	save_state = pci_find_saved_ext_cap(pdev, PCI_EXT_CAP_ID_TPH);
 	if (!save_state)
 		return;
diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c
index a4ff7f5f66dd..00609c7e032a 100644
--- a/drivers/pci/vc.c
+++ b/drivers/pci/vc.c
@@ -391,6 +391,9 @@ void pci_restore_vc_state(struct pci_dev *dev)
 {
 	int i;
 
+	if (!dev->state_saved)
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(vc_caps); i++) {
 		int pos;
 		struct pci_cap_saved_state *save_state;
-- 
2.43.0

Re: [PATCH v4 01/10] PCI: Avoid saving error values for config space

Posted by Benjamin Block 9 hours ago

On Wed, Sep 24, 2025 at 10:16:19AM -0700, Farhan Ali wrote:
> @@ -1792,6 +1798,14 @@ static void pci_restore_pcix_state(struct pci_dev *dev)
>  int pci_save_state(struct pci_dev *dev)
>  {
>  	int i;
> +	u32 val;
> +
> +	pci_read_config_dword(dev, PCI_COMMAND, &val);
> +	if (PCI_POSSIBLE_ERROR(val)) {
> +		pci_warn(dev, "Device config space inaccessible, will only be partially restored\n");
> +		return -EIO;

Should it set `dev->state_saved` to `false`, to be on the save side?
Not sure whether we run a risk of restoring an old, outdated state otherwise.

> +	}
> +
>  	/* XXX: 100% dword access ok here? */
>  	for (i = 0; i < 16; i++) {
>  		pci_read_config_dword(dev, i * 4, &dev->saved_config_space[i]);
> @@ -1854,6 +1868,14 @@ static void pci_restore_config_space_range(struct pci_dev *pdev,
>  
>  static void pci_restore_config_space(struct pci_dev *pdev)
>  {
> +	if (!pdev->state_saved) {
> +		pci_warn(pdev, "No saved config space, restoring BARs\n");
> +		pci_restore_bars(pdev);
> +		pci_write_config_word(pdev, PCI_COMMAND,
> +				PCI_COMMAND_MEMORY | PCI_COMMAND_IO);

Is this really something that ought to be universally enabled? I thought this
depends on whether attached resources are IO and/or MEM?

	int pci_enable_resources(struct pci_dev *dev, int mask)
	{
		...
		pci_dev_for_each_resource(dev, r, i) {
			...
			if (r->flags & IORESOURCE_IO)
				cmd |= PCI_COMMAND_IO;
			if (r->flags & IORESOURCE_MEM)
				cmd |= PCI_COMMAND_MEMORY;
		}
		...
	}

Also IIRC, especially on s390, we never have IO resources?

	int zpci_setup_bus_resources(struct zpci_dev *zdev)
	{
		...
		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
			...
			/* only MMIO is supported */
			flags = IORESOURCE_MEM;
			if (zdev->bars[i].val & 8)
				flags |= IORESOURCE_PREFETCH;
			if (zdev->bars[i].val & 4)
				flags |= IORESOURCE_MEM_64;
			...
		}
		...
	}

So I guess this would have to have some form of the same logic as in
`pci_enable_resources()`, after restoring the BARs.

Or am I missing something?

> +		return;
> +	}

-- 
Best Regards, Benjamin Block        /        Linux on IBM Z Kernel Development
IBM Deutschland Research & Development GmbH    /   https://www.ibm.com/privacy
Vors. Aufs.-R.: Wolfgang Wendt         /        Geschäftsführung: David Faller
Sitz der Ges.: Böblingen     /    Registergericht: AmtsG Stuttgart, HRB 243294

Re: [PATCH v4 01/10] PCI: Avoid saving error values for config space

Posted by Farhan Ali 7 hours ago

On 10/1/2025 8:15 AM, Benjamin Block wrote:
> On Wed, Sep 24, 2025 at 10:16:19AM -0700, Farhan Ali wrote:
>> @@ -1792,6 +1798,14 @@ static void pci_restore_pcix_state(struct pci_dev *dev)
>>   int pci_save_state(struct pci_dev *dev)
>>   {
>>   	int i;
>> +	u32 val;
>> +
>> +	pci_read_config_dword(dev, PCI_COMMAND, &val);
>> +	if (PCI_POSSIBLE_ERROR(val)) {
>> +		pci_warn(dev, "Device config space inaccessible, will only be partially restored\n");
>> +		return -EIO;
> Should it set `dev->state_saved` to `false`, to be on the save side?
> Not sure whether we run a risk of restoring an old, outdated state otherwise.

AFAIU if the state_saved flag was set to true then any state that we 
have saved should be valid and should be okay to be restored from. We 
just want to avoid saving any invalid data.


>
>> +	}
>> +
>>   	/* XXX: 100% dword access ok here? */
>>   	for (i = 0; i < 16; i++) {
>>   		pci_read_config_dword(dev, i * 4, &dev->saved_config_space[i]);
>> @@ -1854,6 +1868,14 @@ static void pci_restore_config_space_range(struct pci_dev *pdev,
>>   
>>   static void pci_restore_config_space(struct pci_dev *pdev)
>>   {
>> +	if (!pdev->state_saved) {
>> +		pci_warn(pdev, "No saved config space, restoring BARs\n");
>> +		pci_restore_bars(pdev);
>> +		pci_write_config_word(pdev, PCI_COMMAND,
>> +				PCI_COMMAND_MEMORY | PCI_COMMAND_IO);
> Is this really something that ought to be universally enabled? I thought this
> depends on whether attached resources are IO and/or MEM?
>
> 	int pci_enable_resources(struct pci_dev *dev, int mask)
> 	{
> 		...
> 		pci_dev_for_each_resource(dev, r, i) {
> 			...
> 			if (r->flags & IORESOURCE_IO)
> 				cmd |= PCI_COMMAND_IO;
> 			if (r->flags & IORESOURCE_MEM)
> 				cmd |= PCI_COMMAND_MEMORY;
> 		}
> 		...
> 	}
>
> Also IIRC, especially on s390, we never have IO resources?
>
> 	int zpci_setup_bus_resources(struct zpci_dev *zdev)
> 	{
> 		...
> 		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> 			...
> 			/* only MMIO is supported */
> 			flags = IORESOURCE_MEM;
> 			if (zdev->bars[i].val & 8)
> 				flags |= IORESOURCE_PREFETCH;
> 			if (zdev->bars[i].val & 4)
> 				flags |= IORESOURCE_MEM_64;
> 			...
> 		}
> 		...
> 	}
>
> So I guess this would have to have some form of the same logic as in
> `pci_enable_resources()`, after restoring the BARs.
>
> Or am I missing something?

As per my understanding of the spec, setting both I/O Space and Memory 
Space should be safe. The spec also mentions if a function doesn't 
support IO/Memory space access it could hardwire the bit to zero. We 
could add the logic to iterate through all the resources and set the 
bits accordingly, but in this case trying a best effort restoration it 
should be fine?

Also I didn't see any issues testing on s390x with the NVMe, RoCE and 
NETD devices, but I could have missed something.

Thanks

Farhan


>
>> +		return;
>> +	}