[v7] PCI/AER: Report fatal errors of RCiEP and EP if link recoverd

[PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

Posted by Shuai Xue 2 weeks, 2 days ago

The current implementation of pcie_do_recovery() assumes that the
recovery process is executed for the device that detected the error.
However, the DPC driver currently passes the error port that experienced
the DPC event to pcie_do_recovery().

Use the SOURCE ID register to correctly identify the device that
detected the error. When passing the error device, the
pcie_do_recovery() will find the upstream bridge and walk bridges
potentially AER affected. And subsequent commits will be able to
accurately access AER status of the error device.

Should not observe any functional changes.

Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
---
 drivers/pci/pci.h      |  2 +-
 drivers/pci/pcie/dpc.c | 25 +++++++++++++++++++++----
 drivers/pci/pcie/edr.c |  7 ++++---
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 0e67014aa001..58640e656897 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -771,7 +771,7 @@ struct rcec_ea {
 void pci_save_dpc_state(struct pci_dev *dev);
 void pci_restore_dpc_state(struct pci_dev *dev);
 void pci_dpc_init(struct pci_dev *pdev);
-void dpc_process_error(struct pci_dev *pdev);
+struct pci_dev *dpc_process_error(struct pci_dev *pdev);
 pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
 bool pci_dpc_recovered(struct pci_dev *pdev);
 unsigned int dpc_tlp_log_len(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index bff29726c6a5..f6069f621683 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -260,10 +260,20 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
 	return 1;
 }
 
-void dpc_process_error(struct pci_dev *pdev)
+/**
+ * dpc_process_error - handle the DPC error status
+ * @pdev: the port that experienced the containment event
+ *
+ * Return: the device that detected the error.
+ *
+ * NOTE: The device reference count is increased, the caller must decrement
+ * the reference count by calling pci_dev_put().
+ */
+struct pci_dev *dpc_process_error(struct pci_dev *pdev)
 {
 	u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
 	struct aer_err_info info = {};
+	struct pci_dev *err_dev;
 
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
 
@@ -279,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
 			pci_aer_clear_nonfatal_status(pdev);
 			pci_aer_clear_fatal_status(pdev);
 		}
+		err_dev = pci_dev_get(pdev);
 		break;
 	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
 	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
@@ -290,6 +301,8 @@ void dpc_process_error(struct pci_dev *pdev)
 				"ERR_FATAL" : "ERR_NONFATAL",
 			 pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
 			 PCI_SLOT(source), PCI_FUNC(source));
+		err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
+					    PCI_BUS_NUM(source), source & 0xff);
 		break;
 	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT:
 		ext_reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT;
@@ -304,8 +317,11 @@ void dpc_process_error(struct pci_dev *pdev)
 		if (ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO &&
 		    pdev->dpc_rp_extensions)
 			dpc_process_rp_pio_error(pdev);
+		err_dev = pci_dev_get(pdev);
 		break;
 	}
+
+	return err_dev;
 }
 
 static void pci_clear_surpdn_errors(struct pci_dev *pdev)
@@ -361,7 +377,7 @@ static bool dpc_is_surprise_removal(struct pci_dev *pdev)
 
 static irqreturn_t dpc_handler(int irq, void *context)
 {
-	struct pci_dev *err_port = context;
+	struct pci_dev *err_port = context, *err_dev;
 
 	/*
 	 * According to PCIe r6.0 sec 6.7.6, errors are an expected side effect
@@ -372,10 +388,11 @@ static irqreturn_t dpc_handler(int irq, void *context)
 		return IRQ_HANDLED;
 	}
 
-	dpc_process_error(err_port);
+	err_dev = dpc_process_error(err_port);
 
 	/* We configure DPC so it only triggers on ERR_FATAL */
-	pcie_do_recovery(err_port, pci_channel_io_frozen, dpc_reset_link);
+	pcie_do_recovery(err_dev, pci_channel_io_frozen, dpc_reset_link);
+	pci_dev_put(err_dev);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/pci/pcie/edr.c b/drivers/pci/pcie/edr.c
index 521fca2f40cb..b6e9d652297e 100644
--- a/drivers/pci/pcie/edr.c
+++ b/drivers/pci/pcie/edr.c
@@ -150,7 +150,7 @@ static int acpi_send_edr_status(struct pci_dev *pdev, struct pci_dev *edev,
 
 static void edr_handle_event(acpi_handle handle, u32 event, void *data)
 {
-	struct pci_dev *pdev = data, *err_port;
+	struct pci_dev *pdev = data, *err_port, *err_dev;
 	pci_ers_result_t estate = PCI_ERS_RESULT_DISCONNECT;
 	u16 status;
 
@@ -190,7 +190,7 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
 		goto send_ost;
 	}
 
-	dpc_process_error(err_port);
+	err_dev = dpc_process_error(err_port);
 	pci_aer_raw_clear_status(err_port);
 
 	/*
@@ -198,7 +198,8 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
 	 * or ERR_NONFATAL, since the link is already down, use the FATAL
 	 * error recovery path for both cases.
 	 */
-	estate = pcie_do_recovery(err_port, pci_channel_io_frozen, dpc_reset_link);
+	estate = pcie_do_recovery(err_dev, pci_channel_io_frozen, dpc_reset_link);
+	pci_dev_put(err_dev);
 
 send_ost:
 
-- 
2.39.3

Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

Posted by Lukas Wunner 1 week ago

On Sat, Jan 24, 2026 at 03:45:54PM +0800, Shuai Xue wrote:
> +++ b/drivers/pci/pcie/dpc.c
> @@ -372,10 +388,11 @@ static irqreturn_t dpc_handler(int irq, void *context)
>  		return IRQ_HANDLED;
>  	}
>  
> -	dpc_process_error(err_port);
> +	err_dev = dpc_process_error(err_port);
>  
>  	/* We configure DPC so it only triggers on ERR_FATAL */
> -	pcie_do_recovery(err_port, pci_channel_io_frozen, dpc_reset_link);
> +	pcie_do_recovery(err_dev, pci_channel_io_frozen, dpc_reset_link);
> +	pci_dev_put(err_dev);
>  
>  	return IRQ_HANDLED;
>  }

You're assuming that the parent of the Requester is always identical
to the containing Downstream Port.  But that's not necessarily the case:

E.g., imagine a DPC-capable Root Port with a PCIe switch below
whose Downstream Ports are not DPC-capable.  Let's say an Endpoint
beneath the PCIe switch sends ERR_FATAL upstream.  AFAICS, your patch
will cause pcie_do_recovery() to invoke dpc_reset_link() with the
Downstream Port of the PCIe switch as argument.  That function will
then happily use pdev->dpc_cap even though it's 0.

A possible solution may be to amend dpc_reset_link() to walk upwards,
starting at pdev, and search for a pci_dev whose pdev->dpc_cap is
non-zero.  That should be the containing DPC-capable port.

I don't really like the err_dev and err_port terminology invented here.
We generally use spec terms to make it easy to connect the code to the
spec.  The spec uses the term "Downstream Port" to denote the containing
Downstream Port, so a name such as "dport" may be better than "err_port".

Similarly, the spec uses "Requester" or "Source" for the reporting agent,
so "req", "requester", "src" or "source" may all be better names than
"err_dev".

Thanks,

Lukas

Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

Posted by Shuai Xue 3 days, 6 hours ago


On 2/2/26 10:02 PM, Lukas Wunner wrote:
> On Sat, Jan 24, 2026 at 03:45:54PM +0800, Shuai Xue wrote:
>> +++ b/drivers/pci/pcie/dpc.c
>> @@ -372,10 +388,11 @@ static irqreturn_t dpc_handler(int irq, void *context)
>>   		return IRQ_HANDLED;
>>   	}
>>   
>> -	dpc_process_error(err_port);
>> +	err_dev = dpc_process_error(err_port);
>>   
>>   	/* We configure DPC so it only triggers on ERR_FATAL */
>> -	pcie_do_recovery(err_port, pci_channel_io_frozen, dpc_reset_link);
>> +	pcie_do_recovery(err_dev, pci_channel_io_frozen, dpc_reset_link);
>> +	pci_dev_put(err_dev);
>>   
>>   	return IRQ_HANDLED;
>>   }
> 
> You're assuming that the parent of the Requester is always identical
> to the containing Downstream Port.  But that's not necessarily the case:
> 
> E.g., imagine a DPC-capable Root Port with a PCIe switch below
> whose Downstream Ports are not DPC-capable.  Let's say an Endpoint
> beneath the PCIe switch sends ERR_FATAL upstream.  AFAICS, your patch
> will cause pcie_do_recovery() to invoke dpc_reset_link() with the
> Downstream Port of the PCIe switch as argument.  That function will
> then happily use pdev->dpc_cap even though it's 0.

I see. Goot point.

> 
> A possible solution may be to amend dpc_reset_link() to walk upwards,
> starting at pdev, and search for a pci_dev whose pdev->dpc_cap is
> non-zero.  That should be the containing DPC-capable port.

See my reply in your later patch.

> 
> I don't really like the err_dev and err_port terminology invented here.
> We generally use spec terms to make it easy to connect the code to the
> spec.  The spec uses the term "Downstream Port" to denote the containing
> Downstream Port, so a name such as "dport" may be better than "err_port".
 >
> 
> Similarly, the spec uses "Requester" or "Source" for the reporting agent,
> so "req", "requester", "src" or "source" may all be better names than
> "err_dev".

Sure, I wll use the common "dport" and "req" term from spec to make it more readable.


> 
> Thanks,
> 
> Lukas

Thanks for valuable comments.
Shuai

Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

Posted by Lukas Wunner 6 days, 18 hours ago

On Mon, Feb 02, 2026 at 03:02:54PM +0100, Lukas Wunner wrote:
> You're assuming that the parent of the Requester is always identical
> to the containing Downstream Port.  But that's not necessarily the case:
> 
> E.g., imagine a DPC-capable Root Port with a PCIe switch below
> whose Downstream Ports are not DPC-capable.  Let's say an Endpoint
> beneath the PCIe switch sends ERR_FATAL upstream.  AFAICS, your patch
> will cause pcie_do_recovery() to invoke dpc_reset_link() with the
> Downstream Port of the PCIe switch as argument.  That function will
> then happily use pdev->dpc_cap even though it's 0.

Thinking about this some more, I realized there's another problem:

In a scenario like the one I've outlined above, after your change,
pcie_do_recovery() will only broadcast error_detected (and other
callbacks) below the downstream port of the PCIe switch -- and not
to any other devices below the containing Root Port.

However, the DPC-induced Link Down event at the Root Port results
in a Hot Reset being propagated down the hierarchy to any device
below the Root Port.  So with your change, the siblings of the
downstream port on the PCIe switch will no longer be informed of
the reset and thus are no longer given an opportunity to recover
after reset.

The premise on which this patch is built is false -- that the bridge
upstream of the error-reporting device is always equal to the
containing Downstream Port.

It seems the only reason why you want to pass the reporting device
to pcie_do_recovery() is that you want to call pcie_clear_device_status()
and pci_aer_clear_nonfatal_status() with that device.

However as I've said before, those calls are AER-specific and should
be moved out of pcie_do_recovery() so that it becomes generic and can
be used by EEH and s390:

https://lore.kernel.org/all/aPYKe1UKKkR7qrt1@wunner.de/

There's another problem:  When a device experiences an error while DPC
is ongoing (i.e. while the link is down), its ERR_FATAL or ERR_NONFATAL
Message may not come through.  Still the error bits are set in the
device's Uncorrectable Error Status register.  So I think what we need to
do is walk the hierarchy below the containing Downstream Port after the
link is back up and search for devices with any error bits set,
then report and clear those errors.  We may do this after first
examining the device in the DPC Error Source ID register.
Any additional errors found while walking the hierarchy can then
be identified as "occurred during DPC recovery".

Thanks,

Lukas

Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

Posted by Shuai Xue 2 days, 7 hours ago

On 2/3/26 5:09 AM, Lukas Wunner wrote:
> On Mon, Feb 02, 2026 at 03:02:54PM +0100, Lukas Wunner wrote:
>> You're assuming that the parent of the Requester is always identical
>> to the containing Downstream Port.  But that's not necessarily the case:
>>
>> E.g., imagine a DPC-capable Root Port with a PCIe switch below
>> whose Downstream Ports are not DPC-capable.  Let's say an Endpoint
>> beneath the PCIe switch sends ERR_FATAL upstream.  AFAICS, your patch
>> will cause pcie_do_recovery() to invoke dpc_reset_link() with the
>> Downstream Port of the PCIe switch as argument.  That function will
>> then happily use pdev->dpc_cap even though it's 0.
> 
> Thinking about this some more, I realized there's another problem:
> 
> In a scenario like the one I've outlined above, after your change,
> pcie_do_recovery() will only broadcast error_detected (and other
> callbacks) below the downstream port of the PCIe switch -- and not
> to any other devices below the containing Root Port.
> 
> However, the DPC-induced Link Down event at the Root Port results
> in a Hot Reset being propagated down the hierarchy to any device
> below the Root Port.  So with your change, the siblings of the
> downstream port on the PCIe switch will no longer be informed of
> the reset and thus are no longer given an opportunity to recover
> after reset.
> 
> The premise on which this patch is built is false -- that the bridge
> upstream of the error-reporting device is always equal to the
> containing Downstream Port.

Thanks again for the very detailed analysis and for the pointers to
your earlier mail.

You are right, thanks for pointing it out.

> 
> It seems the only reason why you want to pass the reporting device
> to pcie_do_recovery() is that you want to call pcie_clear_device_status()
> and pci_aer_clear_nonfatal_status() with that device.

In the AER path, pcie_do_recovery() is indeed invoked with the Error
Source device found by find_source_device(), and internally it treats
that dev as the function that detected the error and derives the
containing Downstream Port (bridge) from it.  For DPC, however, the
error-detecting function is the DPC-capable Downstream Port itself, not
the Endpoint identified as Error Source, so passing the Endpoint to
pcie_do_recovery() breaks that assumption.
> 
> However as I've said before, those calls are AER-specific and should
> be moved out of pcie_do_recovery() so that it becomes generic and can
> be used by EEH and s390:
> 
> https://lore.kernel.org/all/aPYKe1UKKkR7qrt1@wunner.de/

Sure, I'd like to move it out.  I will remove the AER-specific calls
(pcie_clear_device_status() and pci_aer_raw_clear_status()) from
pcie_do_recovery() itself, and instead handle them in the AER and DPC
code paths where we already know which device(s) are the actual error
sources.  That way, pcie_do_recovery() becomes a generic recovery
framework that can be reused by EEH and s390.

> 
> There's another problem:  When a device experiences an error while DPC
> is ongoing (i.e. while the link is down), its ERR_FATAL or ERR_NONFATAL
> Message may not come through.  Still the error bits are set in the
> device's Uncorrectable Error Status register.  So I think what we need to
> do is walk the hierarchy below the containing Downstream Port after the
> link is back up and search for devices with any error bits set,
> then report and clear those errors.  We may do this after first
> examining the device in the DPC Error Source ID register.
> Any additional errors found while walking the hierarchy can then
> be identified as "occurred during DPC recovery".

I agree this is similar in spirit to find_source_device() -- both walk
the bus and check AER Status registers.  For the DPC case, I'll perform
this walk after the link is back up (i.e., after dpc_reset_link()
succeeds).

Regarding pci_restore_state() in slot_reset(): I see now that it does
call pci_aer_clear_status(dev) (at line 1844 in pci.c), which will
clear the AER Status registers.  So if we walk the hierarchy after
the slot_reset callbacks, the error bits accumulated during DPC will
already be cleared.

To avoid losing those errors, I think the walk should happen after
dpc_reset_link() succeeds but *before* pcie_do_recovery() invokes the
slot_reset callbacks.  That way, we can capture the AER Status bits
before pci_restore_state() clears them.

Does that sound like the right approach, or would you prefer a
different placement?

Thanks a lot for your guidance.

Best Regards,
Shuai

Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

Posted by Jonathan Cameron 1 week, 6 days ago

On Sat, 24 Jan 2026 15:45:54 +0800
Shuai Xue <xueshuai@linux.alibaba.com> wrote:

> The current implementation of pcie_do_recovery() assumes that the
> recovery process is executed for the device that detected the error.
> However, the DPC driver currently passes the error port that experienced
> the DPC event to pcie_do_recovery().
> 
> Use the SOURCE ID register to correctly identify the device that
> detected the error. When passing the error device, the
> pcie_do_recovery() will find the upstream bridge and walk bridges
> potentially AER affected. And subsequent commits will be able to
> accurately access AER status of the error device.
> 
> Should not observe any functional changes.
> 
> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
> ---
>  drivers/pci/pci.h      |  2 +-
>  drivers/pci/pcie/dpc.c | 25 +++++++++++++++++++++----
>  drivers/pci/pcie/edr.c |  7 ++++---
>  3 files changed, 26 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 0e67014aa001..58640e656897 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -771,7 +771,7 @@ struct rcec_ea {
>  void pci_save_dpc_state(struct pci_dev *dev);
>  void pci_restore_dpc_state(struct pci_dev *dev);
>  void pci_dpc_init(struct pci_dev *pdev);
> -void dpc_process_error(struct pci_dev *pdev);
> +struct pci_dev *dpc_process_error(struct pci_dev *pdev);
>  pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
>  bool pci_dpc_recovered(struct pci_dev *pdev);
>  unsigned int dpc_tlp_log_len(struct pci_dev *dev);
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index bff29726c6a5..f6069f621683 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -260,10 +260,20 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
>  	return 1;
>  }
>  
> -void dpc_process_error(struct pci_dev *pdev)
> +/**
> + * dpc_process_error - handle the DPC error status
> + * @pdev: the port that experienced the containment event
> + *
> + * Return: the device that detected the error.
> + *
> + * NOTE: The device reference count is increased, the caller must decrement
> + * the reference count by calling pci_dev_put().
> + */
> +struct pci_dev *dpc_process_error(struct pci_dev *pdev)

Maybe it makes sense to carry the err_port naming for the pci_dev
in here as well?  Seems stronger than just relying on people
reading the documentation you've added.
 
>  {
>  	u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
>  	struct aer_err_info info = {};
> +	struct pci_dev *err_dev;
>  
>  	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
>  
> @@ -279,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
>  			pci_aer_clear_nonfatal_status(pdev);
>  			pci_aer_clear_fatal_status(pdev);
>  		}
> +		err_dev = pci_dev_get(pdev);
>  		break;
>  	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
>  	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
> @@ -290,6 +301,8 @@ void dpc_process_error(struct pci_dev *pdev)
>  				"ERR_FATAL" : "ERR_NONFATAL",
>  			 pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
>  			 PCI_SLOT(source), PCI_FUNC(source));
> +		err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
> +					    PCI_BUS_NUM(source), source & 0xff);

Bunch of replication in her with the pci_warn(). Maybe some local variables?
Maybe even rebuild the final parameter from PCI_DEVFN(slot, func) just to make the
association with the print really obvious?

Is there any chance that this might return NULL?   Feels like maybe that's
only a possibility on a broken setup, but I'm not sure of all the wonderful
races around hotplug and DPC occurring before the OS has caught up.

>  		break;
>  	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT:
>  		ext_reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT;
> @@ -304,8 +317,11 @@ void dpc_process_error(struct pci_dev *pdev)
>  		if (ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO &&
>  		    pdev->dpc_rp_extensions)
>  			dpc_process_rp_pio_error(pdev);
> +		err_dev = pci_dev_get(pdev);
>  		break;
>  	}
> +
> +	return err_dev;
>  }

> diff --git a/drivers/pci/pcie/edr.c b/drivers/pci/pcie/edr.c
> index 521fca2f40cb..b6e9d652297e 100644
> --- a/drivers/pci/pcie/edr.c
> +++ b/drivers/pci/pcie/edr.c
> @@ -150,7 +150,7 @@ static int acpi_send_edr_status(struct pci_dev *pdev, struct pci_dev *edev,
>  
>  static void edr_handle_event(acpi_handle handle, u32 event, void *data)
>  {
> -	struct pci_dev *pdev = data, *err_port;
> +	struct pci_dev *pdev = data, *err_port, *err_dev;
>  	pci_ers_result_t estate = PCI_ERS_RESULT_DISCONNECT;
>  	u16 status;
>  
> @@ -190,7 +190,7 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
>  		goto send_ost;
>  	}
>  
> -	dpc_process_error(err_port);
> +	err_dev = dpc_process_error(err_port);
>  	pci_aer_raw_clear_status(err_port);
>  
>  	/*
> @@ -198,7 +198,8 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
>  	 * or ERR_NONFATAL, since the link is already down, use the FATAL
>  	 * error recovery path for both cases.
>  	 */
> -	estate = pcie_do_recovery(err_port, pci_channel_io_frozen, dpc_reset_link);
> +	estate = pcie_do_recovery(err_dev, pci_channel_io_frozen, dpc_reset_link);
> +	pci_dev_put(err_dev);
>  
>  send_ost:
>

Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

Posted by Shuai Xue 1 week, 5 days ago


On 1/27/26 6:24 PM, Jonathan Cameron wrote:
> On Sat, 24 Jan 2026 15:45:54 +0800
> Shuai Xue <xueshuai@linux.alibaba.com> wrote:
> 
>> The current implementation of pcie_do_recovery() assumes that the
>> recovery process is executed for the device that detected the error.
>> However, the DPC driver currently passes the error port that experienced
>> the DPC event to pcie_do_recovery().
>>
>> Use the SOURCE ID register to correctly identify the device that
>> detected the error. When passing the error device, the
>> pcie_do_recovery() will find the upstream bridge and walk bridges
>> potentially AER affected. And subsequent commits will be able to
>> accurately access AER status of the error device.
>>
>> Should not observe any functional changes.
>>
>> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
>> ---
>>   drivers/pci/pci.h      |  2 +-
>>   drivers/pci/pcie/dpc.c | 25 +++++++++++++++++++++----
>>   drivers/pci/pcie/edr.c |  7 ++++---
>>   3 files changed, 26 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>> index 0e67014aa001..58640e656897 100644
>> --- a/drivers/pci/pci.h
>> +++ b/drivers/pci/pci.h
>> @@ -771,7 +771,7 @@ struct rcec_ea {
>>   void pci_save_dpc_state(struct pci_dev *dev);
>>   void pci_restore_dpc_state(struct pci_dev *dev);
>>   void pci_dpc_init(struct pci_dev *pdev);
>> -void dpc_process_error(struct pci_dev *pdev);
>> +struct pci_dev *dpc_process_error(struct pci_dev *pdev);
>>   pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
>>   bool pci_dpc_recovered(struct pci_dev *pdev);
>>   unsigned int dpc_tlp_log_len(struct pci_dev *dev);
>> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
>> index bff29726c6a5..f6069f621683 100644
>> --- a/drivers/pci/pcie/dpc.c
>> +++ b/drivers/pci/pcie/dpc.c
>> @@ -260,10 +260,20 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
>>   	return 1;
>>   }
>>   
>> -void dpc_process_error(struct pci_dev *pdev)
>> +/**
>> + * dpc_process_error - handle the DPC error status
>> + * @pdev: the port that experienced the containment event
>> + *
>> + * Return: the device that detected the error.
>> + *
>> + * NOTE: The device reference count is increased, the caller must decrement
>> + * the reference count by calling pci_dev_put().
>> + */
>> +struct pci_dev *dpc_process_error(struct pci_dev *pdev)
> 
> Maybe it makes sense to carry the err_port naming for the pci_dev
> in here as well?  Seems stronger than just relying on people
> reading the documentation you've added.

Good point. I think renaming the parameter would improve clarity. However,
I'd prefer to handle it in a separate patch to keep this change focused on
the functional modification. Would that work for you?

>   
>>   {
>>   	u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
>>   	struct aer_err_info info = {};
>> +	struct pci_dev *err_dev;
>>   
>>   	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
>>   
>> @@ -279,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
>>   			pci_aer_clear_nonfatal_status(pdev);
>>   			pci_aer_clear_fatal_status(pdev);
>>   		}
>> +		err_dev = pci_dev_get(pdev);
>>   		break;
>>   	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
>>   	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
>> @@ -290,6 +301,8 @@ void dpc_process_error(struct pci_dev *pdev)
>>   				"ERR_FATAL" : "ERR_NONFATAL",
>>   			 pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
>>   			 PCI_SLOT(source), PCI_FUNC(source));
>> +		err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
>> +					    PCI_BUS_NUM(source), source & 0xff);
> 
> Bunch of replication in her with the pci_warn(). Maybe some local variables?
> Maybe even rebuild the final parameter from PCI_DEVFN(slot, func) just to make the
> association with the print really obvious?

Agreed. Here's the improved version:

--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -293,17 +293,28 @@ struct pci_dev *dpc_process_error(struct pci_dev *pdev)
                 break;
         case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
         case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
-               pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID,
-                                    &source);
+       {
+               int domain, bus;
+               u8 slot, func, devfn;
+               const char *err_type;
+
+               pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source);
+
+               /* Extract source device location */
+               domain = pci_domain_nr(pdev->bus);
+               bus = PCI_BUS_NUM(source);
+               slot = PCI_SLOT(source);
+               func = PCI_FUNC(source);
+               devfn = PCI_DEVFN(slot, func);
+
+               err_type = (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE) ?
+                          "ERR_FATAL" : "ERR_NONFATAL";
+
                 pci_warn(pdev, "containment event, status:%#06x, %s received from %04x:%02x:%02x.%d\n",
-                        status,
-                        (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE) ?
-                               "ERR_FATAL" : "ERR_NONFATAL",
-                        pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
-                        PCI_SLOT(source), PCI_FUNC(source));
-               err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
-                                           PCI_BUS_NUM(source), source & 0xff);
+                        status, err_type, domain, bus, slot, func);
+               err_dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
                 break;
+       }
         case PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT:
                 ext_reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT;
                 pci_warn(pdev, "containment event, status:%#06x: %s detected\n",

> 
> Is there any chance that this might return NULL?   Feels like maybe that's
> only a possibility on a broken setup, but I'm not sure of all the wonderful
> races around hotplug and DPC occurring before the OS has caught up.

Surprise Down events are handled separately in
dpc_handle_surprise_removal() and won't reach dpc_process_error().
Please correct me if I missed anything.

Thanks for valuable comments.

Best Regards,
Shuai

Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

Posted by Jonathan Cameron 1 week, 5 days ago

On Wed, 28 Jan 2026 20:27:31 +0800
Shuai Xue <xueshuai@linux.alibaba.com> wrote:

> On 1/27/26 6:24 PM, Jonathan Cameron wrote:
> > On Sat, 24 Jan 2026 15:45:54 +0800
> > Shuai Xue <xueshuai@linux.alibaba.com> wrote:
> >   
> >> The current implementation of pcie_do_recovery() assumes that the
> >> recovery process is executed for the device that detected the error.
> >> However, the DPC driver currently passes the error port that experienced
> >> the DPC event to pcie_do_recovery().
> >>
> >> Use the SOURCE ID register to correctly identify the device that
> >> detected the error. When passing the error device, the
> >> pcie_do_recovery() will find the upstream bridge and walk bridges
> >> potentially AER affected. And subsequent commits will be able to
> >> accurately access AER status of the error device.
> >>
> >> Should not observe any functional changes.
> >>
> >> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> >> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Hi Shuai,

> >> ---
> >>   drivers/pci/pci.h      |  2 +-
> >>   drivers/pci/pcie/dpc.c | 25 +++++++++++++++++++++----
> >>   drivers/pci/pcie/edr.c |  7 ++++---
> >>   3 files changed, 26 insertions(+), 8 deletions(-)
> >>
...

> >> -void dpc_process_error(struct pci_dev *pdev)
> >> +/**
> >> + * dpc_process_error - handle the DPC error status
> >> + * @pdev: the port that experienced the containment event
> >> + *
> >> + * Return: the device that detected the error.
> >> + *
> >> + * NOTE: The device reference count is increased, the caller must decrement
> >> + * the reference count by calling pci_dev_put().
> >> + */
> >> +struct pci_dev *dpc_process_error(struct pci_dev *pdev)  
> > 
> > Maybe it makes sense to carry the err_port naming for the pci_dev
> > in here as well?  Seems stronger than just relying on people
> > reading the documentation you've added.  
> 
> Good point. I think renaming the parameter would improve clarity. However,
> I'd prefer to handle it in a separate patch to keep this change focused on
> the functional modification. Would that work for you?
> 
Sure. Ideal would be a precursor patch, but if it's much easier to
do on top of this I'm fine with that.

You are absolutely correct that it should be a separate patch!

> >     
> >>   {
> >>   	u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
> >>   	struct aer_err_info info = {};
> >> +	struct pci_dev *err_dev;
> >>   
> >>   	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
> >>   
> >> @@ -279,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
> >>   			pci_aer_clear_nonfatal_status(pdev);
> >>   			pci_aer_clear_fatal_status(pdev);
> >>   		}
> >> +		err_dev = pci_dev_get(pdev);
> >>   		break;
> >>   	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
> >>   	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
> >> @@ -290,6 +301,8 @@ void dpc_process_error(struct pci_dev *pdev)
> >>   				"ERR_FATAL" : "ERR_NONFATAL",
> >>   			 pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
> >>   			 PCI_SLOT(source), PCI_FUNC(source));
> >> +		err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
> >> +					    PCI_BUS_NUM(source), source & 0xff);  
> > 
> > Bunch of replication in her with the pci_warn(). Maybe some local variables?
> > Maybe even rebuild the final parameter from PCI_DEVFN(slot, func) just to make the
> > association with the print really obvious?  
> 
> Agreed. Here's the improved version:
> 
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -293,17 +293,28 @@ struct pci_dev *dpc_process_error(struct pci_dev *pdev)
>                  break;
>          case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
>          case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
> -               pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID,
> -                                    &source);
> +       {
> +               int domain, bus;
> +               u8 slot, func, devfn;
> +               const char *err_type;
> +
> +               pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source);
> +
> +               /* Extract source device location */
> +               domain = pci_domain_nr(pdev->bus);
> +               bus = PCI_BUS_NUM(source);
> +               slot = PCI_SLOT(source);
> +               func = PCI_FUNC(source);
> +               devfn = PCI_DEVFN(slot, func);
> +
> +               err_type = (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE) ?
> +                          "ERR_FATAL" : "ERR_NONFATAL";
> +
>                  pci_warn(pdev, "containment event, status:%#06x, %s received from %04x:%02x:%02x.%d\n",
> -                        status,
> -                        (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE) ?
> -                               "ERR_FATAL" : "ERR_NONFATAL",
> -                        pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
> -                        PCI_SLOT(source), PCI_FUNC(source));
> -               err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
> -                                           PCI_BUS_NUM(source), source & 0xff);
> +                        status, err_type, domain, bus, slot, func);
> +               err_dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
Maybe don't bother with local variables for the things only used once.
e.g.

		err_dev = pci_get_domain_bus_and_slot(domain, bus, PCI_DEVFN(slot, func));

and possibly the same for err_type.

I don't mind though if you prefer to use locals for everything.



>                  break;
> +       }
>          case PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT:
>                  ext_reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT;
>                  pci_warn(pdev, "containment event, status:%#06x: %s detected\n",
> 
> > 
> > Is there any chance that this might return NULL?   Feels like maybe that's
> > only a possibility on a broken setup, but I'm not sure of all the wonderful
> > races around hotplug and DPC occurring before the OS has caught up.  
> 
> Surprise Down events are handled separately in
> dpc_handle_surprise_removal() and won't reach dpc_process_error().
> Please correct me if I missed anything.

Probably fine. I just didn't check particularly closely.  

Jonathan
> 
> Thanks for valuable comments.
> 
> Best Regards,
> Shuai

Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

Posted by Shuai Xue 1 week, 4 days ago


On 1/28/26 11:02 PM, Jonathan Cameron wrote:
> On Wed, 28 Jan 2026 20:27:31 +0800
> Shuai Xue <xueshuai@linux.alibaba.com> wrote:
> 
>> On 1/27/26 6:24 PM, Jonathan Cameron wrote:
>>> On Sat, 24 Jan 2026 15:45:54 +0800
>>> Shuai Xue <xueshuai@linux.alibaba.com> wrote:
>>>    
>>>> The current implementation of pcie_do_recovery() assumes that the
>>>> recovery process is executed for the device that detected the error.
>>>> However, the DPC driver currently passes the error port that experienced
>>>> the DPC event to pcie_do_recovery().
>>>>
>>>> Use the SOURCE ID register to correctly identify the device that
>>>> detected the error. When passing the error device, the
>>>> pcie_do_recovery() will find the upstream bridge and walk bridges
>>>> potentially AER affected. And subsequent commits will be able to
>>>> accurately access AER status of the error device.
>>>>
>>>> Should not observe any functional changes.
>>>>
>>>> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>>>> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
> Hi Shuai,
> 
>>>> ---
>>>>    drivers/pci/pci.h      |  2 +-
>>>>    drivers/pci/pcie/dpc.c | 25 +++++++++++++++++++++----
>>>>    drivers/pci/pcie/edr.c |  7 ++++---
>>>>    3 files changed, 26 insertions(+), 8 deletions(-)
>>>>
> ...
> 
>>>> -void dpc_process_error(struct pci_dev *pdev)
>>>> +/**
>>>> + * dpc_process_error - handle the DPC error status
>>>> + * @pdev: the port that experienced the containment event
>>>> + *
>>>> + * Return: the device that detected the error.
>>>> + *
>>>> + * NOTE: The device reference count is increased, the caller must decrement
>>>> + * the reference count by calling pci_dev_put().
>>>> + */
>>>> +struct pci_dev *dpc_process_error(struct pci_dev *pdev)
>>>
>>> Maybe it makes sense to carry the err_port naming for the pci_dev
>>> in here as well?  Seems stronger than just relying on people
>>> reading the documentation you've added.
>>
>> Good point. I think renaming the parameter would improve clarity. However,
>> I'd prefer to handle it in a separate patch to keep this change focused on
>> the functional modification. Would that work for you?
>>
> Sure. Ideal would be a precursor patch, but if it's much easier to
> do on top of this I'm fine with that.
> 
> You are absolutely correct that it should be a separate patch!

Got it.

>>>      
>>>>    {
>>>>    	u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
>>>>    	struct aer_err_info info = {};
>>>> +	struct pci_dev *err_dev;
>>>>    
>>>>    	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
>>>>    
>>>> @@ -279,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
>>>>    			pci_aer_clear_nonfatal_status(pdev);
>>>>    			pci_aer_clear_fatal_status(pdev);
>>>>    		}
>>>> +		err_dev = pci_dev_get(pdev);
>>>>    		break;
>>>>    	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
>>>>    	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
>>>> @@ -290,6 +301,8 @@ void dpc_process_error(struct pci_dev *pdev)
>>>>    				"ERR_FATAL" : "ERR_NONFATAL",
>>>>    			 pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
>>>>    			 PCI_SLOT(source), PCI_FUNC(source));
>>>> +		err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
>>>> +					    PCI_BUS_NUM(source), source & 0xff);
>>>
>>> Bunch of replication in her with the pci_warn(). Maybe some local variables?
>>> Maybe even rebuild the final parameter from PCI_DEVFN(slot, func) just to make the
>>> association with the print really obvious?
>>
>> Agreed. Here's the improved version:
>>
>> --- a/drivers/pci/pcie/dpc.c
>> +++ b/drivers/pci/pcie/dpc.c
>> @@ -293,17 +293,28 @@ struct pci_dev *dpc_process_error(struct pci_dev *pdev)
>>                   break;
>>           case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
>>           case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
>> -               pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID,
>> -                                    &source);
>> +       {
>> +               int domain, bus;
>> +               u8 slot, func, devfn;
>> +               const char *err_type;
>> +
>> +               pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source);
>> +
>> +               /* Extract source device location */
>> +               domain = pci_domain_nr(pdev->bus);
>> +               bus = PCI_BUS_NUM(source);
>> +               slot = PCI_SLOT(source);
>> +               func = PCI_FUNC(source);
>> +               devfn = PCI_DEVFN(slot, func);
>> +
>> +               err_type = (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE) ?
>> +                          "ERR_FATAL" : "ERR_NONFATAL";
>> +
>>                   pci_warn(pdev, "containment event, status:%#06x, %s received from %04x:%02x:%02x.%d\n",
>> -                        status,
>> -                        (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE) ?
>> -                               "ERR_FATAL" : "ERR_NONFATAL",
>> -                        pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
>> -                        PCI_SLOT(source), PCI_FUNC(source));
>> -               err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
>> -                                           PCI_BUS_NUM(source), source & 0xff);
>> +                        status, err_type, domain, bus, slot, func);
>> +               err_dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
> Maybe don't bother with local variables for the things only used once.
> e.g.
> 
> 		err_dev = pci_get_domain_bus_and_slot(domain, bus, PCI_DEVFN(slot, func));
> 
> and possibly the same for err_type.
> 
> I don't mind though if you prefer to use locals for everything.

Sure, will remove local devfn and err_type.

> 
> 
> 
>>                   break;
>> +       }
>>           case PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT:
>>                   ext_reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT;
>>                   pci_warn(pdev, "containment event, status:%#06x: %s detected\n",
>>
>>>
>>> Is there any chance that this might return NULL?   Feels like maybe that's
>>> only a possibility on a broken setup, but I'm not sure of all the wonderful
>>> races around hotplug and DPC occurring before the OS has caught up.
>>
>> Surprise Down events are handled separately in
>> dpc_handle_surprise_removal() and won't reach dpc_process_error().
>> Please correct me if I missed anything.
> 
> Probably fine. I just didn't check particularly closely.
> 
> Jonathan


Thanks for valuable comments.

Best Regards,
Shuai

[PATCH v7 1/5] PCI/DPC: Clarify naming for error port in DPC Handling
[PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error
[PATCH v7 3/5] PCI/AER: Report fatal errors of RCiEP and EP if link recoverd
[PATCH v7 4/5] PCI/AER: Clear both AER fatal and non-fatal status
[PATCH v7 5/5] PCI/AER: Only clear error bits in pcie_clear_device_status()