drivers/pci/hotplug/pciehp_ctrl.c | 33 ++++++++++++--- drivers/pci/hotplug/trace.h | 68 +++++++++++++++++++++++++++++++ include/uapi/linux/pci.h | 7 ++++ 3 files changed, 102 insertions(+), 6 deletions(-) create mode 100644 drivers/pci/hotplug/trace.h
Hotplug events are critical indicators for analyzing hardware health,
particularly in AI supercomputers where surprise link downs can
significantly impact system performance and reliability. The failure
characterization analysis illustrates the significance of failures
caused by the Infiniband link errors. Meta observes that 2% in a machine
learning cluster and 6% in a vision application cluster of Infiniband
failures co-occur with GPU failures, such as falling off the bus, which
may indicate a correlation with PCIe.[1]
To enhance hardware health monitoring and diagnostics, introduces a generic
tracepoint for hotplug events. Additionally, specific tracepoints for PCIe
hotplug events are generated. To enable userspace monitoring of these
tracepoints, for instance using tools like rasdaemon, the
`pci_hotplug_event` is included in the UAPI (User API) header.
$ echo 1 > /sys/kernel/debug/tracing/events/hotplug/pci_hp_event/enable
$ cat /sys/kernel/debug/tracing/trace_pipe
<...>-206 [001] ..... 40.373870: pci_hp_event: 0000:00:02.0 slot:10, event:Link Down
<...>-206 [001] ..... 40.374871: pci_hp_event: 0000:00:02.0 slot:10, event:Card not present
[1]https://arxiv.org/abs/2410.21680
Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
---
changes since v2 per Lukas:
- rewrite commit log to note why add pci_hotplug_event in the UAPI header.
- define TRACE_SYSTEM as pci_hotplug to avoid conflict with other
- squash previous two commits into one.
- rename _TRANS_STATE as _EVENT
- rename PCI_HOTPLUG_CARD_NO_PRESENT as PCI_HOTPLUG_CARD_NOT_PRESENT
---
drivers/pci/hotplug/pciehp_ctrl.c | 33 ++++++++++++---
drivers/pci/hotplug/trace.h | 68 +++++++++++++++++++++++++++++++
include/uapi/linux/pci.h | 7 ++++
3 files changed, 102 insertions(+), 6 deletions(-)
create mode 100644 drivers/pci/hotplug/trace.h
diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
index dcdbfcf404dd..c836462ff067 100644
--- a/drivers/pci/hotplug/pciehp_ctrl.c
+++ b/drivers/pci/hotplug/pciehp_ctrl.c
@@ -21,6 +21,9 @@
#include <linux/pci.h>
#include "pciehp.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
/* The following routines constitute the bulk of the
hotplug controller logic
*/
@@ -239,12 +242,20 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
case ON_STATE:
ctrl->state = POWEROFF_STATE;
mutex_unlock(&ctrl->state_lock);
- if (events & PCI_EXP_SLTSTA_DLLSC)
+ if (events & PCI_EXP_SLTSTA_DLLSC) {
ctrl_info(ctrl, "Slot(%s): Link Down\n",
slot_name(ctrl));
- if (events & PCI_EXP_SLTSTA_PDC)
+ trace_pci_hp_event(pci_name(ctrl->pcie->port),
+ slot_name(ctrl),
+ PCI_HOTPLUG_LINK_DOWN);
+ }
+ if (events & PCI_EXP_SLTSTA_PDC) {
ctrl_info(ctrl, "Slot(%s): Card not present\n",
slot_name(ctrl));
+ trace_pci_hp_event(pci_name(ctrl->pcie->port),
+ slot_name(ctrl),
+ PCI_HOTPLUG_CARD_NOT_PRESENT);
+ }
pciehp_disable_slot(ctrl, SURPRISE_REMOVAL);
break;
default:
@@ -264,6 +275,9 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
INDICATOR_NOOP);
ctrl_info(ctrl, "Slot(%s): Card not present\n",
slot_name(ctrl));
+ trace_pci_hp_event(pci_name(ctrl->pcie->port),
+ slot_name(ctrl),
+ PCI_HOTPLUG_CARD_NOT_PRESENT);
}
mutex_unlock(&ctrl->state_lock);
return;
@@ -276,12 +290,19 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
case OFF_STATE:
ctrl->state = POWERON_STATE;
mutex_unlock(&ctrl->state_lock);
- if (present)
+ if (present) {
ctrl_info(ctrl, "Slot(%s): Card present\n",
slot_name(ctrl));
- if (link_active)
- ctrl_info(ctrl, "Slot(%s): Link Up\n",
- slot_name(ctrl));
+ trace_pci_hp_event(pci_name(ctrl->pcie->port),
+ slot_name(ctrl),
+ PCI_HOTPLUG_CARD_PRESENT);
+ }
+ if (link_active) {
+ ctrl_info(ctrl, "Slot(%s): Link Up\n", slot_name(ctrl));
+ trace_pci_hp_event(pci_name(ctrl->pcie->port),
+ slot_name(ctrl),
+ PCI_HOTPLUG_LINK_UP);
+ }
ctrl->request_result = pciehp_enable_slot(ctrl);
break;
default:
diff --git a/drivers/pci/hotplug/trace.h b/drivers/pci/hotplug/trace.h
new file mode 100644
index 000000000000..45e4ca8b83c4
--- /dev/null
+++ b/drivers/pci/hotplug/trace.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_HW_EVENT_PCI_HP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HW_EVENT_PCI_HP_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pci_hotplug
+
+#define PCI_HOTPLUG_EVENT \
+ EM(PCI_HOTPLUG_LINK_UP, "Link Up") \
+ EM(PCI_HOTPLUG_LINK_DOWN, "Link Down") \
+ EM(PCI_HOTPLUG_CARD_PRESENT, "Card present") \
+ EMe(PCI_HOTPLUG_CARD_NOT_PRESENT, "Card not present")
+
+/* Enums require being exported to userspace, for user tool parsing */
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b) TRACE_DEFINE_ENUM(a);
+
+PCI_HOTPLUG_EVENT
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) {a, b},
+#define EMe(a, b) {a, b}
+
+TRACE_EVENT(pci_hp_event,
+
+ TP_PROTO(const char *port_name,
+ const char *slot,
+ const int event),
+
+ TP_ARGS(port_name, slot, event),
+
+ TP_STRUCT__entry(
+ __string( port_name, port_name )
+ __string( slot, slot )
+ __field( int, event )
+ ),
+
+ TP_fast_assign(
+ __assign_str(port_name);
+ __assign_str(slot);
+ __entry->event = event;
+ ),
+
+ TP_printk("%s slot:%s, event:%s\n",
+ __get_str(port_name),
+ __get_str(slot),
+ __print_symbolic(__entry->event, PCI_HOTPLUG_EVENT)
+ )
+);
+
+#endif /* _TRACE_HW_EVENT_PCI_HP_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../drivers/pci/hotplug
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/pci.h b/include/uapi/linux/pci.h
index a769eefc5139..4f150028965d 100644
--- a/include/uapi/linux/pci.h
+++ b/include/uapi/linux/pci.h
@@ -39,4 +39,11 @@
#define PCIIOC_MMAP_IS_MEM (PCIIOC_BASE | 0x02) /* Set mmap state to MEM space. */
#define PCIIOC_WRITE_COMBINE (PCIIOC_BASE | 0x03) /* Enable/disable write-combining. */
+enum pci_hotplug_event {
+ PCI_HOTPLUG_LINK_UP,
+ PCI_HOTPLUG_LINK_DOWN,
+ PCI_HOTPLUG_CARD_PRESENT,
+ PCI_HOTPLUG_CARD_NOT_PRESENT,
+};
+
#endif /* _UAPILINUX_PCI_H */
--
2.39.3
On Wed, Nov 20, 2024 at 08:43:28PM +0800, Shuai Xue wrote: > $ echo 1 > /sys/kernel/debug/tracing/events/hotplug/pci_hp_event/enable ^^^^^^^ I think this should now be "pci_hotplug" because you've renamed the TRACE_SYSTEM in v3. I'm wondering if we'll have other categories besides "pci_hp_event" below "pci_hotplug". Maybe not. Is it possible to omit the "pci_hotplug" and make "pci_hp_event" top level? Or should this be grouped below "pci" instead of "pci_hotplug"? I'm somewhat at a loss here as I'm not familiar with the conventions used in the tracing subsystem. From a PCI hotplug perspective, this patch LGTM, so: Reviewed-by: Lukas Wunner <lukas@wunner.de>
在 2024/11/21 17:27, Lukas Wunner 写道: > On Wed, Nov 20, 2024 at 08:43:28PM +0800, Shuai Xue wrote: >> $ echo 1 > /sys/kernel/debug/tracing/events/hotplug/pci_hp_event/enable > ^^^^^^^ > I think this should now be "pci_hotplug" because you've renamed the > TRACE_SYSTEM in v3. Yes, you are right. Will fix it. > > I'm wondering if we'll have other categories besides "pci_hp_event" > below "pci_hotplug". Maybe not. Is it possible to omit the "pci_hotplug" > and make "pci_hp_event" top level? Or should this be grouped below "pci" > instead of "pci_hotplug"? I'm somewhat at a loss here as I'm not > familiar with the conventions used in the tracing subsystem. Sure, I can reorganize the directory. You have two optional proposals: 1. /sys/kernel/debug/tracing/events/pci_hp_event 2. /sys/kernel/debug/tracing/events/pci/pci_hp_event I personally prefer the second approach. However, I'll defer the final decision to the tracing subsystem maintainer, considering their expertise and familiarity with the existing conventions. > > From a PCI hotplug perspective, this patch LGTM, so: > > Reviewed-by: Lukas Wunner <lukas@wunner.de> Thank you. Best Regards, Shuai
On Thu, 21 Nov 2024 19:34:31 +0800 Shuai Xue <xueshuai@linux.alibaba.com> wrote: > Sure, I can reorganize the directory. You have two optional proposals: > > 1. /sys/kernel/debug/tracing/events/pci_hp_event I'm guessing the above has TRACING_SYSTEM = pci_hp_event ? That is, the above is a system and not an event. > 2. /sys/kernel/debug/tracing/events/pci/pci_hp_event Whereas here, it's an event. > > I personally prefer the second approach. However, I'll defer the final decision > to the tracing subsystem maintainer, considering their expertise and > familiarity with the existing conventions. Normally the system is determined by the users of the tracing infrastructure and not the tracing maintainers. But I can give you some guidelines. The "system" level: /sys/kernel/tracing/events/<system> is just a way to group like events making it easier to enable them all at once: echo 1 > /sys/kernel/tracing/events/<system>/enable or trace-cmd start -e <system> The name of the "system" should be something that all the events underneath represent. Note, although events in two different systems can have the same name, it's best to try to keep them all unique. That's because if you have: systemA/foo and systemB/foo If you add to the kernel command line: trace_event=foo it will enable both events. Although that could be fixed with: trace_event=systemA:foo Note: trace_event=systemA would enable all systemA events at boot. That said, is all these events going to be related to hotplug? If so, you probably want "hotplug" or "hp" in the system name. If you make it just "pci", then it will be expected that all the events will be related to PCI in general. Does that help? -- Steve
在 2024/11/21 21:43, Steven Rostedt 写道: > On Thu, 21 Nov 2024 19:34:31 +0800 > Shuai Xue <xueshuai@linux.alibaba.com> wrote: > >> Sure, I can reorganize the directory. You have two optional proposals: >> >> 1. /sys/kernel/debug/tracing/events/pci_hp_event > > I'm guessing the above has TRACING_SYSTEM = pci_hp_event ? That is, the > above is a system and not an event. > >> 2. /sys/kernel/debug/tracing/events/pci/pci_hp_event > > Whereas here, it's an event. > >> >> I personally prefer the second approach. However, I'll defer the final decision >> to the tracing subsystem maintainer, considering their expertise and >> familiarity with the existing conventions. > > Normally the system is determined by the users of the tracing > infrastructure and not the tracing maintainers. But I can give you some > guidelines. > > The "system" level: > > /sys/kernel/tracing/events/<system> > > is just a way to group like events making it easier to enable them all at once: > > echo 1 > /sys/kernel/tracing/events/<system>/enable > > or > > trace-cmd start -e <system> > > The name of the "system" should be something that all the events underneath > represent. Note, although events in two different systems can have the same > name, it's best to try to keep them all unique. That's because if you have: > > > systemA/foo and systemB/foo > > If you add to the kernel command line: trace_event=foo > it will enable both events. Although that could be fixed with: trace_event=systemA:foo > > Note: trace_event=systemA would enable all systemA events at boot. > > That said, is all these events going to be related to hotplug? If so, you > probably want "hotplug" or "hp" in the system name. If you make it just > "pci", then it will be expected that all the events will be related to PCI > in general. I think Lukas is asking for a system for pci related tracepoint. > > Does that help? Yes, it really does. > > -- Steve Thanks a lot. Best Regards, Shuai
On Thu, Nov 21, 2024 at 08:43:54AM -0500, Steven Rostedt wrote: > On Thu, 21 Nov 2024 19:34:31 +0800 > Shuai Xue <xueshuai@linux.alibaba.com> wrote: > > Sure, I can reorganize the directory. You have two optional proposals: > > > > 1. /sys/kernel/debug/tracing/events/pci_hp_event > > 2. /sys/kernel/debug/tracing/events/pci/pci_hp_event > > > > I personally prefer the second approach. [...] > That said, is all these events going to be related to hotplug? If so, you > probably want "hotplug" or "hp" in the system name. If you make it just > "pci", then it will be expected that all the events will be related to PCI > in general. Grouping them under "pci" probably makes more sense than grouping them under "hotplug". A lot can be hotplugged (cpu, memory, PCI, USB, ...) but it's mostly unrelated hardware and enabling all their events at once doesn't seem to make much sense. Whereas I can imagine people want to enable all PCI trace events. We currently only have tracepoints for AER in the PCI core, but perhaps people will want to add new ones for power faults, DPC recovery, bandwidth control etc. Would something like /sys/kernel/debug/tracing/events/pci/hotplug_event be possible? It would seem more elegant than /sys/kernel/debug/tracing/events/pci/pci_hp_event because it avoids the duplication of "pci" in the path. Thanks, Lukas
On Fri, 22 Nov 2024 00:09:50 +0100 Lukas Wunner <lukas@wunner.de> wrote: > Would something like > > /sys/kernel/debug/tracing/events/pci/hotplug_event > > be possible? It would seem more elegant than > > /sys/kernel/debug/tracing/events/pci/pci_hp_event > > because it avoids the duplication of "pci" in the path. Most events have the name of the system in it (see the sched events). That's to prevent the duplicates in other places. Also, in the trace output, only the event name is shown and not the system. That is, if you have more than one "hotplug_event" enabled, the trace will not differentiate them. My suggestion is the "pci_hp_event" as it will be more obvious in the trace output. -- Steve
在 2024/11/22 08:08, Steven Rostedt 写道: > On Fri, 22 Nov 2024 00:09:50 +0100 > Lukas Wunner <lukas@wunner.de> wrote: > >> Would something like >> >> /sys/kernel/debug/tracing/events/pci/hotplug_event >> >> be possible? It would seem more elegant than >> >> /sys/kernel/debug/tracing/events/pci/pci_hp_event >> >> because it avoids the duplication of "pci" in the path. > > Most events have the name of the system in it (see the sched events). > That's to prevent the duplicates in other places. Also, in the trace > output, only the event name is shown and not the system. That is, if you > have more than one "hotplug_event" enabled, the trace will not > differentiate them. > > My suggestion is the "pci_hp_event" as it will be more obvious in the trace > output. > > -- Steve Agreed. pci_hp_event is more reasonable. Than is: - system: pci - event: pci_hp_event /sys/kernel/debug/tracing/events/pci/pci_hp_event @Lukas, if you have any other concerns, please let me know. Thank you. Best Regards, Shuai
On Fri, Nov 22, 2024 at 02:25:33PM +0800, Shuai Xue wrote: > Agreed. pci_hp_event is more reasonable. Than is: > > - system: pci > - event: pci_hp_event > > /sys/kernel/debug/tracing/events/pci/pci_hp_event > > @Lukas, if you have any other concerns, please let me know. This LGTM, thank you.
© 2016 - 2024 Red Hat, Inc.