[PATCH] efi/cper: Add NVIDIA CPER section support

Kai-Heng Feng posted 1 patch 1 month, 3 weeks ago
drivers/firmware/efi/Kconfig       | 16 ++++++
drivers/firmware/efi/Makefile      |  1 +
drivers/firmware/efi/cper-nvidia.c | 79 ++++++++++++++++++++++++++++++
drivers/firmware/efi/cper-nvidia.h | 33 +++++++++++++
drivers/firmware/efi/cper.c        |  3 ++
include/linux/cper.h               |  4 ++
6 files changed, 136 insertions(+)
create mode 100644 drivers/firmware/efi/cper-nvidia.c
create mode 100644 drivers/firmware/efi/cper-nvidia.h
[PATCH] efi/cper: Add NVIDIA CPER section support
Posted by Kai-Heng Feng 1 month, 3 weeks ago
Add support for decoding NVIDIA-specific error sections in UEFI CPER
records. NVIDIA hardware generates vendor-specific CPER sections
containing error signatures and diagnostic register dumps. This
implementation decodes these sections and prints error details to the
kernel log.

The NVIDIA CPER section contains a fixed header with error metadata
(signature, error type, severity, socket) followed by variable-length
register address-value pairs for hardware diagnostics.

This work is based on libcper [0].

Example output:
Hardware error from APEI Generic Hardware Error Source: 816
event severity: info
 imprecise tstamp: 2025-11-17 07:57:38
 Error 0, type: info
  section_type: NVIDIA, error_data_length: 224
  signature: HSS-IDLE
  error_type: 0
  error_instance: 0
  severity: 0
  socket: 255
  number_regs: 12
  instance_base: 0x0000000000000000
  register[0]: address=0x0000000004f10008 value=0x0000000000002019
  register[1]: address=0x0000000000000000 value=0x0000000000000000

[0] https://github.com/openbmc/libcper/commit/683e055061ce
Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com>
---
 drivers/firmware/efi/Kconfig       | 16 ++++++
 drivers/firmware/efi/Makefile      |  1 +
 drivers/firmware/efi/cper-nvidia.c | 79 ++++++++++++++++++++++++++++++
 drivers/firmware/efi/cper-nvidia.h | 33 +++++++++++++
 drivers/firmware/efi/cper.c        |  3 ++
 include/linux/cper.h               |  4 ++
 6 files changed, 136 insertions(+)
 create mode 100644 drivers/firmware/efi/cper-nvidia.c
 create mode 100644 drivers/firmware/efi/cper-nvidia.h

diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 29e0729299f5..ed1f53b8e878 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -329,6 +329,22 @@ config UEFI_CPER_X86
 	depends on UEFI_CPER && X86
 	default y
 
+config UEFI_CPER_NVIDIA
+	bool "UEFI CPER NVIDIA support"
+	depends on UEFI_CPER
+	help
+	  This option enables support for decoding NVIDIA-specific error
+	  sections in UEFI Common Platform Error Records (CPER). These
+	  sections contain additional diagnostic information for errors
+	  occurring in NVIDIA hardware such as GPUs, switches, and other
+	  devices.
+
+	  The NVIDIA CPER sections include error signatures (e.g., PCIe-DPC,
+	  DCC-ECC, GPU-STATUS) and diagnostic registers that provide detailed
+	  information about hardware errors for debugging and analysis.
+
+	  If unsure, say N.
+
 config TEE_STMM_EFI
 	tristate "TEE-based EFI runtime variable service driver"
 	depends on EFI && OPTEE
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 8efbcf699e4f..a571b6086860 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -42,5 +42,6 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER)	+= capsule-loader.o
 obj-$(CONFIG_EFI_EARLYCON)		+= earlycon.o
 obj-$(CONFIG_UEFI_CPER_ARM)		+= cper-arm.o
 obj-$(CONFIG_UEFI_CPER_X86)		+= cper-x86.o
+obj-$(CONFIG_UEFI_CPER_NVIDIA)		+= cper-nvidia.o
 obj-$(CONFIG_UNACCEPTED_MEMORY)		+= unaccepted_memory.o
 obj-$(CONFIG_TEE_STMM_EFI)		+= stmm/tee_stmm_efi.o
diff --git a/drivers/firmware/efi/cper-nvidia.c b/drivers/firmware/efi/cper-nvidia.c
new file mode 100644
index 000000000000..8f96318c8e95
--- /dev/null
+++ b/drivers/firmware/efi/cper-nvidia.c
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * UEFI Common Platform Error Record (CPER) support for NVIDIA sections
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cper.h>
+#include <linux/unaligned.h>
+#include <acpi/ghes.h>
+#include "cper-nvidia.h"
+
+static void cper_print_nvidia_error(const char *pfx,
+				    const struct cper_sec_nvidia *nvidia_err,
+				    size_t error_data_length)
+{
+	int i;
+	const u8 *reg_data;
+	size_t min_size;
+
+	printk("%s""signature: %.16s\n", pfx, nvidia_err->signature);
+	printk("%s""error_type: %u\n", pfx, le16_to_cpu(nvidia_err->error_type));
+	printk("%s""error_instance: %u\n", pfx, le16_to_cpu(nvidia_err->error_instance));
+	printk("%s""severity: %u\n", pfx, nvidia_err->severity);
+	printk("%s""socket: %u\n", pfx, nvidia_err->socket);
+	printk("%s""number_regs: %u\n", pfx, nvidia_err->number_regs);
+	printk("%s""instance_base: 0x%016llx\n", pfx,
+	       (unsigned long long)le64_to_cpu(nvidia_err->instance_base));
+
+	if (nvidia_err->number_regs == 0)
+		return;
+
+	/*
+	 * Validate that all registers fit within the error_data_length.
+	 * Each register pair is 16 bytes (two u64s).
+	 */
+	min_size = sizeof(*nvidia_err) + (nvidia_err->number_regs * 16);
+	if (error_data_length < min_size) {
+		printk("%s""NVIDIA: Invalid number_regs %u (section size %zu, need %zu)\n",
+		       pfx, nvidia_err->number_regs, error_data_length, min_size);
+		return;
+	}
+
+	/*
+	 * Registers are stored as address-value pairs immediately
+	 * following the fixed header. Each pair is two little-endian u64s.
+	 */
+	reg_data = (const u8 *)(nvidia_err + 1);
+	for (i = 0; i < nvidia_err->number_regs; i++) {
+		u64 addr = get_unaligned_le64(reg_data + i * 16);
+		u64 val = get_unaligned_le64(reg_data + i * 16 + 8);
+
+		printk("%s""register[%d]: address=0x%016llx value=0x%016llx\n",
+		       pfx, i, (unsigned long long)addr, (unsigned long long)val);
+	}
+}
+
+void cper_estatus_print_nvidia(const char *pfx,
+			       const struct acpi_hest_generic_data *gdata)
+{
+	struct cper_sec_nvidia *nvidia_err;
+
+	nvidia_err = acpi_hest_get_payload((struct acpi_hest_generic_data *)gdata);
+	if (!nvidia_err) {
+		printk("%s""NVIDIA error: Failed to get payload\n", pfx);
+		return;
+	}
+
+	printk("%s""section_type: NVIDIA, error_data_length: %u\n", pfx, gdata->error_data_length);
+
+	if (gdata->error_data_length < sizeof(*nvidia_err)) {
+		printk("%s""NVIDIA error: Section too small (%u < %zu)\n",
+		       pfx, gdata->error_data_length, sizeof(*nvidia_err));
+		return;
+	}
+
+	cper_print_nvidia_error(pfx, nvidia_err, gdata->error_data_length);
+}
diff --git a/drivers/firmware/efi/cper-nvidia.h b/drivers/firmware/efi/cper-nvidia.h
new file mode 100644
index 000000000000..c489f8f05f0f
--- /dev/null
+++ b/drivers/firmware/efi/cper-nvidia.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * UEFI Common Platform Error Record (CPER) support for NVIDIA sections
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#ifndef LINUX_CPER_NVIDIA_H
+#define LINUX_CPER_NVIDIA_H
+
+#include <linux/cper.h>
+
+struct cper_sec_nvidia {
+	char signature[16];
+	__le16 error_type;
+	__le16 error_instance;
+	u8 severity;
+	u8 socket;
+	u8 number_regs;
+	u8 reserved;
+	__le64 instance_base;
+} __packed;
+
+#ifdef CONFIG_UEFI_CPER_NVIDIA
+struct acpi_hest_generic_data;
+void cper_estatus_print_nvidia(const char *pfx,
+			       const struct acpi_hest_generic_data *gdata);
+#else
+static inline void cper_estatus_print_nvidia(const char *pfx,
+					     const struct acpi_hest_generic_data *gdata) { }
+#endif
+
+#endif
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 06b4fdb59917..0b5216aaa8c4 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -26,6 +26,7 @@
 #include <acpi/ghes.h>
 #include <ras/ras_event.h>
 #include <cxl/event.h>
+#include "cper-nvidia.h"
 
 /*
  * CPER record ID need to be unique even after reboot, because record
@@ -697,6 +698,8 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
 			cxl_cper_print_prot_err(newpfx, prot_err);
 		else
 			goto err_section_too_small;
+	} else if (guid_equal(sec_type, &CPER_SEC_NVIDIA)) {
+		cper_estatus_print_nvidia(newpfx, gdata);
 	} else {
 		const void *err = acpi_hest_get_payload(gdata);
 
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 440b35e459e5..b5790e48fbef 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -224,6 +224,10 @@ enum {
 #define CPER_SEC_DMAR_IOMMU						\
 	GUID_INIT(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F,	\
 		  0xDF, 0xAA, 0x84, 0xEC)
+/* NVIDIA Error Section */
+#define CPER_SEC_NVIDIA							\
+	GUID_INIT(0x6d5244f2, 0x2712, 0x11ec, 0xbe, 0xa7, 0xcb, 0x3f,	\
+		  0xdb, 0x95, 0xc7, 0x86)
 
 #define CPER_PROC_VALID_TYPE			0x0001
 #define CPER_PROC_VALID_ISA			0x0002
-- 
2.43.0
RE: [PATCH] efi/cper: Add NVIDIA CPER section support
Posted by Shiju Jose 1 month, 3 weeks ago
>-----Original Message-----
>From: Kai-Heng Feng <kaihengf@nvidia.com>
>Sent: 23 February 2026 06:49
>To: ardb@kernel.org
>Cc: Kai-Heng Feng <kaihengf@nvidia.com>; Rafael J. Wysocki
><rafael@kernel.org>; Tony Luck <tony.luck@intel.com>; Borislav Petkov
><bp@alien8.de>; Guohanjun (Hanjun Guo) <guohanjun@huawei.com>; Mauro
>Carvalho Chehab <mchehab@kernel.org>; Shuai Xue
><xueshuai@linux.alibaba.com>; Jonathan Cameron
><jonathan.cameron@huawei.com>; Morduan Zang
><zhangdandan@uniontech.com>; linux-kernel@vger.kernel.org; linux-
>efi@vger.kernel.org; linux-acpi@vger.kernel.org
>Subject: [PATCH] efi/cper: Add NVIDIA CPER section support
>
>Add support for decoding NVIDIA-specific error sections in UEFI CPER records.
>NVIDIA hardware generates vendor-specific CPER sections containing error
>signatures and diagnostic register dumps. This implementation decodes these
>sections and prints error details to the kernel log.
>
>The NVIDIA CPER section contains a fixed header with error metadata (signature,
>error type, severity, socket) followed by variable-length register address-value
>pairs for hardware diagnostics.
>
>This work is based on libcper [0].
>
>Example output:
>Hardware error from APEI Generic Hardware Error Source: 816 event severity:
>info  imprecise tstamp: 2025-11-17 07:57:38  Error 0, type: info
>  section_type: NVIDIA, error_data_length: 224
>  signature: HSS-IDLE
>  error_type: 0
>  error_instance: 0
>  severity: 0
>  socket: 255
>  number_regs: 12
>  instance_base: 0x0000000000000000
>  register[0]: address=0x0000000004f10008 value=0x0000000000002019
>  register[1]: address=0x0000000000000000 value=0x0000000000000000
>
>[0] https://github.com/openbmc/libcper/commit/683e055061ce
>Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com>
>---
> drivers/firmware/efi/Kconfig       | 16 ++++++
> drivers/firmware/efi/Makefile      |  1 +
> drivers/firmware/efi/cper-nvidia.c | 79 ++++++++++++++++++++++++++++++
>drivers/firmware/efi/cper-nvidia.h | 33 +++++++++++++
> drivers/firmware/efi/cper.c        |  3 ++
> include/linux/cper.h               |  4 ++
> 6 files changed, 136 insertions(+)
> create mode 100644 drivers/firmware/efi/cper-nvidia.c
> create mode 100644 drivers/firmware/efi/cper-nvidia.h
>
>diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index
>29e0729299f5..ed1f53b8e878 100644
>--- a/drivers/firmware/efi/Kconfig
>+++ b/drivers/firmware/efi/Kconfig
>@@ -329,6 +329,22 @@ config UEFI_CPER_X86
> 	depends on UEFI_CPER && X86
> 	default y
>
>+config UEFI_CPER_NVIDIA
>+	bool "UEFI CPER NVIDIA support"
>+	depends on UEFI_CPER
>+	help
>+	  This option enables support for decoding NVIDIA-specific error
>+	  sections in UEFI Common Platform Error Records (CPER). These
>+	  sections contain additional diagnostic information for errors
>+	  occurring in NVIDIA hardware such as GPUs, switches, and other
>+	  devices.
>+
>+	  The NVIDIA CPER sections include error signatures (e.g., PCIe-DPC,
>+	  DCC-ECC, GPU-STATUS) and diagnostic registers that provide detailed
>+	  information about hardware errors for debugging and analysis.
>+
>+	  If unsure, say N.
>+
> config TEE_STMM_EFI
> 	tristate "TEE-based EFI runtime variable service driver"
> 	depends on EFI && OPTEE
>diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index
>8efbcf699e4f..a571b6086860 100644
>--- a/drivers/firmware/efi/Makefile
>+++ b/drivers/firmware/efi/Makefile
>@@ -42,5 +42,6 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER)	+= capsule-
>loader.o
> obj-$(CONFIG_EFI_EARLYCON)		+= earlycon.o
> obj-$(CONFIG_UEFI_CPER_ARM)		+= cper-arm.o
> obj-$(CONFIG_UEFI_CPER_X86)		+= cper-x86.o
>+obj-$(CONFIG_UEFI_CPER_NVIDIA)		+= cper-nvidia.o

Hi,

Is drivers/firmware/efi/cper.c the right place to log vendor-specific errors, 
given that so far drivers/firmware/efi/ only logs CPER information defined by the standards?
Vendor-specific errors are currently logged and recorded in rasdaemon.
https://github.com/mchehab/rasdaemon
https://github.com/mchehab/rasdaemon/blob/master/ras-non-standard-handler.c#L52

If some kernel-level  recovery action or logging is required, we can also register with
acpi/apei/ghes using ghes_register_vendor_record_notifier() to receive a callback.
https://elixir.bootlin.com/linux/v6.19.3/source/drivers/acpi/apei/ghes.c#L652

[...]
>+/* NVIDIA Error Section */
>+#define CPER_SEC_NVIDIA
>	\
>+	GUID_INIT(0x6d5244f2, 0x2712, 0x11ec, 0xbe, 0xa7, 0xcb, 0x3f,	\
>+		  0xdb, 0x95, 0xc7, 0x86)
>
> #define CPER_PROC_VALID_TYPE			0x0001
> #define CPER_PROC_VALID_ISA			0x0002
>--
>2.43.0
>

Thanks,
Shiju
Re: [PATCH] efi/cper: Add NVIDIA CPER section support
Posted by Kai-Heng Feng 1 month, 3 weeks ago
Hi Shiju,

On 2026/2/24 7:23 PM, Shiju Jose wrote:
> External email: Use caution opening links or attachments
> 
> 
>> -----Original Message-----
>> From: Kai-Heng Feng <kaihengf@nvidia.com>
>> Sent: 23 February 2026 06:49
>> To: ardb@kernel.org
>> Cc: Kai-Heng Feng <kaihengf@nvidia.com>; Rafael J. Wysocki
>> <rafael@kernel.org>; Tony Luck <tony.luck@intel.com>; Borislav Petkov
>> <bp@alien8.de>; Guohanjun (Hanjun Guo) <guohanjun@huawei.com>; Mauro
>> Carvalho Chehab <mchehab@kernel.org>; Shuai Xue
>> <xueshuai@linux.alibaba.com>; Jonathan Cameron
>> <jonathan.cameron@huawei.com>; Morduan Zang
>> <zhangdandan@uniontech.com>; linux-kernel@vger.kernel.org; linux-
>> efi@vger.kernel.org; linux-acpi@vger.kernel.org
>> Subject: [PATCH] efi/cper: Add NVIDIA CPER section support
>>
>> Add support for decoding NVIDIA-specific error sections in UEFI CPER records.
>> NVIDIA hardware generates vendor-specific CPER sections containing error
>> signatures and diagnostic register dumps. This implementation decodes these
>> sections and prints error details to the kernel log.
>>
>> The NVIDIA CPER section contains a fixed header with error metadata (signature,
>> error type, severity, socket) followed by variable-length register address-value
>> pairs for hardware diagnostics.
>>
>> This work is based on libcper [0].
>>
>> Example output:
>> Hardware error from APEI Generic Hardware Error Source: 816 event severity:
>> info  imprecise tstamp: 2025-11-17 07:57:38  Error 0, type: info
>>   section_type: NVIDIA, error_data_length: 224
>>   signature: HSS-IDLE
>>   error_type: 0
>>   error_instance: 0
>>   severity: 0
>>   socket: 255
>>   number_regs: 12
>>   instance_base: 0x0000000000000000
>>   register[0]: address=0x0000000004f10008 value=0x0000000000002019
>>   register[1]: address=0x0000000000000000 value=0x0000000000000000
>>
>> [0] https://github.com/openbmc/libcper/commit/683e055061ce
>> Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com>
>> ---
>> drivers/firmware/efi/Kconfig       | 16 ++++++
>> drivers/firmware/efi/Makefile      |  1 +
>> drivers/firmware/efi/cper-nvidia.c | 79 ++++++++++++++++++++++++++++++
>> drivers/firmware/efi/cper-nvidia.h | 33 +++++++++++++
>> drivers/firmware/efi/cper.c        |  3 ++
>> include/linux/cper.h               |  4 ++
>> 6 files changed, 136 insertions(+)
>> create mode 100644 drivers/firmware/efi/cper-nvidia.c
>> create mode 100644 drivers/firmware/efi/cper-nvidia.h
>>
>> diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index
>> 29e0729299f5..ed1f53b8e878 100644
>> --- a/drivers/firmware/efi/Kconfig
>> +++ b/drivers/firmware/efi/Kconfig
>> @@ -329,6 +329,22 @@ config UEFI_CPER_X86
>>        depends on UEFI_CPER && X86
>>        default y
>>
>> +config UEFI_CPER_NVIDIA
>> +      bool "UEFI CPER NVIDIA support"
>> +      depends on UEFI_CPER
>> +      help
>> +        This option enables support for decoding NVIDIA-specific error
>> +        sections in UEFI Common Platform Error Records (CPER). These
>> +        sections contain additional diagnostic information for errors
>> +        occurring in NVIDIA hardware such as GPUs, switches, and other
>> +        devices.
>> +
>> +        The NVIDIA CPER sections include error signatures (e.g., PCIe-DPC,
>> +        DCC-ECC, GPU-STATUS) and diagnostic registers that provide detailed
>> +        information about hardware errors for debugging and analysis.
>> +
>> +        If unsure, say N.
>> +
>> config TEE_STMM_EFI
>>        tristate "TEE-based EFI runtime variable service driver"
>>        depends on EFI && OPTEE
>> diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index
>> 8efbcf699e4f..a571b6086860 100644
>> --- a/drivers/firmware/efi/Makefile
>> +++ b/drivers/firmware/efi/Makefile
>> @@ -42,5 +42,6 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER)     += capsule-
>> loader.o
>> obj-$(CONFIG_EFI_EARLYCON)            += earlycon.o
>> obj-$(CONFIG_UEFI_CPER_ARM)           += cper-arm.o
>> obj-$(CONFIG_UEFI_CPER_X86)           += cper-x86.o
>> +obj-$(CONFIG_UEFI_CPER_NVIDIA)                += cper-nvidia.o
> 
> Hi,
> 
> Is drivers/firmware/efi/cper.c the right place to log vendor-specific errors,
> given that so far drivers/firmware/efi/ only logs CPER information defined by the standards?
> Vendor-specific errors are currently logged and recorded in rasdaemon.
> https://github.com/mchehab/rasdaemon
> https://github.com/mchehab/rasdaemon/blob/master/ras-non-standard-handler.c#L52
> 
> If some kernel-level  recovery action or logging is required, we can also register with
> acpi/apei/ghes using ghes_register_vendor_record_notifier() to receive a callback.
> https://elixir.bootlin.com/linux/v6.19.3/source/drivers/acpi/apei/ghes.c#L652

Thank you for the info. There's indeed an ACPI node for CPER purpose. I'll see 
if that ACPI HID can be used for implementing using 
ghes_register_vendor_record_notifier().

Kai-Heng

> 
> [...]
>> +/* NVIDIA Error Section */
>> +#define CPER_SEC_NVIDIA
>>        \
>> +      GUID_INIT(0x6d5244f2, 0x2712, 0x11ec, 0xbe, 0xa7, 0xcb, 0x3f,   \
>> +                0xdb, 0x95, 0xc7, 0x86)
>>
>> #define CPER_PROC_VALID_TYPE                  0x0001
>> #define CPER_PROC_VALID_ISA                   0x0002
>> --
>> 2.43.0
>>
> 
> Thanks,
> Shiju