drivers/firmware/efi/Kconfig | 16 ++++++ drivers/firmware/efi/Makefile | 1 + drivers/firmware/efi/cper-nvidia.c | 79 ++++++++++++++++++++++++++++++ drivers/firmware/efi/cper-nvidia.h | 33 +++++++++++++ drivers/firmware/efi/cper.c | 3 ++ include/linux/cper.h | 4 ++ 6 files changed, 136 insertions(+) create mode 100644 drivers/firmware/efi/cper-nvidia.c create mode 100644 drivers/firmware/efi/cper-nvidia.h
Add support for decoding NVIDIA-specific error sections in UEFI CPER
records. NVIDIA hardware generates vendor-specific CPER sections
containing error signatures and diagnostic register dumps. This
implementation decodes these sections and prints error details to the
kernel log.
The NVIDIA CPER section contains a fixed header with error metadata
(signature, error type, severity, socket) followed by variable-length
register address-value pairs for hardware diagnostics.
This work is based on libcper [0].
Example output:
Hardware error from APEI Generic Hardware Error Source: 816
event severity: info
imprecise tstamp: 2025-11-17 07:57:38
Error 0, type: info
section_type: NVIDIA, error_data_length: 224
signature: HSS-IDLE
error_type: 0
error_instance: 0
severity: 0
socket: 255
number_regs: 12
instance_base: 0x0000000000000000
register[0]: address=0x0000000004f10008 value=0x0000000000002019
register[1]: address=0x0000000000000000 value=0x0000000000000000
[0] https://github.com/openbmc/libcper/commit/683e055061ce
Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com>
---
drivers/firmware/efi/Kconfig | 16 ++++++
drivers/firmware/efi/Makefile | 1 +
drivers/firmware/efi/cper-nvidia.c | 79 ++++++++++++++++++++++++++++++
drivers/firmware/efi/cper-nvidia.h | 33 +++++++++++++
drivers/firmware/efi/cper.c | 3 ++
include/linux/cper.h | 4 ++
6 files changed, 136 insertions(+)
create mode 100644 drivers/firmware/efi/cper-nvidia.c
create mode 100644 drivers/firmware/efi/cper-nvidia.h
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 29e0729299f5..ed1f53b8e878 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -329,6 +329,22 @@ config UEFI_CPER_X86
depends on UEFI_CPER && X86
default y
+config UEFI_CPER_NVIDIA
+ bool "UEFI CPER NVIDIA support"
+ depends on UEFI_CPER
+ help
+ This option enables support for decoding NVIDIA-specific error
+ sections in UEFI Common Platform Error Records (CPER). These
+ sections contain additional diagnostic information for errors
+ occurring in NVIDIA hardware such as GPUs, switches, and other
+ devices.
+
+ The NVIDIA CPER sections include error signatures (e.g., PCIe-DPC,
+ DCC-ECC, GPU-STATUS) and diagnostic registers that provide detailed
+ information about hardware errors for debugging and analysis.
+
+ If unsure, say N.
+
config TEE_STMM_EFI
tristate "TEE-based EFI runtime variable service driver"
depends on EFI && OPTEE
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 8efbcf699e4f..a571b6086860 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -42,5 +42,6 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule-loader.o
obj-$(CONFIG_EFI_EARLYCON) += earlycon.o
obj-$(CONFIG_UEFI_CPER_ARM) += cper-arm.o
obj-$(CONFIG_UEFI_CPER_X86) += cper-x86.o
+obj-$(CONFIG_UEFI_CPER_NVIDIA) += cper-nvidia.o
obj-$(CONFIG_UNACCEPTED_MEMORY) += unaccepted_memory.o
obj-$(CONFIG_TEE_STMM_EFI) += stmm/tee_stmm_efi.o
diff --git a/drivers/firmware/efi/cper-nvidia.c b/drivers/firmware/efi/cper-nvidia.c
new file mode 100644
index 000000000000..8f96318c8e95
--- /dev/null
+++ b/drivers/firmware/efi/cper-nvidia.c
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * UEFI Common Platform Error Record (CPER) support for NVIDIA sections
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cper.h>
+#include <linux/unaligned.h>
+#include <acpi/ghes.h>
+#include "cper-nvidia.h"
+
+static void cper_print_nvidia_error(const char *pfx,
+ const struct cper_sec_nvidia *nvidia_err,
+ size_t error_data_length)
+{
+ int i;
+ const u8 *reg_data;
+ size_t min_size;
+
+ printk("%s""signature: %.16s\n", pfx, nvidia_err->signature);
+ printk("%s""error_type: %u\n", pfx, le16_to_cpu(nvidia_err->error_type));
+ printk("%s""error_instance: %u\n", pfx, le16_to_cpu(nvidia_err->error_instance));
+ printk("%s""severity: %u\n", pfx, nvidia_err->severity);
+ printk("%s""socket: %u\n", pfx, nvidia_err->socket);
+ printk("%s""number_regs: %u\n", pfx, nvidia_err->number_regs);
+ printk("%s""instance_base: 0x%016llx\n", pfx,
+ (unsigned long long)le64_to_cpu(nvidia_err->instance_base));
+
+ if (nvidia_err->number_regs == 0)
+ return;
+
+ /*
+ * Validate that all registers fit within the error_data_length.
+ * Each register pair is 16 bytes (two u64s).
+ */
+ min_size = sizeof(*nvidia_err) + (nvidia_err->number_regs * 16);
+ if (error_data_length < min_size) {
+ printk("%s""NVIDIA: Invalid number_regs %u (section size %zu, need %zu)\n",
+ pfx, nvidia_err->number_regs, error_data_length, min_size);
+ return;
+ }
+
+ /*
+ * Registers are stored as address-value pairs immediately
+ * following the fixed header. Each pair is two little-endian u64s.
+ */
+ reg_data = (const u8 *)(nvidia_err + 1);
+ for (i = 0; i < nvidia_err->number_regs; i++) {
+ u64 addr = get_unaligned_le64(reg_data + i * 16);
+ u64 val = get_unaligned_le64(reg_data + i * 16 + 8);
+
+ printk("%s""register[%d]: address=0x%016llx value=0x%016llx\n",
+ pfx, i, (unsigned long long)addr, (unsigned long long)val);
+ }
+}
+
+void cper_estatus_print_nvidia(const char *pfx,
+ const struct acpi_hest_generic_data *gdata)
+{
+ struct cper_sec_nvidia *nvidia_err;
+
+ nvidia_err = acpi_hest_get_payload((struct acpi_hest_generic_data *)gdata);
+ if (!nvidia_err) {
+ printk("%s""NVIDIA error: Failed to get payload\n", pfx);
+ return;
+ }
+
+ printk("%s""section_type: NVIDIA, error_data_length: %u\n", pfx, gdata->error_data_length);
+
+ if (gdata->error_data_length < sizeof(*nvidia_err)) {
+ printk("%s""NVIDIA error: Section too small (%u < %zu)\n",
+ pfx, gdata->error_data_length, sizeof(*nvidia_err));
+ return;
+ }
+
+ cper_print_nvidia_error(pfx, nvidia_err, gdata->error_data_length);
+}
diff --git a/drivers/firmware/efi/cper-nvidia.h b/drivers/firmware/efi/cper-nvidia.h
new file mode 100644
index 000000000000..c489f8f05f0f
--- /dev/null
+++ b/drivers/firmware/efi/cper-nvidia.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * UEFI Common Platform Error Record (CPER) support for NVIDIA sections
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#ifndef LINUX_CPER_NVIDIA_H
+#define LINUX_CPER_NVIDIA_H
+
+#include <linux/cper.h>
+
+struct cper_sec_nvidia {
+ char signature[16];
+ __le16 error_type;
+ __le16 error_instance;
+ u8 severity;
+ u8 socket;
+ u8 number_regs;
+ u8 reserved;
+ __le64 instance_base;
+} __packed;
+
+#ifdef CONFIG_UEFI_CPER_NVIDIA
+struct acpi_hest_generic_data;
+void cper_estatus_print_nvidia(const char *pfx,
+ const struct acpi_hest_generic_data *gdata);
+#else
+static inline void cper_estatus_print_nvidia(const char *pfx,
+ const struct acpi_hest_generic_data *gdata) { }
+#endif
+
+#endif
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 06b4fdb59917..0b5216aaa8c4 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -26,6 +26,7 @@
#include <acpi/ghes.h>
#include <ras/ras_event.h>
#include <cxl/event.h>
+#include "cper-nvidia.h"
/*
* CPER record ID need to be unique even after reboot, because record
@@ -697,6 +698,8 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
cxl_cper_print_prot_err(newpfx, prot_err);
else
goto err_section_too_small;
+ } else if (guid_equal(sec_type, &CPER_SEC_NVIDIA)) {
+ cper_estatus_print_nvidia(newpfx, gdata);
} else {
const void *err = acpi_hest_get_payload(gdata);
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 440b35e459e5..b5790e48fbef 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -224,6 +224,10 @@ enum {
#define CPER_SEC_DMAR_IOMMU \
GUID_INIT(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F, \
0xDF, 0xAA, 0x84, 0xEC)
+/* NVIDIA Error Section */
+#define CPER_SEC_NVIDIA \
+ GUID_INIT(0x6d5244f2, 0x2712, 0x11ec, 0xbe, 0xa7, 0xcb, 0x3f, \
+ 0xdb, 0x95, 0xc7, 0x86)
#define CPER_PROC_VALID_TYPE 0x0001
#define CPER_PROC_VALID_ISA 0x0002
--
2.43.0
>-----Original Message----- >From: Kai-Heng Feng <kaihengf@nvidia.com> >Sent: 23 February 2026 06:49 >To: ardb@kernel.org >Cc: Kai-Heng Feng <kaihengf@nvidia.com>; Rafael J. Wysocki ><rafael@kernel.org>; Tony Luck <tony.luck@intel.com>; Borislav Petkov ><bp@alien8.de>; Guohanjun (Hanjun Guo) <guohanjun@huawei.com>; Mauro >Carvalho Chehab <mchehab@kernel.org>; Shuai Xue ><xueshuai@linux.alibaba.com>; Jonathan Cameron ><jonathan.cameron@huawei.com>; Morduan Zang ><zhangdandan@uniontech.com>; linux-kernel@vger.kernel.org; linux- >efi@vger.kernel.org; linux-acpi@vger.kernel.org >Subject: [PATCH] efi/cper: Add NVIDIA CPER section support > >Add support for decoding NVIDIA-specific error sections in UEFI CPER records. >NVIDIA hardware generates vendor-specific CPER sections containing error >signatures and diagnostic register dumps. This implementation decodes these >sections and prints error details to the kernel log. > >The NVIDIA CPER section contains a fixed header with error metadata (signature, >error type, severity, socket) followed by variable-length register address-value >pairs for hardware diagnostics. > >This work is based on libcper [0]. > >Example output: >Hardware error from APEI Generic Hardware Error Source: 816 event severity: >info imprecise tstamp: 2025-11-17 07:57:38 Error 0, type: info > section_type: NVIDIA, error_data_length: 224 > signature: HSS-IDLE > error_type: 0 > error_instance: 0 > severity: 0 > socket: 255 > number_regs: 12 > instance_base: 0x0000000000000000 > register[0]: address=0x0000000004f10008 value=0x0000000000002019 > register[1]: address=0x0000000000000000 value=0x0000000000000000 > >[0] https://github.com/openbmc/libcper/commit/683e055061ce >Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com> >--- > drivers/firmware/efi/Kconfig | 16 ++++++ > drivers/firmware/efi/Makefile | 1 + > drivers/firmware/efi/cper-nvidia.c | 79 ++++++++++++++++++++++++++++++ >drivers/firmware/efi/cper-nvidia.h | 33 +++++++++++++ > drivers/firmware/efi/cper.c | 3 ++ > include/linux/cper.h | 4 ++ > 6 files changed, 136 insertions(+) > create mode 100644 drivers/firmware/efi/cper-nvidia.c > create mode 100644 drivers/firmware/efi/cper-nvidia.h > >diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index >29e0729299f5..ed1f53b8e878 100644 >--- a/drivers/firmware/efi/Kconfig >+++ b/drivers/firmware/efi/Kconfig >@@ -329,6 +329,22 @@ config UEFI_CPER_X86 > depends on UEFI_CPER && X86 > default y > >+config UEFI_CPER_NVIDIA >+ bool "UEFI CPER NVIDIA support" >+ depends on UEFI_CPER >+ help >+ This option enables support for decoding NVIDIA-specific error >+ sections in UEFI Common Platform Error Records (CPER). These >+ sections contain additional diagnostic information for errors >+ occurring in NVIDIA hardware such as GPUs, switches, and other >+ devices. >+ >+ The NVIDIA CPER sections include error signatures (e.g., PCIe-DPC, >+ DCC-ECC, GPU-STATUS) and diagnostic registers that provide detailed >+ information about hardware errors for debugging and analysis. >+ >+ If unsure, say N. >+ > config TEE_STMM_EFI > tristate "TEE-based EFI runtime variable service driver" > depends on EFI && OPTEE >diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index >8efbcf699e4f..a571b6086860 100644 >--- a/drivers/firmware/efi/Makefile >+++ b/drivers/firmware/efi/Makefile >@@ -42,5 +42,6 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule- >loader.o > obj-$(CONFIG_EFI_EARLYCON) += earlycon.o > obj-$(CONFIG_UEFI_CPER_ARM) += cper-arm.o > obj-$(CONFIG_UEFI_CPER_X86) += cper-x86.o >+obj-$(CONFIG_UEFI_CPER_NVIDIA) += cper-nvidia.o Hi, Is drivers/firmware/efi/cper.c the right place to log vendor-specific errors, given that so far drivers/firmware/efi/ only logs CPER information defined by the standards? Vendor-specific errors are currently logged and recorded in rasdaemon. https://github.com/mchehab/rasdaemon https://github.com/mchehab/rasdaemon/blob/master/ras-non-standard-handler.c#L52 If some kernel-level recovery action or logging is required, we can also register with acpi/apei/ghes using ghes_register_vendor_record_notifier() to receive a callback. https://elixir.bootlin.com/linux/v6.19.3/source/drivers/acpi/apei/ghes.c#L652 [...] >+/* NVIDIA Error Section */ >+#define CPER_SEC_NVIDIA > \ >+ GUID_INIT(0x6d5244f2, 0x2712, 0x11ec, 0xbe, 0xa7, 0xcb, 0x3f, \ >+ 0xdb, 0x95, 0xc7, 0x86) > > #define CPER_PROC_VALID_TYPE 0x0001 > #define CPER_PROC_VALID_ISA 0x0002 >-- >2.43.0 > Thanks, Shiju
Hi Shiju, On 2026/2/24 7:23 PM, Shiju Jose wrote: > External email: Use caution opening links or attachments > > >> -----Original Message----- >> From: Kai-Heng Feng <kaihengf@nvidia.com> >> Sent: 23 February 2026 06:49 >> To: ardb@kernel.org >> Cc: Kai-Heng Feng <kaihengf@nvidia.com>; Rafael J. Wysocki >> <rafael@kernel.org>; Tony Luck <tony.luck@intel.com>; Borislav Petkov >> <bp@alien8.de>; Guohanjun (Hanjun Guo) <guohanjun@huawei.com>; Mauro >> Carvalho Chehab <mchehab@kernel.org>; Shuai Xue >> <xueshuai@linux.alibaba.com>; Jonathan Cameron >> <jonathan.cameron@huawei.com>; Morduan Zang >> <zhangdandan@uniontech.com>; linux-kernel@vger.kernel.org; linux- >> efi@vger.kernel.org; linux-acpi@vger.kernel.org >> Subject: [PATCH] efi/cper: Add NVIDIA CPER section support >> >> Add support for decoding NVIDIA-specific error sections in UEFI CPER records. >> NVIDIA hardware generates vendor-specific CPER sections containing error >> signatures and diagnostic register dumps. This implementation decodes these >> sections and prints error details to the kernel log. >> >> The NVIDIA CPER section contains a fixed header with error metadata (signature, >> error type, severity, socket) followed by variable-length register address-value >> pairs for hardware diagnostics. >> >> This work is based on libcper [0]. >> >> Example output: >> Hardware error from APEI Generic Hardware Error Source: 816 event severity: >> info imprecise tstamp: 2025-11-17 07:57:38 Error 0, type: info >> section_type: NVIDIA, error_data_length: 224 >> signature: HSS-IDLE >> error_type: 0 >> error_instance: 0 >> severity: 0 >> socket: 255 >> number_regs: 12 >> instance_base: 0x0000000000000000 >> register[0]: address=0x0000000004f10008 value=0x0000000000002019 >> register[1]: address=0x0000000000000000 value=0x0000000000000000 >> >> [0] https://github.com/openbmc/libcper/commit/683e055061ce >> Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com> >> --- >> drivers/firmware/efi/Kconfig | 16 ++++++ >> drivers/firmware/efi/Makefile | 1 + >> drivers/firmware/efi/cper-nvidia.c | 79 ++++++++++++++++++++++++++++++ >> drivers/firmware/efi/cper-nvidia.h | 33 +++++++++++++ >> drivers/firmware/efi/cper.c | 3 ++ >> include/linux/cper.h | 4 ++ >> 6 files changed, 136 insertions(+) >> create mode 100644 drivers/firmware/efi/cper-nvidia.c >> create mode 100644 drivers/firmware/efi/cper-nvidia.h >> >> diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index >> 29e0729299f5..ed1f53b8e878 100644 >> --- a/drivers/firmware/efi/Kconfig >> +++ b/drivers/firmware/efi/Kconfig >> @@ -329,6 +329,22 @@ config UEFI_CPER_X86 >> depends on UEFI_CPER && X86 >> default y >> >> +config UEFI_CPER_NVIDIA >> + bool "UEFI CPER NVIDIA support" >> + depends on UEFI_CPER >> + help >> + This option enables support for decoding NVIDIA-specific error >> + sections in UEFI Common Platform Error Records (CPER). These >> + sections contain additional diagnostic information for errors >> + occurring in NVIDIA hardware such as GPUs, switches, and other >> + devices. >> + >> + The NVIDIA CPER sections include error signatures (e.g., PCIe-DPC, >> + DCC-ECC, GPU-STATUS) and diagnostic registers that provide detailed >> + information about hardware errors for debugging and analysis. >> + >> + If unsure, say N. >> + >> config TEE_STMM_EFI >> tristate "TEE-based EFI runtime variable service driver" >> depends on EFI && OPTEE >> diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index >> 8efbcf699e4f..a571b6086860 100644 >> --- a/drivers/firmware/efi/Makefile >> +++ b/drivers/firmware/efi/Makefile >> @@ -42,5 +42,6 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule- >> loader.o >> obj-$(CONFIG_EFI_EARLYCON) += earlycon.o >> obj-$(CONFIG_UEFI_CPER_ARM) += cper-arm.o >> obj-$(CONFIG_UEFI_CPER_X86) += cper-x86.o >> +obj-$(CONFIG_UEFI_CPER_NVIDIA) += cper-nvidia.o > > Hi, > > Is drivers/firmware/efi/cper.c the right place to log vendor-specific errors, > given that so far drivers/firmware/efi/ only logs CPER information defined by the standards? > Vendor-specific errors are currently logged and recorded in rasdaemon. > https://github.com/mchehab/rasdaemon > https://github.com/mchehab/rasdaemon/blob/master/ras-non-standard-handler.c#L52 > > If some kernel-level recovery action or logging is required, we can also register with > acpi/apei/ghes using ghes_register_vendor_record_notifier() to receive a callback. > https://elixir.bootlin.com/linux/v6.19.3/source/drivers/acpi/apei/ghes.c#L652 Thank you for the info. There's indeed an ACPI node for CPER purpose. I'll see if that ACPI HID can be used for implementing using ghes_register_vendor_record_notifier(). Kai-Heng > > [...] >> +/* NVIDIA Error Section */ >> +#define CPER_SEC_NVIDIA >> \ >> + GUID_INIT(0x6d5244f2, 0x2712, 0x11ec, 0xbe, 0xa7, 0xcb, 0x3f, \ >> + 0xdb, 0x95, 0xc7, 0x86) >> >> #define CPER_PROC_VALID_TYPE 0x0001 >> #define CPER_PROC_VALID_ISA 0x0002 >> -- >> 2.43.0 >> > > Thanks, > Shiju
© 2016 - 2026 Red Hat, Inc.