[PATCH v3 1/2] apei/ghes: ARM processor Error: don't go past allocated memory

Mauro Carvalho Chehab posted 2 patches 1 month, 3 weeks ago
There is a newer version of this series
[PATCH v3 1/2] apei/ghes: ARM processor Error: don't go past allocated memory
Posted by Mauro Carvalho Chehab 1 month, 3 weeks ago
If the BIOS generates a very small ARM Processor Error, or
an incomplete one, the current logic will fail to deferrence

	err->section_length
and
	ctx_info->size

Add checks to avoid that. With such changes, such GHESv2
records won't cause OOPSes like this:

[    1.492129] Internal error: Oops: 0000000096000005 [#1]  SMP
[    1.495449] Modules linked in:
[    1.495820] CPU: 0 UID: 0 PID: 9 Comm: kworker/0:0 Not tainted 6.18.0-rc1-00017-gabadcc3553dd-dirty #18 PREEMPT
[    1.496125] Hardware name: QEMU QEMU Virtual Machine, BIOS unknown 02/02/2022
[    1.496433] Workqueue: kacpi_notify acpi_os_execute_deferred
[    1.496967] pstate: 814000c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
[    1.497199] pc : log_arm_hw_error+0x5c/0x200
[    1.497380] lr : ghes_handle_arm_hw_error+0x94/0x220

0xffff8000811c5324 is in log_arm_hw_error (../drivers/ras/ras.c:75).
70		err_info = (struct cper_arm_err_info *)(err + 1);
71		ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
72		ctx_err = (u8 *)ctx_info;
73
74		for (n = 0; n < err->context_info_num; n++) {
75			sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
76			ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
77			ctx_len += sz;
78		}
79

and similar ones while trying to access section_length on an
error dump with too small size.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 drivers/acpi/apei/ghes.c | 33 +++++++++++++++++++++++++++++----
 drivers/ras/ras.c        |  6 +++++-
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 0dc767392a6c..9bf4ec84f160 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -552,21 +552,46 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
 {
 	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
 	int flags = sync ? MF_ACTION_REQUIRED : 0;
+	int length = gdata->error_data_length;
 	char error_type[120];
 	bool queued = false;
 	int sec_sev, i;
 	char *p;
 
 	sec_sev = ghes_severity(gdata->error_severity);
-	log_arm_hw_error(err, sec_sev);
+	if (length >= sizeof(*err)) {
+		log_arm_hw_error(err, sec_sev);
+	} else {
+		pr_warn(FW_BUG "arm error length: %d\n", length);
+		pr_warn(FW_BUG "length is too small\n");
+		pr_warn(FW_BUG "firmware-generated error record is incorrect\n");
+		return false;
+	}
+
 	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
 		return false;
 
 	p = (char *)(err + 1);
+	length -= sizeof(err);
+
 	for (i = 0; i < err->err_info_num; i++) {
-		struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
-		bool is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
-		bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
+		struct cper_arm_err_info *err_info;
+		bool is_cache, has_pa;
+
+		/* Ensure we have enough data for the error info header */
+		length -= sizeof(*err_info);
+		if (length < 0)
+			break;
+
+		err_info = (struct cper_arm_err_info *)p;
+
+		/* Validate the claimed length before using it */
+		length -= err_info->length;
+		if (length < 0)
+			break;
+
+		is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
+		has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
 
 		/*
 		 * The field (err_info->error_info & BIT(26)) is fixed to set to
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 2a5b5a9fdcb3..03df3db62334 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -72,7 +72,11 @@ void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
 	ctx_err = (u8 *)ctx_info;
 
 	for (n = 0; n < err->context_info_num; n++) {
-		sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
+		sz = sizeof(struct cper_arm_ctx_info);
+
+		if (sz + (long)ctx_info - (long)err >= err->section_length)
+			sz += ctx_info->size;
+
 		ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
 		ctx_len += sz;
 	}
-- 
2.52.0
Re: [PATCH v3 1/2] apei/ghes: ARM processor Error: don't go past allocated memory
Posted by Jonathan Cameron 1 month, 2 weeks ago
On Fri, 19 Dec 2025 11:49:59 +0100
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> If the BIOS generates a very small ARM Processor Error, or
> an incomplete one, the current logic will fail to deferrence
> 
> 	err->section_length
> and
> 	ctx_info->size
> 
> Add checks to avoid that. With such changes, such GHESv2
> records won't cause OOPSes like this:
> 
> [    1.492129] Internal error: Oops: 0000000096000005 [#1]  SMP
> [    1.495449] Modules linked in:
> [    1.495820] CPU: 0 UID: 0 PID: 9 Comm: kworker/0:0 Not tainted 6.18.0-rc1-00017-gabadcc3553dd-dirty #18 PREEMPT
> [    1.496125] Hardware name: QEMU QEMU Virtual Machine, BIOS unknown 02/02/2022
> [    1.496433] Workqueue: kacpi_notify acpi_os_execute_deferred
> [    1.496967] pstate: 814000c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> [    1.497199] pc : log_arm_hw_error+0x5c/0x200
> [    1.497380] lr : ghes_handle_arm_hw_error+0x94/0x220
> 
> 0xffff8000811c5324 is in log_arm_hw_error (../drivers/ras/ras.c:75).
> 70		err_info = (struct cper_arm_err_info *)(err + 1);
> 71		ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
> 72		ctx_err = (u8 *)ctx_info;
> 73
> 74		for (n = 0; n < err->context_info_num; n++) {
> 75			sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
> 76			ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
> 77			ctx_len += sz;
> 78		}
> 79
> 
> and similar ones while trying to access section_length on an
> error dump with too small size.
> 
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Hi Mauro,

This is fiddly stuff to read in the spec but I think you have a double
counting of the "ARM Processors Error Information Structure" size as
the length in that this time is the length of the structure itself,
not a following body.

Jonathan


> ---
>  drivers/acpi/apei/ghes.c | 33 +++++++++++++++++++++++++++++----
>  drivers/ras/ras.c        |  6 +++++-
>  2 files changed, 34 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 0dc767392a6c..9bf4ec84f160 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -552,21 +552,46 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
>  {
>  	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>  	int flags = sync ? MF_ACTION_REQUIRED : 0;
> +	int length = gdata->error_data_length;
>  	char error_type[120];
>  	bool queued = false;
>  	int sec_sev, i;
>  	char *p;
>  
>  	sec_sev = ghes_severity(gdata->error_severity);
> -	log_arm_hw_error(err, sec_sev);
> +	if (length >= sizeof(*err)) {
> +		log_arm_hw_error(err, sec_sev);
> +	} else {
> +		pr_warn(FW_BUG "arm error length: %d\n", length);
> +		pr_warn(FW_BUG "length is too small\n");
> +		pr_warn(FW_BUG "firmware-generated error record is incorrect\n");
> +		return false;
> +	}
> +
>  	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
>  		return false;
>  
>  	p = (char *)(err + 1);
> +	length -= sizeof(err);
Hacks off the bit of the section that is fixed size.
> +
>  	for (i = 0; i < err->err_info_num; i++) {
> -		struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
> -		bool is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
> -		bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
> +		struct cper_arm_err_info *err_info;
> +		bool is_cache, has_pa;
> +
> +		/* Ensure we have enough data for the error info header */
> +		length -= sizeof(*err_info);
hacks of length of one processor error information structure (fixed 32 bytes)

> +		if (length < 0)
> +			break;
> +
> +		err_info = (struct cper_arm_err_info *)p;
> +
> +		/* Validate the claimed length before using it */
> +		length -= err_info->length;

This one confuses me.  err_info->length is the same 32 bytes you removed above.

So I think this check is wrong.

 
> +		if (length < 0)
> +			break;
> +
> +		is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
> +		has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
>  
>  		/*
>  		 * The field (err_info->error_info & BIT(26)) is fixed to set to
> diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
> index 2a5b5a9fdcb3..03df3db62334 100644
> --- a/drivers/ras/ras.c
> +++ b/drivers/ras/ras.c
> @@ -72,7 +72,11 @@ void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
>  	ctx_err = (u8 *)ctx_info;
>  
>  	for (n = 0; n < err->context_info_num; n++) {
> -		sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
> +		sz = sizeof(struct cper_arm_ctx_info);
> +
> +		if (sz + (long)ctx_info - (long)err >= err->section_length)
> +			sz += ctx_info->size;
> +
>  		ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
>  		ctx_len += sz;
>  	}
Re: [PATCH v3 1/2] apei/ghes: ARM processor Error: don't go past allocated memory
Posted by Mauro Carvalho Chehab 1 month, 2 weeks ago
On Mon, Dec 22, 2025 at 11:38:51AM +0000, Jonathan Cameron wrote:
> On Fri, 19 Dec 2025 11:49:59 +0100
> Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> 
> > If the BIOS generates a very small ARM Processor Error, or
> > an incomplete one, the current logic will fail to deferrence
> > 
> > 	err->section_length
> > and
> > 	ctx_info->size
> > 
> > Add checks to avoid that. With such changes, such GHESv2
> > records won't cause OOPSes like this:
> > 
> > [    1.492129] Internal error: Oops: 0000000096000005 [#1]  SMP
> > [    1.495449] Modules linked in:
> > [    1.495820] CPU: 0 UID: 0 PID: 9 Comm: kworker/0:0 Not tainted 6.18.0-rc1-00017-gabadcc3553dd-dirty #18 PREEMPT
> > [    1.496125] Hardware name: QEMU QEMU Virtual Machine, BIOS unknown 02/02/2022
> > [    1.496433] Workqueue: kacpi_notify acpi_os_execute_deferred
> > [    1.496967] pstate: 814000c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > [    1.497199] pc : log_arm_hw_error+0x5c/0x200
> > [    1.497380] lr : ghes_handle_arm_hw_error+0x94/0x220
> > 
> > 0xffff8000811c5324 is in log_arm_hw_error (../drivers/ras/ras.c:75).
> > 70		err_info = (struct cper_arm_err_info *)(err + 1);
> > 71		ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
> > 72		ctx_err = (u8 *)ctx_info;
> > 73
> > 74		for (n = 0; n < err->context_info_num; n++) {
> > 75			sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
> > 76			ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
> > 77			ctx_len += sz;
> > 78		}
> > 79
> > 
> > and similar ones while trying to access section_length on an
> > error dump with too small size.
> > 
> > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> Hi Mauro,
> 
> This is fiddly stuff to read in the spec but I think you have a double
> counting of the "ARM Processors Error Information Structure" size as
> the length in that this time is the length of the structure itself,
> not a following body.

True. The change below should fix it:

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d37540ef8c00..aacb8d66a3e1 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -582,8 +582,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
 		bool is_cache, has_pa;
 
 		/* Ensure we have enough data for the error info header */
-		length -= sizeof(*err_info);
-		if (length < 0)
+		if (length < sizeof(*err_info))
 			break;
 
 		err_info = (struct cper_arm_err_info *)p;


I'll run some tests here after the change before submitting v4.

Thanks!

Mauro

> 
> Jonathan
> 
> 
> > ---
> >  drivers/acpi/apei/ghes.c | 33 +++++++++++++++++++++++++++++----
> >  drivers/ras/ras.c        |  6 +++++-
> >  2 files changed, 34 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> > index 0dc767392a6c..9bf4ec84f160 100644
> > --- a/drivers/acpi/apei/ghes.c
> > +++ b/drivers/acpi/apei/ghes.c
> > @@ -552,21 +552,46 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
> >  {
> >  	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> >  	int flags = sync ? MF_ACTION_REQUIRED : 0;
> > +	int length = gdata->error_data_length;
> >  	char error_type[120];
> >  	bool queued = false;
> >  	int sec_sev, i;
> >  	char *p;
> >  
> >  	sec_sev = ghes_severity(gdata->error_severity);
> > -	log_arm_hw_error(err, sec_sev);
> > +	if (length >= sizeof(*err)) {
> > +		log_arm_hw_error(err, sec_sev);
> > +	} else {
> > +		pr_warn(FW_BUG "arm error length: %d\n", length);
> > +		pr_warn(FW_BUG "length is too small\n");
> > +		pr_warn(FW_BUG "firmware-generated error record is incorrect\n");
> > +		return false;
> > +	}
> > +
> >  	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
> >  		return false;
> >  
> >  	p = (char *)(err + 1);
> > +	length -= sizeof(err);
> Hacks off the bit of the section that is fixed size.
> > +
> >  	for (i = 0; i < err->err_info_num; i++) {
> > -		struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
> > -		bool is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
> > -		bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
> > +		struct cper_arm_err_info *err_info;
> > +		bool is_cache, has_pa;
> > +
> > +		/* Ensure we have enough data for the error info header */
> > +		length -= sizeof(*err_info);
> hacks of length of one processor error information structure (fixed 32 bytes)
> 
> > +		if (length < 0)
> > +			break;
> > +
> > +		err_info = (struct cper_arm_err_info *)p;
> > +
> > +		/* Validate the claimed length before using it */
> > +		length -= err_info->length;
> 
> This one confuses me.  err_info->length is the same 32 bytes you removed above.
> 
> So I think this check is wrong.
> 
>  
> > +		if (length < 0)
> > +			break;
> > +
> > +		is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
> > +		has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
> >  
> >  		/*
> >  		 * The field (err_info->error_info & BIT(26)) is fixed to set to
> > diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
> > index 2a5b5a9fdcb3..03df3db62334 100644
> > --- a/drivers/ras/ras.c
> > +++ b/drivers/ras/ras.c
> > @@ -72,7 +72,11 @@ void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
> >  	ctx_err = (u8 *)ctx_info;
> >  
> >  	for (n = 0; n < err->context_info_num; n++) {
> > -		sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
> > +		sz = sizeof(struct cper_arm_ctx_info);
> > +
> > +		if (sz + (long)ctx_info - (long)err >= err->section_length)
> > +			sz += ctx_info->size;
> > +
> >  		ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
> >  		ctx_len += sz;
> >  	}
> 

-- 
Thanks,
Mauro