include/linux/kexec.h | 6 +++ include/uapi/linux/kexec.h | 1 + kernel/kexec_core.c | 82 ++++++++++++++++++++++++++++++++++++++ kernel/kexec_file.c | 55 +++++++++++++++++++++++-- 4 files changed, 140 insertions(+), 4 deletions(-)
When booting a new kernel with kexec_file, the kernel picks a target
location that the kernel should live at, then allocates random pages,
checks whether any of those patches magically happens to coincide with
a target address range and if so, uses them for that range.
For every page allocated this way, it then creates a page list that the
relocation code - code that executes while all CPUs are off and we are
just about to jump into the new kernel - copies to their final memory
location. We can not put them there before, because chances are pretty
good that at least some page in the target range is already in use by
the currently running Linux environment.
All of this is inefficient.
Since kexec got introduced, Linux has gained the CMA framework which
can perform physically contiguous memory mappings, while keeping that
memory available for movable memory when it is not needed for contiguous
allocations. The default CMA allocator is for DMA allocations.
This patch adds logic to the kexec file loader to attempt to place the
target payload at a location allocated from CMA. If successful, it uses
that memory range directly instead of creating copy instructions during
the hot phase. To ensure that there is a safety net in case anything goes
wrong with the CMA allocation, it also adds a flag for user space to force
disable CMA allocations.
Using CMA allocations has two advantages:
1) Faster. There is no more need to copy in the hot phase.
2) More robust. Even if by accident some page is still in use for DMA,
the new kernel image will be safe from that access because it resides
in a memory region that is considered allocated in the old kernel and
has a chance to reinitialize that component.
Signed-off-by: Alexander Graf <graf@amazon.com>
---
include/linux/kexec.h | 6 +++
include/uapi/linux/kexec.h | 1 +
kernel/kexec_core.c | 82 ++++++++++++++++++++++++++++++++++++++
kernel/kexec_file.c | 55 +++++++++++++++++++++++--
4 files changed, 140 insertions(+), 4 deletions(-)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index c8971861521a..421af2a303b6 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -91,6 +91,9 @@ struct kexec_segment {
size_t bufsz;
unsigned long mem;
size_t memsz;
+
+ /* Pointer to contiguous CMA allocation or NULL */
+ struct page *cma;
};
#ifdef CONFIG_COMPAT
@@ -169,6 +172,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image);
* @buf_min: The buffer can't be placed below this address.
* @buf_max: The buffer can't be placed above this address.
* @top_down: Allocate from top of memory.
+ * @cma: CMA page if the buffer is backed by CMA.
*/
struct kexec_buf {
struct kimage *image;
@@ -180,6 +184,7 @@ struct kexec_buf {
unsigned long buf_min;
unsigned long buf_max;
bool top_down;
+ struct page *cma;
};
int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf);
@@ -331,6 +336,7 @@ struct kimage {
*/
unsigned int hotplug_support:1;
#endif
+ unsigned int no_cma : 1;
#ifdef ARCH_HAS_KIMAGE_ARCH
struct kimage_arch arch;
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 5ae1741ea8ea..8958ebfcff94 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -27,6 +27,7 @@
#define KEXEC_FILE_ON_CRASH 0x00000002
#define KEXEC_FILE_NO_INITRAMFS 0x00000004
#define KEXEC_FILE_DEBUG 0x00000008
+#define KEXEC_FILE_NO_CMA 0x00000010
/* These values match the ELF architecture values.
* Unless there is a good reason that should continue to be the case.
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 3e62b944c883..4c2b2ef7825d 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -40,6 +40,7 @@
#include <linux/hugetlb.h>
#include <linux/objtool.h>
#include <linux/kmsg_dump.h>
+#include <linux/dma-map-ops.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry)
kimage_free_pages(page);
}
+static void kimage_free_cma(struct kimage *image)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct page *cma = image->segment[i].cma;
+ u32 pages = image->segment[i].memsz >> PAGE_SHIFT;
+
+ if (!cma)
+ continue;
+
+ arch_kexec_pre_free_pages(page_address(cma), pages);
+ dma_release_from_contiguous(NULL, cma, pages);
+ image->segment[i].cma = NULL;
+ }
+
+}
+
void kimage_free(struct kimage *image)
{
kimage_entry_t *ptr, entry;
@@ -591,6 +610,9 @@ void kimage_free(struct kimage *image)
/* Free the kexec control pages... */
kimage_free_page_list(&image->control_pages);
+ /* Free CMA allocations */
+ kimage_free_cma(image);
+
/*
* Free up any temporary buffers allocated. This might hit if
* error occurred much later after buffer allocation.
@@ -716,6 +738,63 @@ static struct page *kimage_alloc_page(struct kimage *image,
return page;
}
+static int kimage_load_cma_segment(struct kimage *image, struct kexec_segment *segment)
+{
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result = 0;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+ char *ptr = page_address(segment->cma);
+
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ /* Initialize the buffer with zeros to allow for smaller input buffers */
+ memset(ptr, 0, mbytes);
+
+ /* Then copy from source buffer to the CMA one */
+ while (mbytes) {
+ size_t uchunk, mchunk;
+
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
+
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+
+ ptr += mchunk;
+ maddr += mchunk;
+ mbytes -= mchunk;
+
+ cond_resched();
+ }
+out:
+ return result;
+}
+
static int kimage_load_normal_segment(struct kimage *image,
struct kexec_segment *segment)
{
@@ -733,6 +812,9 @@ static int kimage_load_normal_segment(struct kimage *image,
mbytes = segment->memsz;
maddr = segment->mem;
+ if (segment->cma)
+ return kimage_load_cma_segment(image, segment);
+
result = kimage_set_destination(image, maddr);
if (result < 0)
goto out;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index fba686487e3b..92bf4ab7b7be 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -27,6 +27,7 @@
#include <linux/kernel_read_file.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
+#include <linux/dma-map-ops.h>
#include "kexec_internal.h"
#ifdef CONFIG_KEXEC_SIG
@@ -230,6 +231,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
ret = 0;
}
+ image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+
if (cmdline_len) {
image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
if (IS_ERR(image->cmdline_buf)) {
@@ -632,6 +635,38 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
}
+static int kexec_alloc_contig(struct kexec_buf *kbuf)
+{
+ u32 pages = (u32)(kbuf->memsz >> PAGE_SHIFT);
+ unsigned long mem;
+ struct page *p;
+
+ if (kbuf->image->no_cma)
+ return -EPERM;
+
+ p = dma_alloc_from_contiguous(NULL, pages, get_order(kbuf->buf_align), true);
+ if (!p)
+ return -EADDRNOTAVAIL;
+
+ pr_debug("allocated %d DMA pages at 0x%lx", pages, page_to_boot_pfn(p));
+
+ mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+
+ if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
+ /* Our region is already in use by a statically defined one. Bail out. */
+ pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
+ dma_release_from_contiguous(NULL, p, pages);
+ return -EADDRNOTAVAIL;
+ }
+
+ kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+ kbuf->cma = p;
+
+ arch_kexec_post_alloc_pages(page_address(p), pages, 0);
+
+ return 0;
+}
+
/**
* kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
* @kbuf: Parameters for the memory search.
@@ -694,10 +729,21 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
- /* Walk the RAM ranges and allocate a suitable range for the buffer */
- ret = arch_kexec_locate_mem_hole(kbuf);
- if (ret)
- return ret;
+ /*
+ * Try to find a free physically contiguous block of memory first. With that, we
+ * can avoid any copying at kexec time.
+ */
+ kbuf->cma = NULL;
+ ret = kexec_alloc_contig(kbuf);
+ if (ret) {
+ /*
+ * Could not find one. Walk the RAM ranges and allocate pages for the
+ * buffer. Maybe some even get us into the target range.
+ */
+ ret = arch_kexec_locate_mem_hole(kbuf);
+ if (ret)
+ return ret;
+ }
/* Found a suitable memory range */
ksegment = &kbuf->image->segment[kbuf->image->nr_segments];
@@ -705,6 +751,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
ksegment->bufsz = kbuf->bufsz;
ksegment->mem = kbuf->mem;
ksegment->memsz = kbuf->memsz;
+ ksegment->cma = kbuf->cma;
kbuf->image->nr_segments++;
return 0;
}
--
2.34.1
Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
On Mon, 2025-05-12 at 22:57 +0000, Alexander Graf wrote:
>
> --- a/include/uapi/linux/kexec.h
> +++ b/include/uapi/linux/kexec.h
> @@ -27,6 +27,7 @@
> #define KEXEC_FILE_ON_CRASH 0x00000002
> #define KEXEC_FILE_NO_INITRAMFS 0x00000004
> #define KEXEC_FILE_DEBUG 0x00000008
> +#define KEXEC_FILE_NO_CMA 0x00000010
>
Gives me EINVAL when I try to use it. This helps (but should it be
permitted only on architectures which implement it?):
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -385,7 +385,7 @@ extern int kexec_load_disabled;
/* List of defined/legal kexec file flags */
#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
KEXEC_FILE_NO_INITRAMFS | \
- KEXEC_FILE_COOPERATIVE)
+ KEXEC_FILE_COOPERATIVE | KEXEC_FILE_NO_CMA)
/* Location of a reserved region to hold the crash kernel.
*/
On Mon, 12 May 2025 22:57:52 +0000 Alexander Graf <graf@amazon.com> wrote:
> When booting a new kernel with kexec_file, the kernel picks a target
> location that the kernel should live at, then allocates random pages,
> checks whether any of those patches magically happens to coincide with
> a target address range and if so, uses them for that range.
>
> For every page allocated this way, it then creates a page list that the
> relocation code - code that executes while all CPUs are off and we are
> just about to jump into the new kernel - copies to their final memory
> location. We can not put them there before, because chances are pretty
> good that at least some page in the target range is already in use by
> the currently running Linux environment.
>
> All of this is inefficient.
>
> Since kexec got introduced, Linux has gained the CMA framework which
> can perform physically contiguous memory mappings, while keeping that
> memory available for movable memory when it is not needed for contiguous
> allocations. The default CMA allocator is for DMA allocations.
>
> This patch adds logic to the kexec file loader to attempt to place the
> target payload at a location allocated from CMA. If successful, it uses
> that memory range directly instead of creating copy instructions during
> the hot phase. To ensure that there is a safety net in case anything goes
> wrong with the CMA allocation, it also adds a flag for user space to force
> disable CMA allocations.
>
> Using CMA allocations has two advantages:
>
> 1) Faster. There is no more need to copy in the hot phase.
How much faster? Kinda matters as "fast" is the whole point of the patch!
> 2) More robust. Even if by accident some page is still in use for DMA,
> the new kernel image will be safe from that access because it resides
> in a memory region that is considered allocated in the old kernel and
> has a chance to reinitialize that component.
Is this known to be a problem in current code?
Some minor observations:
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
>
> ...
>
> @@ -331,6 +336,7 @@ struct kimage {
> */
> unsigned int hotplug_support:1;
> #endif
> + unsigned int no_cma : 1;
"no_cma:1" is more conventional.
> #ifdef ARCH_HAS_KIMAGE_ARCH
> struct kimage_arch arch;
> diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
> index 5ae1741ea8ea..8958ebfcff94 100644
> --- a/include/uapi/linux/kexec.h
> +++ b/include/uapi/linux/kexec.h
>
> ...
>
> +static int kimage_load_cma_segment(struct kimage *image, struct kexec_segment *segment)
> +{
> + unsigned long maddr;
> + size_t ubytes, mbytes;
> + int result = 0;
> + unsigned char __user *buf = NULL;
> + unsigned char *kbuf = NULL;
> + char *ptr = page_address(segment->cma);
> +
> + if (image->file_mode)
> + kbuf = segment->kbuf;
> + else
> + buf = segment->buf;
> + ubytes = segment->bufsz;
> + mbytes = segment->memsz;
> + maddr = segment->mem;
> +
> + /* Initialize the buffer with zeros to allow for smaller input buffers */
> + memset(ptr, 0, mbytes);
Would it be more efficient to zero the remainder after performing the copy?
> + /* Then copy from source buffer to the CMA one */
> + while (mbytes) {
> + size_t uchunk, mchunk;
> +
> + ptr += maddr & ~PAGE_MASK;
> + mchunk = min_t(size_t, mbytes,
> + PAGE_SIZE - (maddr & ~PAGE_MASK));
> + uchunk = min(ubytes, mchunk);
> +
> + if (uchunk) {
> + /* For file based kexec, source pages are in kernel memory */
> + if (image->file_mode)
> + memcpy(ptr, kbuf, uchunk);
> + else
> + result = copy_from_user(ptr, buf, uchunk);
> + ubytes -= uchunk;
> + if (image->file_mode)
> + kbuf += uchunk;
> + else
> + buf += uchunk;
> + }
> +
> + if (result) {
> + result = -EFAULT;
> + goto out;
> + }
> +
> + ptr += mchunk;
> + maddr += mchunk;
> + mbytes -= mchunk;
> +
> + cond_resched();
> + }
> +out:
> + return result;
> +}
> +
>
> ...
>
> --- a/kernel/kexec_file.c
> +++ b/kernel/kexec_file.c
>
> ...
>
> @@ -632,6 +635,38 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
> return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
> }
>
> +static int kexec_alloc_contig(struct kexec_buf *kbuf)
> +{
> + u32 pages = (u32)(kbuf->memsz >> PAGE_SHIFT);
I don't think the cast is needed?
> + unsigned long mem;
> + struct page *p;
> +
> + if (kbuf->image->no_cma)
> + return -EPERM;
> +
> + p = dma_alloc_from_contiguous(NULL, pages, get_order(kbuf->buf_align), true);
dma_alloc_from_contiguous()'s `count' arg is size_t. Making `pages'
size_t seems best here. (And nr_pages would be a better identifier!)
> + if (!p)
> + return -EADDRNOTAVAIL;
EADDRNOTAVAIL is a networking thing. People will be surprised to see
kexec returning networking errors. Perhaps choose something more
appropriate?
> + pr_debug("allocated %d DMA pages at 0x%lx", pages, page_to_boot_pfn(p));
> +
> + mem = page_to_boot_pfn(p) << PAGE_SHIFT;
> +
> + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
> + /* Our region is already in use by a statically defined one. Bail out. */
> + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
> + dma_release_from_contiguous(NULL, p, pages);
> + return -EADDRNOTAVAIL;
> + }
> +
> + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
> + kbuf->cma = p;
> +
> + arch_kexec_post_alloc_pages(page_address(p), pages, 0);
> +
> + return 0;
> +}
> +
>
> ...
>
Hi Andrew,
On 13.05.25 01:59, Andrew Morton wrote:
> On Mon, 12 May 2025 22:57:52 +0000 Alexander Graf <graf@amazon.com> wrote:
>
>> When booting a new kernel with kexec_file, the kernel picks a target
>> location that the kernel should live at, then allocates random pages,
>> checks whether any of those patches magically happens to coincide with
>> a target address range and if so, uses them for that range.
>>
>> For every page allocated this way, it then creates a page list that the
>> relocation code - code that executes while all CPUs are off and we are
>> just about to jump into the new kernel - copies to their final memory
>> location. We can not put them there before, because chances are pretty
>> good that at least some page in the target range is already in use by
>> the currently running Linux environment.
>>
>> All of this is inefficient.
>>
>> Since kexec got introduced, Linux has gained the CMA framework which
>> can perform physically contiguous memory mappings, while keeping that
>> memory available for movable memory when it is not needed for contiguous
>> allocations. The default CMA allocator is for DMA allocations.
>>
>> This patch adds logic to the kexec file loader to attempt to place the
>> target payload at a location allocated from CMA. If successful, it uses
>> that memory range directly instead of creating copy instructions during
>> the hot phase. To ensure that there is a safety net in case anything goes
>> wrong with the CMA allocation, it also adds a flag for user space to force
>> disable CMA allocations.
>>
>> Using CMA allocations has two advantages:
>>
>> 1) Faster. There is no more need to copy in the hot phase.
> How much faster? Kinda matters as "fast" is the whole point of the patch!
It mostly depends on your memory bandwidth, cache size and the amount of
payload you kexec into. Today, we copy kernel, initrd and dtb during the
kexec hot phase. With this patch, we copy none.
At a hypothetical payload size of 100MiB and an AMLogic S905 we're
looking at 50ms. On an Ice Lake system at about 4ms.
As for real life benchmarks, on an Apple M1 VM with 53MiB payload I
measure a 2.2ms speedup.
That said, to me robustness is at least as important as speed here. The
current scheme creates memory allocation conflicts almost by design,
which increase the odds of something going wrong.
>> 2) More robust. Even if by accident some page is still in use for DMA,
>> the new kernel image will be safe from that access because it resides
>> in a memory region that is considered allocated in the old kernel and
>> has a chance to reinitialize that component.
> Is this known to be a problem in current code?
Yes, see David's email. I'll clarify in the patch description.
>
> Some minor observations:
>
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>>
>> ...
>>
>> @@ -331,6 +336,7 @@ struct kimage {
>> */
>> unsigned int hotplug_support:1;
>> #endif
>> + unsigned int no_cma : 1;
> "no_cma:1" is more conventional.
Fixed.
>
>> #ifdef ARCH_HAS_KIMAGE_ARCH
>> struct kimage_arch arch;
>> diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
>> index 5ae1741ea8ea..8958ebfcff94 100644
>> --- a/include/uapi/linux/kexec.h
>> +++ b/include/uapi/linux/kexec.h
>>
>> ...
>>
>> +static int kimage_load_cma_segment(struct kimage *image, struct kexec_segment *segment)
>> +{
>> + unsigned long maddr;
>> + size_t ubytes, mbytes;
>> + int result = 0;
>> + unsigned char __user *buf = NULL;
>> + unsigned char *kbuf = NULL;
>> + char *ptr = page_address(segment->cma);
>> +
>> + if (image->file_mode)
>> + kbuf = segment->kbuf;
>> + else
>> + buf = segment->buf;
>> + ubytes = segment->bufsz;
>> + mbytes = segment->memsz;
>> + maddr = segment->mem;
>> +
>> + /* Initialize the buffer with zeros to allow for smaller input buffers */
>> + memset(ptr, 0, mbytes);
> Would it be more efficient to zero the remainder after performing the copy?
Yup. Changed :).
>
>> + /* Then copy from source buffer to the CMA one */
>> + while (mbytes) {
>> + size_t uchunk, mchunk;
>> +
>> + ptr += maddr & ~PAGE_MASK;
>> + mchunk = min_t(size_t, mbytes,
>> + PAGE_SIZE - (maddr & ~PAGE_MASK));
>> + uchunk = min(ubytes, mchunk);
>> +
>> + if (uchunk) {
>> + /* For file based kexec, source pages are in kernel memory */
>> + if (image->file_mode)
>> + memcpy(ptr, kbuf, uchunk);
>> + else
>> + result = copy_from_user(ptr, buf, uchunk);
>> + ubytes -= uchunk;
>> + if (image->file_mode)
>> + kbuf += uchunk;
>> + else
>> + buf += uchunk;
>> + }
>> +
>> + if (result) {
>> + result = -EFAULT;
>> + goto out;
>> + }
>> +
>> + ptr += mchunk;
>> + maddr += mchunk;
>> + mbytes -= mchunk;
>> +
>> + cond_resched();
>> + }
>> +out:
>> + return result;
>> +}
>> +
>>
>> ...
>>
>> --- a/kernel/kexec_file.c
>> +++ b/kernel/kexec_file.c
>>
>> ...
>>
>> @@ -632,6 +635,38 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
>> return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
>> }
>>
>> +static int kexec_alloc_contig(struct kexec_buf *kbuf)
>> +{
>> + u32 pages = (u32)(kbuf->memsz >> PAGE_SHIFT);
> I don't think the cast is needed?
I added it to make it more explicit we're casting (and hence potentially
lose a few bits). At size_t, that is no longer necessary.
>
>> + unsigned long mem;
>> + struct page *p;
>> +
>> + if (kbuf->image->no_cma)
>> + return -EPERM;
>> +
>> + p = dma_alloc_from_contiguous(NULL, pages, get_order(kbuf->buf_align), true);
> dma_alloc_from_contiguous()'s `count' arg is size_t. Making `pages'
> size_t seems best here. (And nr_pages would be a better identifier!)
>
>
>> + if (!p)
>> + return -EADDRNOTAVAIL;
> EADDRNOTAVAIL is a networking thing. People will be surprised to see
> kexec returning networking errors. Perhaps choose something more
> appropriate?
I was surprised too when I saw it in the surrounding code. It's the
error code that locate_mem_hole() returns, so I copied it. Let me change
it at least in this function to something sensible.
Alex
Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
On Mon, 2025-05-12 at 16:59 -0700, Andrew Morton wrote: > On Mon, 12 May 2025 22:57:52 +0000 Alexander Graf <graf@amazon.com> wrote: > > > When booting a new kernel with kexec_file, the kernel picks a target > > location that the kernel should live at, then allocates random pages, > > checks whether any of those patches magically happens to coincide with > > a target address range and if so, uses them for that range. > > > > For every page allocated this way, it then creates a page list that the > > relocation code - code that executes while all CPUs are off and we are > > just about to jump into the new kernel - copies to their final memory > > location. We can not put them there before, because chances are pretty > > good that at least some page in the target range is already in use by > > the currently running Linux environment. > > > > All of this is inefficient. > > > > Since kexec got introduced, Linux has gained the CMA framework which > > can perform physically contiguous memory mappings, while keeping that > > memory available for movable memory when it is not needed for contiguous > > allocations. The default CMA allocator is for DMA allocations. > > > > This patch adds logic to the kexec file loader to attempt to place the > > target payload at a location allocated from CMA. If successful, it uses > > that memory range directly instead of creating copy instructions during > > the hot phase. To ensure that there is a safety net in case anything goes > > wrong with the CMA allocation, it also adds a flag for user space to force > > disable CMA allocations. > > > > Using CMA allocations has two advantages: > > > > 1) Faster. There is no more need to copy in the hot phase. > > How much faster? Kinda matters as "fast" is the whole point of the patch! > > > 2) More robust. Even if by accident some page is still in use for DMA, > > the new kernel image will be safe from that access because it resides > > in a memory region that is considered allocated in the old kernel and > > has a chance to reinitialize that component. > > https://lore.kernel.org/all/20250512140909.3464-1-dssauerw@amazon.de/> Is this known to be a problem in current code? Oh $DEITY yes. The Arm Generic Interrupt Controller is, to quote a dear friend, "a cautionary tale of how not to approach a hardware design". It does a whole bunch of arbitrary DMA all over the place, and doesn't even live behind an IOMMU. And doesn't *stop* doing DMA unless you ask it *really* nicely; merely shutting down the offending high-level components isn't always enough, because they might still to write back some caches. Here's one of the latest examples (not actually the one which has been breaking kexec for us, as far as we know, but an example of the genre): https://lore.kernel.org/all/20250512140909.3464-1-dssauerw@amazon.de/ So putting the new kernel into a physical memory region which was considered 'free' by the previous kernel, as Alex explains, is actually a very good defence-in-depth mechanism to protect against such issues.
On Mon, 12 May 2025 19:02:20 -0700 David Woodhouse <dwmw2@infradead.org> wrote: > > > 2) More robust. Even if by accident some page is still in use for DMA, > > > the new kernel image will be safe from that access because it resides > > > in a memory region that is considered allocated in the old kernel and > > > has a chance to reinitialize that component. > > > > > > https://lore.kernel.org/all/20250512140909.3464-1-dssauerw@amazon.de/> > > >Is this known to be a problem in current code? > > Oh $DEITY yes. The Arm Generic Interrupt Controller is, to quote a dear > friend, "a cautionary tale of how not to approach a hardware design". > > It does a whole bunch of arbitrary DMA all over the place, and doesn't > even live behind an IOMMU. And doesn't *stop* doing DMA unless you ask > it *really* nicely; merely shutting down the offending high-level > components isn't always enough, because they might still to write back > some caches. > > Here's one of the latest examples (not actually the one which has been > breaking kexec for us, as far as we know, but an example of the genre): > https://lore.kernel.org/all/20250512140909.3464-1-dssauerw@amazon.de/ > > So putting the new kernel into a physical memory region which was > considered 'free' by the previous kernel, as Alex explains, is actually > a very good defence-in-depth mechanism to protect against such issues. Lol, it sounds like you're having fun over there. Alexander, can you please repackage David's info to your taste and include it in the changelog? Escalating the value of the patch from "might speed it up, don't know how much" to "addresses grievous real-world issues" is helpful to the patch's case!
© 2016 - 2026 Red Hat, Inc.