[PATCH 1/2] kho: add support for preserving vmalloc allocations

Mike Rapoport posted 2 patches 1 month ago
There is a newer version of this series
[PATCH 1/2] kho: add support for preserving vmalloc allocations
Posted by Mike Rapoport 1 month ago
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

A vmalloc allocation is preserved using binary structure similar to
global KHO memory tracker. It's a linked list of pages where each page
is an array of physical address of pages in vmalloc area.

kho_preserve_vmalloc() hands out the physical address of the head page
to the caller. This address is used as the argument to
kho_vmalloc_restore() to restore the mapping in the vmalloc address
space and populate it with the preserved pages.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 include/linux/kexec_handover.h |  12 +++
 kernel/kexec_handover.c        | 140 +++++++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+)

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 348844cffb13..b7bf3bf11019 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -42,8 +42,10 @@ struct kho_serialization;
 bool kho_is_enabled(void);
 
 int kho_preserve_folio(struct folio *folio);
+int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation);
 int kho_preserve_phys(phys_addr_t phys, size_t size);
 struct folio *kho_restore_folio(phys_addr_t phys);
+void *kho_restore_vmalloc(phys_addr_t preservation);
 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
 int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
 
@@ -70,11 +72,21 @@ static inline int kho_preserve_phys(phys_addr_t phys, size_t size)
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct folio *kho_restore_folio(phys_addr_t phys)
 {
 	return NULL;
 }
 
+static inline void *kho_restore_vmalloc(phys_addr_t preservation)
+{
+	return NULL;
+}
+
 static inline int kho_add_subtree(struct kho_serialization *ser,
 				  const char *name, void *fdt)
 {
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index ecd1ac210dbd..a11ae79d6bc9 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -18,6 +18,7 @@
 #include <linux/memblock.h>
 #include <linux/notifier.h>
 #include <linux/page-isolation.h>
+#include <linux/vmalloc.h>
 
 #include <asm/early_ioremap.h>
 
@@ -733,6 +734,145 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
 }
 EXPORT_SYMBOL_GPL(kho_preserve_phys);
 
+struct kho_vmalloc_chunk;
+
+struct kho_vmalloc_hdr {
+	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
+	unsigned int total_pages;	/* only valid in the first chunk */
+	unsigned int num_elms;
+};
+
+#define KHO_VMALLOC_SIZE				\
+	((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
+	 sizeof(phys_addr_t))
+
+struct kho_vmalloc_chunk {
+	struct kho_vmalloc_hdr hdr;
+	phys_addr_t phys[KHO_VMALLOC_SIZE];
+};
+
+static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
+
+static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
+{
+	struct kho_vmalloc_chunk *chunk;
+	int err;
+
+	chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	err = kho_preserve_phys(virt_to_phys(chunk), PAGE_SIZE);
+	if (err)
+		goto err_free;
+	if (cur)
+		KHOSER_STORE_PTR(cur->hdr.next, chunk);
+	return chunk;
+
+err_free:
+	kfree(chunk);
+	return NULL;
+}
+
+static void kho_vmalloc_free_chunks(struct kho_vmalloc_chunk *first_chunk)
+{
+	struct kho_mem_track *track = &kho_out.ser.track;
+	struct kho_vmalloc_chunk *chunk = first_chunk;
+
+	while (chunk) {
+		unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
+		struct kho_vmalloc_chunk *tmp = chunk;
+
+		__kho_unpreserve(track, pfn, pfn + 1);
+
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		kfree(tmp);
+	}
+}
+
+int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
+{
+	struct kho_vmalloc_chunk *chunk, *first_chunk;
+	struct vm_struct *vm = find_vm_area(ptr);
+	int err;
+
+	if (!vm)
+		return -EINVAL;
+
+	/* we don't support HUGE_VMAP yet */
+	if (get_vm_area_page_order(vm))
+		return -EOPNOTSUPP;
+
+	chunk = new_vmalloc_chunk(NULL);
+	if (!chunk)
+		return -ENOMEM;
+	first_chunk = chunk;
+	first_chunk->hdr.total_pages = vm->nr_pages;
+
+	for (int i = 0; i < vm->nr_pages; i++) {
+		phys_addr_t phys = page_to_phys(vm->pages[i]);
+
+		err = kho_preserve_phys(phys, PAGE_SIZE);
+		if (err)
+			goto err_free;
+
+		chunk->phys[chunk->hdr.num_elms] = phys;
+		chunk->hdr.num_elms++;
+		if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->phys)) {
+			chunk = new_vmalloc_chunk(chunk);
+			if (!chunk)
+				goto err_free;
+		}
+	}
+
+	*preservation = virt_to_phys(first_chunk);
+	return 0;
+
+err_free:
+	kho_vmalloc_free_chunks(first_chunk);
+	return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
+
+void *kho_restore_vmalloc(phys_addr_t preservation)
+{
+	struct kho_vmalloc_chunk *chunk = phys_to_virt(preservation);
+	unsigned int idx = 0, nr = 0;
+	struct page **pages;
+	void *ptr;
+
+	nr = chunk->hdr.total_pages;
+	pages = kvmalloc_array(nr, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return NULL;
+
+	while (chunk) {
+		struct page *page;
+
+		for (int i = 0; i < chunk->hdr.num_elms; i++) {
+			page = phys_to_page(chunk->phys[i]);
+			kho_restore_page(page, 0);
+			pages[idx++] = page;
+		}
+
+		page = virt_to_page(chunk);
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		kho_restore_page(page, 0);
+		__free_page(page);
+	}
+
+	ptr = vmap(pages, nr, VM_MAP_PUT_PAGES, PAGE_KERNEL);
+	if (!ptr)
+		goto err_free_pages_array;
+
+	return ptr;
+
+err_free_pages_array:
+	kvfree(pages);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
+
 /* Handling for debug/kho/out */
 
 static struct dentry *debugfs_root;
-- 
2.50.1
Re: [PATCH 1/2] kho: add support for preserving vmalloc allocations
Posted by Jason Gunthorpe 4 weeks, 1 day ago
On Wed, Sep 03, 2025 at 09:30:17AM +0300, Mike Rapoport wrote:
> +int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
> +{
> +	struct kho_vmalloc_chunk *chunk, *first_chunk;
> +	struct vm_struct *vm = find_vm_area(ptr);
> +	int err;
> +
> +	if (!vm)
> +		return -EINVAL;
> +
> +	/* we don't support HUGE_VMAP yet */
> +	if (get_vm_area_page_order(vm))
> +		return -EOPNOTSUPP;

This is a compatability problem.. Should have some way to indicate
that future kernels have an incompatible serialization so restore can
fail..

> +	chunk = new_vmalloc_chunk(NULL);
> +	if (!chunk)
> +		return -ENOMEM;
> +	first_chunk = chunk;
> +	first_chunk->hdr.total_pages = vm->nr_pages;
> +
> +	for (int i = 0; i < vm->nr_pages; i++) {
> +		phys_addr_t phys = page_to_phys(vm->pages[i]);
> +
> +		err = kho_preserve_phys(phys, PAGE_SIZE);

Don't call kho_preserve_phy if you already have a page! We should be
getting rid of kho_preserve_phys() :(

Jason
Re: [PATCH 1/2] kho: add support for preserving vmalloc allocations
Posted by Mike Rapoport 4 weeks, 1 day ago
On Wed, Sep 03, 2025 at 09:56:20AM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 03, 2025 at 09:30:17AM +0300, Mike Rapoport wrote:
> > +int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
> > +{
> > +	struct kho_vmalloc_chunk *chunk, *first_chunk;
> > +	struct vm_struct *vm = find_vm_area(ptr);
> > +	int err;
> > +
> > +	if (!vm)
> > +		return -EINVAL;
> > +
> > +	/* we don't support HUGE_VMAP yet */
> > +	if (get_vm_area_page_order(vm))
> > +		return -EOPNOTSUPP;
> 
> This is a compatability problem.. Should have some way to indicate
> that future kernels have an incompatible serialization so restore can
> fail..

We can add version or flags to kho_vmalloc_chunk, e.g. make it

struct kho_vmalloc_hdr {
	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
	unsigned int total_pages;	/* only valid in the first chunk */
	unsigned short version;		/* only valid in the first chunk */
	unsigned short num_elms;
};

I'm thinking about actually adding support for HUGE_VMAP for the next
resping, but version/flags seems useful anyway.

> > +	chunk = new_vmalloc_chunk(NULL);
> > +	if (!chunk)
> > +		return -ENOMEM;
> > +	first_chunk = chunk;
> > +	first_chunk->hdr.total_pages = vm->nr_pages;
> > +
> > +	for (int i = 0; i < vm->nr_pages; i++) {
> > +		phys_addr_t phys = page_to_phys(vm->pages[i]);
> > +
> > +		err = kho_preserve_phys(phys, PAGE_SIZE);
> 
> Don't call kho_preserve_phy if you already have a page!

Ok, I'll add kho_preserve_page() ;-P.

Now seriously, by no means this is a folio, so it's either
kho_preserve_phys() or __kho_preserve_order(). I don't mind switching to
latter, but I really see no point doing it.

> We should be getting rid of kho_preserve_phys() :(

How do you suggest to preserve memblock?

> Jason

-- 
Sincerely yours,
Mike.
Re: [PATCH 1/2] kho: add support for preserving vmalloc allocations
Posted by Jason Gunthorpe 4 weeks, 1 day ago
On Wed, Sep 03, 2025 at 06:38:00PM +0300, Mike Rapoport wrote:
> > Don't call kho_preserve_phy if you already have a page!
> 
> Ok, I'll add kho_preserve_page() ;-P.

Cast it to a folio :P
 
> Now seriously, by no means this is a folio, 

It really is. The entire bitmap thing is about preserving folios/page
which are basically the same thing ATM. folio is the prefered type for
what used to be compound pages.

As Matthew moves ahead it will effectively become preserving
memdescs. This may even start to happen this year..

Every memdesc has a type, so when ever the physical pages are restored
KHO will need to recreate the struct page and page->memdesc with the
correct values, including the memdesc type code and any memdesc
allocation that Matthew plans.

Meaning everything should be struct page or folio based at this API
level, and restore functions should be logically paired with the
allocation functions that created the memory in the first place.

vmalloc() is calling alloc_pages_bulk_node_noprof() to allocate the
memory, so the restore of that memory should also have a 'kho restore
page' type of name that clearly refers back to the allocator it pairs
with.

In the more general case this should be setting the cgroup and
charging it as well.

> How do you suggest to preserve memblock?

Does the memory have a struct page? Then it should be a preserved
folio list so you get back struct pages in the right state for what
memblock is doing. Someday that will turn into some specific memdesc
type and so on.

If it doesn't have a struct page then it shouldn't be in the bitmaps
at all.

Jason
Re: [PATCH 1/2] kho: add support for preserving vmalloc allocations
Posted by Mike Rapoport 4 weeks, 1 day ago
On Wed, Sep 03, 2025 at 02:06:31PM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 03, 2025 at 06:38:00PM +0300, Mike Rapoport wrote:
> > > Don't call kho_preserve_phy if you already have a page!
>  
> > Now seriously, by no means this is a folio, 
> 
> It really is. The entire bitmap thing is about preserving folios/page
> which are basically the same thing ATM. folio is the prefered type for
> what used to be compound pages.
 
> As Matthew moves ahead it will effectively become preserving
> memdescs. This may even start to happen this year..
> 
> Every memdesc has a type, so when ever the physical pages are restored
> KHO will need to recreate the struct page and page->memdesc with the
> correct values, including the memdesc type code and any memdesc
> allocation that Matthew plans.
> 
> Meaning everything should be struct page or folio based at this API
> level, and restore functions should be logically paired with the
> allocation functions that created the memory in the first place.
> 
> vmalloc() is calling alloc_pages_bulk_node_noprof() to allocate the
> memory, so the restore of that memory should also have a 'kho restore
> page' type of name that clearly refers back to the allocator it pairs
> with.

I'm actually all for having a single entry point
kho_{preserve,restore}_page() that will do

	if (folio)
		do_folio()
	else if (vmalloc)
		do_vmalloc()

etc.

It seems that our major disagreement is about using 'folio' vs 'page' in
the naming.

In my view calling everything 'folio' is a bad idea as we are moving fast
from Ottawa interpretation to New York interpretation of folio.

I'd rather stick to the good old 'page' and when the time comes we can
's/page/memdesc/g' supposing Matthew actually plans for it.

This way we won't need to handle the fallback from divorce of folio from
page. This indeed is less relevant to KHO, but there are a lot of
folio_alloc() in LUO and PCI patches that will have to be changed to a
different allocation apparently this year.

> In the more general case this should be setting the cgroup and
> charging it as well.

Yes, eventually :)

> > How do you suggest to preserve memblock?
> 
> Does the memory have a struct page? Then it should be a preserved
> folio list so you get back struct pages in the right state for what

page list you mean ;-)

> memblock is doing. Someday that will turn into some specific memdesc
> type and so on.

> If it doesn't have a struct page then it shouldn't be in the bitmaps
> at all.

There is a struct page for everything that's memblock_alloc()ed. And we can
do page list, but for large  physically contiguous allocation it does not
make sense. 

I'd rather replace kho_preserve_phys() with kho_preserve_memblock() and add
a restore counterpart to properly set the struct pages for it which we lack
now.

> Jason

-- 
Sincerely yours,
Mike.
Re: [PATCH 1/2] kho: add support for preserving vmalloc allocations
Posted by Jason Gunthorpe 4 weeks ago
On Wed, Sep 03, 2025 at 10:25:02PM +0300, Mike Rapoport wrote:

> It seems that our major disagreement is about using 'folio' vs 'page' in
> the naming.

It is a folio because folio is the name for something that is a high
order page and it signals that the pointer is the head page. Which is
excatly what KHO preservation works on.

I don't know what the next step is when folio is split - presumably we
will get a new type to represent an abstract memdesc head of a high
order allocation that the lowest KHO primitives will change over to.

> I'd rather stick to the good old 'page' and when the time comes we can
> 's/page/memdesc/g' supposing Matthew actually plans for it.

I think you should just convert from the vmap page to folio for now
and most likely vmap will stop using page someday..
 
> This way we won't need to handle the fallback from divorce of folio from
> page. This indeed is less relevant to KHO, but there are a lot of
> folio_alloc() in LUO and PCI patches that will have to be changed to a
> different allocation apparently this year.

I'm not sure what to do about this, really we should make slab work :\

But yes whatever it gets changed into needs to have matching
restoration. Most likely we will get some kind of API to allocate
PAGE_SIZE units of frozen pages with a special memdesc, so that will
need to be paired through the preserver/restore as well.

> There is a struct page for everything that's memblock_alloc()ed. And we can
> do page list, but for large  physically contiguous allocation it does not
> make sense. 

Arguably you could make them into high order pages and preserve those..
 
> I'd rather replace kho_preserve_phys() with kho_preserve_memblock() and add
> a restore counterpart to properly set the struct pages for it which we lack
> now.

Sure, but my point is if memblock physical memory has struct pages
then you should be preserving the struct pages as struct pages not as
phys. Upon restoring the 64 bytes of struct page memory should be
restored back to whatever memblock expects, which will eventually
include the proper memdesc value.

Jason
Re: [PATCH 1/2] kho: add support for preserving vmalloc allocations
Posted by Mike Rapoport 2 weeks, 3 days ago
On Thu, Sep 04, 2025 at 09:30:32AM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 03, 2025 at 10:25:02PM +0300, Mike Rapoport wrote:
> 
> > It seems that our major disagreement is about using 'folio' vs 'page' in
> > the naming.
> 
> It is a folio because folio is the name for something that is a high
> order page and it signals that the pointer is the head page. Which is
> excatly what KHO preservation works on.

kmalloc_large() and vmalloc(VMAP_HUGE) are not folios and won't be.
 
> I don't know what the next step is when folio is split - presumably we
> will get a new type to represent an abstract memdesc head of a high
> order allocation that the lowest KHO primitives will change over to.
> 
> > I'd rather stick to the good old 'page' and when the time comes we can
> > 's/page/memdesc/g' supposing Matthew actually plans for it.
> 
> I think you should just convert from the vmap page to folio for now
> and most likely vmap will stop using page someday..

This is wrong. vmalloc is not a folio and according to memdesc plan [1] it
will be be page until it becomes memdesc.
  
> > There is a struct page for everything that's memblock_alloc()ed. And we can
> > do page list, but for large  physically contiguous allocation it does not
> > make sense. 
> 
> Arguably you could make them into high order pages and preserve those..

They are not aligned by order and they may be partially freed starting at
arbitrary page. Making them high order pages will be a mess.

[1] https://kernelnewbies.org/MatthewWilcox/Memdescs

-- 
Sincerely yours,
Mike.