[PATCH v5 08/23] KVM: s390: KVM page table management functions: allocation

Claudio Imbrenda posted 23 patches 1 week ago
[PATCH v5 08/23] KVM: s390: KVM page table management functions: allocation
Posted by Claudio Imbrenda 1 week ago
Add page table management functions to be used for KVM guest (gmap)
page tables.

This patch adds the boilerplate and functions for the allocation and
deallocation of DAT tables.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 arch/s390/kvm/Makefile     |   1 +
 arch/s390/kvm/dat.c        | 103 +++++++++++++++++++++++++++++++++++++
 arch/s390/kvm/dat.h        |  77 +++++++++++++++++++++++++++
 arch/s390/mm/page-states.c |   1 +
 4 files changed, 182 insertions(+)
 create mode 100644 arch/s390/kvm/dat.c

diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 9a723c48b05a..84315d2f75fb 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -9,6 +9,7 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
 kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o
+kvm-y += dat.o
 
 kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
new file mode 100644
index 000000000000..c324a27f379f
--- /dev/null
+++ b/arch/s390/kvm/dat.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2020, 2024
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ *		 Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *		 David Hildenbrand <david@redhat.com>
+ *		 Janosch Frank <frankja@linux.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pagewalk.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/ksm.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/pgtable.h>
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/pgalloc.h>
+
+#include <asm/page-states.h>
+#include <asm/tlb.h>
+#include "dat.h"
+
+int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc)
+{
+	void *o;
+
+	for ( ; mc->n_crsts < KVM_S390_MMU_CACHE_N_CRSTS; mc->n_crsts++) {
+		o = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER);
+		if (!o)
+			return -ENOMEM;
+		mc->crsts[mc->n_crsts] = o;
+	}
+	for ( ; mc->n_pts < KVM_S390_MMU_CACHE_N_PTS; mc->n_pts++) {
+		o = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+		if (!o)
+			return -ENOMEM;
+		mc->pts[mc->n_pts] = o;
+	}
+	for ( ; mc->n_rmaps < KVM_S390_MMU_CACHE_N_RMAPS; mc->n_rmaps++) {
+		o = kzalloc(sizeof(*mc->rmaps[0]), GFP_KERNEL_ACCOUNT);
+		if (!o)
+			return -ENOMEM;
+		mc->rmaps[mc->n_rmaps] = o;
+	}
+	return 0;
+}
+
+static inline struct page_table *dat_alloc_pt_noinit(struct kvm_s390_mmu_cache *mc)
+{
+	struct page_table *res;
+
+	res = kvm_s390_mmu_cache_alloc_pt(mc);
+	if (res)
+		__arch_set_page_dat(res, 1);
+	return res;
+}
+
+static inline struct crst_table *dat_alloc_crst_noinit(struct kvm_s390_mmu_cache *mc)
+{
+	struct crst_table *res;
+
+	res = kvm_s390_mmu_cache_alloc_crst(mc);
+	if (res)
+		__arch_set_page_dat(res, 1UL << CRST_ALLOC_ORDER);
+	return res;
+}
+
+struct crst_table *dat_alloc_crst_sleepable(unsigned long init)
+{
+	struct page *page;
+	void *virt;
+
+	page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER);
+	if (!page)
+		return NULL;
+	virt = page_to_virt(page);
+	__arch_set_page_dat(virt, 1UL << CRST_ALLOC_ORDER);
+	crst_table_init(virt, init);
+	return virt;
+}
+
+void dat_free_level(struct crst_table *table, bool owns_ptes)
+{
+	unsigned int i;
+
+	for (i = 0; i < _CRST_ENTRIES; i++) {
+		if (table->crstes[i].h.fc || table->crstes[i].h.i)
+			continue;
+		if (!is_pmd(table->crstes[i]))
+			dat_free_level(dereference_crste(table->crstes[i]), owns_ptes);
+		else if (owns_ptes)
+			dat_free_pt(dereference_pmd(table->crstes[i].pmd));
+	}
+	dat_free_crst(table);
+}
diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h
index 4d2b7a7bf898..486b7dfc5df2 100644
--- a/arch/s390/kvm/dat.h
+++ b/arch/s390/kvm/dat.h
@@ -418,6 +418,46 @@ struct vsie_rmap {
 
 static_assert(sizeof(struct vsie_rmap) == 2 * sizeof(long));
 
+#define KVM_S390_MMU_CACHE_N_CRSTS	6
+#define KVM_S390_MMU_CACHE_N_PTS	2
+#define KVM_S390_MMU_CACHE_N_RMAPS	16
+struct kvm_s390_mmu_cache {
+	void *crsts[KVM_S390_MMU_CACHE_N_CRSTS];
+	void *pts[KVM_S390_MMU_CACHE_N_PTS];
+	void *rmaps[KVM_S390_MMU_CACHE_N_RMAPS];
+	short int n_crsts;
+	short int n_pts;
+	short int n_rmaps;
+};
+
+void dat_free_level(struct crst_table *table, bool owns_ptes);
+struct crst_table *dat_alloc_crst_sleepable(unsigned long init);
+
+int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc);
+
+#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN)
+
+static inline struct page_table *kvm_s390_mmu_cache_alloc_pt(struct kvm_s390_mmu_cache *mc)
+{
+	if (mc->n_pts)
+		return mc->pts[--mc->n_pts];
+	return (void *)__get_free_page(GFP_KVM_S390_MMU_CACHE);
+}
+
+static inline struct crst_table *kvm_s390_mmu_cache_alloc_crst(struct kvm_s390_mmu_cache *mc)
+{
+	if (mc->n_crsts)
+		return mc->crsts[--mc->n_crsts];
+	return (void *)__get_free_pages(GFP_KVM_S390_MMU_CACHE | __GFP_COMP, CRST_ALLOC_ORDER);
+}
+
+static inline struct vsie_rmap *kvm_s390_mmu_cache_alloc_rmap(struct kvm_s390_mmu_cache *mc)
+{
+	if (mc->n_rmaps)
+		return mc->rmaps[--mc->n_rmaps];
+	return kzalloc(sizeof(struct vsie_rmap), GFP_KVM_S390_MMU_CACHE);
+}
+
 static inline struct crst_table *crste_table_start(union crste *crstep)
 {
 	return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE);
@@ -717,4 +757,41 @@ static inline void pgste_set_unlock(union pte *ptep, union pgste pgste)
 	WRITE_ONCE(*pgste_of(ptep), pgste);
 }
 
+static inline void dat_free_pt(struct page_table *pt)
+{
+	free_page((unsigned long)pt);
+}
+
+static inline void _dat_free_crst(struct crst_table *table)
+{
+	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
+#define dat_free_crst(x) _dat_free_crst(_CRSTP(x))
+
+static inline void kvm_s390_free_mmu_cache(struct kvm_s390_mmu_cache *mc)
+{
+	if (!mc)
+		return;
+	while (mc->n_pts)
+		dat_free_pt(mc->pts[--mc->n_pts]);
+	while (mc->n_crsts)
+		_dat_free_crst(mc->crsts[--mc->n_crsts]);
+	while (mc->n_rmaps)
+		kfree(mc->rmaps[--mc->n_rmaps]);
+	kfree(mc);
+}
+
+DEFINE_FREE(kvm_s390_mmu_cache, struct kvm_s390_mmu_cache *, if (_T) kvm_s390_free_mmu_cache(_T))
+
+static inline struct kvm_s390_mmu_cache *kvm_s390_new_mmu_cache(void)
+{
+	struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache);
+
+	mc = kzalloc(sizeof(*mc), GFP_KERNEL_ACCOUNT);
+	if (mc && !kvm_s390_mmu_cache_topup(mc))
+		return_ptr(mc);
+	return NULL;
+}
+
 #endif /* __KVM_S390_DAT_H */
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index 01f9b39e65f5..5bee173db72e 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -13,6 +13,7 @@
 #include <asm/page.h>
 
 int __bootdata_preserved(cmma_flag);
+EXPORT_SYMBOL(cmma_flag);
 
 void arch_free_page(struct page *page, int order)
 {
-- 
2.51.1
Re: [PATCH v5 08/23] KVM: s390: KVM page table management functions: allocation
Posted by Janosch Frank 1 week ago
On 11/24/25 12:55, Claudio Imbrenda wrote:
> Add page table management functions to be used for KVM guest (gmap)
> page tables.
> 
> This patch adds the boilerplate and functions for the allocation and
> deallocation of DAT tables.
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>

What's the cache for, why did you add it?

> ---
>   arch/s390/kvm/Makefile     |   1 +
>   arch/s390/kvm/dat.c        | 103 +++++++++++++++++++++++++++++++++++++
>   arch/s390/kvm/dat.h        |  77 +++++++++++++++++++++++++++
>   arch/s390/mm/page-states.c |   1 +
>   4 files changed, 182 insertions(+)
>   create mode 100644 arch/s390/kvm/dat.c
> 
> diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
> index 9a723c48b05a..84315d2f75fb 100644
> --- a/arch/s390/kvm/Makefile
> +++ b/arch/s390/kvm/Makefile
> @@ -9,6 +9,7 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
>   
>   kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
>   kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o
> +kvm-y += dat.o
>   
>   kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
>   obj-$(CONFIG_KVM) += kvm.o
> diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
> new file mode 100644
> index 000000000000..c324a27f379f
> --- /dev/null
> +++ b/arch/s390/kvm/dat.c
> @@ -0,0 +1,103 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + *  KVM guest address space mapping code
> + *
> + *    Copyright IBM Corp. 2007, 2020, 2024

Should definitely add 2025

> + *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
> + *		 Martin Schwidefsky <schwidefsky@de.ibm.com>
> + *		 David Hildenbrand <david@redhat.com>
> + *		 Janosch Frank <frankja@linux.ibm.com>

Did you retain the authors here because you bring in code from other 
files with these authors in the future?
Re: [PATCH v5 08/23] KVM: s390: KVM page table management functions: allocation
Posted by Claudio Imbrenda 1 week ago
On Mon, 24 Nov 2025 13:27:24 +0100
Janosch Frank <frankja@linux.ibm.com> wrote:

> On 11/24/25 12:55, Claudio Imbrenda wrote:
> > Add page table management functions to be used for KVM guest (gmap)
> > page tables.
> > 
> > This patch adds the boilerplate and functions for the allocation and
> > deallocation of DAT tables.
> > 
> > Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>  
> 
> What's the cache for, why did you add it?

the cache is needed to allocate memory when we can sleep (to avoid
atomic allocations), and use it when holding spinlocks (when we can't
sleep).

this is similar to what other architectures do, except that in their
case they only have one type of page table to worry about, we need at
least 3 types of objects.

unlike other architectures, allocations from the cache can fail, and
the calling code needs to handle failures (e.g. by replenishing the
cache and trying again)

> 
> > ---
> >   arch/s390/kvm/Makefile     |   1 +
> >   arch/s390/kvm/dat.c        | 103 +++++++++++++++++++++++++++++++++++++
> >   arch/s390/kvm/dat.h        |  77 +++++++++++++++++++++++++++
> >   arch/s390/mm/page-states.c |   1 +
> >   4 files changed, 182 insertions(+)
> >   create mode 100644 arch/s390/kvm/dat.c
> > 
> > diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
> > index 9a723c48b05a..84315d2f75fb 100644
> > --- a/arch/s390/kvm/Makefile
> > +++ b/arch/s390/kvm/Makefile
> > @@ -9,6 +9,7 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
> >   
> >   kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
> >   kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o
> > +kvm-y += dat.o
> >   
> >   kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
> >   obj-$(CONFIG_KVM) += kvm.o
> > diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
> > new file mode 100644
> > index 000000000000..c324a27f379f
> > --- /dev/null
> > +++ b/arch/s390/kvm/dat.c
> > @@ -0,0 +1,103 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + *  KVM guest address space mapping code
> > + *
> > + *    Copyright IBM Corp. 2007, 2020, 2024  
> 
> Should definitely add 2025

yes

> 
> > + *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
> > + *		 Martin Schwidefsky <schwidefsky@de.ibm.com>
> > + *		 David Hildenbrand <david@redhat.com>
> > + *		 Janosch Frank <frankja@linux.ibm.com>  
> 
> Did you retain the authors here because you bring in code from other 
> files with these authors in the future?

in the beginning I had copied stuff from other files, but things have
changed a lot. do you think I should drop the other names?
Re: [PATCH v5 08/23] KVM: s390: KVM page table management functions: allocation
Posted by Janosch Frank 1 week ago
On 11/24/25 13:41, Claudio Imbrenda wrote:
> On Mon, 24 Nov 2025 13:27:24 +0100
> Janosch Frank <frankja@linux.ibm.com> wrote:
> 
>> On 11/24/25 12:55, Claudio Imbrenda wrote:
>>> Add page table management functions to be used for KVM guest (gmap)
>>> page tables.
>>>
>>> This patch adds the boilerplate and functions for the allocation and
>>> deallocation of DAT tables.
>>>
>>> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
>>
>> What's the cache for, why did you add it?
> 
> the cache is needed to allocate memory when we can sleep (to avoid
> atomic allocations), and use it when holding spinlocks (when we can't
> sleep).
> 
> this is similar to what other architectures do, except that in their
> case they only have one type of page table to worry about, we need at
> least 3 types of objects.
> 
> unlike other architectures, allocations from the cache can fail, and
> the calling code needs to handle failures (e.g. by replenishing the
> cache and trying again)

Well, this information should make it into this patch, either into the 
description or in the body.

> 
>>
>>> ---
>>>    arch/s390/kvm/Makefile     |   1 +
>>>    arch/s390/kvm/dat.c        | 103 +++++++++++++++++++++++++++++++++++++
>>>    arch/s390/kvm/dat.h        |  77 +++++++++++++++++++++++++++
>>>    arch/s390/mm/page-states.c |   1 +
>>>    4 files changed, 182 insertions(+)
>>>    create mode 100644 arch/s390/kvm/dat.c