[PATCH v4 2/2] RISC-V: KVM: Split huge pages during fault handling for dirty logging

Posted by wang.yechao255@zte.com.cn 2 days, 22 hours ago

From: Wang Yechao <wang.yechao255@zte.com.cn>

During dirty logging, all huge pages are write-protected. When the guest
writes to a write-protected huge page, a page fault is triggered. Before
recovering the write permission, the huge page must be split into smaller
pages (e.g., 4K). After splitting, the normal mapping process proceeds,
allowing write permission to be restored at the smaller page granularity.

If dirty logging is disabled because migration failed or was cancelled,
only recover the write permission at the 4K level, and skip recovering the
huge page mapping at this time to avoid the overhead of freeing page tables.
The huge page mapping can be recovered in the ioctl context, similar to x86,
in a later patch.

Signed-off-by: Wang Yechao <wang.yechao255@zte.com.cn>
---
 arch/riscv/include/asm/kvm_gstage.h |   4 +
 arch/riscv/kvm/gstage.c             | 126 ++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+)

diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
index 595e2183173e..373748c6745e 100644
--- a/arch/riscv/include/asm/kvm_gstage.h
+++ b/arch/riscv/include/asm/kvm_gstage.h
@@ -53,6 +53,10 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
 			      bool page_rdonly, bool page_exec,
 			      struct kvm_gstage_mapping *out_map);

+int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
+                                struct kvm_mmu_memory_cache *pcache,
+                                gpa_t addr, u32 target_level, bool flush);
+
 enum kvm_riscv_gstage_op {
 	GSTAGE_OP_NOP = 0,	/* Nothing */
 	GSTAGE_OP_CLEAR,	/* Clear/Unmap */
diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
index d2001d508046..ffec3e5ddcaf 100644
--- a/arch/riscv/kvm/gstage.c
+++ b/arch/riscv/kvm/gstage.c
@@ -163,13 +163,32 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
 	return 0;
 }

+static void kvm_riscv_gstage_update_pte_prot(struct kvm_gstage *gstage, u32 level,
+					     gpa_t addr, pte_t *ptep, pgprot_t prot)
+{
+	pte_t new_pte;
+
+	if (pgprot_val(pte_pgprot(ptep_get(ptep))) == pgprot_val(prot))
+		return;
+
+	new_pte = pfn_pte(pte_pfn(ptep_get(ptep)), prot);
+	new_pte = pte_mkdirty(new_pte);
+
+	set_pte(ptep, new_pte);
+
+	gstage_tlb_flush(gstage, level, addr);
+}
+
 int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
 			      struct kvm_mmu_memory_cache *pcache,
 			      gpa_t gpa, phys_addr_t hpa, unsigned long page_size,
 			      bool page_rdonly, bool page_exec,
 			      struct kvm_gstage_mapping *out_map)
 {
+	bool found_leaf;
+	u32 ptep_level;
 	pgprot_t prot;
+	pte_t *ptep;
 	int ret;

 	out_map->addr = gpa;
@@ -203,12 +222,119 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
 		else
 			prot = PAGE_WRITE;
 	}
+
+	found_leaf = kvm_riscv_gstage_get_leaf(gstage, gpa, &ptep, &ptep_level);
+	if (found_leaf) {
+		/*
+		 * ptep_level is the current gstage mapping level of addr, out_map->level
+		 * is the required mapping level during fault handling.
+		 *
+		 * 1) ptep_level > out_map->level
+		 * This happens when dirty logging is enabled and huge pages are used.
+		 * KVM must track the pages at 4K level, and split the huge mapping
+		 * into 4K mappings.
+		 *
+		 * 2) ptep_level < out_map->level
+		 * This happens when dirty logging is disabled and huge pages are used.
+		 * The gstage is split into 4K mappings, but the out_map level is now
+		 * back to the huge page level. Ignore the out_map level this time, and
+		 * just update the pte prot here. Otherwise, we would fall back to mapping
+		 * the gstage at huge page level in `kvm_riscv_gstage_set_pte`, with the
+		 * overhead of freeing the page tables(not support now), which would slow
+		 * down the vCPUs' performance.
+		 *
+		 * It is better to recover the huge page mapping in the ioctl context when
+		 * disabling dirty logging.
+		 *
+		 * 3) ptep_level == out_map->level
+		 * We already have the ptep, just update the pte prot if the pfn not change.
+		 * There is no need to invoke `kvm_riscv_gstage_set_pte` again.
+		 */
+		if (ptep_level > out_map->level) {
+			kvm_riscv_gstage_split_huge(gstage, pcache, gpa,
+						    out_map->level, true);
+		} else if (ALIGN_DOWN(PFN_PHYS(pte_pfn(ptep_get(ptep))), page_size) == hpa) {
+			kvm_riscv_gstage_update_pte_prot(gstage, ptep_level, gpa, ptep, prot);
+			return 0;
+		}
+	}
+
 	out_map->pte = pfn_pte(PFN_DOWN(hpa), prot);
 	out_map->pte = pte_mkdirty(out_map->pte);

 	return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
 }

+static inline unsigned long make_child_pte(unsigned long huge_pte, int index,
+					   unsigned long child_page_size)
+{
+	unsigned long child_pte = huge_pte;
+	unsigned long child_pfn_offset;
+
+	/*
+	 * The child_pte already has the base address of the huge page being
+	 * split. So we just have to OR in the offset to the page at the next
+	 * lower level for the given index.
+	 */
+	child_pfn_offset = index * (child_page_size / PAGE_SIZE);
+	child_pte |= pte_val(pfn_pte(child_pfn_offset, __pgprot(0)));
+
+	return child_pte;
+}
+
+int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
+				struct kvm_mmu_memory_cache *pcache,
+				gpa_t addr, u32 target_level, bool flush)
+{
+	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+	pte_t *next_ptep = (pte_t *)gstage->pgd;
+	unsigned long huge_pte, child_pte;
+	unsigned long child_page_size;
+	pte_t *ptep;
+	int i, ret;
+
+	if (!pcache)
+		return -ENOMEM;
+
+	while(current_level > target_level) {
+		ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)];
+
+		if (!pte_val(ptep_get(ptep)))
+			break;
+
+		if (!gstage_pte_leaf(ptep)) {
+			next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+			current_level--;
+			continue;
+		}
+
+		huge_pte = pte_val(ptep_get(ptep));
+
+		ret = gstage_level_to_page_size(current_level - 1, &child_page_size);
+		if (ret)
+			return ret;
+
+		next_ptep = kvm_mmu_memory_cache_alloc(pcache);
+		if (!next_ptep)
+			return -ENOMEM;
+
+		for (i = 0; i < PTRS_PER_PTE; i++) {
+			child_pte = make_child_pte(huge_pte, i, child_page_size);
+			set_pte((pte_t *)&next_ptep[i], __pte(child_pte));
+		}
+
+		set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
+				__pgprot(_PAGE_TABLE)));
+
+		if (flush)
+			gstage_tlb_flush(gstage, current_level, addr);
+
+		current_level--;
+	}
+
+	return 0;
+}
+
 void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
 			     pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
 {
-- 
2.47.3

Re: [PATCH v4 2/2] RISC-V: KVM: Split huge pages during fault handling for dirty logging

Posted by Anup Patel 2 days, 21 hours ago

On Mon, Mar 30, 2026 at 1:43 PM <wang.yechao255@zte.com.cn> wrote:
>
> From: Wang Yechao <wang.yechao255@zte.com.cn>
>
> During dirty logging, all huge pages are write-protected. When the guest
> writes to a write-protected huge page, a page fault is triggered. Before
> recovering the write permission, the huge page must be split into smaller
> pages (e.g., 4K). After splitting, the normal mapping process proceeds,
> allowing write permission to be restored at the smaller page granularity.
>
> If dirty logging is disabled because migration failed or was cancelled,
> only recover the write permission at the 4K level, and skip recovering the
> huge page mapping at this time to avoid the overhead of freeing page tables.
> The huge page mapping can be recovered in the ioctl context, similar to x86,
> in a later patch.
>
> Signed-off-by: Wang Yechao <wang.yechao255@zte.com.cn>

LGTM.

Reviewed-by: Anup Patel <anup@brainfault.org>

Thanks,
Anup

> ---
>  arch/riscv/include/asm/kvm_gstage.h |   4 +
>  arch/riscv/kvm/gstage.c             | 126 ++++++++++++++++++++++++++++
>  2 files changed, 130 insertions(+)
>
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 595e2183173e..373748c6745e 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -53,6 +53,10 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>                               bool page_rdonly, bool page_exec,
>                               struct kvm_gstage_mapping *out_map);
>
> +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
> +                                struct kvm_mmu_memory_cache *pcache,
> +                                gpa_t addr, u32 target_level, bool flush);
> +
>  enum kvm_riscv_gstage_op {
>         GSTAGE_OP_NOP = 0,      /* Nothing */
>         GSTAGE_OP_CLEAR,        /* Clear/Unmap */
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index d2001d508046..ffec3e5ddcaf 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -163,13 +163,32 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
>         return 0;
>  }
>
> +static void kvm_riscv_gstage_update_pte_prot(struct kvm_gstage *gstage, u32 level,
> +                                            gpa_t addr, pte_t *ptep, pgprot_t prot)
> +{
> +       pte_t new_pte;
> +
> +       if (pgprot_val(pte_pgprot(ptep_get(ptep))) == pgprot_val(prot))
> +               return;
> +
> +       new_pte = pfn_pte(pte_pfn(ptep_get(ptep)), prot);
> +       new_pte = pte_mkdirty(new_pte);
> +
> +       set_pte(ptep, new_pte);
> +
> +       gstage_tlb_flush(gstage, level, addr);
> +}
> +
>  int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>                               struct kvm_mmu_memory_cache *pcache,
>                               gpa_t gpa, phys_addr_t hpa, unsigned long page_size,
>                               bool page_rdonly, bool page_exec,
>                               struct kvm_gstage_mapping *out_map)
>  {
> +       bool found_leaf;
> +       u32 ptep_level;
>         pgprot_t prot;
> +       pte_t *ptep;
>         int ret;
>
>         out_map->addr = gpa;
> @@ -203,12 +222,119 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>                 else
>                         prot = PAGE_WRITE;
>         }
> +
> +       found_leaf = kvm_riscv_gstage_get_leaf(gstage, gpa, &ptep, &ptep_level);
> +       if (found_leaf) {
> +               /*
> +                * ptep_level is the current gstage mapping level of addr, out_map->level
> +                * is the required mapping level during fault handling.
> +                *
> +                * 1) ptep_level > out_map->level
> +                * This happens when dirty logging is enabled and huge pages are used.
> +                * KVM must track the pages at 4K level, and split the huge mapping
> +                * into 4K mappings.
> +                *
> +                * 2) ptep_level < out_map->level
> +                * This happens when dirty logging is disabled and huge pages are used.
> +                * The gstage is split into 4K mappings, but the out_map level is now
> +                * back to the huge page level. Ignore the out_map level this time, and
> +                * just update the pte prot here. Otherwise, we would fall back to mapping
> +                * the gstage at huge page level in `kvm_riscv_gstage_set_pte`, with the
> +                * overhead of freeing the page tables(not support now), which would slow
> +                * down the vCPUs' performance.
> +                *
> +                * It is better to recover the huge page mapping in the ioctl context when
> +                * disabling dirty logging.
> +                *
> +                * 3) ptep_level == out_map->level
> +                * We already have the ptep, just update the pte prot if the pfn not change.
> +                * There is no need to invoke `kvm_riscv_gstage_set_pte` again.
> +                */
> +               if (ptep_level > out_map->level) {
> +                       kvm_riscv_gstage_split_huge(gstage, pcache, gpa,
> +                                                   out_map->level, true);
> +               } else if (ALIGN_DOWN(PFN_PHYS(pte_pfn(ptep_get(ptep))), page_size) == hpa) {
> +                       kvm_riscv_gstage_update_pte_prot(gstage, ptep_level, gpa, ptep, prot);
> +                       return 0;
> +               }
> +       }
> +
>         out_map->pte = pfn_pte(PFN_DOWN(hpa), prot);
>         out_map->pte = pte_mkdirty(out_map->pte);
>
>         return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
>  }
>
> +static inline unsigned long make_child_pte(unsigned long huge_pte, int index,
> +                                          unsigned long child_page_size)
> +{
> +       unsigned long child_pte = huge_pte;
> +       unsigned long child_pfn_offset;
> +
> +       /*
> +        * The child_pte already has the base address of the huge page being
> +        * split. So we just have to OR in the offset to the page at the next
> +        * lower level for the given index.
> +        */
> +       child_pfn_offset = index * (child_page_size / PAGE_SIZE);
> +       child_pte |= pte_val(pfn_pte(child_pfn_offset, __pgprot(0)));
> +
> +       return child_pte;
> +}
> +
> +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
> +                               struct kvm_mmu_memory_cache *pcache,
> +                               gpa_t addr, u32 target_level, bool flush)
> +{
> +       u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
> +       pte_t *next_ptep = (pte_t *)gstage->pgd;
> +       unsigned long huge_pte, child_pte;
> +       unsigned long child_page_size;
> +       pte_t *ptep;
> +       int i, ret;
> +
> +       if (!pcache)
> +               return -ENOMEM;
> +
> +       while(current_level > target_level) {
> +               ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)];
> +
> +               if (!pte_val(ptep_get(ptep)))
> +                       break;
> +
> +               if (!gstage_pte_leaf(ptep)) {
> +                       next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
> +                       current_level--;
> +                       continue;
> +               }
> +
> +               huge_pte = pte_val(ptep_get(ptep));
> +
> +               ret = gstage_level_to_page_size(current_level - 1, &child_page_size);
> +               if (ret)
> +                       return ret;
> +
> +               next_ptep = kvm_mmu_memory_cache_alloc(pcache);
> +               if (!next_ptep)
> +                       return -ENOMEM;
> +
> +               for (i = 0; i < PTRS_PER_PTE; i++) {
> +                       child_pte = make_child_pte(huge_pte, i, child_page_size);
> +                       set_pte((pte_t *)&next_ptep[i], __pte(child_pte));
> +               }
> +
> +               set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
> +                               __pgprot(_PAGE_TABLE)));
> +
> +               if (flush)
> +                       gstage_tlb_flush(gstage, current_level, addr);
> +
> +               current_level--;
> +       }
> +
> +       return 0;
> +}
> +
>  void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>                              pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
>  {
> --
> 2.47.3
>
> --
> kvm-riscv mailing list
> kvm-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kvm-riscv